summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/Kconfig5
-rw-r--r--arch/alpha/include/asm/agp.h4
-rw-r--r--arch/alpha/kernel/pci-sysfs.c12
-rw-r--r--arch/arc/Kconfig45
-rw-r--r--arch/arc/include/asm/atomic-llsc.h97
-rw-r--r--arch/arc/include/asm/atomic-spinlock.h102
-rw-r--r--arch/arc/include/asm/atomic.h444
-rw-r--r--arch/arc/include/asm/atomic64-arcv2.h250
-rw-r--r--arch/arc/include/asm/bitops.h188
-rw-r--r--arch/arc/include/asm/cache.h4
-rw-r--r--arch/arc/include/asm/cmpxchg.h233
-rw-r--r--arch/arc/include/asm/entry-compact.h8
-rw-r--r--arch/arc/include/asm/hugepage.h8
-rw-r--r--arch/arc/include/asm/mmu-arcv2.h103
-rw-r--r--arch/arc/include/asm/mmu.h87
-rw-r--r--arch/arc/include/asm/mmu_context.h28
-rw-r--r--arch/arc/include/asm/page.h74
-rw-r--r--arch/arc/include/asm/pgalloc.h81
-rw-r--r--arch/arc/include/asm/pgtable-bits-arcv2.h149
-rw-r--r--arch/arc/include/asm/pgtable-levels.h189
-rw-r--r--arch/arc/include/asm/pgtable.h339
-rw-r--r--arch/arc/include/asm/processor.h2
-rw-r--r--arch/arc/include/asm/setup.h12
-rw-r--r--arch/arc/include/asm/smp.h14
-rw-r--r--arch/arc/include/asm/tlb-mmu1.h101
-rw-r--r--arch/arc/kernel/entry-arcv2.S1
-rw-r--r--arch/arc/kernel/entry.S7
-rw-r--r--arch/arc/kernel/intc-compact.c2
-rw-r--r--arch/arc/kernel/smp.c4
-rw-r--r--arch/arc/kernel/stacktrace.c2
-rw-r--r--arch/arc/kernel/traps.c5
-rw-r--r--arch/arc/mm/cache.c112
-rw-r--r--arch/arc/mm/fault.c20
-rw-r--r--arch/arc/mm/init.c5
-rw-r--r--arch/arc/mm/ioremap.c3
-rw-r--r--arch/arc/mm/tlb.c268
-rw-r--r--arch/arc/mm/tlbex.S84
-rw-r--r--arch/arm/Kconfig6
-rw-r--r--arch/arm/Makefile3
-rw-r--r--arch/arm/boot/Makefile14
-rw-r--r--arch/arm/boot/compressed/Makefile2
-rw-r--r--arch/arm/boot/dts/omap34xx.dtsi1
-rw-r--r--arch/arm/boot/dts/omap36xx.dtsi1
-rw-r--r--arch/arm/configs/dove_defconfig1
-rw-r--r--arch/arm/configs/pxa_defconfig1
-rw-r--r--arch/arm/include/asm/div64.h11
-rw-r--r--arch/arm/include/asm/gpio.h4
-rw-r--r--arch/arm/include/asm/ptrace.h1
-rw-r--r--arch/arm/include/asm/syscall.h16
-rw-r--r--arch/arm/include/asm/thread_info.h6
-rw-r--r--arch/arm/include/asm/uaccess-asm.h6
-rw-r--r--arch/arm/include/asm/uaccess.h169
-rw-r--r--arch/arm/include/asm/unified.h4
-rw-r--r--arch/arm/include/uapi/asm/unistd.h1
-rw-r--r--arch/arm/kernel/asm-offsets.c3
-rw-r--r--arch/arm/kernel/entry-common.S20
-rw-r--r--arch/arm/kernel/process.c7
-rw-r--r--arch/arm/kernel/ptrace.c14
-rw-r--r--arch/arm/kernel/signal.c8
-rw-r--r--arch/arm/kernel/sys_oabi-compat.c216
-rw-r--r--arch/arm/kernel/traps.c52
-rw-r--r--arch/arm/lib/copy_from_user.S3
-rw-r--r--arch/arm/lib/copy_to_user.S3
-rw-r--r--arch/arm/tools/syscall.tbl2
-rw-r--r--arch/arm64/Kconfig4
-rw-r--r--arch/arm64/include/asm/compat.h5
-rw-r--r--arch/arm64/include/asm/cpufeature.h18
-rw-r--r--arch/arm64/include/asm/kvm_arm.h54
-rw-r--r--arch/arm64/include/asm/kvm_asm.h7
-rw-r--r--arch/arm64/include/asm/kvm_host.h17
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h2
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h17
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h168
-rw-r--r--arch/arm64/include/asm/sysreg.h26
-rw-r--r--arch/arm64/include/asm/uaccess.h11
-rw-r--r--arch/arm64/include/asm/unistd32.h10
-rw-r--r--arch/arm64/kernel/cpufeature.c8
-rw-r--r--arch/arm64/kernel/pci.c29
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S4
-rw-r--r--arch/arm64/kvm/Kconfig10
-rw-r--r--arch/arm64/kvm/arm.c161
-rw-r--r--arch/arm64/kvm/debug.c2
-rw-r--r--arch/arm64/kvm/guest.c9
-rw-r--r--arch/arm64/kvm/handle_exit.c43
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h6
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mem_protect.h35
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mm.h3
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/spinlock.h25
-rw-r--r--arch/arm64/kvm/hyp/nvhe/debug-sr.c2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S21
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c20
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c244
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mm.c22
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c82
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c17
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c4
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c247
-rw-r--r--arch/arm64/kvm/hyp/vhe/debug-sr.c2
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c18
-rw-r--r--arch/arm64/kvm/hyp/vhe/sysreg-sr.c2
-rw-r--r--arch/arm64/kvm/hyp/vhe/tlb.c4
-rw-r--r--arch/arm64/kvm/mmu.c76
-rw-r--r--arch/arm64/kvm/perf.c2
-rw-r--r--arch/arm64/kvm/pmu-emul.c14
-rw-r--r--arch/arm64/kvm/psci.c15
-rw-r--r--arch/arm64/kvm/reset.c43
-rw-r--r--arch/arm64/kvm/sys_regs.c134
-rw-r--r--arch/arm64/kvm/sys_regs.h31
-rw-r--r--arch/arm64/kvm/trace_handle_exit.h10
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v2.c4
-rw-r--r--arch/arm64/kvm/vgic/vgic-v2.c36
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c36
-rw-r--r--arch/arm64/kvm/vgic/vgic.c39
-rw-r--r--arch/arm64/kvm/vgic/vgic.h2
-rw-r--r--arch/arm64/lib/Makefile2
-rw-r--r--arch/arm64/lib/copy_in_user.S77
-rw-r--r--arch/arm64/mm/init.c22
-rw-r--r--arch/arm64/mm/mmu.c3
-rw-r--r--arch/csky/Kconfig4
-rw-r--r--arch/h8300/kernel/traps.c4
-rw-r--r--arch/hexagon/Kconfig4
-rw-r--r--arch/hexagon/kernel/traps.c4
-rw-r--r--arch/ia64/mm/init.c3
-rw-r--r--arch/microblaze/Kbuild4
-rw-r--r--arch/microblaze/Kconfig1
-rw-r--r--arch/microblaze/Kconfig.debug5
-rw-r--r--arch/microblaze/Makefile5
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/Kconfig.debug4
-rw-r--r--arch/mips/cavium-octeon/octeon-memcpy.S2
-rw-r--r--arch/mips/configs/lemote2f_defconfig1
-rw-r--r--arch/mips/configs/pic32mzda_defconfig1
-rw-r--r--arch/mips/configs/rt305x_defconfig1
-rw-r--r--arch/mips/configs/xway_defconfig1
-rw-r--r--arch/mips/include/asm/compat.h8
-rw-r--r--arch/mips/include/asm/uaccess.h26
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl10
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl10
-rw-r--r--arch/mips/kvm/mips.c4
-rw-r--r--arch/mips/kvm/vz.c3
-rw-r--r--arch/mips/lib/memcpy.S11
-rw-r--r--arch/nds32/Kconfig4
-rw-r--r--arch/nds32/kernel/setup.c1
-rw-r--r--arch/nds32/kernel/traps.c5
-rw-r--r--arch/nios2/Kconfig3
-rw-r--r--arch/nios2/kernel/traps.c5
-rw-r--r--arch/openrisc/Kconfig4
-rw-r--r--arch/openrisc/kernel/traps.c5
-rw-r--r--arch/parisc/Kconfig3
-rw-r--r--arch/parisc/Kconfig.debug3
-rw-r--r--arch/parisc/boot/compressed/Makefile18
-rw-r--r--arch/parisc/configs/generic-32bit_defconfig1
-rw-r--r--arch/parisc/include/asm/compat.h6
-rw-r--r--arch/parisc/include/asm/processor.h4
-rw-r--r--arch/parisc/include/asm/rt_sigframe.h2
-rw-r--r--arch/parisc/include/asm/thread_info.h2
-rw-r--r--arch/parisc/include/asm/uaccess.h127
-rw-r--r--arch/parisc/kernel/asm-offsets.c1
-rw-r--r--arch/parisc/kernel/parisc_ksyms.c1
-rw-r--r--arch/parisc/kernel/setup.c2
-rw-r--r--arch/parisc/kernel/signal.c45
-rw-r--r--arch/parisc/kernel/signal32.h2
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl8
-rw-r--r--arch/parisc/kernel/time.c7
-rw-r--r--arch/parisc/kernel/traps.c4
-rw-r--r--arch/parisc/lib/lusercopy.S52
-rw-r--r--arch/parisc/lib/memcpy.c9
-rw-r--r--arch/powerpc/Kconfig5
-rw-r--r--arch/powerpc/include/asm/compat.h16
-rw-r--r--arch/powerpc/include/asm/kvm_host.h1
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl10
-rw-r--r--arch/powerpc/kernel/traps.c5
-rw-r--r--arch/powerpc/kvm/book3s.c5
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c2
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c18
-rw-r--r--arch/powerpc/kvm/booke.c5
-rw-r--r--arch/powerpc/mm/mem.c3
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c9
-rw-r--r--arch/riscv/Kconfig6
-rw-r--r--arch/riscv/kernel/traps.c5
-rw-r--r--arch/s390/Kconfig2
-rw-r--r--arch/s390/Kconfig.debug3
-rw-r--r--arch/s390/configs/debug_defconfig1
-rw-r--r--arch/s390/configs/defconfig1
-rw-r--r--arch/s390/configs/zfcpdump_defconfig1
-rw-r--r--arch/s390/include/asm/compat.h10
-rw-r--r--arch/s390/include/asm/cpu_mcf.h7
-rw-r--r--arch/s390/include/asm/kvm_host.h2
-rw-r--r--arch/s390/include/asm/smp.h1
-rw-r--r--arch/s390/include/asm/stacktrace.h20
-rw-r--r--arch/s390/include/asm/uaccess.h3
-rw-r--r--arch/s390/include/asm/unwind.h8
-rw-r--r--arch/s390/kernel/entry.S4
-rw-r--r--arch/s390/kernel/ftrace.c4
-rw-r--r--arch/s390/kernel/perf_cpum_cf.c8
-rw-r--r--arch/s390/kernel/setup.c10
-rw-r--r--arch/s390/kernel/smp.c9
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl10
-rw-r--r--arch/s390/kernel/topology.c13
-rw-r--r--arch/s390/kvm/interrupt.c12
-rw-r--r--arch/s390/kvm/kvm-s390.c12
-rw-r--r--arch/s390/kvm/kvm-s390.h2
-rw-r--r--arch/s390/kvm/vsie.c2
-rw-r--r--arch/s390/lib/uaccess.c63
-rw-r--r--arch/s390/mm/fault.c2
-rw-r--r--arch/s390/mm/gmap.c11
-rw-r--r--arch/s390/mm/init.c3
-rw-r--r--arch/s390/mm/pgtable.c4
-rw-r--r--arch/s390/pci/pci_clp.c9
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/Kconfig.debug3
-rw-r--r--arch/sh/mm/init.c3
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/Kconfig.debug4
-rw-r--r--arch/sparc/include/asm/compat.h19
-rw-r--r--arch/sparc/kernel/process_64.c2
-rw-r--r--arch/sparc/kernel/signal32.c12
-rw-r--r--arch/sparc/kernel/signal_64.c8
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl10
-rw-r--r--arch/um/Kconfig6
-rw-r--r--arch/um/drivers/virt-pci.c108
-rw-r--r--arch/um/drivers/virtio_uml.c5
-rw-r--r--arch/um/kernel/skas/clone.c3
-rw-r--r--arch/um/kernel/trap.c4
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/Kconfig.debug3
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl4
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/include/asm/compat.h13
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h96
-rw-r--r--arch/x86/include/asm/uaccess_64.h7
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/kernel/kvm.c5
-rw-r--r--arch/x86/kvm/debugfs.c111
-rw-r--r--arch/x86/kvm/hyperv.c32
-rw-r--r--arch/x86/kvm/i8254.c3
-rw-r--r--arch/x86/kvm/ioapic.h4
-rw-r--r--arch/x86/kvm/lapic.c26
-rw-r--r--arch/x86/kvm/mmu.h25
-rw-r--r--arch/x86/kvm/mmu/mmu.c524
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c4
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h18
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h6
-rw-r--r--arch/x86/kvm/mmu/page_track.c1
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h6
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c139
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h29
-rw-r--r--arch/x86/kvm/pmu.c5
-rw-r--r--arch/x86/kvm/pmu.h2
-rw-r--r--arch/x86/kvm/svm/avic.c49
-rw-r--r--arch/x86/kvm/svm/nested.c5
-rw-r--r--arch/x86/kvm/svm/sev.c3
-rw-r--r--arch/x86/kvm/svm/svm.c97
-rw-r--r--arch/x86/kvm/svm/svm.h8
-rw-r--r--arch/x86/kvm/svm/svm_ops.h2
-rw-r--r--arch/x86/kvm/vmx/evmcs.c1
-rw-r--r--arch/x86/kvm/vmx/evmcs.h4
-rw-r--r--arch/x86/kvm/vmx/nested.c56
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c4
-rw-r--r--arch/x86/kvm/vmx/vmcs.h2
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c1
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h4
-rw-r--r--arch/x86/kvm/vmx/vmx.c333
-rw-r--r--arch/x86/kvm/vmx/vmx.h38
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h4
-rw-r--r--arch/x86/kvm/x86.c189
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/kvm/xen.c23
-rw-r--r--arch/x86/kvm/xen.h5
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/pci/numachip.c1
-rw-r--r--arch/x86/pci/sta2x11-fixup.c3
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h12
-rw-r--r--arch/x86/um/shared/sysdep/stub_64.h12
-rw-r--r--arch/x86/um/stub_segv.c3
-rw-r--r--arch/xtensa/Kconfig4
281 files changed, 4159 insertions, 4313 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 98db63496bab..8df1c7102643 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -197,6 +197,9 @@ config HAVE_FUNCTION_ERROR_INJECTION
config HAVE_NMI
bool
+config TRACE_IRQFLAGS_SUPPORT
+ bool
+
#
# An arch should select this if it provides all these things:
#
@@ -886,7 +889,7 @@ config HAVE_SOFTIRQ_ON_OWN_STACK
bool
help
Architecture provides a function to run __do_softirq() on a
- seperate stack.
+ separate stack.
config PGTABLE_LEVELS
int
diff --git a/arch/alpha/include/asm/agp.h b/arch/alpha/include/asm/agp.h
index 7173eada1567..7874f063d000 100644
--- a/arch/alpha/include/asm/agp.h
+++ b/arch/alpha/include/asm/agp.h
@@ -6,8 +6,8 @@
/* dummy for now */
-#define map_page_into_agp(page)
-#define unmap_page_from_agp(page)
+#define map_page_into_agp(page) do { } while (0)
+#define unmap_page_from_agp(page) do { } while (0)
#define flush_agp_cache() mb()
/* GATT allocation. Returns/accepts GATT kernel virtual address. */
diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c
index 0021580d79ad..5808a66e2a81 100644
--- a/arch/alpha/kernel/pci-sysfs.c
+++ b/arch/alpha/kernel/pci-sysfs.c
@@ -60,6 +60,8 @@ static int __pci_mmap_fits(struct pci_dev *pdev, int num,
* @sparse: address space type
*
* Use the bus mapping routines to map a PCI resource into userspace.
+ *
+ * Return: %0 on success, negative error code otherwise
*/
static int pci_mmap_resource(struct kobject *kobj,
struct bin_attribute *attr,
@@ -106,7 +108,7 @@ static int pci_mmap_resource_dense(struct file *filp, struct kobject *kobj,
/**
* pci_remove_resource_files - cleanup resource files
- * @dev: dev to cleanup
+ * @pdev: pci_dev to cleanup
*
* If we created resource files for @dev, remove them from sysfs and
* free their resources.
@@ -221,10 +223,12 @@ static int pci_create_attr(struct pci_dev *pdev, int num)
}
/**
- * pci_create_resource_files - create resource files in sysfs for @dev
- * @dev: dev in question
+ * pci_create_resource_files - create resource files in sysfs for @pdev
+ * @pdev: pci_dev in question
*
* Walk the resources in @dev creating files for each resource available.
+ *
+ * Return: %0 on success, or negative error code
*/
int pci_create_resource_files(struct pci_dev *pdev)
{
@@ -296,7 +300,7 @@ int pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma,
/**
* pci_adjust_legacy_attr - adjustment of legacy file attributes
- * @b: bus to create files under
+ * @bus: bus to create files under
* @mmap_type: I/O port or memory
*
* Adjust file name and size for sparse mappings.
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index b5bf68e74732..3a5a80f302e1 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -49,9 +49,7 @@ config ARC
select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING
select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32
select SET_FS
-
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
+ select TRACE_IRQFLAGS_SUPPORT
config LOCKDEP_SUPPORT
def_bool y
@@ -116,16 +114,9 @@ choice
default ARC_CPU_770 if ISA_ARCOMPACT
default ARC_CPU_HS if ISA_ARCV2
-if ISA_ARCOMPACT
-
-config ARC_CPU_750D
- bool "ARC750D"
- select ARC_CANT_LLSC
- help
- Support for ARC750 core
-
config ARC_CPU_770
bool "ARC770"
+ depends on ISA_ARCOMPACT
select ARC_HAS_SWAPE
help
Support for ARC770 core introduced with Rel 4.10 (Summer 2011)
@@ -135,8 +126,6 @@ config ARC_CPU_770
-Caches: New Prog Model, Region Flush
-Insns: endian swap, load-locked/store-conditional, time-stamp-ctr
-endif #ISA_ARCOMPACT
-
config ARC_CPU_HS
bool "ARC-HS"
depends on ISA_ARCV2
@@ -274,33 +263,17 @@ config ARC_DCCM_BASE
choice
prompt "MMU Version"
- default ARC_MMU_V3 if ARC_CPU_770
- default ARC_MMU_V2 if ARC_CPU_750D
- default ARC_MMU_V4 if ARC_CPU_HS
-
-if ISA_ARCOMPACT
-
-config ARC_MMU_V1
- bool "MMU v1"
- help
- Orig ARC700 MMU
-
-config ARC_MMU_V2
- bool "MMU v2"
- help
- Fixed the deficiency of v1 - possible thrashing in memcpy scenario
- when 2 D-TLB and 1 I-TLB entries index into same 2way set.
+ default ARC_MMU_V3 if ISA_ARCOMPACT
+ default ARC_MMU_V4 if ISA_ARCV2
config ARC_MMU_V3
bool "MMU v3"
- depends on ARC_CPU_770
+ depends on ISA_ARCOMPACT
help
Introduced with ARC700 4.10: New Features
Variable Page size (1k-16k), var JTLB size 128 x (2 or 4)
Shared Address Spaces (SASID)
-endif
-
config ARC_MMU_V4
bool "MMU v4"
depends on ISA_ARCV2
@@ -319,7 +292,6 @@ config ARC_PAGE_SIZE_8K
config ARC_PAGE_SIZE_16K
bool "16KB"
- depends on ARC_MMU_V3 || ARC_MMU_V4
config ARC_PAGE_SIZE_4K
bool "4KB"
@@ -340,6 +312,10 @@ config ARC_HUGEPAGE_16M
endchoice
+config PGTABLE_LEVELS
+ int "Number of Page table levels"
+ default 2
+
config ARC_COMPACT_IRQ_LEVELS
depends on ISA_ARCOMPACT
bool "Setup Timer IRQ as high Priority"
@@ -563,9 +539,6 @@ config ARC_DW2_UNWIND
If you don't debug the kernel, you can say N, but we may not be able
to solve problems without frame unwind information
-config ARC_DBG_TLB_PARANOIA
- bool "Paranoia Checks in Low Level TLB Handlers"
-
config ARC_DBG_JUMP_LABEL
bool "Paranoid checks in Static Keys (jump labels) code"
depends on JUMP_LABEL
diff --git a/arch/arc/include/asm/atomic-llsc.h b/arch/arc/include/asm/atomic-llsc.h
new file mode 100644
index 000000000000..088d348781c1
--- /dev/null
+++ b/arch/arc/include/asm/atomic-llsc.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _ASM_ARC_ATOMIC_LLSC_H
+#define _ASM_ARC_ATOMIC_LLSC_H
+
+#define arch_atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
+
+#define ATOMIC_OP(op, c_op, asm_op) \
+static inline void arch_atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned int val; \
+ \
+ __asm__ __volatile__( \
+ "1: llock %[val], [%[ctr]] \n" \
+ " " #asm_op " %[val], %[val], %[i] \n" \
+ " scond %[val], [%[ctr]] \n" \
+ " bnz 1b \n" \
+ : [val] "=&r" (val) /* Early clobber to prevent reg reuse */ \
+ : [ctr] "r" (&v->counter), /* Not "m": llock only supports reg direct addr mode */ \
+ [i] "ir" (i) \
+ : "cc"); \
+} \
+
+#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
+static inline int arch_atomic_##op##_return_relaxed(int i, atomic_t *v) \
+{ \
+ unsigned int val; \
+ \
+ __asm__ __volatile__( \
+ "1: llock %[val], [%[ctr]] \n" \
+ " " #asm_op " %[val], %[val], %[i] \n" \
+ " scond %[val], [%[ctr]] \n" \
+ " bnz 1b \n" \
+ : [val] "=&r" (val) \
+ : [ctr] "r" (&v->counter), \
+ [i] "ir" (i) \
+ : "cc"); \
+ \
+ return val; \
+}
+
+#define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed
+#define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed
+
+#define ATOMIC_FETCH_OP(op, c_op, asm_op) \
+static inline int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v) \
+{ \
+ unsigned int val, orig; \
+ \
+ __asm__ __volatile__( \
+ "1: llock %[orig], [%[ctr]] \n" \
+ " " #asm_op " %[val], %[orig], %[i] \n" \
+ " scond %[val], [%[ctr]] \n" \
+ " bnz 1b \n" \
+ : [val] "=&r" (val), \
+ [orig] "=&r" (orig) \
+ : [ctr] "r" (&v->counter), \
+ [i] "ir" (i) \
+ : "cc"); \
+ \
+ return orig; \
+}
+
+#define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add_relaxed
+#define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed
+
+#define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed
+#define arch_atomic_fetch_andnot_relaxed arch_atomic_fetch_andnot_relaxed
+#define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed
+#define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor_relaxed
+
+#define ATOMIC_OPS(op, c_op, asm_op) \
+ ATOMIC_OP(op, c_op, asm_op) \
+ ATOMIC_OP_RETURN(op, c_op, asm_op) \
+ ATOMIC_FETCH_OP(op, c_op, asm_op)
+
+ATOMIC_OPS(add, +=, add)
+ATOMIC_OPS(sub, -=, sub)
+
+#undef ATOMIC_OPS
+#define ATOMIC_OPS(op, c_op, asm_op) \
+ ATOMIC_OP(op, c_op, asm_op) \
+ ATOMIC_FETCH_OP(op, c_op, asm_op)
+
+ATOMIC_OPS(and, &=, and)
+ATOMIC_OPS(andnot, &= ~, bic)
+ATOMIC_OPS(or, |=, or)
+ATOMIC_OPS(xor, ^=, xor)
+
+#define arch_atomic_andnot arch_atomic_andnot
+
+#undef ATOMIC_OPS
+#undef ATOMIC_FETCH_OP
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
+#endif
diff --git a/arch/arc/include/asm/atomic-spinlock.h b/arch/arc/include/asm/atomic-spinlock.h
new file mode 100644
index 000000000000..2c830347bfb4
--- /dev/null
+++ b/arch/arc/include/asm/atomic-spinlock.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _ASM_ARC_ATOMIC_SPLOCK_H
+#define _ASM_ARC_ATOMIC_SPLOCK_H
+
+/*
+ * Non hardware assisted Atomic-R-M-W
+ * Locking would change to irq-disabling only (UP) and spinlocks (SMP)
+ */
+
+static inline void arch_atomic_set(atomic_t *v, int i)
+{
+ /*
+ * Independent of hardware support, all of the atomic_xxx() APIs need
+ * to follow the same locking rules to make sure that a "hardware"
+ * atomic insn (e.g. LD) doesn't clobber an "emulated" atomic insn
+ * sequence
+ *
+ * Thus atomic_set() despite being 1 insn (and seemingly atomic)
+ * requires the locking.
+ */
+ unsigned long flags;
+
+ atomic_ops_lock(flags);
+ WRITE_ONCE(v->counter, i);
+ atomic_ops_unlock(flags);
+}
+
+#define arch_atomic_set_release(v, i) arch_atomic_set((v), (i))
+
+#define ATOMIC_OP(op, c_op, asm_op) \
+static inline void arch_atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ \
+ atomic_ops_lock(flags); \
+ v->counter c_op i; \
+ atomic_ops_unlock(flags); \
+}
+
+#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
+static inline int arch_atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ unsigned int temp; \
+ \
+ /* \
+ * spin lock/unlock provides the needed smp_mb() before/after \
+ */ \
+ atomic_ops_lock(flags); \
+ temp = v->counter; \
+ temp c_op i; \
+ v->counter = temp; \
+ atomic_ops_unlock(flags); \
+ \
+ return temp; \
+}
+
+#define ATOMIC_FETCH_OP(op, c_op, asm_op) \
+static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ unsigned int orig; \
+ \
+ /* \
+ * spin lock/unlock provides the needed smp_mb() before/after \
+ */ \
+ atomic_ops_lock(flags); \
+ orig = v->counter; \
+ v->counter c_op i; \
+ atomic_ops_unlock(flags); \
+ \
+ return orig; \
+}
+
+#define ATOMIC_OPS(op, c_op, asm_op) \
+ ATOMIC_OP(op, c_op, asm_op) \
+ ATOMIC_OP_RETURN(op, c_op, asm_op) \
+ ATOMIC_FETCH_OP(op, c_op, asm_op)
+
+ATOMIC_OPS(add, +=, add)
+ATOMIC_OPS(sub, -=, sub)
+
+#undef ATOMIC_OPS
+#define ATOMIC_OPS(op, c_op, asm_op) \
+ ATOMIC_OP(op, c_op, asm_op) \
+ ATOMIC_FETCH_OP(op, c_op, asm_op)
+
+ATOMIC_OPS(and, &=, and)
+ATOMIC_OPS(andnot, &= ~, bic)
+ATOMIC_OPS(or, |=, or)
+ATOMIC_OPS(xor, ^=, xor)
+
+#define arch_atomic_andnot arch_atomic_andnot
+#define arch_atomic_fetch_andnot arch_atomic_fetch_andnot
+
+#undef ATOMIC_OPS
+#undef ATOMIC_FETCH_OP
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
+#endif
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 7a36d79b5b2f..52ee51e1ff7c 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -17,435 +17,43 @@
#define arch_atomic_read(v) READ_ONCE((v)->counter)
#ifdef CONFIG_ARC_HAS_LLSC
-
-#define arch_atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
-
-#define ATOMIC_OP(op, c_op, asm_op) \
-static inline void arch_atomic_##op(int i, atomic_t *v) \
-{ \
- unsigned int val; \
- \
- __asm__ __volatile__( \
- "1: llock %[val], [%[ctr]] \n" \
- " " #asm_op " %[val], %[val], %[i] \n" \
- " scond %[val], [%[ctr]] \n" \
- " bnz 1b \n" \
- : [val] "=&r" (val) /* Early clobber to prevent reg reuse */ \
- : [ctr] "r" (&v->counter), /* Not "m": llock only supports reg direct addr mode */ \
- [i] "ir" (i) \
- : "cc"); \
-} \
-
-#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
-static inline int arch_atomic_##op##_return(int i, atomic_t *v) \
-{ \
- unsigned int val; \
- \
- /* \
- * Explicit full memory barrier needed before/after as \
- * LLOCK/SCOND themselves don't provide any such semantics \
- */ \
- smp_mb(); \
- \
- __asm__ __volatile__( \
- "1: llock %[val], [%[ctr]] \n" \
- " " #asm_op " %[val], %[val], %[i] \n" \
- " scond %[val], [%[ctr]] \n" \
- " bnz 1b \n" \
- : [val] "=&r" (val) \
- : [ctr] "r" (&v->counter), \
- [i] "ir" (i) \
- : "cc"); \
- \
- smp_mb(); \
- \
- return val; \
-}
-
-#define ATOMIC_FETCH_OP(op, c_op, asm_op) \
-static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \
-{ \
- unsigned int val, orig; \
- \
- /* \
- * Explicit full memory barrier needed before/after as \
- * LLOCK/SCOND themselves don't provide any such semantics \
- */ \
- smp_mb(); \
- \
- __asm__ __volatile__( \
- "1: llock %[orig], [%[ctr]] \n" \
- " " #asm_op " %[val], %[orig], %[i] \n" \
- " scond %[val], [%[ctr]] \n" \
- " bnz 1b \n" \
- : [val] "=&r" (val), \
- [orig] "=&r" (orig) \
- : [ctr] "r" (&v->counter), \
- [i] "ir" (i) \
- : "cc"); \
- \
- smp_mb(); \
- \
- return orig; \
-}
-
-#else /* !CONFIG_ARC_HAS_LLSC */
-
-#ifndef CONFIG_SMP
-
- /* violating atomic_xxx API locking protocol in UP for optimization sake */
-#define arch_atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
-
+#include <asm/atomic-llsc.h>
#else
+#include <asm/atomic-spinlock.h>
+#endif
-static inline void arch_atomic_set(atomic_t *v, int i)
-{
- /*
- * Independent of hardware support, all of the atomic_xxx() APIs need
- * to follow the same locking rules to make sure that a "hardware"
- * atomic insn (e.g. LD) doesn't clobber an "emulated" atomic insn
- * sequence
- *
- * Thus atomic_set() despite being 1 insn (and seemingly atomic)
- * requires the locking.
- */
- unsigned long flags;
+#define arch_atomic_cmpxchg(v, o, n) \
+({ \
+ arch_cmpxchg(&((v)->counter), (o), (n)); \
+})
- atomic_ops_lock(flags);
- WRITE_ONCE(v->counter, i);
- atomic_ops_unlock(flags);
-}
+#ifdef arch_cmpxchg_relaxed
+#define arch_atomic_cmpxchg_relaxed(v, o, n) \
+({ \
+ arch_cmpxchg_relaxed(&((v)->counter), (o), (n)); \
+})
+#endif
-#define arch_atomic_set_release(v, i) arch_atomic_set((v), (i))
+#define arch_atomic_xchg(v, n) \
+({ \
+ arch_xchg(&((v)->counter), (n)); \
+})
+#ifdef arch_xchg_relaxed
+#define arch_atomic_xchg_relaxed(v, n) \
+({ \
+ arch_xchg_relaxed(&((v)->counter), (n)); \
+})
#endif
/*
- * Non hardware assisted Atomic-R-M-W
- * Locking would change to irq-disabling only (UP) and spinlocks (SMP)
+ * 64-bit atomics
*/
-
-#define ATOMIC_OP(op, c_op, asm_op) \
-static inline void arch_atomic_##op(int i, atomic_t *v) \
-{ \
- unsigned long flags; \
- \
- atomic_ops_lock(flags); \
- v->counter c_op i; \
- atomic_ops_unlock(flags); \
-}
-
-#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
-static inline int arch_atomic_##op##_return(int i, atomic_t *v) \
-{ \
- unsigned long flags; \
- unsigned long temp; \
- \
- /* \
- * spin lock/unlock provides the needed smp_mb() before/after \
- */ \
- atomic_ops_lock(flags); \
- temp = v->counter; \
- temp c_op i; \
- v->counter = temp; \
- atomic_ops_unlock(flags); \
- \
- return temp; \
-}
-
-#define ATOMIC_FETCH_OP(op, c_op, asm_op) \
-static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \
-{ \
- unsigned long flags; \
- unsigned long orig; \
- \
- /* \
- * spin lock/unlock provides the needed smp_mb() before/after \
- */ \
- atomic_ops_lock(flags); \
- orig = v->counter; \
- v->counter c_op i; \
- atomic_ops_unlock(flags); \
- \
- return orig; \
-}
-
-#endif /* !CONFIG_ARC_HAS_LLSC */
-
-#define ATOMIC_OPS(op, c_op, asm_op) \
- ATOMIC_OP(op, c_op, asm_op) \
- ATOMIC_OP_RETURN(op, c_op, asm_op) \
- ATOMIC_FETCH_OP(op, c_op, asm_op)
-
-ATOMIC_OPS(add, +=, add)
-ATOMIC_OPS(sub, -=, sub)
-
-#undef ATOMIC_OPS
-#define ATOMIC_OPS(op, c_op, asm_op) \
- ATOMIC_OP(op, c_op, asm_op) \
- ATOMIC_FETCH_OP(op, c_op, asm_op)
-
-ATOMIC_OPS(and, &=, and)
-ATOMIC_OPS(andnot, &= ~, bic)
-ATOMIC_OPS(or, |=, or)
-ATOMIC_OPS(xor, ^=, xor)
-
-#define arch_atomic_andnot arch_atomic_andnot
-#define arch_atomic_fetch_andnot arch_atomic_fetch_andnot
-
-#undef ATOMIC_OPS
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP_RETURN
-#undef ATOMIC_OP
-
#ifdef CONFIG_GENERIC_ATOMIC64
-
#include <asm-generic/atomic64.h>
-
-#else /* Kconfig ensures this is only enabled with needed h/w assist */
-
-/*
- * ARCv2 supports 64-bit exclusive load (LLOCKD) / store (SCONDD)
- * - The address HAS to be 64-bit aligned
- * - There are 2 semantics involved here:
- * = exclusive implies no interim update between load/store to same addr
- * = both words are observed/updated together: this is guaranteed even
- * for regular 64-bit load (LDD) / store (STD). Thus atomic64_set()
- * is NOT required to use LLOCKD+SCONDD, STD suffices
- */
-
-typedef struct {
- s64 __aligned(8) counter;
-} atomic64_t;
-
-#define ATOMIC64_INIT(a) { (a) }
-
-static inline s64 arch_atomic64_read(const atomic64_t *v)
-{
- s64 val;
-
- __asm__ __volatile__(
- " ldd %0, [%1] \n"
- : "=r"(val)
- : "r"(&v->counter));
-
- return val;
-}
-
-static inline void arch_atomic64_set(atomic64_t *v, s64 a)
-{
- /*
- * This could have been a simple assignment in "C" but would need
- * explicit volatile. Otherwise gcc optimizers could elide the store
- * which borked atomic64 self-test
- * In the inline asm version, memory clobber needed for exact same
- * reason, to tell gcc about the store.
- *
- * This however is not needed for sibling atomic64_add() etc since both
- * load/store are explicitly done in inline asm. As long as API is used
- * for each access, gcc has no way to optimize away any load/store
- */
- __asm__ __volatile__(
- " std %0, [%1] \n"
- :
- : "r"(a), "r"(&v->counter)
- : "memory");
-}
-
-#define ATOMIC64_OP(op, op1, op2) \
-static inline void arch_atomic64_##op(s64 a, atomic64_t *v) \
-{ \
- s64 val; \
- \
- __asm__ __volatile__( \
- "1: \n" \
- " llockd %0, [%1] \n" \
- " " #op1 " %L0, %L0, %L2 \n" \
- " " #op2 " %H0, %H0, %H2 \n" \
- " scondd %0, [%1] \n" \
- " bnz 1b \n" \
- : "=&r"(val) \
- : "r"(&v->counter), "ir"(a) \
- : "cc"); \
-} \
-
-#define ATOMIC64_OP_RETURN(op, op1, op2) \
-static inline s64 arch_atomic64_##op##_return(s64 a, atomic64_t *v) \
-{ \
- s64 val; \
- \
- smp_mb(); \
- \
- __asm__ __volatile__( \
- "1: \n" \
- " llockd %0, [%1] \n" \
- " " #op1 " %L0, %L0, %L2 \n" \
- " " #op2 " %H0, %H0, %H2 \n" \
- " scondd %0, [%1] \n" \
- " bnz 1b \n" \
- : [val] "=&r"(val) \
- : "r"(&v->counter), "ir"(a) \
- : "cc"); /* memory clobber comes from smp_mb() */ \
- \
- smp_mb(); \
- \
- return val; \
-}
-
-#define ATOMIC64_FETCH_OP(op, op1, op2) \
-static inline s64 arch_atomic64_fetch_##op(s64 a, atomic64_t *v) \
-{ \
- s64 val, orig; \
- \
- smp_mb(); \
- \
- __asm__ __volatile__( \
- "1: \n" \
- " llockd %0, [%2] \n" \
- " " #op1 " %L1, %L0, %L3 \n" \
- " " #op2 " %H1, %H0, %H3 \n" \
- " scondd %1, [%2] \n" \
- " bnz 1b \n" \
- : "=&r"(orig), "=&r"(val) \
- : "r"(&v->counter), "ir"(a) \
- : "cc"); /* memory clobber comes from smp_mb() */ \
- \
- smp_mb(); \
- \
- return orig; \
-}
-
-#define ATOMIC64_OPS(op, op1, op2) \
- ATOMIC64_OP(op, op1, op2) \
- ATOMIC64_OP_RETURN(op, op1, op2) \
- ATOMIC64_FETCH_OP(op, op1, op2)
-
-ATOMIC64_OPS(add, add.f, adc)
-ATOMIC64_OPS(sub, sub.f, sbc)
-ATOMIC64_OPS(and, and, and)
-ATOMIC64_OPS(andnot, bic, bic)
-ATOMIC64_OPS(or, or, or)
-ATOMIC64_OPS(xor, xor, xor)
-
-#define arch_atomic64_andnot arch_atomic64_andnot
-#define arch_atomic64_fetch_andnot arch_atomic64_fetch_andnot
-
-#undef ATOMIC64_OPS
-#undef ATOMIC64_FETCH_OP
-#undef ATOMIC64_OP_RETURN
-#undef ATOMIC64_OP
-
-static inline s64
-arch_atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new)
-{
- s64 prev;
-
- smp_mb();
-
- __asm__ __volatile__(
- "1: llockd %0, [%1] \n"
- " brne %L0, %L2, 2f \n"
- " brne %H0, %H2, 2f \n"
- " scondd %3, [%1] \n"
- " bnz 1b \n"
- "2: \n"
- : "=&r"(prev)
- : "r"(ptr), "ir"(expected), "r"(new)
- : "cc"); /* memory clobber comes from smp_mb() */
-
- smp_mb();
-
- return prev;
-}
-
-static inline s64 arch_atomic64_xchg(atomic64_t *ptr, s64 new)
-{
- s64 prev;
-
- smp_mb();
-
- __asm__ __volatile__(
- "1: llockd %0, [%1] \n"
- " scondd %2, [%1] \n"
- " bnz 1b \n"
- "2: \n"
- : "=&r"(prev)
- : "r"(ptr), "r"(new)
- : "cc"); /* memory clobber comes from smp_mb() */
-
- smp_mb();
-
- return prev;
-}
-
-/**
- * arch_atomic64_dec_if_positive - decrement by 1 if old value positive
- * @v: pointer of type atomic64_t
- *
- * The function returns the old value of *v minus 1, even if
- * the atomic variable, v, was not decremented.
- */
-
-static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
-{
- s64 val;
-
- smp_mb();
-
- __asm__ __volatile__(
- "1: llockd %0, [%1] \n"
- " sub.f %L0, %L0, 1 # w0 - 1, set C on borrow\n"
- " sub.c %H0, %H0, 1 # if C set, w1 - 1\n"
- " brlt %H0, 0, 2f \n"
- " scondd %0, [%1] \n"
- " bnz 1b \n"
- "2: \n"
- : "=&r"(val)
- : "r"(&v->counter)
- : "cc"); /* memory clobber comes from smp_mb() */
-
- smp_mb();
-
- return val;
-}
-#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
-
-/**
- * arch_atomic64_fetch_add_unless - add unless the number is a given value
- * @v: pointer of type atomic64_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, if it was not @u.
- * Returns the old value of @v
- */
-static inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
-{
- s64 old, temp;
-
- smp_mb();
-
- __asm__ __volatile__(
- "1: llockd %0, [%2] \n"
- " brne %L0, %L4, 2f # continue to add since v != u \n"
- " breq.d %H0, %H4, 3f # return since v == u \n"
- "2: \n"
- " add.f %L1, %L0, %L3 \n"
- " adc %H1, %H0, %H3 \n"
- " scondd %1, [%2] \n"
- " bnz 1b \n"
- "3: \n"
- : "=&r"(old), "=&r" (temp)
- : "r"(&v->counter), "r"(a), "r"(u)
- : "cc"); /* memory clobber comes from smp_mb() */
-
- smp_mb();
-
- return old;
-}
-#define arch_atomic64_fetch_add_unless arch_atomic64_fetch_add_unless
-
-#endif /* !CONFIG_GENERIC_ATOMIC64 */
+#else
+#include <asm/atomic64-arcv2.h>
+#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/arc/include/asm/atomic64-arcv2.h b/arch/arc/include/asm/atomic64-arcv2.h
new file mode 100644
index 000000000000..c5a8010fdc97
--- /dev/null
+++ b/arch/arc/include/asm/atomic64-arcv2.h
@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/*
+ * ARCv2 supports 64-bit exclusive load (LLOCKD) / store (SCONDD)
+ * - The address HAS to be 64-bit aligned
+ */
+
+#ifndef _ASM_ARC_ATOMIC64_ARCV2_H
+#define _ASM_ARC_ATOMIC64_ARCV2_H
+
+typedef struct {
+ s64 __aligned(8) counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(a) { (a) }
+
+static inline s64 arch_atomic64_read(const atomic64_t *v)
+{
+ s64 val;
+
+ __asm__ __volatile__(
+ " ldd %0, [%1] \n"
+ : "=r"(val)
+ : "r"(&v->counter));
+
+ return val;
+}
+
+static inline void arch_atomic64_set(atomic64_t *v, s64 a)
+{
+ /*
+ * This could have been a simple assignment in "C" but would need
+ * explicit volatile. Otherwise gcc optimizers could elide the store
+ * which borked atomic64 self-test
+ * In the inline asm version, memory clobber needed for exact same
+ * reason, to tell gcc about the store.
+ *
+ * This however is not needed for sibling atomic64_add() etc since both
+ * load/store are explicitly done in inline asm. As long as API is used
+ * for each access, gcc has no way to optimize away any load/store
+ */
+ __asm__ __volatile__(
+ " std %0, [%1] \n"
+ :
+ : "r"(a), "r"(&v->counter)
+ : "memory");
+}
+
+#define ATOMIC64_OP(op, op1, op2) \
+static inline void arch_atomic64_##op(s64 a, atomic64_t *v) \
+{ \
+ s64 val; \
+ \
+ __asm__ __volatile__( \
+ "1: \n" \
+ " llockd %0, [%1] \n" \
+ " " #op1 " %L0, %L0, %L2 \n" \
+ " " #op2 " %H0, %H0, %H2 \n" \
+ " scondd %0, [%1] \n" \
+ " bnz 1b \n" \
+ : "=&r"(val) \
+ : "r"(&v->counter), "ir"(a) \
+ : "cc"); \
+} \
+
+#define ATOMIC64_OP_RETURN(op, op1, op2) \
+static inline s64 arch_atomic64_##op##_return_relaxed(s64 a, atomic64_t *v) \
+{ \
+ s64 val; \
+ \
+ __asm__ __volatile__( \
+ "1: \n" \
+ " llockd %0, [%1] \n" \
+ " " #op1 " %L0, %L0, %L2 \n" \
+ " " #op2 " %H0, %H0, %H2 \n" \
+ " scondd %0, [%1] \n" \
+ " bnz 1b \n" \
+ : [val] "=&r"(val) \
+ : "r"(&v->counter), "ir"(a) \
+ : "cc"); /* memory clobber comes from smp_mb() */ \
+ \
+ return val; \
+}
+
+#define arch_atomic64_add_return_relaxed arch_atomic64_add_return_relaxed
+#define arch_atomic64_sub_return_relaxed arch_atomic64_sub_return_relaxed
+
+#define ATOMIC64_FETCH_OP(op, op1, op2) \
+static inline s64 arch_atomic64_fetch_##op##_relaxed(s64 a, atomic64_t *v) \
+{ \
+ s64 val, orig; \
+ \
+ __asm__ __volatile__( \
+ "1: \n" \
+ " llockd %0, [%2] \n" \
+ " " #op1 " %L1, %L0, %L3 \n" \
+ " " #op2 " %H1, %H0, %H3 \n" \
+ " scondd %1, [%2] \n" \
+ " bnz 1b \n" \
+ : "=&r"(orig), "=&r"(val) \
+ : "r"(&v->counter), "ir"(a) \
+ : "cc"); /* memory clobber comes from smp_mb() */ \
+ \
+ return orig; \
+}
+
+#define arch_atomic64_fetch_add_relaxed arch_atomic64_fetch_add_relaxed
+#define arch_atomic64_fetch_sub_relaxed arch_atomic64_fetch_sub_relaxed
+
+#define arch_atomic64_fetch_and_relaxed arch_atomic64_fetch_and_relaxed
+#define arch_atomic64_fetch_andnot_relaxed arch_atomic64_fetch_andnot_relaxed
+#define arch_atomic64_fetch_or_relaxed arch_atomic64_fetch_or_relaxed
+#define arch_atomic64_fetch_xor_relaxed arch_atomic64_fetch_xor_relaxed
+
+#define ATOMIC64_OPS(op, op1, op2) \
+ ATOMIC64_OP(op, op1, op2) \
+ ATOMIC64_OP_RETURN(op, op1, op2) \
+ ATOMIC64_FETCH_OP(op, op1, op2)
+
+ATOMIC64_OPS(add, add.f, adc)
+ATOMIC64_OPS(sub, sub.f, sbc)
+
+#undef ATOMIC64_OPS
+#define ATOMIC64_OPS(op, op1, op2) \
+ ATOMIC64_OP(op, op1, op2) \
+ ATOMIC64_FETCH_OP(op, op1, op2)
+
+ATOMIC64_OPS(and, and, and)
+ATOMIC64_OPS(andnot, bic, bic)
+ATOMIC64_OPS(or, or, or)
+ATOMIC64_OPS(xor, xor, xor)
+
+#define arch_atomic64_andnot arch_atomic64_andnot
+
+#undef ATOMIC64_OPS
+#undef ATOMIC64_FETCH_OP
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
+
+static inline s64
+arch_atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new)
+{
+ s64 prev;
+
+ smp_mb();
+
+ __asm__ __volatile__(
+ "1: llockd %0, [%1] \n"
+ " brne %L0, %L2, 2f \n"
+ " brne %H0, %H2, 2f \n"
+ " scondd %3, [%1] \n"
+ " bnz 1b \n"
+ "2: \n"
+ : "=&r"(prev)
+ : "r"(ptr), "ir"(expected), "r"(new)
+ : "cc"); /* memory clobber comes from smp_mb() */
+
+ smp_mb();
+
+ return prev;
+}
+
+static inline s64 arch_atomic64_xchg(atomic64_t *ptr, s64 new)
+{
+ s64 prev;
+
+ smp_mb();
+
+ __asm__ __volatile__(
+ "1: llockd %0, [%1] \n"
+ " scondd %2, [%1] \n"
+ " bnz 1b \n"
+ "2: \n"
+ : "=&r"(prev)
+ : "r"(ptr), "r"(new)
+ : "cc"); /* memory clobber comes from smp_mb() */
+
+ smp_mb();
+
+ return prev;
+}
+
+/**
+ * arch_atomic64_dec_if_positive - decrement by 1 if old value positive
+ * @v: pointer of type atomic64_t
+ *
+ * The function returns the old value of *v minus 1, even if
+ * the atomic variable, v, was not decremented.
+ */
+
+static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
+{
+ s64 val;
+
+ smp_mb();
+
+ __asm__ __volatile__(
+ "1: llockd %0, [%1] \n"
+ " sub.f %L0, %L0, 1 # w0 - 1, set C on borrow\n"
+ " sub.c %H0, %H0, 1 # if C set, w1 - 1\n"
+ " brlt %H0, 0, 2f \n"
+ " scondd %0, [%1] \n"
+ " bnz 1b \n"
+ "2: \n"
+ : "=&r"(val)
+ : "r"(&v->counter)
+ : "cc"); /* memory clobber comes from smp_mb() */
+
+ smp_mb();
+
+ return val;
+}
+#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
+
+/**
+ * arch_atomic64_fetch_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic64_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, if it was not @u.
+ * Returns the old value of @v
+ */
+static inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
+{
+ s64 old, temp;
+
+ smp_mb();
+
+ __asm__ __volatile__(
+ "1: llockd %0, [%2] \n"
+ " brne %L0, %L4, 2f # continue to add since v != u \n"
+ " breq.d %H0, %H4, 3f # return since v == u \n"
+ "2: \n"
+ " add.f %L1, %L0, %L3 \n"
+ " adc %H1, %H0, %H3 \n"
+ " scondd %1, [%2] \n"
+ " bnz 1b \n"
+ "3: \n"
+ : "=&r"(old), "=&r" (temp)
+ : "r"(&v->counter), "r"(a), "r"(u)
+ : "cc"); /* memory clobber comes from smp_mb() */
+
+ smp_mb();
+
+ return old;
+}
+#define arch_atomic64_fetch_add_unless arch_atomic64_fetch_add_unless
+
+#endif
diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h
index fb98440c0bd4..a7daaf64ae34 100644
--- a/arch/arc/include/asm/bitops.h
+++ b/arch/arc/include/asm/bitops.h
@@ -14,188 +14,6 @@
#include <linux/types.h>
#include <linux/compiler.h>
-#include <asm/barrier.h>
-#ifndef CONFIG_ARC_HAS_LLSC
-#include <asm/smp.h>
-#endif
-
-#ifdef CONFIG_ARC_HAS_LLSC
-
-/*
- * Hardware assisted Atomic-R-M-W
- */
-
-#define BIT_OP(op, c_op, asm_op) \
-static inline void op##_bit(unsigned long nr, volatile unsigned long *m)\
-{ \
- unsigned int temp; \
- \
- m += nr >> 5; \
- \
- nr &= 0x1f; \
- \
- __asm__ __volatile__( \
- "1: llock %0, [%1] \n" \
- " " #asm_op " %0, %0, %2 \n" \
- " scond %0, [%1] \n" \
- " bnz 1b \n" \
- : "=&r"(temp) /* Early clobber, to prevent reg reuse */ \
- : "r"(m), /* Not "m": llock only supports reg direct addr mode */ \
- "ir"(nr) \
- : "cc"); \
-}
-
-/*
- * Semantically:
- * Test the bit
- * if clear
- * set it and return 0 (old value)
- * else
- * return 1 (old value).
- *
- * Since ARC lacks a equivalent h/w primitive, the bit is set unconditionally
- * and the old value of bit is returned
- */
-#define TEST_N_BIT_OP(op, c_op, asm_op) \
-static inline int test_and_##op##_bit(unsigned long nr, volatile unsigned long *m)\
-{ \
- unsigned long old, temp; \
- \
- m += nr >> 5; \
- \
- nr &= 0x1f; \
- \
- /* \
- * Explicit full memory barrier needed before/after as \
- * LLOCK/SCOND themselves don't provide any such smenatic \
- */ \
- smp_mb(); \
- \
- __asm__ __volatile__( \
- "1: llock %0, [%2] \n" \
- " " #asm_op " %1, %0, %3 \n" \
- " scond %1, [%2] \n" \
- " bnz 1b \n" \
- : "=&r"(old), "=&r"(temp) \
- : "r"(m), "ir"(nr) \
- : "cc"); \
- \
- smp_mb(); \
- \
- return (old & (1 << nr)) != 0; \
-}
-
-#else /* !CONFIG_ARC_HAS_LLSC */
-
-/*
- * Non hardware assisted Atomic-R-M-W
- * Locking would change to irq-disabling only (UP) and spinlocks (SMP)
- *
- * There's "significant" micro-optimization in writing our own variants of
- * bitops (over generic variants)
- *
- * (1) The generic APIs have "signed" @nr while we have it "unsigned"
- * This avoids extra code to be generated for pointer arithmatic, since
- * is "not sure" that index is NOT -ve
- * (2) Utilize the fact that ARCompact bit fidding insn (BSET/BCLR/ASL) etc
- * only consider bottom 5 bits of @nr, so NO need to mask them off.
- * (GCC Quirk: however for constant @nr we still need to do the masking
- * at compile time)
- */
-
-#define BIT_OP(op, c_op, asm_op) \
-static inline void op##_bit(unsigned long nr, volatile unsigned long *m)\
-{ \
- unsigned long temp, flags; \
- m += nr >> 5; \
- \
- /* \
- * spin lock/unlock provide the needed smp_mb() before/after \
- */ \
- bitops_lock(flags); \
- \
- temp = *m; \
- *m = temp c_op (1UL << (nr & 0x1f)); \
- \
- bitops_unlock(flags); \
-}
-
-#define TEST_N_BIT_OP(op, c_op, asm_op) \
-static inline int test_and_##op##_bit(unsigned long nr, volatile unsigned long *m)\
-{ \
- unsigned long old, flags; \
- m += nr >> 5; \
- \
- bitops_lock(flags); \
- \
- old = *m; \
- *m = old c_op (1UL << (nr & 0x1f)); \
- \
- bitops_unlock(flags); \
- \
- return (old & (1UL << (nr & 0x1f))) != 0; \
-}
-
-#endif
-
-/***************************************
- * Non atomic variants
- **************************************/
-
-#define __BIT_OP(op, c_op, asm_op) \
-static inline void __##op##_bit(unsigned long nr, volatile unsigned long *m) \
-{ \
- unsigned long temp; \
- m += nr >> 5; \
- \
- temp = *m; \
- *m = temp c_op (1UL << (nr & 0x1f)); \
-}
-
-#define __TEST_N_BIT_OP(op, c_op, asm_op) \
-static inline int __test_and_##op##_bit(unsigned long nr, volatile unsigned long *m)\
-{ \
- unsigned long old; \
- m += nr >> 5; \
- \
- old = *m; \
- *m = old c_op (1UL << (nr & 0x1f)); \
- \
- return (old & (1UL << (nr & 0x1f))) != 0; \
-}
-
-#define BIT_OPS(op, c_op, asm_op) \
- \
- /* set_bit(), clear_bit(), change_bit() */ \
- BIT_OP(op, c_op, asm_op) \
- \
- /* test_and_set_bit(), test_and_clear_bit(), test_and_change_bit() */\
- TEST_N_BIT_OP(op, c_op, asm_op) \
- \
- /* __set_bit(), __clear_bit(), __change_bit() */ \
- __BIT_OP(op, c_op, asm_op) \
- \
- /* __test_and_set_bit(), __test_and_clear_bit(), __test_and_change_bit() */\
- __TEST_N_BIT_OP(op, c_op, asm_op)
-
-BIT_OPS(set, |, bset)
-BIT_OPS(clear, & ~, bclr)
-BIT_OPS(change, ^, bxor)
-
-/*
- * This routine doesn't need to be atomic.
- */
-static inline int
-test_bit(unsigned int nr, const volatile unsigned long *addr)
-{
- unsigned long mask;
-
- addr += nr >> 5;
-
- mask = 1UL << (nr & 0x1f);
-
- return ((mask & *addr) != 0);
-}
#ifdef CONFIG_ISA_ARCOMPACT
@@ -296,7 +114,7 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long word)
* @result: [1-32]
* fls(1) = 1, fls(0x80000000) = 32, fls(0) = 0
*/
-static inline __attribute__ ((const)) int fls(unsigned long x)
+static inline __attribute__ ((const)) int fls(unsigned int x)
{
int n;
@@ -323,7 +141,7 @@ static inline __attribute__ ((const)) int __fls(unsigned long x)
* ffs = Find First Set in word (LSB to MSB)
* @result: [1-32], 0 if all 0's
*/
-static inline __attribute__ ((const)) int ffs(unsigned long x)
+static inline __attribute__ ((const)) int ffs(unsigned int x)
{
int n;
@@ -368,6 +186,8 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x)
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/lock.h>
+#include <asm-generic/bitops/atomic.h>
+#include <asm-generic/bitops/non-atomic.h>
#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/le.h>
diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h
index d8ece4292388..f0f1fc5d62b6 100644
--- a/arch/arc/include/asm/cache.h
+++ b/arch/arc/include/asm/cache.h
@@ -62,10 +62,6 @@
#define ARCH_SLAB_MINALIGN 8
#endif
-extern void arc_cache_init(void);
-extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len);
-extern void read_decode_cache_bcr(void);
-
extern int ioc_enable;
extern unsigned long perip_base, perip_end;
diff --git a/arch/arc/include/asm/cmpxchg.h b/arch/arc/include/asm/cmpxchg.h
index d42917e803e1..c5b544a5fe81 100644
--- a/arch/arc/include/asm/cmpxchg.h
+++ b/arch/arc/include/asm/cmpxchg.h
@@ -6,6 +6,7 @@
#ifndef __ASM_ARC_CMPXCHG_H
#define __ASM_ARC_CMPXCHG_H
+#include <linux/build_bug.h>
#include <linux/types.h>
#include <asm/barrier.h>
@@ -13,146 +14,130 @@
#ifdef CONFIG_ARC_HAS_LLSC
-static inline unsigned long
-__cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new)
-{
- unsigned long prev;
-
- /*
- * Explicit full memory barrier needed before/after as
- * LLOCK/SCOND themselves don't provide any such semantics
- */
- smp_mb();
-
- __asm__ __volatile__(
- "1: llock %0, [%1] \n"
- " brne %0, %2, 2f \n"
- " scond %3, [%1] \n"
- " bnz 1b \n"
- "2: \n"
- : "=&r"(prev) /* Early clobber, to prevent reg reuse */
- : "r"(ptr), /* Not "m": llock only supports reg direct addr mode */
- "ir"(expected),
- "r"(new) /* can't be "ir". scond can't take LIMM for "b" */
- : "cc", "memory"); /* so that gcc knows memory is being written here */
-
- smp_mb();
-
- return prev;
-}
-
-#else /* !CONFIG_ARC_HAS_LLSC */
-
-static inline unsigned long
-__cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new)
-{
- unsigned long flags;
- int prev;
- volatile unsigned long *p = ptr;
-
- /*
- * spin lock/unlock provide the needed smp_mb() before/after
- */
- atomic_ops_lock(flags);
- prev = *p;
- if (prev == expected)
- *p = new;
- atomic_ops_unlock(flags);
- return prev;
-}
-
-#endif
+/*
+ * if (*ptr == @old)
+ * *ptr = @new
+ */
+#define __cmpxchg(ptr, old, new) \
+({ \
+ __typeof__(*(ptr)) _prev; \
+ \
+ __asm__ __volatile__( \
+ "1: llock %0, [%1] \n" \
+ " brne %0, %2, 2f \n" \
+ " scond %3, [%1] \n" \
+ " bnz 1b \n" \
+ "2: \n" \
+ : "=&r"(_prev) /* Early clobber prevent reg reuse */ \
+ : "r"(ptr), /* Not "m": llock only supports reg */ \
+ "ir"(old), \
+ "r"(new) /* Not "ir": scond can't take LIMM */ \
+ : "cc", \
+ "memory"); /* gcc knows memory is clobbered */ \
+ \
+ _prev; \
+})
-#define arch_cmpxchg(ptr, o, n) ({ \
- (typeof(*(ptr)))__cmpxchg((ptr), \
- (unsigned long)(o), \
- (unsigned long)(n)); \
+#define arch_cmpxchg_relaxed(ptr, old, new) \
+({ \
+ __typeof__(ptr) _p_ = (ptr); \
+ __typeof__(*(ptr)) _o_ = (old); \
+ __typeof__(*(ptr)) _n_ = (new); \
+ __typeof__(*(ptr)) _prev_; \
+ \
+ switch(sizeof((_p_))) { \
+ case 4: \
+ _prev_ = __cmpxchg(_p_, _o_, _n_); \
+ break; \
+ default: \
+ BUILD_BUG(); \
+ } \
+ _prev_; \
})
-/*
- * atomic_cmpxchg is same as cmpxchg
- * LLSC: only different in data-type, semantics are exactly same
- * !LLSC: cmpxchg() has to use an external lock atomic_ops_lock to guarantee
- * semantics, and this lock also happens to be used by atomic_*()
- */
-#define arch_atomic_cmpxchg(v, o, n) ((int)arch_cmpxchg(&((v)->counter), (o), (n)))
+#else
+#define arch_cmpxchg(ptr, old, new) \
+({ \
+ volatile __typeof__(ptr) _p_ = (ptr); \
+ __typeof__(*(ptr)) _o_ = (old); \
+ __typeof__(*(ptr)) _n_ = (new); \
+ __typeof__(*(ptr)) _prev_; \
+ unsigned long __flags; \
+ \
+ BUILD_BUG_ON(sizeof(_p_) != 4); \
+ \
+ /* \
+ * spin lock/unlock provide the needed smp_mb() before/after \
+ */ \
+ atomic_ops_lock(__flags); \
+ _prev_ = *_p_; \
+ if (_prev_ == _o_) \
+ *_p_ = _n_; \
+ atomic_ops_unlock(__flags); \
+ _prev_; \
+})
+
+#endif
/*
- * xchg (reg with memory) based on "Native atomic" EX insn
+ * xchg
*/
-static inline unsigned long __xchg(unsigned long val, volatile void *ptr,
- int size)
-{
- extern unsigned long __xchg_bad_pointer(void);
-
- switch (size) {
- case 4:
- smp_mb();
-
- __asm__ __volatile__(
- " ex %0, [%1] \n"
- : "+r"(val)
- : "r"(ptr)
- : "memory");
+#ifdef CONFIG_ARC_HAS_LLSC
- smp_mb();
+#define __xchg(ptr, val) \
+({ \
+ __asm__ __volatile__( \
+ " ex %0, [%1] \n" /* set new value */ \
+ : "+r"(val) \
+ : "r"(ptr) \
+ : "memory"); \
+ _val_; /* get old value */ \
+})
- return val;
- }
- return __xchg_bad_pointer();
-}
+#define arch_xchg_relaxed(ptr, val) \
+({ \
+ __typeof__(ptr) _p_ = (ptr); \
+ __typeof__(*(ptr)) _val_ = (val); \
+ \
+ switch(sizeof(*(_p_))) { \
+ case 4: \
+ _val_ = __xchg(_p_, _val_); \
+ break; \
+ default: \
+ BUILD_BUG(); \
+ } \
+ _val_; \
+})
-#define _xchg(ptr, with) ((typeof(*(ptr)))__xchg((unsigned long)(with), (ptr), \
- sizeof(*(ptr))))
+#else /* !CONFIG_ARC_HAS_LLSC */
/*
- * xchg() maps directly to ARC EX instruction which guarantees atomicity.
- * However in !LLSC config, it also needs to be use @atomic_ops_lock spinlock
- * due to a subtle reason:
- * - For !LLSC, cmpxchg() needs to use that lock (see above) and there is lot
- * of kernel code which calls xchg()/cmpxchg() on same data (see llist.h)
- * Hence xchg() needs to follow same locking rules.
- *
- * Technically the lock is also needed for UP (boils down to irq save/restore)
- * but we can cheat a bit since cmpxchg() atomic_ops_lock() would cause irqs to
- * be disabled thus can't possibly be interrupted/preempted/clobbered by xchg()
- * Other way around, xchg is one instruction anyways, so can't be interrupted
- * as such
+ * EX instructions is baseline and present in !LLSC too. But in this
+ * regime it still needs use @atomic_ops_lock spinlock to allow interop
+ * with cmpxchg() which uses spinlock in !LLSC
+ * (llist.h use xchg and cmpxchg on sama data)
*/
-#if !defined(CONFIG_ARC_HAS_LLSC) && defined(CONFIG_SMP)
-
-#define arch_xchg(ptr, with) \
-({ \
- unsigned long flags; \
- typeof(*(ptr)) old_val; \
- \
- atomic_ops_lock(flags); \
- old_val = _xchg(ptr, with); \
- atomic_ops_unlock(flags); \
- old_val; \
+#define arch_xchg(ptr, val) \
+({ \
+ __typeof__(ptr) _p_ = (ptr); \
+ __typeof__(*(ptr)) _val_ = (val); \
+ \
+ unsigned long __flags; \
+ \
+ atomic_ops_lock(__flags); \
+ \
+ __asm__ __volatile__( \
+ " ex %0, [%1] \n" \
+ : "+r"(_val_) \
+ : "r"(_p_) \
+ : "memory"); \
+ \
+ atomic_ops_unlock(__flags); \
+ _val_; \
})
-#else
-
-#define arch_xchg(ptr, with) _xchg(ptr, with)
-
#endif
-/*
- * "atomic" variant of xchg()
- * REQ: It needs to follow the same serialization rules as other atomic_xxx()
- * Since xchg() doesn't always do that, it would seem that following definition
- * is incorrect. But here's the rationale:
- * SMP : Even xchg() takes the atomic_ops_lock, so OK.
- * LLSC: atomic_ops_lock are not relevant at all (even if SMP, since LLSC
- * is natively "SMP safe", no serialization required).
- * UP : other atomics disable IRQ, so no way a difft ctxt atomic_xchg()
- * could clobber them. atomic_xchg() itself would be 1 insn, so it
- * can't be clobbered by others. Thus no serialization required when
- * atomic_xchg is involved.
- */
-#define arch_atomic_xchg(v, new) (arch_xchg(&((v)->counter), new))
-
#endif
diff --git a/arch/arc/include/asm/entry-compact.h b/arch/arc/include/asm/entry-compact.h
index 6dbf5cecc8cc..5aab4f93ab8a 100644
--- a/arch/arc/include/asm/entry-compact.h
+++ b/arch/arc/include/asm/entry-compact.h
@@ -126,19 +126,11 @@
* to be saved again on kernel mode stack, as part of pt_regs.
*-------------------------------------------------------------*/
.macro PROLOG_FREEUP_REG reg, mem
-#ifndef ARC_USE_SCRATCH_REG
- sr \reg, [ARC_REG_SCRATCH_DATA0]
-#else
st \reg, [\mem]
-#endif
.endm
.macro PROLOG_RESTORE_REG reg, mem
-#ifndef ARC_USE_SCRATCH_REG
- lr \reg, [ARC_REG_SCRATCH_DATA0]
-#else
ld \reg, [\mem]
-#endif
.endm
/*--------------------------------------------------------------
diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h
index 4eef17c5c1da..11b0ff26b97b 100644
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@@ -58,14 +58,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd);
-/* Generic variants assume pgtable_t is struct page *, hence need for these */
-#define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
- pgtable_t pgtable);
-
-#define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
-
#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
extern void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end);
diff --git a/arch/arc/include/asm/mmu-arcv2.h b/arch/arc/include/asm/mmu-arcv2.h
new file mode 100644
index 000000000000..ed9036d4ede3
--- /dev/null
+++ b/arch/arc/include/asm/mmu-arcv2.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2004, 2007-2010, 2011-2012, 2019-20 Synopsys, Inc. (www.synopsys.com)
+ *
+ * MMUv3 (arc700) / MMUv4 (archs) are software page walked and software managed.
+ * This file contains the TLB access registers and commands
+ */
+
+#ifndef _ASM_ARC_MMU_ARCV2_H
+#define _ASM_ARC_MMU_ARCV2_H
+
+/*
+ * TLB Management regs
+ */
+#define ARC_REG_MMU_BCR 0x06f
+
+#ifdef CONFIG_ARC_MMU_V3
+#define ARC_REG_TLBPD0 0x405
+#define ARC_REG_TLBPD1 0x406
+#define ARC_REG_TLBPD1HI 0 /* Dummy: allows common code */
+#define ARC_REG_TLBINDEX 0x407
+#define ARC_REG_TLBCOMMAND 0x408
+#define ARC_REG_PID 0x409
+#define ARC_REG_SCRATCH_DATA0 0x418
+#else
+#define ARC_REG_TLBPD0 0x460
+#define ARC_REG_TLBPD1 0x461
+#define ARC_REG_TLBPD1HI 0x463
+#define ARC_REG_TLBINDEX 0x464
+#define ARC_REG_TLBCOMMAND 0x465
+#define ARC_REG_PID 0x468
+#define ARC_REG_SCRATCH_DATA0 0x46c
+#endif
+
+/* Bits in MMU PID reg */
+#define __TLB_ENABLE (1 << 31)
+#define __PROG_ENABLE (1 << 30)
+#define MMU_ENABLE (__TLB_ENABLE | __PROG_ENABLE)
+
+/* Bits in TLB Index reg */
+#define TLB_LKUP_ERR 0x80000000
+
+#ifdef CONFIG_ARC_MMU_V3
+#define TLB_DUP_ERR (TLB_LKUP_ERR | 0x00000001)
+#else
+#define TLB_DUP_ERR (TLB_LKUP_ERR | 0x40000000)
+#endif
+
+/*
+ * TLB Commands
+ */
+#define TLBWrite 0x1
+#define TLBRead 0x2
+#define TLBGetIndex 0x3
+#define TLBProbe 0x4
+#define TLBWriteNI 0x5 /* write JTLB without inv uTLBs */
+#define TLBIVUTLB 0x6 /* explicitly inv uTLBs */
+
+#ifdef CONFIG_ARC_MMU_V4
+#define TLBInsertEntry 0x7
+#define TLBDeleteEntry 0x8
+#endif
+
+/* Masks for actual TLB "PD"s */
+#define PTE_BITS_IN_PD0 (_PAGE_GLOBAL | _PAGE_PRESENT | _PAGE_HW_SZ)
+#define PTE_BITS_RWX (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ)
+
+#define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK_PHYS | _PAGE_CACHEABLE)
+
+#ifndef __ASSEMBLY__
+
+struct mm_struct;
+extern int pae40_exist_but_not_enab(void);
+
+static inline int is_pae40_enabled(void)
+{
+ return IS_ENABLED(CONFIG_ARC_HAS_PAE40);
+}
+
+static inline void mmu_setup_asid(struct mm_struct *mm, unsigned long asid)
+{
+ write_aux_reg(ARC_REG_PID, asid | MMU_ENABLE);
+}
+
+static inline void mmu_setup_pgd(struct mm_struct *mm, void *pgd)
+{
+ /* PGD cached in MMU reg to avoid 3 mem lookups: task->mm->pgd */
+#ifdef CONFIG_ISA_ARCV2
+ write_aux_reg(ARC_REG_SCRATCH_DATA0, (unsigned int)pgd);
+#endif
+}
+
+#else
+
+.macro ARC_MMU_REENABLE reg
+ lr \reg, [ARC_REG_PID]
+ or \reg, \reg, MMU_ENABLE
+ sr \reg, [ARC_REG_PID]
+.endm
+
+#endif /* !__ASSEMBLY__ */
+
+#endif
diff --git a/arch/arc/include/asm/mmu.h b/arch/arc/include/asm/mmu.h
index 26b731d32a2b..ca427c30f70e 100644
--- a/arch/arc/include/asm/mmu.h
+++ b/arch/arc/include/asm/mmu.h
@@ -7,98 +7,15 @@
#define _ASM_ARC_MMU_H
#ifndef __ASSEMBLY__
-#include <linux/threads.h> /* NR_CPUS */
-#endif
-
-#if defined(CONFIG_ARC_MMU_V1)
-#define CONFIG_ARC_MMU_VER 1
-#elif defined(CONFIG_ARC_MMU_V2)
-#define CONFIG_ARC_MMU_VER 2
-#elif defined(CONFIG_ARC_MMU_V3)
-#define CONFIG_ARC_MMU_VER 3
-#elif defined(CONFIG_ARC_MMU_V4)
-#define CONFIG_ARC_MMU_VER 4
-#endif
-
-/* MMU Management regs */
-#define ARC_REG_MMU_BCR 0x06f
-#if (CONFIG_ARC_MMU_VER < 4)
-#define ARC_REG_TLBPD0 0x405
-#define ARC_REG_TLBPD1 0x406
-#define ARC_REG_TLBPD1HI 0 /* Dummy: allows code sharing with ARC700 */
-#define ARC_REG_TLBINDEX 0x407
-#define ARC_REG_TLBCOMMAND 0x408
-#define ARC_REG_PID 0x409
-#define ARC_REG_SCRATCH_DATA0 0x418
-#else
-#define ARC_REG_TLBPD0 0x460
-#define ARC_REG_TLBPD1 0x461
-#define ARC_REG_TLBPD1HI 0x463
-#define ARC_REG_TLBINDEX 0x464
-#define ARC_REG_TLBCOMMAND 0x465
-#define ARC_REG_PID 0x468
-#define ARC_REG_SCRATCH_DATA0 0x46c
-#endif
-
-#if defined(CONFIG_ISA_ARCV2) || !defined(CONFIG_SMP)
-#define ARC_USE_SCRATCH_REG
-#endif
-
-/* Bits in MMU PID register */
-#define __TLB_ENABLE (1 << 31)
-#define __PROG_ENABLE (1 << 30)
-#define MMU_ENABLE (__TLB_ENABLE | __PROG_ENABLE)
-
-/* Error code if probe fails */
-#define TLB_LKUP_ERR 0x80000000
-
-#if (CONFIG_ARC_MMU_VER < 4)
-#define TLB_DUP_ERR (TLB_LKUP_ERR | 0x00000001)
-#else
-#define TLB_DUP_ERR (TLB_LKUP_ERR | 0x40000000)
-#endif
-
-/* TLB Commands */
-#define TLBWrite 0x1
-#define TLBRead 0x2
-#define TLBGetIndex 0x3
-#define TLBProbe 0x4
-
-#if (CONFIG_ARC_MMU_VER >= 2)
-#define TLBWriteNI 0x5 /* write JTLB without inv uTLBs */
-#define TLBIVUTLB 0x6 /* explicitly inv uTLBs */
-#else
-#define TLBWriteNI TLBWrite /* Not present in hardware, fallback */
-#endif
-
-#if (CONFIG_ARC_MMU_VER >= 4)
-#define TLBInsertEntry 0x7
-#define TLBDeleteEntry 0x8
-#endif
-#ifndef __ASSEMBLY__
+#include <linux/threads.h> /* NR_CPUS */
typedef struct {
unsigned long asid[NR_CPUS]; /* 8 bit MMU PID + Generation cycle */
} mm_context_t;
-#ifdef CONFIG_ARC_DBG_TLB_PARANOIA
-void tlb_paranoid_check(unsigned int mm_asid, unsigned long address);
-#else
-#define tlb_paranoid_check(a, b)
#endif
-void arc_mmu_init(void);
-extern char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len);
-void read_decode_mmu_bcr(void);
-
-static inline int is_pae40_enabled(void)
-{
- return IS_ENABLED(CONFIG_ARC_HAS_PAE40);
-}
-
-extern int pae40_exist_but_not_enab(void);
-
-#endif /* !__ASSEMBLY__ */
+#include <asm/mmu-arcv2.h>
#endif
diff --git a/arch/arc/include/asm/mmu_context.h b/arch/arc/include/asm/mmu_context.h
index df164066e172..dda471f5f05b 100644
--- a/arch/arc/include/asm/mmu_context.h
+++ b/arch/arc/include/asm/mmu_context.h
@@ -15,22 +15,23 @@
#ifndef _ASM_ARC_MMU_CONTEXT_H
#define _ASM_ARC_MMU_CONTEXT_H
-#include <asm/arcregs.h>
-#include <asm/tlb.h>
#include <linux/sched/mm.h>
+#include <asm/tlb.h>
#include <asm-generic/mm_hooks.h>
-/* ARC700 ASID Management
+/* ARC ASID Management
+ *
+ * MMU tags TLBs with an 8-bit ASID, avoiding need to flush the TLB on
+ * context-switch.
*
- * ARC MMU provides 8-bit ASID (0..255) to TAG TLB entries, allowing entries
- * with same vaddr (different tasks) to co-exit. This provides for
- * "Fast Context Switch" i.e. no TLB flush on ctxt-switch
+ * ASID is managed per cpu, so task threads across CPUs can have different
+ * ASID. Global ASID management is needed if hardware supports TLB shootdown
+ * and/or shared TLB across cores, which ARC doesn't.
*
- * Linux assigns each task a unique ASID. A simple round-robin allocation
- * of H/w ASID is done using software tracker @asid_cpu.
- * When it reaches max 255, the allocation cycle starts afresh by flushing
- * the entire TLB and wrapping ASID back to zero.
+ * Each task is assigned unique ASID, with a simple round-robin allocator
+ * tracked in @asid_cpu. When 8-bit value rolls over,a new cycle is started
+ * over from 0, and TLB is flushed
*
* A new allocation cycle, post rollover, could potentially reassign an ASID
* to a different task. Thus the rule is to refresh the ASID in a new cycle.
@@ -93,7 +94,7 @@ static inline void get_new_mmu_context(struct mm_struct *mm)
asid_mm(mm, cpu) = asid_cpu(cpu);
set_hw:
- write_aux_reg(ARC_REG_PID, hw_pid(mm, cpu) | MMU_ENABLE);
+ mmu_setup_asid(mm, hw_pid(mm, cpu));
local_irq_restore(flags);
}
@@ -146,10 +147,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
*/
cpumask_set_cpu(cpu, mm_cpumask(next));
-#ifdef ARC_USE_SCRATCH_REG
- /* PGD cached in MMU reg to avoid 3 mem lookups: task->mm->pgd */
- write_aux_reg(ARC_REG_SCRATCH_DATA0, next->pgd);
-#endif
+ mmu_setup_pgd(next, next->pgd);
get_new_mmu_context(next);
}
diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h
index 4a9d33372fe2..9a62e1d87967 100644
--- a/arch/arc/include/asm/page.h
+++ b/arch/arc/include/asm/page.h
@@ -34,57 +34,55 @@ void copy_user_highpage(struct page *to, struct page *from,
unsigned long u_vaddr, struct vm_area_struct *vma);
void clear_user_page(void *to, unsigned long u_vaddr, struct page *page);
-#undef STRICT_MM_TYPECHECKS
-
-#ifdef STRICT_MM_TYPECHECKS
-/*
- * These are used to make use of C type-checking..
- */
-typedef struct {
-#ifdef CONFIG_ARC_HAS_PAE40
- unsigned long long pte;
-#else
- unsigned long pte;
-#endif
-} pte_t;
typedef struct {
unsigned long pgd;
} pgd_t;
+
+#define pgd_val(x) ((x).pgd)
+#define __pgd(x) ((pgd_t) { (x) })
+
+#if CONFIG_PGTABLE_LEVELS > 3
+
typedef struct {
- unsigned long pgprot;
-} pgprot_t;
+ unsigned long pud;
+} pud_t;
-#define pte_val(x) ((x).pte)
-#define pgd_val(x) ((x).pgd)
-#define pgprot_val(x) ((x).pgprot)
+#define pud_val(x) ((x).pud)
+#define __pud(x) ((pud_t) { (x) })
-#define __pte(x) ((pte_t) { (x) })
-#define __pgd(x) ((pgd_t) { (x) })
-#define __pgprot(x) ((pgprot_t) { (x) })
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 2
-#define pte_pgprot(x) __pgprot(pte_val(x))
+typedef struct {
+ unsigned long pmd;
+} pmd_t;
-#else /* !STRICT_MM_TYPECHECKS */
+#define pmd_val(x) ((x).pmd)
+#define __pmd(x) ((pmd_t) { (x) })
+#endif
+
+typedef struct {
#ifdef CONFIG_ARC_HAS_PAE40
-typedef unsigned long long pte_t;
+ unsigned long long pte;
#else
-typedef unsigned long pte_t;
+ unsigned long pte;
#endif
-typedef unsigned long pgd_t;
-typedef unsigned long pgprot_t;
+} pte_t;
-#define pte_val(x) (x)
-#define pgd_val(x) (x)
-#define pgprot_val(x) (x)
-#define __pte(x) (x)
-#define __pgd(x) (x)
-#define __pgprot(x) (x)
-#define pte_pgprot(x) (x)
+#define pte_val(x) ((x).pte)
+#define __pte(x) ((pte_t) { (x) })
-#endif
+typedef struct {
+ unsigned long pgprot;
+} pgprot_t;
+
+#define pgprot_val(x) ((x).pgprot)
+#define __pgprot(x) ((pgprot_t) { (x) })
+#define pte_pgprot(x) __pgprot(pte_val(x))
-typedef pte_t * pgtable_t;
+typedef struct page *pgtable_t;
/*
* Use virt_to_pfn with caution:
@@ -122,8 +120,8 @@ extern int pfn_valid(unsigned long pfn);
* virt here means link-address/program-address as embedded in object code.
* And for ARC, link-addr = physical address
*/
-#define __pa(vaddr) ((unsigned long)(vaddr))
-#define __va(paddr) ((void *)((unsigned long)(paddr)))
+#define __pa(vaddr) ((unsigned long)(vaddr))
+#define __va(paddr) ((void *)((unsigned long)(paddr)))
#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr))
#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr))
diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h
index a32ca3104ced..096b8ef58edb 100644
--- a/arch/arc/include/asm/pgalloc.h
+++ b/arch/arc/include/asm/pgalloc.h
@@ -31,30 +31,32 @@
#include <linux/mm.h>
#include <linux/log2.h>
+#include <asm-generic/pgalloc.h>
static inline void
pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
{
- pmd_set(pmd, pte);
+ /*
+ * The cast to long below is OK in 32-bit PAE40 regime with long long pte
+ * Despite "wider" pte, the pte table needs to be in non-PAE low memory
+ * as all higher levels can only hold long pointers.
+ *
+ * The cast itself is needed given simplistic definition of set_pmd()
+ */
+ set_pmd(pmd, __pmd((unsigned long)pte));
}
-static inline void
-pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t ptep)
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page)
{
- pmd_set(pmd, (pte_t *) ptep);
-}
-
-static inline int __get_order_pgd(void)
-{
- return get_order(PTRS_PER_PGD * sizeof(pgd_t));
+ set_pmd(pmd, __pmd((unsigned long)page_address(pte_page)));
}
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
- int num, num2;
- pgd_t *ret = (pgd_t *) __get_free_pages(GFP_KERNEL, __get_order_pgd());
+ pgd_t *ret = (pgd_t *) __get_free_page(GFP_KERNEL);
if (ret) {
+ int num, num2;
num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE;
memzero(ret, num * sizeof(pgd_t));
@@ -68,64 +70,27 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
return ret;
}
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_pages((unsigned long)pgd, __get_order_pgd());
-}
-
-
-/*
- * With software-only page-tables, addr-split for traversal is tweakable and
- * that directly governs how big tables would be at each level.
- * Further, the MMU page size is configurable.
- * Thus we need to programatically assert the size constraint
- * All of this is const math, allowing gcc to do constant folding/propagation.
- */
+#if CONFIG_PGTABLE_LEVELS > 3
-static inline int __get_order_pte(void)
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp)
{
- return get_order(PTRS_PER_PTE * sizeof(pte_t));
+ set_p4d(p4dp, __p4d((unsigned long)pudp));
}
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
- pte_t *pte;
+#define __pud_free_tlb(tlb, pmd, addr) pud_free((tlb)->mm, pmd)
- pte = (pte_t *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
- __get_order_pte());
+#endif
- return pte;
-}
+#if CONFIG_PGTABLE_LEVELS > 2
-static inline pgtable_t
-pte_alloc_one(struct mm_struct *mm)
+static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp)
{
- pgtable_t pte_pg;
- struct page *page;
-
- pte_pg = (pgtable_t)__get_free_pages(GFP_KERNEL, __get_order_pte());
- if (!pte_pg)
- return 0;
- memzero((void *)pte_pg, PTRS_PER_PTE * sizeof(pte_t));
- page = virt_to_page(pte_pg);
- if (!pgtable_pte_page_ctor(page)) {
- __free_page(page);
- return 0;
- }
-
- return pte_pg;
+ set_pud(pudp, __pud((unsigned long)pmdp));
}
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- free_pages((unsigned long)pte, __get_order_pte()); /* takes phy addr */
-}
+#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd)
-static inline void pte_free(struct mm_struct *mm, pgtable_t ptep)
-{
- pgtable_pte_page_dtor(virt_to_page(ptep));
- free_pages((unsigned long)ptep, __get_order_pte());
-}
+#endif
#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte)
diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h
new file mode 100644
index 000000000000..183d23bc1e00
--- /dev/null
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
+ */
+
+/*
+ * page table flags for software walked/managed MMUv3 (ARC700) and MMUv4 (HS)
+ * There correspond to the corresponding bits in the TLB
+ */
+
+#ifndef _ASM_ARC_PGTABLE_BITS_ARCV2_H
+#define _ASM_ARC_PGTABLE_BITS_ARCV2_H
+
+#ifdef CONFIG_ARC_CACHE_PAGES
+#define _PAGE_CACHEABLE (1 << 0) /* Cached (H) */
+#else
+#define _PAGE_CACHEABLE 0
+#endif
+
+#define _PAGE_EXECUTE (1 << 1) /* User Execute (H) */
+#define _PAGE_WRITE (1 << 2) /* User Write (H) */
+#define _PAGE_READ (1 << 3) /* User Read (H) */
+#define _PAGE_ACCESSED (1 << 4) /* Accessed (s) */
+#define _PAGE_DIRTY (1 << 5) /* Modified (s) */
+#define _PAGE_SPECIAL (1 << 6)
+#define _PAGE_GLOBAL (1 << 8) /* ASID agnostic (H) */
+#define _PAGE_PRESENT (1 << 9) /* PTE/TLB Valid (H) */
+
+#ifdef CONFIG_ARC_MMU_V4
+#define _PAGE_HW_SZ (1 << 10) /* Normal/super (H) */
+#else
+#define _PAGE_HW_SZ 0
+#endif
+
+/* Defaults for every user page */
+#define ___DEF (_PAGE_PRESENT | _PAGE_CACHEABLE)
+
+/* Set of bits not changed in pte_modify */
+#define _PAGE_CHG_MASK (PAGE_MASK_PHYS | _PAGE_ACCESSED | _PAGE_DIRTY | \
+ _PAGE_SPECIAL)
+
+/* More Abbrevaited helpers */
+#define PAGE_U_NONE __pgprot(___DEF)
+#define PAGE_U_R __pgprot(___DEF | _PAGE_READ)
+#define PAGE_U_W_R __pgprot(___DEF | _PAGE_READ | _PAGE_WRITE)
+#define PAGE_U_X_R __pgprot(___DEF | _PAGE_READ | _PAGE_EXECUTE)
+#define PAGE_U_X_W_R __pgprot(___DEF \
+ | _PAGE_READ | _PAGE_WRITE | _PAGE_EXECUTE)
+#define PAGE_KERNEL __pgprot(___DEF | _PAGE_GLOBAL \
+ | _PAGE_READ | _PAGE_WRITE | _PAGE_EXECUTE)
+
+#define PAGE_SHARED PAGE_U_W_R
+
+#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) & ~_PAGE_CACHEABLE))
+
+/*
+ * Mapping of vm_flags (Generic VM) to PTE flags (arch specific)
+ *
+ * Certain cases have 1:1 mapping
+ * e.g. __P101 means VM_READ, VM_EXEC and !VM_SHARED
+ * which directly corresponds to PAGE_U_X_R
+ *
+ * Other rules which cause the divergence from 1:1 mapping
+ *
+ * 1. Although ARC700 can do exclusive execute/write protection (meaning R
+ * can be tracked independet of X/W unlike some other CPUs), still to
+ * keep things consistent with other archs:
+ * -Write implies Read: W => R
+ * -Execute implies Read: X => R
+ *
+ * 2. Pvt Writable doesn't have Write Enabled initially: Pvt-W => !W
+ * This is to enable COW mechanism
+ */
+ /* xwr */
+#define __P000 PAGE_U_NONE
+#define __P001 PAGE_U_R
+#define __P010 PAGE_U_R /* Pvt-W => !W */
+#define __P011 PAGE_U_R /* Pvt-W => !W */
+#define __P100 PAGE_U_X_R /* X => R */
+#define __P101 PAGE_U_X_R
+#define __P110 PAGE_U_X_R /* Pvt-W => !W and X => R */
+#define __P111 PAGE_U_X_R /* Pvt-W => !W */
+
+#define __S000 PAGE_U_NONE
+#define __S001 PAGE_U_R
+#define __S010 PAGE_U_W_R /* W => R */
+#define __S011 PAGE_U_W_R
+#define __S100 PAGE_U_X_R /* X => R */
+#define __S101 PAGE_U_X_R
+#define __S110 PAGE_U_X_W_R /* X => R */
+#define __S111 PAGE_U_X_W_R
+
+#ifndef __ASSEMBLY__
+
+#define pte_write(pte) (pte_val(pte) & _PAGE_WRITE)
+#define pte_dirty(pte) (pte_val(pte) & _PAGE_DIRTY)
+#define pte_young(pte) (pte_val(pte) & _PAGE_ACCESSED)
+#define pte_special(pte) (pte_val(pte) & _PAGE_SPECIAL)
+
+#define PTE_BIT_FUNC(fn, op) \
+ static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; }
+
+PTE_BIT_FUNC(mknotpresent, &= ~(_PAGE_PRESENT));
+PTE_BIT_FUNC(wrprotect, &= ~(_PAGE_WRITE));
+PTE_BIT_FUNC(mkwrite, |= (_PAGE_WRITE));
+PTE_BIT_FUNC(mkclean, &= ~(_PAGE_DIRTY));
+PTE_BIT_FUNC(mkdirty, |= (_PAGE_DIRTY));
+PTE_BIT_FUNC(mkold, &= ~(_PAGE_ACCESSED));
+PTE_BIT_FUNC(mkyoung, |= (_PAGE_ACCESSED));
+PTE_BIT_FUNC(mkspecial, |= (_PAGE_SPECIAL));
+PTE_BIT_FUNC(mkhuge, |= (_PAGE_HW_SZ));
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+ return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
+}
+
+static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval)
+{
+ set_pte(ptep, pteval);
+}
+
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep);
+
+/* Encode swap {type,off} tuple into PTE
+ * We reserve 13 bits for 5-bit @type, keeping bits 12-5 zero, ensuring that
+ * PAGE_PRESENT is zero in a PTE holding swap "identifier"
+ */
+#define __swp_entry(type, off) ((swp_entry_t) \
+ { ((type) & 0x1f) | ((off) << 13) })
+
+/* Decode a PTE containing swap "identifier "into constituents */
+#define __swp_type(pte_lookalike) (((pte_lookalike).val) & 0x1f)
+#define __swp_offset(pte_lookalike) ((pte_lookalike).val >> 13)
+
+#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
+#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
+
+#define kern_addr_valid(addr) (1)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#include <asm/hugepage.h>
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif
diff --git a/arch/arc/include/asm/pgtable-levels.h b/arch/arc/include/asm/pgtable-levels.h
new file mode 100644
index 000000000000..8084ef2f6491
--- /dev/null
+++ b/arch/arc/include/asm/pgtable-levels.h
@@ -0,0 +1,189 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Synopsys, Inc. (www.synopsys.com)
+ */
+
+/*
+ * Helpers for implemenintg paging levels
+ */
+
+#ifndef _ASM_ARC_PGTABLE_LEVELS_H
+#define _ASM_ARC_PGTABLE_LEVELS_H
+
+#if CONFIG_PGTABLE_LEVELS == 2
+
+/*
+ * 2 level paging setup for software walked MMUv3 (ARC700) and MMUv4 (HS)
+ *
+ * [31] 32 bit virtual address [0]
+ * -------------------------------------------------------
+ * | | <---------- PGDIR_SHIFT ----------> |
+ * | | | <-- PAGE_SHIFT --> |
+ * -------------------------------------------------------
+ * | | |
+ * | | --> off in page frame
+ * | ---> index into Page Table
+ * ----> index into Page Directory
+ *
+ * Given software walk, the vaddr split is arbitrary set to 11:8:13
+ * However enabling of super page in a 2 level regime pegs PGDIR_SHIFT to
+ * super page size.
+ */
+
+#if defined(CONFIG_ARC_HUGEPAGE_16M)
+#define PGDIR_SHIFT 24
+#elif defined(CONFIG_ARC_HUGEPAGE_2M)
+#define PGDIR_SHIFT 21
+#else
+/*
+ * No Super page case
+ * Default value provides 11:8:13 (8K), 10:10:12 (4K)
+ * Limits imposed by pgtable_t only PAGE_SIZE long
+ * (so 4K page can only have 1K entries: or 10 bits)
+ */
+#ifdef CONFIG_ARC_PAGE_SIZE_4K
+#define PGDIR_SHIFT 22
+#else
+#define PGDIR_SHIFT 21
+#endif
+
+#endif
+
+#else /* CONFIG_PGTABLE_LEVELS != 2 */
+
+/*
+ * A default 3 level paging testing setup in software walked MMU
+ * MMUv4 (8K page): <4> : <7> : <8> : <13>
+ * A default 4 level paging testing setup in software walked MMU
+ * MMUv4 (8K page): <4> : <3> : <4> : <8> : <13>
+ */
+#define PGDIR_SHIFT 28
+#if CONFIG_PGTABLE_LEVELS > 3
+#define PUD_SHIFT 25
+#endif
+#if CONFIG_PGTABLE_LEVELS > 2
+#define PMD_SHIFT 21
+#endif
+
+#endif /* CONFIG_PGTABLE_LEVELS */
+
+#define PGDIR_SIZE BIT(PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+#define PTRS_PER_PGD BIT(32 - PGDIR_SHIFT)
+
+#if CONFIG_PGTABLE_LEVELS > 3
+#define PUD_SIZE BIT(PUD_SHIFT)
+#define PUD_MASK (~(PUD_SIZE - 1))
+#define PTRS_PER_PUD BIT(PGDIR_SHIFT - PUD_SHIFT)
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 2
+#define PMD_SIZE BIT(PMD_SHIFT)
+#define PMD_MASK (~(PMD_SIZE - 1))
+#define PTRS_PER_PMD BIT(PUD_SHIFT - PMD_SHIFT)
+#endif
+
+#define PTRS_PER_PTE BIT(PMD_SHIFT - PAGE_SHIFT)
+
+#ifndef __ASSEMBLY__
+
+#if CONFIG_PGTABLE_LEVELS > 3
+#include <asm-generic/pgtable-nop4d.h>
+#elif CONFIG_PGTABLE_LEVELS > 2
+#include <asm-generic/pgtable-nopud.h>
+#else
+#include <asm-generic/pgtable-nopmd.h>
+#endif
+
+/*
+ * 1st level paging: pgd
+ */
+#define pgd_index(addr) ((addr) >> PGDIR_SHIFT)
+#define pgd_offset(mm, addr) (((mm)->pgd) + pgd_index(addr))
+#define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
+#define pgd_ERROR(e) \
+ pr_crit("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
+
+#if CONFIG_PGTABLE_LEVELS > 3
+
+/* In 4 level paging, p4d_* macros work on pgd */
+#define p4d_none(x) (!p4d_val(x))
+#define p4d_bad(x) ((p4d_val(x) & ~PAGE_MASK))
+#define p4d_present(x) (p4d_val(x))
+#define p4d_clear(xp) do { p4d_val(*(xp)) = 0; } while (0)
+#define p4d_pgtable(p4d) ((pud_t *)(p4d_val(p4d) & PAGE_MASK))
+#define p4d_page(p4d) virt_to_page(p4d_pgtable(p4d))
+#define set_p4d(p4dp, p4d) (*(p4dp) = p4d)
+
+/*
+ * 2nd level paging: pud
+ */
+#define pud_ERROR(e) \
+ pr_crit("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
+
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 2
+
+/*
+ * In 3 level paging, pud_* macros work on pgd
+ * In 4 level paging, pud_* macros work on pud
+ */
+#define pud_none(x) (!pud_val(x))
+#define pud_bad(x) ((pud_val(x) & ~PAGE_MASK))
+#define pud_present(x) (pud_val(x))
+#define pud_clear(xp) do { pud_val(*(xp)) = 0; } while (0)
+#define pud_pgtable(pud) ((pmd_t *)(pud_val(pud) & PAGE_MASK))
+#define pud_page(pud) virt_to_page(pud_pgtable(pud))
+#define set_pud(pudp, pud) (*(pudp) = pud)
+
+/*
+ * 3rd level paging: pmd
+ */
+#define pmd_ERROR(e) \
+ pr_crit("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
+
+#define pmd_pfn(pmd) ((pmd_val(pmd) & PMD_MASK) >> PAGE_SHIFT)
+#define pfn_pmd(pfn,prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
+#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
+
+#endif
+
+/*
+ * Due to the strange way generic pgtable level folding works, the pmd_* macros
+ * - are valid even for 2 levels (which supposedly only has pgd - pte)
+ * - behave differently for 2 vs. 3
+ * In 2 level paging (pgd -> pte), pmd_* macros work on pgd
+ * In 3+ level paging (pgd -> pmd -> pte), pmd_* macros work on pmd
+ */
+#define pmd_none(x) (!pmd_val(x))
+#define pmd_bad(x) ((pmd_val(x) & ~PAGE_MASK))
+#define pmd_present(x) (pmd_val(x))
+#define pmd_clear(xp) do { pmd_val(*(xp)) = 0; } while (0)
+#define pmd_page_vaddr(pmd) (pmd_val(pmd) & PAGE_MASK)
+#define pmd_page(pmd) virt_to_page(pmd_page_vaddr(pmd))
+#define set_pmd(pmdp, pmd) (*(pmdp) = pmd)
+#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd))
+
+/*
+ * 4th level paging: pte
+ */
+#define pte_ERROR(e) \
+ pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
+
+#define pte_none(x) (!pte_val(x))
+#define pte_present(x) (pte_val(x) & _PAGE_PRESENT)
+#define pte_clear(mm,addr,ptep) set_pte_at(mm, addr, ptep, __pte(0))
+#define pte_page(pte) pfn_to_page(pte_pfn(pte))
+#define set_pte(ptep, pte) ((*(ptep)) = (pte))
+#define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT)
+#define pfn_pte(pfn, prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot))
+#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot)
+
+#ifdef CONFIG_ISA_ARCV2
+#define pmd_leaf(x) (pmd_val(x) & _PAGE_HW_SZ)
+#endif
+
+#endif /* !__ASSEMBLY__ */
+
+#endif
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 320cc0ae8a08..9320b04c04bf 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -1,220 +1,17 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
- *
- * vineetg: May 2011
- * -Folded PAGE_PRESENT (used by VM) and PAGE_VALID (used by MMU) into 1.
- * They are semantically the same although in different contexts
- * VALID marks a TLB entry exists and it will only happen if PRESENT
- * - Utilise some unused free bits to confine PTE flags to 12 bits
- * This is a must for 4k pg-sz
- *
- * vineetg: Mar 2011 - changes to accommodate MMU TLB Page Descriptor mods
- * -TLB Locking never really existed, except for initial specs
- * -SILENT_xxx not needed for our port
- * -Per my request, MMU V3 changes the layout of some of the bits
- * to avoid a few shifts in TLB Miss handlers.
- *
- * vineetg: April 2010
- * -PGD entry no longer contains any flags. If empty it is 0, otherwise has
- * Pg-Tbl ptr. Thus pmd_present(), pmd_valid(), pmd_set( ) become simpler
- *
- * vineetg: April 2010
- * -Switched form 8:11:13 split for page table lookup to 11:8:13
- * -this speeds up page table allocation itself as we now have to memset 1K
- * instead of 8k per page table.
- * -TODO: Right now page table alloc is 8K and rest 7K is unused
- * need to optimise it
- *
- * Amit Bhor, Sameer Dhavale: Codito Technologies 2004
*/
#ifndef _ASM_ARC_PGTABLE_H
#define _ASM_ARC_PGTABLE_H
#include <linux/bits.h>
-#include <asm-generic/pgtable-nopmd.h>
-#include <asm/page.h>
-#include <asm/mmu.h> /* to propagate CONFIG_ARC_MMU_VER <n> */
-
-/**************************************************************************
- * Page Table Flags
- *
- * ARC700 MMU only deals with softare managed TLB entries.
- * Page Tables are purely for Linux VM's consumption and the bits below are
- * suited to that (uniqueness). Hence some are not implemented in the TLB and
- * some have different value in TLB.
- * e.g. MMU v2: K_READ bit is 8 and so is GLOBAL (possible because they live in
- * seperate PD0 and PD1, which combined forms a translation entry)
- * while for PTE perspective, they are 8 and 9 respectively
- * with MMU v3: Most bits (except SHARED) represent the exact hardware pos
- * (saves some bit shift ops in TLB Miss hdlrs)
- */
-
-#if (CONFIG_ARC_MMU_VER <= 2)
-
-#define _PAGE_ACCESSED (1<<1) /* Page is accessed (S) */
-#define _PAGE_CACHEABLE (1<<2) /* Page is cached (H) */
-#define _PAGE_EXECUTE (1<<3) /* Page has user execute perm (H) */
-#define _PAGE_WRITE (1<<4) /* Page has user write perm (H) */
-#define _PAGE_READ (1<<5) /* Page has user read perm (H) */
-#define _PAGE_DIRTY (1<<6) /* Page modified (dirty) (S) */
-#define _PAGE_SPECIAL (1<<7)
-#define _PAGE_GLOBAL (1<<8) /* Page is global (H) */
-#define _PAGE_PRESENT (1<<10) /* TLB entry is valid (H) */
-
-#else /* MMU v3 onwards */
-
-#define _PAGE_CACHEABLE (1<<0) /* Page is cached (H) */
-#define _PAGE_EXECUTE (1<<1) /* Page has user execute perm (H) */
-#define _PAGE_WRITE (1<<2) /* Page has user write perm (H) */
-#define _PAGE_READ (1<<3) /* Page has user read perm (H) */
-#define _PAGE_ACCESSED (1<<4) /* Page is accessed (S) */
-#define _PAGE_DIRTY (1<<5) /* Page modified (dirty) (S) */
-#define _PAGE_SPECIAL (1<<6)
-
-#if (CONFIG_ARC_MMU_VER >= 4)
-#define _PAGE_WTHRU (1<<7) /* Page cache mode write-thru (H) */
-#endif
-
-#define _PAGE_GLOBAL (1<<8) /* Page is global (H) */
-#define _PAGE_PRESENT (1<<9) /* TLB entry is valid (H) */
-
-#if (CONFIG_ARC_MMU_VER >= 4)
-#define _PAGE_HW_SZ (1<<10) /* Page Size indicator (H): 0 normal, 1 super */
-#endif
-
-#define _PAGE_SHARED_CODE (1<<11) /* Shared Code page with cmn vaddr
- usable for shared TLB entries (H) */
-
-#define _PAGE_UNUSED_BIT (1<<12)
-#endif
-
-/* vmalloc permissions */
-#define _K_PAGE_PERMS (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ | \
- _PAGE_GLOBAL | _PAGE_PRESENT)
-
-#ifndef CONFIG_ARC_CACHE_PAGES
-#undef _PAGE_CACHEABLE
-#define _PAGE_CACHEABLE 0
-#endif
-#ifndef _PAGE_HW_SZ
-#define _PAGE_HW_SZ 0
-#endif
-
-/* Defaults for every user page */
-#define ___DEF (_PAGE_PRESENT | _PAGE_CACHEABLE)
-
-/* Set of bits not changed in pte_modify */
-#define _PAGE_CHG_MASK (PAGE_MASK_PHYS | _PAGE_ACCESSED | _PAGE_DIRTY | \
- _PAGE_SPECIAL)
-/* More Abbrevaited helpers */
-#define PAGE_U_NONE __pgprot(___DEF)
-#define PAGE_U_R __pgprot(___DEF | _PAGE_READ)
-#define PAGE_U_W_R __pgprot(___DEF | _PAGE_READ | _PAGE_WRITE)
-#define PAGE_U_X_R __pgprot(___DEF | _PAGE_READ | _PAGE_EXECUTE)
-#define PAGE_U_X_W_R __pgprot(___DEF | _PAGE_READ | _PAGE_WRITE | \
- _PAGE_EXECUTE)
-
-#define PAGE_SHARED PAGE_U_W_R
-
-/* While kernel runs out of unstranslated space, vmalloc/modules use a chunk of
- * user vaddr space - visible in all addr spaces, but kernel mode only
- * Thus Global, all-kernel-access, no-user-access, cached
- */
-#define PAGE_KERNEL __pgprot(_K_PAGE_PERMS | _PAGE_CACHEABLE)
-
-/* ioremap */
-#define PAGE_KERNEL_NO_CACHE __pgprot(_K_PAGE_PERMS)
-
-/* Masks for actual TLB "PD"s */
-#define PTE_BITS_IN_PD0 (_PAGE_GLOBAL | _PAGE_PRESENT | _PAGE_HW_SZ)
-#define PTE_BITS_RWX (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ)
-
-#define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK_PHYS | _PAGE_CACHEABLE)
-
-/**************************************************************************
- * Mapping of vm_flags (Generic VM) to PTE flags (arch specific)
- *
- * Certain cases have 1:1 mapping
- * e.g. __P101 means VM_READ, VM_EXEC and !VM_SHARED
- * which directly corresponds to PAGE_U_X_R
- *
- * Other rules which cause the divergence from 1:1 mapping
- *
- * 1. Although ARC700 can do exclusive execute/write protection (meaning R
- * can be tracked independet of X/W unlike some other CPUs), still to
- * keep things consistent with other archs:
- * -Write implies Read: W => R
- * -Execute implies Read: X => R
- *
- * 2. Pvt Writable doesn't have Write Enabled initially: Pvt-W => !W
- * This is to enable COW mechanism
- */
- /* xwr */
-#define __P000 PAGE_U_NONE
-#define __P001 PAGE_U_R
-#define __P010 PAGE_U_R /* Pvt-W => !W */
-#define __P011 PAGE_U_R /* Pvt-W => !W */
-#define __P100 PAGE_U_X_R /* X => R */
-#define __P101 PAGE_U_X_R
-#define __P110 PAGE_U_X_R /* Pvt-W => !W and X => R */
-#define __P111 PAGE_U_X_R /* Pvt-W => !W */
-
-#define __S000 PAGE_U_NONE
-#define __S001 PAGE_U_R
-#define __S010 PAGE_U_W_R /* W => R */
-#define __S011 PAGE_U_W_R
-#define __S100 PAGE_U_X_R /* X => R */
-#define __S101 PAGE_U_X_R
-#define __S110 PAGE_U_X_W_R /* X => R */
-#define __S111 PAGE_U_X_W_R
-
-/****************************************************************
- * 2 tier (PGD:PTE) software page walker
- *
- * [31] 32 bit virtual address [0]
- * -------------------------------------------------------
- * | | <------------ PGDIR_SHIFT ----------> |
- * | | |
- * | BITS_FOR_PGD | BITS_FOR_PTE | <-- PAGE_SHIFT --> |
- * -------------------------------------------------------
- * | | |
- * | | --> off in page frame
- * | ---> index into Page Table
- * ----> index into Page Directory
- *
- * In a single page size configuration, only PAGE_SHIFT is fixed
- * So both PGD and PTE sizing can be tweaked
- * e.g. 8K page (PAGE_SHIFT 13) can have
- * - PGDIR_SHIFT 21 -> 11:8:13 address split
- * - PGDIR_SHIFT 24 -> 8:11:13 address split
- *
- * If Super Page is configured, PGDIR_SHIFT becomes fixed too,
- * so the sizing flexibility is gone.
- */
-
-#if defined(CONFIG_ARC_HUGEPAGE_16M)
-#define PGDIR_SHIFT 24
-#elif defined(CONFIG_ARC_HUGEPAGE_2M)
-#define PGDIR_SHIFT 21
-#else
-/*
- * Only Normal page support so "hackable" (see comment above)
- * Default value provides 11:8:13 (8K), 11:9:12 (4K)
- */
-#define PGDIR_SHIFT 21
-#endif
-
-#define BITS_FOR_PTE (PGDIR_SHIFT - PAGE_SHIFT)
-#define BITS_FOR_PGD (32 - PGDIR_SHIFT)
-
-#define PGDIR_SIZE BIT(PGDIR_SHIFT) /* vaddr span, not PDG sz */
-#define PGDIR_MASK (~(PGDIR_SIZE-1))
-
-#define PTRS_PER_PTE BIT(BITS_FOR_PTE)
-#define PTRS_PER_PGD BIT(BITS_FOR_PGD)
+#include <asm/pgtable-levels.h>
+#include <asm/pgtable-bits-arcv2.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
/*
* Number of entries a user land program use.
@@ -222,143 +19,17 @@
*/
#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
-
-/****************************************************************
- * Bucket load of VM Helpers
- */
-
#ifndef __ASSEMBLY__
-#define pte_ERROR(e) \
- pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
-#define pgd_ERROR(e) \
- pr_crit("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
-
-/* the zero page used for uninitialized and anonymous pages */
extern char empty_zero_page[PAGE_SIZE];
#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
-#define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval))
-#define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval)
-
-/* find the page descriptor of the Page Tbl ref by PMD entry */
-#define pmd_page(pmd) virt_to_page(pmd_val(pmd) & PAGE_MASK)
-
-/* find the logical addr (phy for ARC) of the Page Tbl ref by PMD entry */
-#define pmd_page_vaddr(pmd) (pmd_val(pmd) & PAGE_MASK)
-
-/* In a 2 level sys, setup the PGD entry with PTE value */
-static inline void pmd_set(pmd_t *pmdp, pte_t *ptep)
-{
- pmd_val(*pmdp) = (unsigned long)ptep;
-}
-
-#define pte_none(x) (!pte_val(x))
-#define pte_present(x) (pte_val(x) & _PAGE_PRESENT)
-#define pte_clear(mm, addr, ptep) set_pte_at(mm, addr, ptep, __pte(0))
-
-#define pmd_none(x) (!pmd_val(x))
-#define pmd_bad(x) ((pmd_val(x) & ~PAGE_MASK))
-#define pmd_present(x) (pmd_val(x))
-#define pmd_leaf(x) (pmd_val(x) & _PAGE_HW_SZ)
-#define pmd_clear(xp) do { pmd_val(*(xp)) = 0; } while (0)
-
-#define pte_page(pte) pfn_to_page(pte_pfn(pte))
-#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot)
-#define pfn_pte(pfn, prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot))
-
-/* Don't use virt_to_pfn for macros below: could cause truncations for PAE40*/
-#define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT)
-
-/* Zoo of pte_xxx function */
-#define pte_read(pte) (pte_val(pte) & _PAGE_READ)
-#define pte_write(pte) (pte_val(pte) & _PAGE_WRITE)
-#define pte_dirty(pte) (pte_val(pte) & _PAGE_DIRTY)
-#define pte_young(pte) (pte_val(pte) & _PAGE_ACCESSED)
-#define pte_special(pte) (pte_val(pte) & _PAGE_SPECIAL)
-
-#define PTE_BIT_FUNC(fn, op) \
- static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; }
-
-PTE_BIT_FUNC(mknotpresent, &= ~(_PAGE_PRESENT));
-PTE_BIT_FUNC(wrprotect, &= ~(_PAGE_WRITE));
-PTE_BIT_FUNC(mkwrite, |= (_PAGE_WRITE));
-PTE_BIT_FUNC(mkclean, &= ~(_PAGE_DIRTY));
-PTE_BIT_FUNC(mkdirty, |= (_PAGE_DIRTY));
-PTE_BIT_FUNC(mkold, &= ~(_PAGE_ACCESSED));
-PTE_BIT_FUNC(mkyoung, |= (_PAGE_ACCESSED));
-PTE_BIT_FUNC(exprotect, &= ~(_PAGE_EXECUTE));
-PTE_BIT_FUNC(mkexec, |= (_PAGE_EXECUTE));
-PTE_BIT_FUNC(mkspecial, |= (_PAGE_SPECIAL));
-PTE_BIT_FUNC(mkhuge, |= (_PAGE_HW_SZ));
-
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{
- return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
-}
+extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE);
/* Macro to mark a page protection as uncacheable */
#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) & ~_PAGE_CACHEABLE))
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
-{
- set_pte(ptep, pteval);
-}
-
-/*
- * Macro to quickly access the PGD entry, utlising the fact that some
- * arch may cache the pointer to Page Directory of "current" task
- * in a MMU register
- *
- * Thus task->mm->pgd (3 pointer dereferences, cache misses etc simply
- * becomes read a register
- *
- * ********CAUTION*******:
- * Kernel code might be dealing with some mm_struct of NON "current"
- * Thus use this macro only when you are certain that "current" is current
- * e.g. when dealing with signal frame setup code etc
- */
-#ifdef ARC_USE_SCRATCH_REG
-#define pgd_offset_fast(mm, addr) \
-({ \
- pgd_t *pgd_base = (pgd_t *) read_aux_reg(ARC_REG_SCRATCH_DATA0); \
- pgd_base + pgd_index(addr); \
-})
-#else
-#define pgd_offset_fast(mm, addr) pgd_offset(mm, addr)
-#endif
-
extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE);
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep);
-
-/* Encode swap {type,off} tuple into PTE
- * We reserve 13 bits for 5-bit @type, keeping bits 12-5 zero, ensuring that
- * PAGE_PRESENT is zero in a PTE holding swap "identifier"
- */
-#define __swp_entry(type, off) ((swp_entry_t) { \
- ((type) & 0x1f) | ((off) << 13) })
-
-/* Decode a PTE containing swap "identifier "into constituents */
-#define __swp_type(pte_lookalike) (((pte_lookalike).val) & 0x1f)
-#define __swp_offset(pte_lookalike) ((pte_lookalike).val >> 13)
-
-/* NOPs, to keep generic kernel happy */
-#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
-#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-
-#define kern_addr_valid(addr) (1)
-
-#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd))
-
-/*
- * remap a physical page `pfn' of size `size' with page protection `prot'
- * into virtual address `from'
- */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#include <asm/hugepage.h>
-#endif
/* to cope with aliasing VIPT cache */
#define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h
index e4031ecd3c8c..f28afcf5c6d1 100644
--- a/arch/arc/include/asm/processor.h
+++ b/arch/arc/include/asm/processor.h
@@ -93,7 +93,7 @@ extern unsigned int get_wchan(struct task_struct *p);
#define VMALLOC_START (PAGE_OFFSET - (CONFIG_ARC_KVADDR_SIZE << 20))
/* 1 PGDIR_SIZE each for fixmap/pkmap, 2 PGDIR_SIZE gutter (see asm/highmem.h) */
-#define VMALLOC_SIZE ((CONFIG_ARC_KVADDR_SIZE << 20) - PGDIR_SIZE * 4)
+#define VMALLOC_SIZE ((CONFIG_ARC_KVADDR_SIZE << 20) - PMD_SIZE * 4)
#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE)
diff --git a/arch/arc/include/asm/setup.h b/arch/arc/include/asm/setup.h
index 01f85478170d..028a8cf76206 100644
--- a/arch/arc/include/asm/setup.h
+++ b/arch/arc/include/asm/setup.h
@@ -2,8 +2,8 @@
/*
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
*/
-#ifndef __ASMARC_SETUP_H
-#define __ASMARC_SETUP_H
+#ifndef __ASM_ARC_SETUP_H
+#define __ASM_ARC_SETUP_H
#include <linux/types.h>
@@ -34,4 +34,12 @@ long __init arc_get_mem_sz(void);
#define IS_AVAIL2(v, s, cfg) IS_AVAIL1(v, s), IS_AVAIL1(v, IS_USED_CFG(cfg))
#define IS_AVAIL3(v, v2, s) IS_AVAIL1(v, s), IS_AVAIL1(v, IS_DISABLED_RUN(v2))
+extern void arc_mmu_init(void);
+extern char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len);
+extern void read_decode_mmu_bcr(void);
+
+extern void arc_cache_init(void);
+extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len);
+extern void read_decode_cache_bcr(void);
+
#endif /* __ASMARC_SETUP_H */
diff --git a/arch/arc/include/asm/smp.h b/arch/arc/include/asm/smp.h
index c5de4008d19f..d856491606ac 100644
--- a/arch/arc/include/asm/smp.h
+++ b/arch/arc/include/asm/smp.h
@@ -105,7 +105,6 @@ static inline const char *arc_platform_smp_cpuinfo(void)
#include <asm/spinlock.h>
extern arch_spinlock_t smp_atomic_ops_lock;
-extern arch_spinlock_t smp_bitops_lock;
#define atomic_ops_lock(flags) do { \
local_irq_save(flags); \
@@ -117,24 +116,11 @@ extern arch_spinlock_t smp_bitops_lock;
local_irq_restore(flags); \
} while (0)
-#define bitops_lock(flags) do { \
- local_irq_save(flags); \
- arch_spin_lock(&smp_bitops_lock); \
-} while (0)
-
-#define bitops_unlock(flags) do { \
- arch_spin_unlock(&smp_bitops_lock); \
- local_irq_restore(flags); \
-} while (0)
-
#else /* !CONFIG_SMP */
#define atomic_ops_lock(flags) local_irq_save(flags)
#define atomic_ops_unlock(flags) local_irq_restore(flags)
-#define bitops_lock(flags) local_irq_save(flags)
-#define bitops_unlock(flags) local_irq_restore(flags)
-
#endif /* !CONFIG_SMP */
#endif /* !CONFIG_ARC_HAS_LLSC */
diff --git a/arch/arc/include/asm/tlb-mmu1.h b/arch/arc/include/asm/tlb-mmu1.h
deleted file mode 100644
index a3083b36f5f4..000000000000
--- a/arch/arc/include/asm/tlb-mmu1.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
- */
-
-#ifndef __ASM_TLB_MMU_V1_H__
-#define __ASM_TLB_MMU_V1_H__
-
-#include <asm/mmu.h>
-
-#if defined(__ASSEMBLY__) && (CONFIG_ARC_MMU_VER == 1)
-
-.macro TLB_WRITE_HEURISTICS
-
-#define JH_HACK1
-#undef JH_HACK2
-#undef JH_HACK3
-
-#ifdef JH_HACK3
-; Calculate set index for 2-way MMU
-; -avoiding use of GetIndex from MMU
-; and its unpleasant LFSR pseudo-random sequence
-;
-; r1 = TLBPD0 from TLB_RELOAD above
-;
-; -- jh_ex_way_set not cleared on startup
-; didn't want to change setup.c
-; hence extra instruction to clean
-;
-; -- should be in cache since in same line
-; as r0/r1 saves above
-;
-ld r0,[jh_ex_way_sel] ; victim pointer
-and r0,r0,1 ; clean
-xor.f r0,r0,1 ; flip
-st r0,[jh_ex_way_sel] ; store back
-asr r0,r1,12 ; get set # <<1, note bit 12=R=0
-or.nz r0,r0,1 ; set way bit
-and r0,r0,0xff ; clean
-sr r0,[ARC_REG_TLBINDEX]
-#endif
-
-#ifdef JH_HACK2
-; JH hack #2
-; Faster than hack #1 in non-thrash case, but hard-coded for 2-way MMU
-; Slower in thrash case (where it matters) because more code is executed
-; Inefficient due to two-register paradigm of this miss handler
-;
-/* r1 = data TLBPD0 at this point */
-lr r0,[eret] /* instruction address */
-xor r0,r0,r1 /* compare set # */
-and.f r0,r0,0x000fe000 /* 2-way MMU mask */
-bne 88f /* not in same set - no need to probe */
-
-lr r0,[eret] /* instruction address */
-and r0,r0,PAGE_MASK /* VPN of instruction address */
-; lr r1,[ARC_REG_TLBPD0] /* Data VPN+ASID - already in r1 from TLB_RELOAD*/
-and r1,r1,0xff /* Data ASID */
-or r0,r0,r1 /* Instruction address + Data ASID */
-
-lr r1,[ARC_REG_TLBPD0] /* save TLBPD0 containing data TLB*/
-sr r0,[ARC_REG_TLBPD0] /* write instruction address to TLBPD0 */
-sr TLBProbe, [ARC_REG_TLBCOMMAND] /* Look for instruction */
-lr r0,[ARC_REG_TLBINDEX] /* r0 = index where instruction is, if at all */
-sr r1,[ARC_REG_TLBPD0] /* restore TLBPD0 */
-
-xor r0,r0,1 /* flip bottom bit of data index */
-b.d 89f
-sr r0,[ARC_REG_TLBINDEX] /* and put it back */
-88:
-sr TLBGetIndex, [ARC_REG_TLBCOMMAND]
-89:
-#endif
-
-#ifdef JH_HACK1
-;
-; Always checks whether instruction will be kicked out by dtlb miss
-;
-mov_s r3, r1 ; save PD0 prepared by TLB_RELOAD in r3
-lr r0,[eret] /* instruction address */
-and r0,r0,PAGE_MASK /* VPN of instruction address */
-bmsk r1,r3,7 /* Data ASID, bits 7-0 */
-or_s r0,r0,r1 /* Instruction address + Data ASID */
-
-sr r0,[ARC_REG_TLBPD0] /* write instruction address to TLBPD0 */
-sr TLBProbe, [ARC_REG_TLBCOMMAND] /* Look for instruction */
-lr r0,[ARC_REG_TLBINDEX] /* r0 = index where instruction is, if at all */
-sr r3,[ARC_REG_TLBPD0] /* restore TLBPD0 */
-
-sr TLBGetIndex, [ARC_REG_TLBCOMMAND]
-lr r1,[ARC_REG_TLBINDEX] /* r1 = index where MMU wants to put data */
-cmp r0,r1 /* if no match on indices, go around */
-xor.eq r1,r1,1 /* flip bottom bit of data index */
-sr r1,[ARC_REG_TLBINDEX] /* and put it back */
-#endif
-
-.endm
-
-#endif
-
-#endif
diff --git a/arch/arc/kernel/entry-arcv2.S b/arch/arc/kernel/entry-arcv2.S
index 12d5f12d10d2..a7e6a2174187 100644
--- a/arch/arc/kernel/entry-arcv2.S
+++ b/arch/arc/kernel/entry-arcv2.S
@@ -10,6 +10,7 @@
#include <asm/errno.h>
#include <asm/arcregs.h>
#include <asm/irqflags.h>
+#include <asm/mmu.h>
; A maximum number of supported interrupts in the core interrupt controller.
; This number is not equal to the maximum interrupt number (256) because
diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S
index 2cb8dfe866b6..dd77a0c8f740 100644
--- a/arch/arc/kernel/entry.S
+++ b/arch/arc/kernel/entry.S
@@ -101,11 +101,8 @@ ENTRY(EV_MachineCheck)
lr r0, [efa]
mov r1, sp
- ; hardware auto-disables MMU, re-enable it to allow kernel vaddr
- ; access for say stack unwinding of modules for crash dumps
- lr r3, [ARC_REG_PID]
- or r3, r3, MMU_ENABLE
- sr r3, [ARC_REG_PID]
+ ; MC excpetions disable MMU
+ ARC_MMU_REENABLE r3
lsr r3, r2, 8
bmsk r3, r3, 7
diff --git a/arch/arc/kernel/intc-compact.c b/arch/arc/kernel/intc-compact.c
index a86641b91e65..6885e422870e 100644
--- a/arch/arc/kernel/intc-compact.c
+++ b/arch/arc/kernel/intc-compact.c
@@ -142,7 +142,7 @@ IRQCHIP_DECLARE(arc_intc, "snps,arc700-intc", init_onchip_IRQ);
* Time hard-ISR, timer_interrupt( ) calls spin_unlock_irq several times.
* Here local_irq_enable( ) shd not re-enable lower priority interrupts
* -If called from soft-ISR, it must re-enable all interrupts
- * soft ISR are low prioity jobs which can be very slow, thus all IRQs
+ * soft ISR are low priority jobs which can be very slow, thus all IRQs
* must be enabled while they run.
* Now hardware context wise we may still be in L2 ISR (not done rtie)
* still we must re-enable both L1 and L2 IRQs
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index db0e104d6835..78e6d069b1c1 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -29,10 +29,8 @@
#ifndef CONFIG_ARC_HAS_LLSC
arch_spinlock_t smp_atomic_ops_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-arch_spinlock_t smp_bitops_lock = __ARCH_SPIN_LOCK_UNLOCKED;
EXPORT_SYMBOL_GPL(smp_atomic_ops_lock);
-EXPORT_SYMBOL_GPL(smp_bitops_lock);
#endif
struct plat_smp_ops __weak plat_smp_ops;
@@ -283,7 +281,7 @@ static void ipi_send_msg_one(int cpu, enum ipi_msg_type msg)
/*
* Call the platform specific IPI kick function, but avoid if possible:
* Only do so if there's no pending msg from other concurrent sender(s).
- * Otherwise, recevier will see this msg as well when it takes the
+ * Otherwise, receiver will see this msg as well when it takes the
* IPI corresponding to that msg. This is true, even if it is already in
* IPI handler, because !@old means it has not yet dequeued the msg(s)
* so @new msg can be a free-loader
diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c
index 1b9576d21e24..c376ff3147e7 100644
--- a/arch/arc/kernel/stacktrace.c
+++ b/arch/arc/kernel/stacktrace.c
@@ -149,7 +149,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
#else
/* On ARC, only Dward based unwinder works. fp based backtracing is
* not possible (-fno-omit-frame-pointer) because of the way function
- * prelogue is setup (callee regs saved and then fp set and not other
+ * prologue is setup (callee regs saved and then fp set and not other
* way around
*/
pr_warn_once("CONFIG_ARC_DW2_UNWIND needs to be enabled\n");
diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c
index 57235e5c0cea..6b83e3f2b41c 100644
--- a/arch/arc/kernel/traps.c
+++ b/arch/arc/kernel/traps.c
@@ -20,11 +20,6 @@
#include <asm/unaligned.h>
#include <asm/kprobes.h>
-void __init trap_init(void)
-{
- return;
-}
-
void die(const char *str, struct pt_regs *regs, unsigned long address)
{
show_kernel_fault_diag(str, regs, address);
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index a2fbea3ee07c..8aa1231865d1 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -205,93 +205,24 @@ slc_chk:
#define OP_INV_IC 0x4
/*
- * I-Cache Aliasing in ARC700 VIPT caches (MMU v1-v3)
+ * Cache Flush programming model
*
- * ARC VIPT I-cache uses vaddr to index into cache and paddr to match the tag.
- * The orig Cache Management Module "CDU" only required paddr to invalidate a
- * certain line since it sufficed as index in Non-Aliasing VIPT cache-geometry.
- * Infact for distinct V1,V2,P: all of {V1-P},{V2-P},{P-P} would end up fetching
- * the exact same line.
+ * ARC700 MMUv3 I$ and D$ are both VIPT and can potentially alias.
+ * Programming model requires both paddr and vaddr irrespecive of aliasing
+ * considerations:
+ * - vaddr in {I,D}C_IV?L
+ * - paddr in {I,D}C_PTAG
*
- * However for larger Caches (way-size > page-size) - i.e. in Aliasing config,
- * paddr alone could not be used to correctly index the cache.
+ * In HS38x (MMUv4), D$ is PIPT, I$ is VIPT and can still alias.
+ * Programming model is different for aliasing vs. non-aliasing I$
+ * - D$ / Non-aliasing I$: only paddr in {I,D}C_IV?L
+ * - Aliasing I$: same as ARC700 above (so MMUv3 routine used for MMUv4 I$)
*
- * ------------------
- * MMU v1/v2 (Fixed Page Size 8k)
- * ------------------
- * The solution was to provide CDU with these additonal vaddr bits. These
- * would be bits [x:13], x would depend on cache-geometry, 13 comes from
- * standard page size of 8k.
- * H/w folks chose [17:13] to be a future safe range, and moreso these 5 bits
- * of vaddr could easily be "stuffed" in the paddr as bits [4:0] since the
- * orig 5 bits of paddr were anyways ignored by CDU line ops, as they
- * represent the offset within cache-line. The adv of using this "clumsy"
- * interface for additional info was no new reg was needed in CDU programming
- * model.
- *
- * 17:13 represented the max num of bits passable, actual bits needed were
- * fewer, based on the num-of-aliases possible.
- * -for 2 alias possibility, only bit 13 needed (32K cache)
- * -for 4 alias possibility, bits 14:13 needed (64K cache)
- *
- * ------------------
- * MMU v3
- * ------------------
- * This ver of MMU supports variable page sizes (1k-16k): although Linux will
- * only support 8k (default), 16k and 4k.
- * However from hardware perspective, smaller page sizes aggravate aliasing
- * meaning more vaddr bits needed to disambiguate the cache-line-op ;
- * the existing scheme of piggybacking won't work for certain configurations.
- * Two new registers IC_PTAG and DC_PTAG inttoduced.
- * "tag" bits are provided in PTAG, index bits in existing IVIL/IVDL/FLDL regs
+ * - If PAE40 is enabled, independent of aliasing considerations, the higher
+ * bits needs to be written into PTAG_HI
*/
static inline
-void __cache_line_loop_v2(phys_addr_t paddr, unsigned long vaddr,
- unsigned long sz, const int op, const int full_page)
-{
- unsigned int aux_cmd;
- int num_lines;
-
- if (op == OP_INV_IC) {
- aux_cmd = ARC_REG_IC_IVIL;
- } else {
- /* d$ cmd: INV (discard or wback-n-discard) OR FLUSH (wback) */
- aux_cmd = op & OP_INV ? ARC_REG_DC_IVDL : ARC_REG_DC_FLDL;
- }
-
- /* Ensure we properly floor/ceil the non-line aligned/sized requests
- * and have @paddr - aligned to cache line and integral @num_lines.
- * This however can be avoided for page sized since:
- * -@paddr will be cache-line aligned already (being page aligned)
- * -@sz will be integral multiple of line size (being page sized).
- */
- if (!full_page) {
- sz += paddr & ~CACHE_LINE_MASK;
- paddr &= CACHE_LINE_MASK;
- vaddr &= CACHE_LINE_MASK;
- }
-
- num_lines = DIV_ROUND_UP(sz, L1_CACHE_BYTES);
-
- /* MMUv2 and before: paddr contains stuffed vaddrs bits */
- paddr |= (vaddr >> PAGE_SHIFT) & 0x1F;
-
- while (num_lines-- > 0) {
- write_aux_reg(aux_cmd, paddr);
- paddr += L1_CACHE_BYTES;
- }
-}
-
-/*
- * For ARC700 MMUv3 I-cache and D-cache flushes
- * - ARC700 programming model requires paddr and vaddr be passed in seperate
- * AUX registers (*_IV*L and *_PTAG respectively) irrespective of whether the
- * caches actually alias or not.
- * - For HS38, only the aliasing I-cache configuration uses the PTAG reg
- * (non aliasing I-cache version doesn't; while D-cache can't possibly alias)
- */
-static inline
void __cache_line_loop_v3(phys_addr_t paddr, unsigned long vaddr,
unsigned long sz, const int op, const int full_page)
{
@@ -350,17 +281,6 @@ void __cache_line_loop_v3(phys_addr_t paddr, unsigned long vaddr,
#ifndef USE_RGN_FLSH
/*
- * In HS38x (MMU v4), I-cache is VIPT (can alias), D-cache is PIPT
- * Here's how cache ops are implemented
- *
- * - D-cache: only paddr needed (in DC_IVDL/DC_FLDL)
- * - I-cache Non Aliasing: Despite VIPT, only paddr needed (in IC_IVIL)
- * - I-cache Aliasing: Both vaddr and paddr needed (in IC_IVIL, IC_PTAG
- * respectively, similar to MMU v3 programming model, hence
- * __cache_line_loop_v3() is used)
- *
- * If PAE40 is enabled, independent of aliasing considerations, the higher bits
- * needs to be written into PTAG_HI
*/
static inline
void __cache_line_loop_v4(phys_addr_t paddr, unsigned long vaddr,
@@ -460,11 +380,9 @@ void __cache_line_loop_v4(phys_addr_t paddr, unsigned long vaddr,
#endif
-#if (CONFIG_ARC_MMU_VER < 3)
-#define __cache_line_loop __cache_line_loop_v2
-#elif (CONFIG_ARC_MMU_VER == 3)
+#ifdef CONFIG_ARC_MMU_V3
#define __cache_line_loop __cache_line_loop_v3
-#elif (CONFIG_ARC_MMU_VER > 3)
+#else
#define __cache_line_loop __cache_line_loop_v4
#endif
@@ -1123,7 +1041,7 @@ void clear_user_page(void *to, unsigned long u_vaddr, struct page *page)
clear_page(to);
clear_bit(PG_dc_clean, &page->flags);
}
-
+EXPORT_SYMBOL(clear_user_page);
/**********************************************************************
* Explicit Cache flush request from user space via syscall
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index f5657cb68e4f..5787c261c9a4 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -33,28 +33,34 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address)
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
- pgd = pgd_offset_fast(current->active_mm, address);
+ pgd = pgd_offset(current->active_mm, address);
pgd_k = pgd_offset_k(address);
- if (!pgd_present(*pgd_k))
+ if (pgd_none (*pgd_k))
goto bad_area;
+ if (!pgd_present(*pgd))
+ set_pgd(pgd, *pgd_k);
p4d = p4d_offset(pgd, address);
p4d_k = p4d_offset(pgd_k, address);
- if (!p4d_present(*p4d_k))
+ if (p4d_none(*p4d_k))
goto bad_area;
+ if (!p4d_present(*p4d))
+ set_p4d(p4d, *p4d_k);
pud = pud_offset(p4d, address);
pud_k = pud_offset(p4d_k, address);
- if (!pud_present(*pud_k))
+ if (pud_none(*pud_k))
goto bad_area;
+ if (!pud_present(*pud))
+ set_pud(pud, *pud_k);
pmd = pmd_offset(pud, address);
pmd_k = pmd_offset(pud_k, address);
- if (!pmd_present(*pmd_k))
+ if (pmd_none(*pmd_k))
goto bad_area;
-
- set_pmd(pmd, *pmd_k);
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, *pmd_k);
/* XXX: create the TLB entry here */
return 0;
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index c083bf660cec..699ecf119641 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -189,6 +189,11 @@ void __init mem_init(void)
{
memblock_free_all();
highmem_init();
+
+ BUILD_BUG_ON((PTRS_PER_PGD * sizeof(pgd_t)) > PAGE_SIZE);
+ BUILD_BUG_ON((PTRS_PER_PUD * sizeof(pud_t)) > PAGE_SIZE);
+ BUILD_BUG_ON((PTRS_PER_PMD * sizeof(pmd_t)) > PAGE_SIZE);
+ BUILD_BUG_ON((PTRS_PER_PTE * sizeof(pte_t)) > PAGE_SIZE);
}
#ifdef CONFIG_HIGHMEM
diff --git a/arch/arc/mm/ioremap.c b/arch/arc/mm/ioremap.c
index 95c649fbc95a..0ee75aca6e10 100644
--- a/arch/arc/mm/ioremap.c
+++ b/arch/arc/mm/ioremap.c
@@ -39,7 +39,8 @@ void __iomem *ioremap(phys_addr_t paddr, unsigned long size)
if (arc_uncached_addr_space(paddr))
return (void __iomem *)(u32)paddr;
- return ioremap_prot(paddr, size, PAGE_KERNEL_NO_CACHE);
+ return ioremap_prot(paddr, size,
+ pgprot_val(pgprot_noncached(PAGE_KERNEL)));
}
EXPORT_SYMBOL(ioremap);
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index 9c7c68247289..5f71445f26bd 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -1,51 +1,9 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * TLB Management (flush/create/diagnostics) for ARC700
+ * TLB Management (flush/create/diagnostics) for MMUv3 and MMUv4
*
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
*
- * vineetg: Aug 2011
- * -Reintroduce duplicate PD fixup - some customer chips still have the issue
- *
- * vineetg: May 2011
- * -No need to flush_cache_page( ) for each call to update_mmu_cache()
- * some of the LMBench tests improved amazingly
- * = page-fault thrice as fast (75 usec to 28 usec)
- * = mmap twice as fast (9.6 msec to 4.6 msec),
- * = fork (5.3 msec to 3.7 msec)
- *
- * vineetg: April 2011 :
- * -MMU v3: PD{0,1} bits layout changed: They don't overlap anymore,
- * helps avoid a shift when preparing PD0 from PTE
- *
- * vineetg: April 2011 : Preparing for MMU V3
- * -MMU v2/v3 BCRs decoded differently
- * -Remove TLB_SIZE hardcoding as it's variable now: 256 or 512
- * -tlb_entry_erase( ) can be void
- * -local_flush_tlb_range( ):
- * = need not "ceil" @end
- * = walks MMU only if range spans < 32 entries, as opposed to 256
- *
- * Vineetg: Sept 10th 2008
- * -Changes related to MMU v2 (Rel 4.8)
- *
- * Vineetg: Aug 29th 2008
- * -In TLB Flush operations (Metal Fix MMU) there is a explicit command to
- * flush Micro-TLBS. If TLB Index Reg is invalid prior to TLBIVUTLB cmd,
- * it fails. Thus need to load it with ANY valid value before invoking
- * TLBIVUTLB cmd
- *
- * Vineetg: Aug 21th 2008:
- * -Reduced the duration of IRQ lockouts in TLB Flush routines
- * -Multiple copies of TLB erase code separated into a "single" function
- * -In TLB Flush routines, interrupt disabling moved UP to retrieve ASID
- * in interrupt-safe region.
- *
- * Vineetg: April 23rd Bug #93131
- * Problem: tlb_flush_kernel_range() doesn't do anything if the range to
- * flush is more than the size of TLB itself.
- *
- * Rahul Trivedi : Codito Technologies 2004
*/
#include <linux/module.h>
@@ -57,47 +15,6 @@
#include <asm/mmu_context.h>
#include <asm/mmu.h>
-/* Need for ARC MMU v2
- *
- * ARC700 MMU-v1 had a Joint-TLB for Code and Data and is 2 way set-assoc.
- * For a memcpy operation with 3 players (src/dst/code) such that all 3 pages
- * map into same set, there would be contention for the 2 ways causing severe
- * Thrashing.
- *
- * Although J-TLB is 2 way set assoc, ARC700 caches J-TLB into uTLBS which has
- * much higher associativity. u-D-TLB is 8 ways, u-I-TLB is 4 ways.
- * Given this, the thrashing problem should never happen because once the 3
- * J-TLB entries are created (even though 3rd will knock out one of the prev
- * two), the u-D-TLB and u-I-TLB will have what is required to accomplish memcpy
- *
- * Yet we still see the Thrashing because a J-TLB Write cause flush of u-TLBs.
- * This is a simple design for keeping them in sync. So what do we do?
- * The solution which James came up was pretty neat. It utilised the assoc
- * of uTLBs by not invalidating always but only when absolutely necessary.
- *
- * - Existing TLB commands work as before
- * - New command (TLBWriteNI) for TLB write without clearing uTLBs
- * - New command (TLBIVUTLB) to invalidate uTLBs.
- *
- * The uTLBs need only be invalidated when pages are being removed from the
- * OS page table. If a 'victim' TLB entry is being overwritten in the main TLB
- * as a result of a miss, the removed entry is still allowed to exist in the
- * uTLBs as it is still valid and present in the OS page table. This allows the
- * full associativity of the uTLBs to hide the limited associativity of the main
- * TLB.
- *
- * During a miss handler, the new "TLBWriteNI" command is used to load
- * entries without clearing the uTLBs.
- *
- * When the OS page table is updated, TLB entries that may be associated with a
- * removed page are removed (flushed) from the TLB using TLBWrite. In this
- * circumstance, the uTLBs must also be cleared. This is done by using the
- * existing TLBWrite command. An explicit IVUTLB is also required for those
- * corner cases when TLBWrite was not executed at all because the corresp
- * J-TLB entry got evicted/replaced.
- */
-
-
/* A copy of the ASID from the PID reg is kept in asid_cache */
DEFINE_PER_CPU(unsigned int, asid_cache) = MM_CTXT_FIRST_CYCLE;
@@ -120,32 +37,10 @@ static inline void __tlb_entry_erase(void)
static void utlb_invalidate(void)
{
-#if (CONFIG_ARC_MMU_VER >= 2)
-
-#if (CONFIG_ARC_MMU_VER == 2)
- /* MMU v2 introduced the uTLB Flush command.
- * There was however an obscure hardware bug, where uTLB flush would
- * fail when a prior probe for J-TLB (both totally unrelated) would
- * return lkup err - because the entry didn't exist in MMU.
- * The Workaround was to set Index reg with some valid value, prior to
- * flush. This was fixed in MMU v3
- */
- unsigned int idx;
-
- /* make sure INDEX Reg is valid */
- idx = read_aux_reg(ARC_REG_TLBINDEX);
-
- /* If not write some dummy val */
- if (unlikely(idx & TLB_LKUP_ERR))
- write_aux_reg(ARC_REG_TLBINDEX, 0xa);
-#endif
-
write_aux_reg(ARC_REG_TLBCOMMAND, TLBIVUTLB);
-#endif
-
}
-#if (CONFIG_ARC_MMU_VER < 4)
+#ifdef CONFIG_ARC_MMU_V3
static inline unsigned int tlb_entry_lkup(unsigned long vaddr_n_asid)
{
@@ -176,7 +71,7 @@ static void tlb_entry_erase(unsigned int vaddr_n_asid)
}
}
-static void tlb_entry_insert(unsigned int pd0, pte_t pd1)
+static void tlb_entry_insert(unsigned int pd0, phys_addr_t pd1)
{
unsigned int idx;
@@ -206,7 +101,7 @@ static void tlb_entry_insert(unsigned int pd0, pte_t pd1)
write_aux_reg(ARC_REG_TLBCOMMAND, TLBWrite);
}
-#else /* CONFIG_ARC_MMU_VER >= 4) */
+#else /* MMUv4 */
static void tlb_entry_erase(unsigned int vaddr_n_asid)
{
@@ -214,13 +109,16 @@ static void tlb_entry_erase(unsigned int vaddr_n_asid)
write_aux_reg(ARC_REG_TLBCOMMAND, TLBDeleteEntry);
}
-static void tlb_entry_insert(unsigned int pd0, pte_t pd1)
+static void tlb_entry_insert(unsigned int pd0, phys_addr_t pd1)
{
write_aux_reg(ARC_REG_TLBPD0, pd0);
- write_aux_reg(ARC_REG_TLBPD1, pd1);
- if (is_pae40_enabled())
+ if (!is_pae40_enabled()) {
+ write_aux_reg(ARC_REG_TLBPD1, pd1);
+ } else {
+ write_aux_reg(ARC_REG_TLBPD1, pd1 & 0xFFFFFFFF);
write_aux_reg(ARC_REG_TLBPD1HI, (u64)pd1 >> 32);
+ }
write_aux_reg(ARC_REG_TLBCOMMAND, TLBInsertEntry);
}
@@ -496,7 +394,7 @@ void create_tlb(struct vm_area_struct *vma, unsigned long vaddr, pte_t *ptep)
unsigned long flags;
unsigned int asid_or_sasid, rwx;
unsigned long pd0;
- pte_t pd1;
+ phys_addr_t pd1;
/*
* create_tlb() assumes that current->mm == vma->mm, since
@@ -505,7 +403,6 @@ void create_tlb(struct vm_area_struct *vma, unsigned long vaddr, pte_t *ptep)
*
* Removing the assumption involves
* -Using vma->mm->context{ASID,SASID}, as opposed to MMU reg.
- * -Fix the TLB paranoid debug code to not trigger false negatives.
* -More importantly it makes this handler inconsistent with fast-path
* TLB Refill handler which always deals with "current"
*
@@ -528,8 +425,6 @@ void create_tlb(struct vm_area_struct *vma, unsigned long vaddr, pte_t *ptep)
local_irq_save(flags);
- tlb_paranoid_check(asid_mm(vma->vm_mm, smp_processor_id()), vaddr);
-
vaddr &= PAGE_MASK;
/* update this PTE credentials */
@@ -639,43 +534,6 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
update_mmu_cache(vma, addr, &pte);
}
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
- pgtable_t pgtable)
-{
- struct list_head *lh = (struct list_head *) pgtable;
-
- assert_spin_locked(&mm->page_table_lock);
-
- /* FIFO */
- if (!pmd_huge_pte(mm, pmdp))
- INIT_LIST_HEAD(lh);
- else
- list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
- pmd_huge_pte(mm, pmdp) = pgtable;
-}
-
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
-{
- struct list_head *lh;
- pgtable_t pgtable;
-
- assert_spin_locked(&mm->page_table_lock);
-
- pgtable = pmd_huge_pte(mm, pmdp);
- lh = (struct list_head *) pgtable;
- if (list_empty(lh))
- pmd_huge_pte(mm, pmdp) = NULL;
- else {
- pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
- list_del(lh);
- }
-
- pte_val(pgtable[0]) = 0;
- pte_val(pgtable[1]) = 0;
-
- return pgtable;
-}
-
void local_flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
@@ -706,14 +564,6 @@ void read_decode_mmu_bcr(void)
{
struct cpuinfo_arc_mmu *mmu = &cpuinfo_arc700[smp_processor_id()].mmu;
unsigned int tmp;
- struct bcr_mmu_1_2 {
-#ifdef CONFIG_CPU_BIG_ENDIAN
- unsigned int ver:8, ways:4, sets:4, u_itlb:8, u_dtlb:8;
-#else
- unsigned int u_dtlb:8, u_itlb:8, sets:4, ways:4, ver:8;
-#endif
- } *mmu2;
-
struct bcr_mmu_3 {
#ifdef CONFIG_CPU_BIG_ENDIAN
unsigned int ver:8, ways:4, sets:4, res:3, sasid:1, pg_sz:4,
@@ -738,23 +588,14 @@ void read_decode_mmu_bcr(void)
tmp = read_aux_reg(ARC_REG_MMU_BCR);
mmu->ver = (tmp >> 24);
- if (is_isa_arcompact()) {
- if (mmu->ver <= 2) {
- mmu2 = (struct bcr_mmu_1_2 *)&tmp;
- mmu->pg_sz_k = TO_KB(0x2000);
- mmu->sets = 1 << mmu2->sets;
- mmu->ways = 1 << mmu2->ways;
- mmu->u_dtlb = mmu2->u_dtlb;
- mmu->u_itlb = mmu2->u_itlb;
- } else {
- mmu3 = (struct bcr_mmu_3 *)&tmp;
- mmu->pg_sz_k = 1 << (mmu3->pg_sz - 1);
- mmu->sets = 1 << mmu3->sets;
- mmu->ways = 1 << mmu3->ways;
- mmu->u_dtlb = mmu3->u_dtlb;
- mmu->u_itlb = mmu3->u_itlb;
- mmu->sasid = mmu3->sasid;
- }
+ if (is_isa_arcompact() && mmu->ver == 3) {
+ mmu3 = (struct bcr_mmu_3 *)&tmp;
+ mmu->pg_sz_k = 1 << (mmu3->pg_sz - 1);
+ mmu->sets = 1 << mmu3->sets;
+ mmu->ways = 1 << mmu3->ways;
+ mmu->u_dtlb = mmu3->u_dtlb;
+ mmu->u_itlb = mmu3->u_itlb;
+ mmu->sasid = mmu3->sasid;
} else {
mmu4 = (struct bcr_mmu_4 *)&tmp;
mmu->pg_sz_k = 1 << (mmu4->sz0 - 1);
@@ -780,8 +621,8 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len)
IS_USED_CFG(CONFIG_TRANSPARENT_HUGEPAGE));
n += scnprintf(buf + n, len - n,
- "MMU [v%x]\t: %dk PAGE, %sJTLB %d (%dx%d), uDTLB %d, uITLB %d%s%s\n",
- p_mmu->ver, p_mmu->pg_sz_k, super_pg,
+ "MMU [v%x]\t: %dk PAGE, %s, swalk %d lvl, JTLB %d (%dx%d), uDTLB %d, uITLB %d%s%s\n",
+ p_mmu->ver, p_mmu->pg_sz_k, super_pg, CONFIG_PGTABLE_LEVELS,
p_mmu->sets * p_mmu->ways, p_mmu->sets, p_mmu->ways,
p_mmu->u_dtlb, p_mmu->u_itlb,
IS_AVAIL2(p_mmu->pae, ", PAE40 ", CONFIG_ARC_HAS_PAE40));
@@ -815,22 +656,17 @@ void arc_mmu_init(void)
/*
* Ensure that MMU features assumed by kernel exist in hardware.
- * For older ARC700 cpus, it has to be exact match, since the MMU
- * revisions were not backwards compatible (MMUv3 TLB layout changed
- * so even if kernel for v2 didn't use any new cmds of v3, it would
- * still not work.
- * For HS cpus, MMUv4 was baseline and v5 is backwards compatible
- * (will run older software).
+ * - For older ARC700 cpus, only v3 supported
+ * - For HS cpus, v4 was baseline and v5 is backwards compatible
+ * (will run older software).
*/
- if (is_isa_arcompact() && mmu->ver == CONFIG_ARC_MMU_VER)
+ if (is_isa_arcompact() && mmu->ver == 3)
compat = 1;
- else if (is_isa_arcv2() && mmu->ver >= CONFIG_ARC_MMU_VER)
+ else if (is_isa_arcv2() && mmu->ver >= 4)
compat = 1;
- if (!compat) {
- panic("MMU ver %d doesn't match kernel built for %d...\n",
- mmu->ver, CONFIG_ARC_MMU_VER);
- }
+ if (!compat)
+ panic("MMU ver %d doesn't match kernel built for\n", mmu->ver);
if (mmu->pg_sz_k != TO_KB(PAGE_SIZE))
panic("MMU pg size != PAGE_SIZE (%luk)\n", TO_KB(PAGE_SIZE));
@@ -843,14 +679,11 @@ void arc_mmu_init(void)
if (IS_ENABLED(CONFIG_ARC_HAS_PAE40) && !mmu->pae)
panic("Hardware doesn't support PAE40\n");
- /* Enable the MMU */
- write_aux_reg(ARC_REG_PID, MMU_ENABLE);
+ /* Enable the MMU with ASID 0 */
+ mmu_setup_asid(NULL, 0);
- /* In smp we use this reg for interrupt 1 scratch */
-#ifdef ARC_USE_SCRATCH_REG
- /* swapper_pg_dir is the pgd for the kernel, used by vmalloc */
- write_aux_reg(ARC_REG_SCRATCH_DATA0, swapper_pg_dir);
-#endif
+ /* cache the pgd pointer in MMU SCRATCH reg (ARCv2 only) */
+ mmu_setup_pgd(NULL, swapper_pg_dir);
if (pae40_exist_but_not_enab())
write_aux_reg(ARC_REG_TLBPD1HI, 0);
@@ -945,40 +778,3 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
local_irq_restore(flags);
}
-
-/***********************************************************************
- * Diagnostic Routines
- * -Called from Low Level TLB Handlers if things don;t look good
- **********************************************************************/
-
-#ifdef CONFIG_ARC_DBG_TLB_PARANOIA
-
-/*
- * Low Level ASM TLB handler calls this if it finds that HW and SW ASIDS
- * don't match
- */
-void print_asid_mismatch(int mm_asid, int mmu_asid, int is_fast_path)
-{
- pr_emerg("ASID Mismatch in %s Path Handler: sw-pid=0x%x hw-pid=0x%x\n",
- is_fast_path ? "Fast" : "Slow", mm_asid, mmu_asid);
-
- __asm__ __volatile__("flag 1");
-}
-
-void tlb_paranoid_check(unsigned int mm_asid, unsigned long addr)
-{
- unsigned int mmu_asid;
-
- mmu_asid = read_aux_reg(ARC_REG_PID) & 0xff;
-
- /*
- * At the time of a TLB miss/installation
- * - HW version needs to match SW version
- * - SW needs to have a valid ASID
- */
- if (addr < 0x70000000 &&
- ((mm_asid == MM_CTXT_NO_ASID) ||
- (mmu_asid != (mm_asid & MM_CTXT_ASID_MASK))))
- print_asid_mismatch(mm_asid, mmu_asid, 0);
-}
-#endif
diff --git a/arch/arc/mm/tlbex.S b/arch/arc/mm/tlbex.S
index 062fae46c3f8..e054780a8fe0 100644
--- a/arch/arc/mm/tlbex.S
+++ b/arch/arc/mm/tlbex.S
@@ -39,7 +39,6 @@
#include <asm/arcregs.h>
#include <asm/cache.h>
#include <asm/processor.h>
-#include <asm/tlb-mmu1.h>
#ifdef CONFIG_ISA_ARCOMPACT
;-----------------------------------------------------------------
@@ -94,11 +93,6 @@ ex_saved_reg1:
st_s r1, [r0, 4]
st_s r2, [r0, 8]
st_s r3, [r0, 12]
-
- ; VERIFY if the ASID in MMU-PID Reg is same as
- ; one in Linux data structures
-
- tlb_paranoid_check_asm
.endm
.macro TLBMISS_RESTORE_REGS
@@ -148,53 +142,16 @@ ex_saved_reg1:
#endif
;============================================================================
-; Troubleshooting Stuff
+;TLB Miss handling Code
;============================================================================
-; Linux keeps ASID (Address Space ID) in task->active_mm->context.asid
-; When Creating TLB Entries, instead of doing 3 dependent loads from memory,
-; we use the MMU PID Reg to get current ASID.
-; In bizzare scenrios SW and HW ASID can get out-of-sync which is trouble.
-; So we try to detect this in TLB Mis shandler
-
-.macro tlb_paranoid_check_asm
-
-#ifdef CONFIG_ARC_DBG_TLB_PARANOIA
-
- GET_CURR_TASK_ON_CPU r3
- ld r0, [r3, TASK_ACT_MM]
- ld r0, [r0, MM_CTXT+MM_CTXT_ASID]
- breq r0, 0, 55f ; Error if no ASID allocated
-
- lr r1, [ARC_REG_PID]
- and r1, r1, 0xFF
-
- and r2, r0, 0xFF ; MMU PID bits only for comparison
- breq r1, r2, 5f
-
-55:
- ; Error if H/w and S/w ASID don't match, but NOT if in kernel mode
- lr r2, [erstatus]
- bbit0 r2, STATUS_U_BIT, 5f
-
- ; We sure are in troubled waters, Flag the error, but to do so
- ; need to switch to kernel mode stack to call error routine
- GET_TSK_STACK_BASE r3, sp
-
- ; Call printk to shoutout aloud
- mov r2, 1
- j print_asid_mismatch
-
-5: ; ASIDs match so proceed normally
- nop
-
+#ifndef PMD_SHIFT
+#define PMD_SHIFT PUD_SHIFT
#endif
-.endm
-
-;============================================================================
-;TLB Miss handling Code
-;============================================================================
+#ifndef PUD_SHIFT
+#define PUD_SHIFT PGDIR_SHIFT
+#endif
;-----------------------------------------------------------------------------
; This macro does the page-table lookup for the faulting address.
@@ -203,7 +160,7 @@ ex_saved_reg1:
lr r2, [efa]
-#ifdef ARC_USE_SCRATCH_REG
+#ifdef CONFIG_ISA_ARCV2
lr r1, [ARC_REG_SCRATCH_DATA0] ; current pgd
#else
GET_CURR_TASK_ON_CPU r1
@@ -216,6 +173,24 @@ ex_saved_reg1:
tst r3, r3
bz do_slow_path_pf ; if no Page Table, do page fault
+#if CONFIG_PGTABLE_LEVELS > 3
+ lsr r0, r2, PUD_SHIFT ; Bits for indexing into PUD
+ and r0, r0, (PTRS_PER_PUD - 1)
+ ld.as r1, [r3, r0] ; PMD entry
+ tst r1, r1
+ bz do_slow_path_pf
+ mov r3, r1
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 2
+ lsr r0, r2, PMD_SHIFT ; Bits for indexing into PMD
+ and r0, r0, (PTRS_PER_PMD - 1)
+ ld.as r1, [r3, r0] ; PMD entry
+ tst r1, r1
+ bz do_slow_path_pf
+ mov r3, r1
+#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
and.f 0, r3, _PAGE_HW_SZ ; Is this Huge PMD (thp)
add2.nz r1, r1, r0
@@ -279,7 +254,7 @@ ex_saved_reg1:
; Commit the TLB entry into MMU
.macro COMMIT_ENTRY_TO_MMU
-#if (CONFIG_ARC_MMU_VER < 4)
+#ifdef CONFIG_ARC_MMU_V3
/* Get free TLB slot: Set = computed from vaddr, way = random */
sr TLBGetIndex, [ARC_REG_TLBCOMMAND]
@@ -375,13 +350,6 @@ ENTRY(EV_TLBMissD)
CONV_PTE_TO_TLB
-#if (CONFIG_ARC_MMU_VER == 1)
- ; MMU with 2 way set assoc J-TLB, needs some help in pathetic case of
- ; memcpy where 3 parties contend for 2 ways, ensuing a livelock.
- ; But only for old MMU or one with Metal Fix
- TLB_WRITE_HEURISTICS
-#endif
-
COMMIT_ENTRY_TO_MMU
TLBMISS_RESTORE_REGS
EV_TLBMissD_fast_ret: ; additional label for VDK OS-kit instrumentation
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f1d6531b5ce5..fc196421b2ce 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -124,8 +124,8 @@ config ARM
select PCI_SYSCALL if PCI
select PERF_USE_VMALLOC
select RTC_LIB
- select SET_FS
select SYS_SUPPORTS_APM_EMULATION
+ select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M
# Above selects are sorted alphabetically; please add new ones
# according to that. Thanks.
help
@@ -189,10 +189,6 @@ config LOCKDEP_SUPPORT
bool
default y
-config TRACE_IRQFLAGS_SUPPORT
- bool
- default !CPU_V7M
-
config ARCH_HAS_ILOG2_U32
bool
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 173da685a52e..847c31e7c368 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -308,7 +308,8 @@ $(BOOT_TARGETS): vmlinux
@$(kecho) ' Kernel: $(boot)/$@ is ready'
$(INSTALL_TARGETS):
- $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $@
+ $(CONFIG_SHELL) $(srctree)/$(boot)/install.sh "$(KERNELRELEASE)" \
+ $(boot)/$(patsubst %install,%Image,$@) System.map "$(INSTALL_PATH)"
PHONY += vdso_install
vdso_install:
diff --git a/arch/arm/boot/Makefile b/arch/arm/boot/Makefile
index 0b3cd7a33a26..54a09f9464fb 100644
--- a/arch/arm/boot/Makefile
+++ b/arch/arm/boot/Makefile
@@ -96,23 +96,11 @@ $(obj)/bootp/bootp: $(obj)/zImage initrd FORCE
$(obj)/bootpImage: $(obj)/bootp/bootp FORCE
$(call if_changed,objcopy)
-PHONY += initrd install zinstall uinstall
+PHONY += initrd
initrd:
@test "$(INITRD_PHYS)" != "" || \
(echo This machine does not support INITRD; exit -1)
@test "$(INITRD)" != "" || \
(echo You must specify INITRD; exit -1)
-install:
- $(CONFIG_SHELL) $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" \
- $(obj)/Image System.map "$(INSTALL_PATH)"
-
-zinstall:
- $(CONFIG_SHELL) $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" \
- $(obj)/zImage System.map "$(INSTALL_PATH)"
-
-uinstall:
- $(CONFIG_SHELL) $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" \
- $(obj)/uImage System.map "$(INSTALL_PATH)"
-
subdir- := bootp compressed dts
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index 9d91ae1091b0..91265e7ff672 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -85,6 +85,8 @@ compress-$(CONFIG_KERNEL_LZ4) = lz4
libfdt_objs := fdt_rw.o fdt_ro.o fdt_wip.o fdt.o
ifeq ($(CONFIG_ARM_ATAG_DTB_COMPAT),y)
+CFLAGS_REMOVE_atags_to_fdt.o += -Wframe-larger-than=${CONFIG_FRAME_WARN}
+CFLAGS_atags_to_fdt.o += -Wframe-larger-than=1280
OBJS += $(libfdt_objs) atags_to_fdt.o
endif
ifeq ($(CONFIG_USE_OF),y)
diff --git a/arch/arm/boot/dts/omap34xx.dtsi b/arch/arm/boot/dts/omap34xx.dtsi
index feaa43b78535..8b8451399784 100644
--- a/arch/arm/boot/dts/omap34xx.dtsi
+++ b/arch/arm/boot/dts/omap34xx.dtsi
@@ -24,7 +24,6 @@
};
};
- /* see Documentation/devicetree/bindings/opp/opp.txt */
cpu0_opp_table: opp-table {
compatible = "operating-points-v2-ti-cpu";
syscon = <&scm_conf>;
diff --git a/arch/arm/boot/dts/omap36xx.dtsi b/arch/arm/boot/dts/omap36xx.dtsi
index 20844dbc002e..22b33098b1a2 100644
--- a/arch/arm/boot/dts/omap36xx.dtsi
+++ b/arch/arm/boot/dts/omap36xx.dtsi
@@ -29,7 +29,6 @@
};
};
- /* see Documentation/devicetree/bindings/opp/opp.txt */
cpu0_opp_table: opp-table {
compatible = "operating-points-v2-ti-cpu";
syscon = <&scm_conf>;
diff --git a/arch/arm/configs/dove_defconfig b/arch/arm/configs/dove_defconfig
index b935162a8bba..33074fdab2ea 100644
--- a/arch/arm/configs/dove_defconfig
+++ b/arch/arm/configs/dove_defconfig
@@ -56,7 +56,6 @@ CONFIG_ATA=y
CONFIG_SATA_MV=y
CONFIG_NETDEVICES=y
CONFIG_MV643XX_ETH=y
-CONFIG_INPUT_POLLDEV=y
# CONFIG_INPUT_MOUSEDEV is not set
CONFIG_INPUT_EVDEV=y
# CONFIG_KEYBOARD_ATKBD is not set
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index 363f1b1b08e3..58f4834289e6 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -284,7 +284,6 @@ CONFIG_RT2800USB=m
CONFIG_MWIFIEX=m
CONFIG_MWIFIEX_SDIO=m
CONFIG_INPUT_FF_MEMLESS=m
-CONFIG_INPUT_POLLDEV=y
CONFIG_INPUT_MATRIXKMAP=y
CONFIG_INPUT_MOUSEDEV=m
CONFIG_INPUT_MOUSEDEV_SCREEN_X=640
diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h
index 595e538f5bfb..4b69cf850451 100644
--- a/arch/arm/include/asm/div64.h
+++ b/arch/arm/include/asm/div64.h
@@ -52,17 +52,6 @@ static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
#else
-/*
- * gcc versions earlier than 4.0 are simply too problematic for the
- * __div64_const32() code in asm-generic/div64.h. First there is
- * gcc PR 15089 that tend to trig on more complex constructs, spurious
- * .global __udivsi3 are inserted even if none of those symbols are
- * referenced in the generated code, and those gcc versions are not able
- * to do constant propagation on long long values anyway.
- */
-
-#define __div64_const32_is_OK (__GNUC__ >= 4)
-
static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
{
unsigned long long res;
diff --git a/arch/arm/include/asm/gpio.h b/arch/arm/include/asm/gpio.h
index c50e383358c4..f3bb8a2bf788 100644
--- a/arch/arm/include/asm/gpio.h
+++ b/arch/arm/include/asm/gpio.h
@@ -2,10 +2,6 @@
#ifndef _ARCH_ARM_GPIO_H
#define _ARCH_ARM_GPIO_H
-#if CONFIG_ARCH_NR_GPIO > 0
-#define ARCH_NR_GPIOS CONFIG_ARCH_NR_GPIO
-#endif
-
/* Note: this may rely upon the value of ARCH_NR_GPIOS set in mach/gpio.h */
#include <asm-generic/gpio.h>
diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h
index 91d6b7856be4..93051e2f402c 100644
--- a/arch/arm/include/asm/ptrace.h
+++ b/arch/arm/include/asm/ptrace.h
@@ -19,7 +19,6 @@ struct pt_regs {
struct svc_pt_regs {
struct pt_regs regs;
u32 dacr;
- u32 addr_limit;
};
#define to_svc_pt_regs(r) container_of(r, struct svc_pt_regs, regs)
diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h
index fd02761ba06c..24c19d63ff0a 100644
--- a/arch/arm/include/asm/syscall.h
+++ b/arch/arm/include/asm/syscall.h
@@ -22,7 +22,21 @@ extern const unsigned long sys_call_table[];
static inline int syscall_get_nr(struct task_struct *task,
struct pt_regs *regs)
{
- return task_thread_info(task)->syscall;
+ if (IS_ENABLED(CONFIG_AEABI) && !IS_ENABLED(CONFIG_OABI_COMPAT))
+ return task_thread_info(task)->abi_syscall;
+
+ return task_thread_info(task)->abi_syscall & __NR_SYSCALL_MASK;
+}
+
+static inline bool __in_oabi_syscall(struct task_struct *task)
+{
+ return IS_ENABLED(CONFIG_OABI_COMPAT) &&
+ (task_thread_info(task)->abi_syscall & __NR_OABI_SYSCALL_BASE);
+}
+
+static inline bool in_oabi_syscall(void)
+{
+ return __in_oabi_syscall(current);
}
static inline void syscall_rollback(struct task_struct *task,
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index a02799bd0cdf..9a18da3e10cc 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -31,8 +31,6 @@ struct task_struct;
#include <asm/types.h>
-typedef unsigned long mm_segment_t;
-
struct cpu_context_save {
__u32 r4;
__u32 r5;
@@ -54,7 +52,6 @@ struct cpu_context_save {
struct thread_info {
unsigned long flags; /* low level flags */
int preempt_count; /* 0 => preemptable, <0 => bug */
- mm_segment_t addr_limit; /* address limit */
struct task_struct *task; /* main task structure */
__u32 cpu; /* cpu */
__u32 cpu_domain; /* cpu domain */
@@ -62,7 +59,7 @@ struct thread_info {
unsigned long stack_canary;
#endif
struct cpu_context_save cpu_context; /* cpu context */
- __u32 syscall; /* syscall number */
+ __u32 abi_syscall; /* ABI type and syscall nr */
__u8 used_cp[16]; /* thread used copro */
unsigned long tp_value[2]; /* TLS registers */
union fp_state fpstate __attribute__((aligned(8)));
@@ -77,7 +74,6 @@ struct thread_info {
.task = &tsk, \
.flags = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .addr_limit = KERNEL_DS, \
}
/*
diff --git a/arch/arm/include/asm/uaccess-asm.h b/arch/arm/include/asm/uaccess-asm.h
index e6eb7a2aaf1e..6451a433912c 100644
--- a/arch/arm/include/asm/uaccess-asm.h
+++ b/arch/arm/include/asm/uaccess-asm.h
@@ -84,12 +84,8 @@
* if \disable is set.
*/
.macro uaccess_entry, tsk, tmp0, tmp1, tmp2, disable
- ldr \tmp1, [\tsk, #TI_ADDR_LIMIT]
- ldr \tmp2, =TASK_SIZE
- str \tmp2, [\tsk, #TI_ADDR_LIMIT]
DACR( mrc p15, 0, \tmp0, c3, c0, 0)
DACR( str \tmp0, [sp, #SVC_DACR])
- str \tmp1, [sp, #SVC_ADDR_LIMIT]
.if \disable && IS_ENABLED(CONFIG_CPU_SW_DOMAIN_PAN)
/* kernel=client, user=no access */
mov \tmp2, #DACR_UACCESS_DISABLE
@@ -106,9 +102,7 @@
/* Restore the user access state previously saved by uaccess_entry */
.macro uaccess_exit, tsk, tmp0, tmp1
- ldr \tmp1, [sp, #SVC_ADDR_LIMIT]
DACR( ldr \tmp0, [sp, #SVC_DACR])
- str \tmp1, [\tsk, #TI_ADDR_LIMIT]
DACR( mcr p15, 0, \tmp0, c3, c0, 0)
.endm
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index a13d90206472..084d1c07c2d0 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -52,32 +52,8 @@ static __always_inline void uaccess_restore(unsigned int flags)
extern int __get_user_bad(void);
extern int __put_user_bad(void);
-/*
- * Note that this is actually 0x1,0000,0000
- */
-#define KERNEL_DS 0x00000000
-
#ifdef CONFIG_MMU
-#define USER_DS TASK_SIZE
-#define get_fs() (current_thread_info()->addr_limit)
-
-static inline void set_fs(mm_segment_t fs)
-{
- current_thread_info()->addr_limit = fs;
-
- /*
- * Prevent a mispredicted conditional call to set_fs from forwarding
- * the wrong address limit to access_ok under speculation.
- */
- dsb(nsh);
- isb();
-
- modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
-}
-
-#define uaccess_kernel() (get_fs() == KERNEL_DS)
-
/*
* We use 33-bit arithmetic here. Success returns zero, failure returns
* addr_limit. We take advantage that addr_limit will be zero for KERNEL_DS,
@@ -89,7 +65,7 @@ static inline void set_fs(mm_segment_t fs)
__asm__(".syntax unified\n" \
"adds %1, %2, %3; sbcscc %1, %1, %0; movcc %0, #0" \
: "=&r" (flag), "=&r" (roksum) \
- : "r" (addr), "Ir" (size), "0" (current_thread_info()->addr_limit) \
+ : "r" (addr), "Ir" (size), "0" (TASK_SIZE) \
: "cc"); \
flag; })
@@ -120,7 +96,7 @@ static inline void __user *__uaccess_mask_range_ptr(const void __user *ptr,
" subshs %1, %1, %2\n"
" movlo %0, #0\n"
: "+r" (safe_ptr), "=&r" (tmp)
- : "r" (size), "r" (current_thread_info()->addr_limit)
+ : "r" (size), "r" (TASK_SIZE)
: "cc");
csdb();
@@ -194,7 +170,7 @@ extern int __get_user_64t_4(void *);
#define __get_user_check(x, p) \
({ \
- unsigned long __limit = current_thread_info()->addr_limit - 1; \
+ unsigned long __limit = TASK_SIZE - 1; \
register typeof(*(p)) __user *__p asm("r0") = (p); \
register __inttype(x) __r2 asm("r2"); \
register unsigned long __l asm("r1") = __limit; \
@@ -245,7 +221,7 @@ extern int __put_user_8(void *, unsigned long long);
#define __put_user_check(__pu_val, __ptr, __err, __s) \
({ \
- unsigned long __limit = current_thread_info()->addr_limit - 1; \
+ unsigned long __limit = TASK_SIZE - 1; \
register typeof(__pu_val) __r2 asm("r2") = __pu_val; \
register const void __user *__p asm("r0") = __ptr; \
register unsigned long __l asm("r1") = __limit; \
@@ -262,19 +238,8 @@ extern int __put_user_8(void *, unsigned long long);
#else /* CONFIG_MMU */
-/*
- * uClinux has only one addr space, so has simplified address limits.
- */
-#define USER_DS KERNEL_DS
-
-#define uaccess_kernel() (true)
#define __addr_ok(addr) ((void)(addr), 1)
#define __range_ok(addr, size) ((void)(addr), 0)
-#define get_fs() (KERNEL_DS)
-
-static inline void set_fs(mm_segment_t fs)
-{
-}
#define get_user(x, p) __get_user(x, p)
#define __put_user_check __put_user_nocheck
@@ -283,9 +248,6 @@ static inline void set_fs(mm_segment_t fs)
#define access_ok(addr, size) (__range_ok(addr, size) == 0)
-#define user_addr_max() \
- (uaccess_kernel() ? ~0UL : get_fs())
-
#ifdef CONFIG_CPU_SPECTRE
/*
* When mitigating Spectre variant 1, it is not worth fixing the non-
@@ -308,11 +270,11 @@ static inline void set_fs(mm_segment_t fs)
#define __get_user(x, ptr) \
({ \
long __gu_err = 0; \
- __get_user_err((x), (ptr), __gu_err); \
+ __get_user_err((x), (ptr), __gu_err, TUSER()); \
__gu_err; \
})
-#define __get_user_err(x, ptr, err) \
+#define __get_user_err(x, ptr, err, __t) \
do { \
unsigned long __gu_addr = (unsigned long)(ptr); \
unsigned long __gu_val; \
@@ -321,18 +283,19 @@ do { \
might_fault(); \
__ua_flags = uaccess_save_and_enable(); \
switch (sizeof(*(ptr))) { \
- case 1: __get_user_asm_byte(__gu_val, __gu_addr, err); break; \
- case 2: __get_user_asm_half(__gu_val, __gu_addr, err); break; \
- case 4: __get_user_asm_word(__gu_val, __gu_addr, err); break; \
+ case 1: __get_user_asm_byte(__gu_val, __gu_addr, err, __t); break; \
+ case 2: __get_user_asm_half(__gu_val, __gu_addr, err, __t); break; \
+ case 4: __get_user_asm_word(__gu_val, __gu_addr, err, __t); break; \
default: (__gu_val) = __get_user_bad(); \
} \
uaccess_restore(__ua_flags); \
(x) = (__typeof__(*(ptr)))__gu_val; \
} while (0)
+#endif
#define __get_user_asm(x, addr, err, instr) \
__asm__ __volatile__( \
- "1: " TUSER(instr) " %1, [%2], #0\n" \
+ "1: " instr " %1, [%2], #0\n" \
"2:\n" \
" .pushsection .text.fixup,\"ax\"\n" \
" .align 2\n" \
@@ -348,40 +311,38 @@ do { \
: "r" (addr), "i" (-EFAULT) \
: "cc")
-#define __get_user_asm_byte(x, addr, err) \
- __get_user_asm(x, addr, err, ldrb)
+#define __get_user_asm_byte(x, addr, err, __t) \
+ __get_user_asm(x, addr, err, "ldrb" __t)
#if __LINUX_ARM_ARCH__ >= 6
-#define __get_user_asm_half(x, addr, err) \
- __get_user_asm(x, addr, err, ldrh)
+#define __get_user_asm_half(x, addr, err, __t) \
+ __get_user_asm(x, addr, err, "ldrh" __t)
#else
#ifndef __ARMEB__
-#define __get_user_asm_half(x, __gu_addr, err) \
+#define __get_user_asm_half(x, __gu_addr, err, __t) \
({ \
unsigned long __b1, __b2; \
- __get_user_asm_byte(__b1, __gu_addr, err); \
- __get_user_asm_byte(__b2, __gu_addr + 1, err); \
+ __get_user_asm_byte(__b1, __gu_addr, err, __t); \
+ __get_user_asm_byte(__b2, __gu_addr + 1, err, __t); \
(x) = __b1 | (__b2 << 8); \
})
#else
-#define __get_user_asm_half(x, __gu_addr, err) \
+#define __get_user_asm_half(x, __gu_addr, err, __t) \
({ \
unsigned long __b1, __b2; \
- __get_user_asm_byte(__b1, __gu_addr, err); \
- __get_user_asm_byte(__b2, __gu_addr + 1, err); \
+ __get_user_asm_byte(__b1, __gu_addr, err, __t); \
+ __get_user_asm_byte(__b2, __gu_addr + 1, err, __t); \
(x) = (__b1 << 8) | __b2; \
})
#endif
#endif /* __LINUX_ARM_ARCH__ >= 6 */
-#define __get_user_asm_word(x, addr, err) \
- __get_user_asm(x, addr, err, ldr)
-#endif
-
+#define __get_user_asm_word(x, addr, err, __t) \
+ __get_user_asm(x, addr, err, "ldr" __t)
#define __put_user_switch(x, ptr, __err, __fn) \
do { \
@@ -425,7 +386,7 @@ do { \
#define __put_user_nocheck(x, __pu_ptr, __err, __size) \
do { \
unsigned long __pu_addr = (unsigned long)__pu_ptr; \
- __put_user_nocheck_##__size(x, __pu_addr, __err); \
+ __put_user_nocheck_##__size(x, __pu_addr, __err, TUSER());\
} while (0)
#define __put_user_nocheck_1 __put_user_asm_byte
@@ -433,9 +394,11 @@ do { \
#define __put_user_nocheck_4 __put_user_asm_word
#define __put_user_nocheck_8 __put_user_asm_dword
+#endif /* !CONFIG_CPU_SPECTRE */
+
#define __put_user_asm(x, __pu_addr, err, instr) \
__asm__ __volatile__( \
- "1: " TUSER(instr) " %1, [%2], #0\n" \
+ "1: " instr " %1, [%2], #0\n" \
"2:\n" \
" .pushsection .text.fixup,\"ax\"\n" \
" .align 2\n" \
@@ -450,36 +413,36 @@ do { \
: "r" (x), "r" (__pu_addr), "i" (-EFAULT) \
: "cc")
-#define __put_user_asm_byte(x, __pu_addr, err) \
- __put_user_asm(x, __pu_addr, err, strb)
+#define __put_user_asm_byte(x, __pu_addr, err, __t) \
+ __put_user_asm(x, __pu_addr, err, "strb" __t)
#if __LINUX_ARM_ARCH__ >= 6
-#define __put_user_asm_half(x, __pu_addr, err) \
- __put_user_asm(x, __pu_addr, err, strh)
+#define __put_user_asm_half(x, __pu_addr, err, __t) \
+ __put_user_asm(x, __pu_addr, err, "strh" __t)
#else
#ifndef __ARMEB__
-#define __put_user_asm_half(x, __pu_addr, err) \
+#define __put_user_asm_half(x, __pu_addr, err, __t) \
({ \
unsigned long __temp = (__force unsigned long)(x); \
- __put_user_asm_byte(__temp, __pu_addr, err); \
- __put_user_asm_byte(__temp >> 8, __pu_addr + 1, err); \
+ __put_user_asm_byte(__temp, __pu_addr, err, __t); \
+ __put_user_asm_byte(__temp >> 8, __pu_addr + 1, err, __t);\
})
#else
-#define __put_user_asm_half(x, __pu_addr, err) \
+#define __put_user_asm_half(x, __pu_addr, err, __t) \
({ \
unsigned long __temp = (__force unsigned long)(x); \
- __put_user_asm_byte(__temp >> 8, __pu_addr, err); \
- __put_user_asm_byte(__temp, __pu_addr + 1, err); \
+ __put_user_asm_byte(__temp >> 8, __pu_addr, err, __t); \
+ __put_user_asm_byte(__temp, __pu_addr + 1, err, __t); \
})
#endif
#endif /* __LINUX_ARM_ARCH__ >= 6 */
-#define __put_user_asm_word(x, __pu_addr, err) \
- __put_user_asm(x, __pu_addr, err, str)
+#define __put_user_asm_word(x, __pu_addr, err, __t) \
+ __put_user_asm(x, __pu_addr, err, "str" __t)
#ifndef __ARMEB__
#define __reg_oper0 "%R2"
@@ -489,12 +452,12 @@ do { \
#define __reg_oper1 "%R2"
#endif
-#define __put_user_asm_dword(x, __pu_addr, err) \
+#define __put_user_asm_dword(x, __pu_addr, err, __t) \
__asm__ __volatile__( \
- ARM( "1: " TUSER(str) " " __reg_oper1 ", [%1], #4\n" ) \
- ARM( "2: " TUSER(str) " " __reg_oper0 ", [%1]\n" ) \
- THUMB( "1: " TUSER(str) " " __reg_oper1 ", [%1]\n" ) \
- THUMB( "2: " TUSER(str) " " __reg_oper0 ", [%1, #4]\n" ) \
+ ARM( "1: str" __t " " __reg_oper1 ", [%1], #4\n" ) \
+ ARM( "2: str" __t " " __reg_oper0 ", [%1]\n" ) \
+ THUMB( "1: str" __t " " __reg_oper1 ", [%1]\n" ) \
+ THUMB( "2: str" __t " " __reg_oper0 ", [%1, #4]\n" ) \
"3:\n" \
" .pushsection .text.fixup,\"ax\"\n" \
" .align 2\n" \
@@ -510,7 +473,49 @@ do { \
: "r" (x), "i" (-EFAULT) \
: "cc")
-#endif /* !CONFIG_CPU_SPECTRE */
+#define HAVE_GET_KERNEL_NOFAULT
+
+#define __get_kernel_nofault(dst, src, type, err_label) \
+do { \
+ const type *__pk_ptr = (src); \
+ unsigned long __src = (unsigned long)(__pk_ptr); \
+ type __val; \
+ int __err = 0; \
+ switch (sizeof(type)) { \
+ case 1: __get_user_asm_byte(__val, __src, __err, ""); break; \
+ case 2: __get_user_asm_half(__val, __src, __err, ""); break; \
+ case 4: __get_user_asm_word(__val, __src, __err, ""); break; \
+ case 8: { \
+ u32 *__v32 = (u32*)&__val; \
+ __get_user_asm_word(__v32[0], __src, __err, ""); \
+ if (__err) \
+ break; \
+ __get_user_asm_word(__v32[1], __src+4, __err, ""); \
+ break; \
+ } \
+ default: __err = __get_user_bad(); break; \
+ } \
+ *(type *)(dst) = __val; \
+ if (__err) \
+ goto err_label; \
+} while (0)
+
+#define __put_kernel_nofault(dst, src, type, err_label) \
+do { \
+ const type *__pk_ptr = (dst); \
+ unsigned long __dst = (unsigned long)__pk_ptr; \
+ int __err = 0; \
+ type __val = *(type *)src; \
+ switch (sizeof(type)) { \
+ case 1: __put_user_asm_byte(__val, __dst, __err, ""); break; \
+ case 2: __put_user_asm_half(__val, __dst, __err, ""); break; \
+ case 4: __put_user_asm_word(__val, __dst, __err, ""); break; \
+ case 8: __put_user_asm_dword(__val, __dst, __err, ""); break; \
+ default: __err = __put_user_bad(); break; \
+ } \
+ if (__err) \
+ goto err_label; \
+} while (0)
#ifdef CONFIG_MMU
extern unsigned long __must_check
diff --git a/arch/arm/include/asm/unified.h b/arch/arm/include/asm/unified.h
index 1e2c3eb04353..ce9689118dbb 100644
--- a/arch/arm/include/asm/unified.h
+++ b/arch/arm/include/asm/unified.h
@@ -24,10 +24,6 @@ __asm__(".syntax unified");
#ifdef CONFIG_THUMB2_KERNEL
-#if __GNUC__ < 4
-#error Thumb-2 kernel requires gcc >= 4
-#endif
-
/* The CPSR bit describing the instruction set (Thumb) */
#define PSR_ISETSTATE PSR_T_BIT
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index ae7749e15726..a1149911464c 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -15,6 +15,7 @@
#define _UAPI__ASM_ARM_UNISTD_H
#define __NR_OABI_SYSCALL_BASE 0x900000
+#define __NR_SYSCALL_MASK 0x0fffff
#if defined(__thumb__) || defined(__ARM_EABI__)
#define __NR_SYSCALL_BASE 0
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 64944701bf6a..a646a3f6440f 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -43,11 +43,11 @@ int main(void)
BLANK();
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
- DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
DEFINE(TI_TASK, offsetof(struct thread_info, task));
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain));
DEFINE(TI_CPU_SAVE, offsetof(struct thread_info, cpu_context));
+ DEFINE(TI_ABI_SYSCALL, offsetof(struct thread_info, abi_syscall));
DEFINE(TI_USED_CP, offsetof(struct thread_info, used_cp));
DEFINE(TI_TP_VALUE, offsetof(struct thread_info, tp_value));
DEFINE(TI_FPSTATE, offsetof(struct thread_info, fpstate));
@@ -88,7 +88,6 @@ int main(void)
DEFINE(S_OLD_R0, offsetof(struct pt_regs, ARM_ORIG_r0));
DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs));
DEFINE(SVC_DACR, offsetof(struct svc_pt_regs, dacr));
- DEFINE(SVC_ADDR_LIMIT, offsetof(struct svc_pt_regs, addr_limit));
DEFINE(SVC_REGS_SIZE, sizeof(struct svc_pt_regs));
BLANK();
DEFINE(SIGFRAME_RC3_OFFSET, offsetof(struct sigframe, retcode[3]));
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 7f0b7aba1498..d9c99db50243 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -49,10 +49,6 @@ __ret_fast_syscall:
UNWIND(.fnstart )
UNWIND(.cantunwind )
disable_irq_notrace @ disable interrupts
- ldr r2, [tsk, #TI_ADDR_LIMIT]
- ldr r1, =TASK_SIZE
- cmp r2, r1
- blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
movs r1, r1, lsl #16
bne fast_work_pending
@@ -87,10 +83,6 @@ __ret_fast_syscall:
bl do_rseq_syscall
#endif
disable_irq_notrace @ disable interrupts
- ldr r2, [tsk, #TI_ADDR_LIMIT]
- ldr r1, =TASK_SIZE
- cmp r2, r1
- blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
movs r1, r1, lsl #16
beq no_work_pending
@@ -129,10 +121,6 @@ ret_slow_syscall:
#endif
disable_irq_notrace @ disable interrupts
ENTRY(ret_to_user_from_irq)
- ldr r2, [tsk, #TI_ADDR_LIMIT]
- ldr r1, =TASK_SIZE
- cmp r2, r1
- blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS]
movs r1, r1, lsl #16
bne slow_work_pending
@@ -226,6 +214,7 @@ ENTRY(vector_swi)
/* saved_psr and saved_pc are now dead */
uaccess_disable tbl
+ get_thread_info tsk
adr tbl, sys_call_table @ load syscall table pointer
@@ -237,13 +226,17 @@ ENTRY(vector_swi)
* get the old ABI syscall table address.
*/
bics r10, r10, #0xff000000
+ strne r10, [tsk, #TI_ABI_SYSCALL]
+ streq scno, [tsk, #TI_ABI_SYSCALL]
eorne scno, r10, #__NR_OABI_SYSCALL_BASE
ldrne tbl, =sys_oabi_call_table
#elif !defined(CONFIG_AEABI)
bic scno, scno, #0xff000000 @ mask off SWI op-code
+ str scno, [tsk, #TI_ABI_SYSCALL]
eor scno, scno, #__NR_SYSCALL_BASE @ check OS number
+#else
+ str scno, [tsk, #TI_ABI_SYSCALL]
#endif
- get_thread_info tsk
/*
* Reload the registers that may have been corrupted on entry to
* the syscall assembly (by tracing or context tracking.)
@@ -288,7 +281,6 @@ ENDPROC(vector_swi)
* context switches, and waiting for our parent to respond.
*/
__sys_trace:
- mov r1, scno
add r0, sp, #S_OFF
bl syscall_trace_enter
mov scno, r0
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index bb5ad8a6a4c3..0e2d3051741e 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -106,7 +106,7 @@ void __show_regs(struct pt_regs *regs)
unsigned long flags;
char buf[64];
#ifndef CONFIG_CPU_V7M
- unsigned int domain, fs;
+ unsigned int domain;
#ifdef CONFIG_CPU_SW_DOMAIN_PAN
/*
* Get the domain register for the parent context. In user
@@ -115,14 +115,11 @@ void __show_regs(struct pt_regs *regs)
*/
if (user_mode(regs)) {
domain = DACR_UACCESS_ENABLE;
- fs = get_fs();
} else {
domain = to_svc_pt_regs(regs)->dacr;
- fs = to_svc_pt_regs(regs)->addr_limit;
}
#else
domain = get_domain();
- fs = get_fs();
#endif
#endif
@@ -158,8 +155,6 @@ void __show_regs(struct pt_regs *regs)
if ((domain & domain_mask(DOMAIN_USER)) ==
domain_val(DOMAIN_USER, DOMAIN_NOACCESS))
segment = "none";
- else if (fs == KERNEL_DS)
- segment = "kernel";
else
segment = "user";
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index b008859680bc..43b963ea4a0e 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -25,6 +25,7 @@
#include <linux/tracehook.h>
#include <linux/unistd.h>
+#include <asm/syscall.h>
#include <asm/traps.h>
#define CREATE_TRACE_POINTS
@@ -785,7 +786,8 @@ long arch_ptrace(struct task_struct *child, long request,
break;
case PTRACE_SET_SYSCALL:
- task_thread_info(child)->syscall = data;
+ task_thread_info(child)->abi_syscall = data &
+ __NR_SYSCALL_MASK;
ret = 0;
break;
@@ -844,14 +846,14 @@ static void tracehook_report_syscall(struct pt_regs *regs,
if (dir == PTRACE_SYSCALL_EXIT)
tracehook_report_syscall_exit(regs, 0);
else if (tracehook_report_syscall_entry(regs))
- current_thread_info()->syscall = -1;
+ current_thread_info()->abi_syscall = -1;
regs->ARM_ip = ip;
}
-asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
+asmlinkage int syscall_trace_enter(struct pt_regs *regs)
{
- current_thread_info()->syscall = scno;
+ int scno;
if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
@@ -862,11 +864,11 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
return -1;
#else
/* XXX: remove this once OABI gets fixed */
- secure_computing_strict(current_thread_info()->syscall);
+ secure_computing_strict(syscall_get_nr(current, regs));
#endif
/* Tracer or seccomp may have changed syscall. */
- scno = current_thread_info()->syscall;
+ scno = syscall_get_nr(current, regs);
if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
trace_sys_enter(regs, scno);
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 4e0dcff3f5b0..d0a800be0486 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -669,14 +669,6 @@ struct page *get_signal_page(void)
return page;
}
-/* Defer to generic check */
-asmlinkage void addr_limit_check_failed(void)
-{
-#ifdef CONFIG_MMU
- addr_limit_user_check();
-#endif
-}
-
#ifdef CONFIG_DEBUG_RSEQ
asmlinkage void do_rseq_syscall(struct pt_regs *regs)
{
diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index 075a2e0ed2c1..68112c172025 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -80,9 +80,12 @@
#include <linux/socket.h>
#include <linux/net.h>
#include <linux/ipc.h>
+#include <linux/ipc_namespace.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
+#include <asm/syscall.h>
+
struct oldabi_stat64 {
unsigned long long st_dev;
unsigned int __pad1;
@@ -191,60 +194,87 @@ struct oabi_flock64 {
pid_t l_pid;
} __attribute__ ((packed,aligned(4)));
-static long do_locks(unsigned int fd, unsigned int cmd,
- unsigned long arg)
+static int get_oabi_flock(struct flock64 *kernel, struct oabi_flock64 __user *arg)
{
- struct flock64 kernel;
struct oabi_flock64 user;
- mm_segment_t fs;
- long ret;
if (copy_from_user(&user, (struct oabi_flock64 __user *)arg,
sizeof(user)))
return -EFAULT;
- kernel.l_type = user.l_type;
- kernel.l_whence = user.l_whence;
- kernel.l_start = user.l_start;
- kernel.l_len = user.l_len;
- kernel.l_pid = user.l_pid;
-
- fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_fcntl64(fd, cmd, (unsigned long)&kernel);
- set_fs(fs);
-
- if (!ret && (cmd == F_GETLK64 || cmd == F_OFD_GETLK)) {
- user.l_type = kernel.l_type;
- user.l_whence = kernel.l_whence;
- user.l_start = kernel.l_start;
- user.l_len = kernel.l_len;
- user.l_pid = kernel.l_pid;
- if (copy_to_user((struct oabi_flock64 __user *)arg,
- &user, sizeof(user)))
- ret = -EFAULT;
- }
- return ret;
+
+ kernel->l_type = user.l_type;
+ kernel->l_whence = user.l_whence;
+ kernel->l_start = user.l_start;
+ kernel->l_len = user.l_len;
+ kernel->l_pid = user.l_pid;
+
+ return 0;
+}
+
+static int put_oabi_flock(struct flock64 *kernel, struct oabi_flock64 __user *arg)
+{
+ struct oabi_flock64 user;
+
+ user.l_type = kernel->l_type;
+ user.l_whence = kernel->l_whence;
+ user.l_start = kernel->l_start;
+ user.l_len = kernel->l_len;
+ user.l_pid = kernel->l_pid;
+
+ if (copy_to_user((struct oabi_flock64 __user *)arg,
+ &user, sizeof(user)))
+ return -EFAULT;
+
+ return 0;
}
asmlinkage long sys_oabi_fcntl64(unsigned int fd, unsigned int cmd,
unsigned long arg)
{
+ void __user *argp = (void __user *)arg;
+ struct fd f = fdget_raw(fd);
+ struct flock64 flock;
+ long err = -EBADF;
+
+ if (!f.file)
+ goto out;
+
switch (cmd) {
- case F_OFD_GETLK:
- case F_OFD_SETLK:
- case F_OFD_SETLKW:
case F_GETLK64:
+ case F_OFD_GETLK:
+ err = security_file_fcntl(f.file, cmd, arg);
+ if (err)
+ break;
+ err = get_oabi_flock(&flock, argp);
+ if (err)
+ break;
+ err = fcntl_getlk64(f.file, cmd, &flock);
+ if (!err)
+ err = put_oabi_flock(&flock, argp);
+ break;
case F_SETLK64:
case F_SETLKW64:
- return do_locks(fd, cmd, arg);
-
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
+ err = security_file_fcntl(f.file, cmd, arg);
+ if (err)
+ break;
+ err = get_oabi_flock(&flock, argp);
+ if (err)
+ break;
+ err = fcntl_setlk64(fd, f.file, cmd, &flock);
+ break;
default:
- return sys_fcntl64(fd, cmd, arg);
+ err = sys_fcntl64(fd, cmd, arg);
+ break;
}
+ fdput(f);
+out:
+ return err;
}
struct oabi_epoll_event {
- __u32 events;
+ __poll_t events;
__u64 data;
} __attribute__ ((packed,aligned(4)));
@@ -264,55 +294,34 @@ asmlinkage long sys_oabi_epoll_ctl(int epfd, int op, int fd,
return do_epoll_ctl(epfd, op, fd, &kernel, false);
}
-
-asmlinkage long sys_oabi_epoll_wait(int epfd,
- struct oabi_epoll_event __user *events,
- int maxevents, int timeout)
-{
- struct epoll_event *kbuf;
- struct oabi_epoll_event e;
- mm_segment_t fs;
- long ret, err, i;
-
- if (maxevents <= 0 ||
- maxevents > (INT_MAX/sizeof(*kbuf)) ||
- maxevents > (INT_MAX/sizeof(*events)))
- return -EINVAL;
- if (!access_ok(events, sizeof(*events) * maxevents))
- return -EFAULT;
- kbuf = kmalloc_array(maxevents, sizeof(*kbuf), GFP_KERNEL);
- if (!kbuf)
- return -ENOMEM;
- fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_epoll_wait(epfd, kbuf, maxevents, timeout);
- set_fs(fs);
- err = 0;
- for (i = 0; i < ret; i++) {
- e.events = kbuf[i].events;
- e.data = kbuf[i].data;
- err = __copy_to_user(events, &e, sizeof(e));
- if (err)
- break;
- events++;
- }
- kfree(kbuf);
- return err ? -EFAULT : ret;
-}
#else
asmlinkage long sys_oabi_epoll_ctl(int epfd, int op, int fd,
struct oabi_epoll_event __user *event)
{
return -EINVAL;
}
+#endif
-asmlinkage long sys_oabi_epoll_wait(int epfd,
- struct oabi_epoll_event __user *events,
- int maxevents, int timeout)
+struct epoll_event __user *
+epoll_put_uevent(__poll_t revents, __u64 data,
+ struct epoll_event __user *uevent)
{
- return -EINVAL;
+ if (in_oabi_syscall()) {
+ struct oabi_epoll_event __user *oevent = (void __user *)uevent;
+
+ if (__put_user(revents, &oevent->events) ||
+ __put_user(data, &oevent->data))
+ return NULL;
+
+ return (void __user *)(oevent+1);
+ }
+
+ if (__put_user(revents, &uevent->events) ||
+ __put_user(data, &uevent->data))
+ return NULL;
+
+ return uevent+1;
}
-#endif
struct oabi_sembuf {
unsigned short sem_num;
@@ -321,46 +330,52 @@ struct oabi_sembuf {
unsigned short __pad;
};
+#define sc_semopm sem_ctls[2]
+
+#ifdef CONFIG_SYSVIPC
asmlinkage long sys_oabi_semtimedop(int semid,
struct oabi_sembuf __user *tsops,
unsigned nsops,
const struct old_timespec32 __user *timeout)
{
+ struct ipc_namespace *ns;
struct sembuf *sops;
- struct old_timespec32 local_timeout;
long err;
int i;
+ ns = current->nsproxy->ipc_ns;
+ if (nsops > ns->sc_semopm)
+ return -E2BIG;
if (nsops < 1 || nsops > SEMOPM)
return -EINVAL;
- if (!access_ok(tsops, sizeof(*tsops) * nsops))
- return -EFAULT;
- sops = kmalloc_array(nsops, sizeof(*sops), GFP_KERNEL);
+ sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL);
if (!sops)
return -ENOMEM;
err = 0;
for (i = 0; i < nsops; i++) {
struct oabi_sembuf osb;
- err |= __copy_from_user(&osb, tsops, sizeof(osb));
+ err |= copy_from_user(&osb, tsops, sizeof(osb));
sops[i].sem_num = osb.sem_num;
sops[i].sem_op = osb.sem_op;
sops[i].sem_flg = osb.sem_flg;
tsops++;
}
- if (timeout) {
- /* copy this as well before changing domain protection */
- err |= copy_from_user(&local_timeout, timeout, sizeof(*timeout));
- timeout = &local_timeout;
- }
if (err) {
err = -EFAULT;
- } else {
- mm_segment_t fs = get_fs();
- set_fs(KERNEL_DS);
- err = sys_semtimedop_time32(semid, sops, nsops, timeout);
- set_fs(fs);
+ goto out;
}
- kfree(sops);
+
+ if (timeout) {
+ struct timespec64 ts;
+ err = get_old_timespec32(&ts, timeout);
+ if (err)
+ goto out;
+ err = __do_semtimedop(semid, sops, nsops, &ts, ns);
+ goto out;
+ }
+ err = __do_semtimedop(semid, sops, nsops, NULL, ns);
+out:
+ kvfree(sops);
return err;
}
@@ -387,6 +402,27 @@ asmlinkage int sys_oabi_ipc(uint call, int first, int second, int third,
return sys_ipc(call, first, second, third, ptr, fifth);
}
}
+#else
+asmlinkage long sys_oabi_semtimedop(int semid,
+ struct oabi_sembuf __user *tsops,
+ unsigned nsops,
+ const struct old_timespec32 __user *timeout)
+{
+ return -ENOSYS;
+}
+
+asmlinkage long sys_oabi_semop(int semid, struct oabi_sembuf __user *tsops,
+ unsigned nsops)
+{
+ return -ENOSYS;
+}
+
+asmlinkage int sys_oabi_ipc(uint call, int first, int second, int third,
+ void __user *ptr, long fifth)
+{
+ return -ENOSYS;
+}
+#endif
asmlinkage long sys_oabi_bind(int fd, struct sockaddr __user *addr, int addrlen)
{
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 64308e3a5d0c..4a7edc6e848f 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -122,17 +122,8 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom,
unsigned long top)
{
unsigned long first;
- mm_segment_t fs;
int i;
- /*
- * We need to switch to kernel mode so that we can use __get_user
- * to safely read from kernel space. Note that we now dump the
- * code first, just in case the backtrace kills us.
- */
- fs = get_fs();
- set_fs(KERNEL_DS);
-
printk("%s%s(0x%08lx to 0x%08lx)\n", lvl, str, bottom, top);
for (first = bottom & ~31; first < top; first += 32) {
@@ -145,7 +136,7 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom,
for (p = first, i = 0; i < 8 && p < top; i++, p += 4) {
if (p >= bottom && p < top) {
unsigned long val;
- if (__get_user(val, (unsigned long *)p) == 0)
+ if (get_kernel_nofault(val, (unsigned long *)p))
sprintf(str + i * 9, " %08lx", val);
else
sprintf(str + i * 9, " ????????");
@@ -153,11 +144,9 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom,
}
printk("%s%04lx:%s\n", lvl, first & 0xffff, str);
}
-
- set_fs(fs);
}
-static void __dump_instr(const char *lvl, struct pt_regs *regs)
+static void dump_instr(const char *lvl, struct pt_regs *regs)
{
unsigned long addr = instruction_pointer(regs);
const int thumb = thumb_mode(regs);
@@ -173,10 +162,20 @@ static void __dump_instr(const char *lvl, struct pt_regs *regs)
for (i = -4; i < 1 + !!thumb; i++) {
unsigned int val, bad;
- if (thumb)
- bad = get_user(val, &((u16 *)addr)[i]);
- else
- bad = get_user(val, &((u32 *)addr)[i]);
+ if (!user_mode(regs)) {
+ if (thumb) {
+ u16 val16;
+ bad = get_kernel_nofault(val16, &((u16 *)addr)[i]);
+ val = val16;
+ } else {
+ bad = get_kernel_nofault(val, &((u32 *)addr)[i]);
+ }
+ } else {
+ if (thumb)
+ bad = get_user(val, &((u16 *)addr)[i]);
+ else
+ bad = get_user(val, &((u32 *)addr)[i]);
+ }
if (!bad)
p += sprintf(p, i == 0 ? "(%0*x) " : "%0*x ",
@@ -189,20 +188,6 @@ static void __dump_instr(const char *lvl, struct pt_regs *regs)
printk("%sCode: %s\n", lvl, str);
}
-static void dump_instr(const char *lvl, struct pt_regs *regs)
-{
- mm_segment_t fs;
-
- if (!user_mode(regs)) {
- fs = get_fs();
- set_fs(KERNEL_DS);
- __dump_instr(lvl, regs);
- set_fs(fs);
- } else {
- __dump_instr(lvl, regs);
- }
-}
-
#ifdef CONFIG_ARM_UNWIND
static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
const char *loglvl)
@@ -781,11 +766,6 @@ void abort(void)
panic("Oops failed to kill thread");
}
-void __init trap_init(void)
-{
- return;
-}
-
#ifdef CONFIG_KUSER_HELPERS
static void __init kuser_init(void *vectors)
{
diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index f8016e3db65d..480a20766137 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -109,8 +109,7 @@
ENTRY(arm_copy_from_user)
#ifdef CONFIG_CPU_SPECTRE
- get_thread_info r3
- ldr r3, [r3, #TI_ADDR_LIMIT]
+ ldr r3, =TASK_SIZE
uaccess_mask_range_ptr r1, r2, r3, ip
#endif
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index ebfe4cb3d912..842ea5ede485 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -109,8 +109,7 @@
ENTRY(__copy_to_user_std)
WEAK(arm_copy_to_user)
#ifdef CONFIG_CPU_SPECTRE
- get_thread_info r3
- ldr r3, [r3, #TI_ADDR_LIMIT]
+ ldr r3, =TASK_SIZE
uaccess_mask_range_ptr r0, r2, r3, ip
#endif
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 7e0a9b692d87..e842209e135d 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -266,7 +266,7 @@
249 common lookup_dcookie sys_lookup_dcookie
250 common epoll_create sys_epoll_create
251 common epoll_ctl sys_epoll_ctl sys_oabi_epoll_ctl
-252 common epoll_wait sys_epoll_wait sys_oabi_epoll_wait
+252 common epoll_wait sys_epoll_wait
253 common remap_file_pages sys_remap_file_pages
# 254 for set_thread_area
# 255 for get_thread_area
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9dc1720a909f..077f2ec4eeb2 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -220,6 +220,7 @@ config ARM64
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
+ select TRACE_IRQFLAGS_SUPPORT
help
ARM 64-bit (AArch64) Linux support.
@@ -287,9 +288,6 @@ config ILLEGAL_POINTER_VALUE
config LOCKDEP_SUPPORT
def_bool y
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
config GENERIC_BUG
def_bool y
depends on BUG
diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index 79c1a750e357..eaa6ca062d89 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -107,11 +107,6 @@ struct compat_statfs {
#define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current)))
#define COMPAT_MINSIGSTKSZ 2048
-static inline void __user *arch_compat_alloc_user_space(long len)
-{
- return (void __user *)compat_user_stack_pointer() - len;
-}
-
struct compat_ipc64_perm {
compat_key_t key;
__compat_uid32_t uid;
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index cdfa2a242e9f..ef6be92b1921 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -602,14 +602,14 @@ static inline bool id_aa64pfr0_32bit_el1(u64 pfr0)
{
u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT);
- return val == ID_AA64PFR0_EL1_32BIT_64BIT;
+ return val == ID_AA64PFR0_ELx_32BIT_64BIT;
}
static inline bool id_aa64pfr0_32bit_el0(u64 pfr0)
{
u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT);
- return val == ID_AA64PFR0_EL0_32BIT_64BIT;
+ return val == ID_AA64PFR0_ELx_32BIT_64BIT;
}
static inline bool id_aa64pfr0_sve(u64 pfr0)
@@ -784,13 +784,13 @@ extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
{
switch (parange) {
- case 0: return 32;
- case 1: return 36;
- case 2: return 40;
- case 3: return 42;
- case 4: return 44;
- case 5: return 48;
- case 6: return 52;
+ case ID_AA64MMFR0_PARANGE_32: return 32;
+ case ID_AA64MMFR0_PARANGE_36: return 36;
+ case ID_AA64MMFR0_PARANGE_40: return 40;
+ case ID_AA64MMFR0_PARANGE_42: return 42;
+ case ID_AA64MMFR0_PARANGE_44: return 44;
+ case ID_AA64MMFR0_PARANGE_48: return 48;
+ case ID_AA64MMFR0_PARANGE_52: return 52;
/*
* A future PE could use a value unknown to the kernel.
* However, by the "D10.1.4 Principles of the ID scheme
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index d436831dd706..327120c0089f 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -12,8 +12,13 @@
#include <asm/types.h>
/* Hyp Configuration Register (HCR) bits */
+
+#define HCR_TID5 (UL(1) << 58)
+#define HCR_DCT (UL(1) << 57)
#define HCR_ATA_SHIFT 56
#define HCR_ATA (UL(1) << HCR_ATA_SHIFT)
+#define HCR_AMVOFFEN (UL(1) << 51)
+#define HCR_FIEN (UL(1) << 47)
#define HCR_FWB (UL(1) << 46)
#define HCR_API (UL(1) << 41)
#define HCR_APK (UL(1) << 40)
@@ -32,9 +37,9 @@
#define HCR_TVM (UL(1) << 26)
#define HCR_TTLB (UL(1) << 25)
#define HCR_TPU (UL(1) << 24)
-#define HCR_TPC (UL(1) << 23)
+#define HCR_TPC (UL(1) << 23) /* HCR_TPCP if FEAT_DPB */
#define HCR_TSW (UL(1) << 22)
-#define HCR_TAC (UL(1) << 21)
+#define HCR_TACR (UL(1) << 21)
#define HCR_TIDCP (UL(1) << 20)
#define HCR_TSC (UL(1) << 19)
#define HCR_TID3 (UL(1) << 18)
@@ -56,12 +61,13 @@
#define HCR_PTW (UL(1) << 2)
#define HCR_SWIO (UL(1) << 1)
#define HCR_VM (UL(1) << 0)
+#define HCR_RES0 ((UL(1) << 48) | (UL(1) << 39))
/*
* The bits we set in HCR:
* TLOR: Trap LORegion register accesses
* RW: 64bit by default, can be overridden for 32bit VMs
- * TAC: Trap ACTLR
+ * TACR: Trap ACTLR
* TSC: Trap SMC
* TSW: Trap cache operations by set/way
* TWE: Trap WFE
@@ -76,7 +82,7 @@
* PTW: Take a stage2 fault if a stage1 walk steps in device memory
*/
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
- HCR_BSU_IS | HCR_FB | HCR_TAC | \
+ HCR_BSU_IS | HCR_FB | HCR_TACR | \
HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
HCR_FMO | HCR_IMO | HCR_PTW )
#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
@@ -275,24 +281,40 @@
#define CPTR_EL2_TTA (1 << 20)
#define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT)
#define CPTR_EL2_TZ (1 << 8)
-#define CPTR_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 */
-#define CPTR_EL2_DEFAULT CPTR_EL2_RES1
+#define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */
+#define CPTR_EL2_DEFAULT CPTR_NVHE_EL2_RES1
+#define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) | \
+ GENMASK(29, 21) | \
+ GENMASK(19, 14) | \
+ BIT(11))
/* Hyp Debug Configuration Register bits */
#define MDCR_EL2_E2TB_MASK (UL(0x3))
#define MDCR_EL2_E2TB_SHIFT (UL(24))
-#define MDCR_EL2_TTRF (1 << 19)
-#define MDCR_EL2_TPMS (1 << 14)
+#define MDCR_EL2_HPMFZS (UL(1) << 36)
+#define MDCR_EL2_HPMFZO (UL(1) << 29)
+#define MDCR_EL2_MTPME (UL(1) << 28)
+#define MDCR_EL2_TDCC (UL(1) << 27)
+#define MDCR_EL2_HCCD (UL(1) << 23)
+#define MDCR_EL2_TTRF (UL(1) << 19)
+#define MDCR_EL2_HPMD (UL(1) << 17)
+#define MDCR_EL2_TPMS (UL(1) << 14)
#define MDCR_EL2_E2PB_MASK (UL(0x3))
#define MDCR_EL2_E2PB_SHIFT (UL(12))
-#define MDCR_EL2_TDRA (1 << 11)
-#define MDCR_EL2_TDOSA (1 << 10)
-#define MDCR_EL2_TDA (1 << 9)
-#define MDCR_EL2_TDE (1 << 8)
-#define MDCR_EL2_HPME (1 << 7)
-#define MDCR_EL2_TPM (1 << 6)
-#define MDCR_EL2_TPMCR (1 << 5)
-#define MDCR_EL2_HPMN_MASK (0x1F)
+#define MDCR_EL2_TDRA (UL(1) << 11)
+#define MDCR_EL2_TDOSA (UL(1) << 10)
+#define MDCR_EL2_TDA (UL(1) << 9)
+#define MDCR_EL2_TDE (UL(1) << 8)
+#define MDCR_EL2_HPME (UL(1) << 7)
+#define MDCR_EL2_TPM (UL(1) << 6)
+#define MDCR_EL2_TPMCR (UL(1) << 5)
+#define MDCR_EL2_HPMN_MASK (UL(0x1F))
+#define MDCR_EL2_RES0 (GENMASK(63, 37) | \
+ GENMASK(35, 30) | \
+ GENMASK(25, 24) | \
+ GENMASK(22, 20) | \
+ BIT(18) | \
+ GENMASK(16, 15))
/* For compatibility with fault code shared with 32-bit */
#define FSC_FAULT ESR_ELx_FSC_FAULT
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 9f0bf2109be7..e86045ac43ba 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -59,12 +59,11 @@
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14
#define __KVM_HOST_SMCCC_FUNC___pkvm_init 15
-#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings 16
+#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp 16
#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17
#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18
#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19
-#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20
-#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21
+#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 20
#ifndef __ASSEMBLY__
@@ -210,7 +209,7 @@ extern u64 __vgic_v3_read_vmcr(void);
extern void __vgic_v3_write_vmcr(u32 vmcr);
extern void __vgic_v3_init_lrs(void);
-extern u32 __kvm_get_mdcr_el2(void);
+extern u64 __kvm_get_mdcr_el2(void);
#define __KVM_EXTABLE(from, to) \
" .pushsection __kvm_ex_table, \"a\"\n" \
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 41911585ae0c..f8be56d5342b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -66,7 +66,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
extern unsigned int kvm_sve_max_vl;
int kvm_arm_init_sve(void);
-int __attribute_const__ kvm_target_cpu(void);
+u32 __attribute_const__ kvm_target_cpu(void);
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
@@ -185,7 +185,6 @@ enum vcpu_sysreg {
PMCNTENSET_EL0, /* Count Enable Set Register */
PMINTENSET_EL1, /* Interrupt Enable Set Register */
PMOVSSET_EL0, /* Overflow Flag Status Set Register */
- PMSWINC_EL0, /* Software Increment Register */
PMUSERENR_EL0, /* User Enable Register */
/* Pointer Authentication Registers in a strict increasing order. */
@@ -287,9 +286,13 @@ struct kvm_vcpu_arch {
/* Stage 2 paging state used by the hardware on next switch */
struct kvm_s2_mmu *hw_mmu;
- /* HYP configuration */
+ /* Values of trap registers for the guest. */
u64 hcr_el2;
- u32 mdcr_el2;
+ u64 mdcr_el2;
+ u64 cptr_el2;
+
+ /* Values of trap registers for the host before guest entry. */
+ u64 mdcr_el2_host;
/* Exception Information */
struct kvm_vcpu_fault_info fault;
@@ -576,6 +579,7 @@ struct kvm_vcpu_stat {
u64 wfi_exit_stat;
u64 mmio_exit_user;
u64 mmio_exit_kernel;
+ u64 signal_exits;
u64 exits;
};
@@ -771,6 +775,11 @@ void kvm_arch_free_vm(struct kvm *kvm);
int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
+static inline bool kvm_vm_is_protected(struct kvm *kvm)
+{
+ return false;
+}
+
int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature);
bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 9d60b3006efc..657d0c94cf82 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -95,7 +95,7 @@ void __sve_restore_state(void *sve_pffr, u32 *fpsr);
#ifndef __KVM_NVHE_HYPERVISOR__
void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
-void deactivate_traps_vhe_put(void);
+void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu);
#endif
u64 __guest_enter(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index b52c5c4b9a3d..02d378887743 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -252,6 +252,11 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr)
+/*
+ * When this is (directly or indirectly) used on the TLB invalidation
+ * path, we rely on a previously issued DSB so that page table updates
+ * and VMID reads are correctly ordered.
+ */
static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
{
struct kvm_vmid *vmid = &mmu->vmid;
@@ -259,7 +264,7 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0;
baddr = mmu->pgd_phys;
- vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT;
+ vmid_field = (u64)READ_ONCE(vmid->vmid) << VTTBR_VMID_SHIFT;
return kvm_phys_to_vttbr(baddr) | vmid_field | cnp;
}
@@ -267,9 +272,10 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
* Must be called from hyp code running at EL2 with an updated VTTBR
* and interrupts disabled.
*/
-static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr)
+static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu,
+ struct kvm_arch *arch)
{
- write_sysreg(vtcr, vtcr_el2);
+ write_sysreg(arch->vtcr, vtcr_el2);
write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);
/*
@@ -280,11 +286,6 @@ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long
asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
}
-static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
-{
- __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr);
-}
-
static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
{
return container_of(mmu->arch, struct kvm, arch);
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index f004c0115d89..027783829584 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -25,6 +25,46 @@ static inline u64 kvm_get_parange(u64 mmfr0)
typedef u64 kvm_pte_t;
+#define KVM_PTE_VALID BIT(0)
+
+#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
+#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
+
+static inline bool kvm_pte_valid(kvm_pte_t pte)
+{
+ return pte & KVM_PTE_VALID;
+}
+
+static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
+{
+ u64 pa = pte & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16)
+ pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
+
+ return pa;
+}
+
+static inline u64 kvm_granule_shift(u32 level)
+{
+ /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
+ return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
+}
+
+static inline u64 kvm_granule_size(u32 level)
+{
+ return BIT(kvm_granule_shift(level));
+}
+
+static inline bool kvm_level_supports_block_mapping(u32 level)
+{
+ /*
+ * Reject invalid block mappings and don't bother with 4TB mappings for
+ * 52-bit PAs.
+ */
+ return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1));
+}
+
/**
* struct kvm_pgtable_mm_ops - Memory management callbacks.
* @zalloc_page: Allocate a single zeroed memory page.
@@ -76,30 +116,15 @@ enum kvm_pgtable_stage2_flags {
};
/**
- * struct kvm_pgtable - KVM page-table.
- * @ia_bits: Maximum input address size, in bits.
- * @start_level: Level at which the page-table walk starts.
- * @pgd: Pointer to the first top-level entry of the page-table.
- * @mm_ops: Memory management callbacks.
- * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
- */
-struct kvm_pgtable {
- u32 ia_bits;
- u32 start_level;
- kvm_pte_t *pgd;
- struct kvm_pgtable_mm_ops *mm_ops;
-
- /* Stage-2 only */
- struct kvm_s2_mmu *mmu;
- enum kvm_pgtable_stage2_flags flags;
-};
-
-/**
* enum kvm_pgtable_prot - Page-table permissions and attributes.
* @KVM_PGTABLE_PROT_X: Execute permission.
* @KVM_PGTABLE_PROT_W: Write permission.
* @KVM_PGTABLE_PROT_R: Read permission.
* @KVM_PGTABLE_PROT_DEVICE: Device attributes.
+ * @KVM_PGTABLE_PROT_SW0: Software bit 0.
+ * @KVM_PGTABLE_PROT_SW1: Software bit 1.
+ * @KVM_PGTABLE_PROT_SW2: Software bit 2.
+ * @KVM_PGTABLE_PROT_SW3: Software bit 3.
*/
enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_X = BIT(0),
@@ -107,21 +132,48 @@ enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_R = BIT(2),
KVM_PGTABLE_PROT_DEVICE = BIT(3),
+
+ KVM_PGTABLE_PROT_SW0 = BIT(55),
+ KVM_PGTABLE_PROT_SW1 = BIT(56),
+ KVM_PGTABLE_PROT_SW2 = BIT(57),
+ KVM_PGTABLE_PROT_SW3 = BIT(58),
};
-#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
+#define KVM_PGTABLE_PROT_RW (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
+#define KVM_PGTABLE_PROT_RWX (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X)
+
+#define PKVM_HOST_MEM_PROT KVM_PGTABLE_PROT_RWX
+#define PKVM_HOST_MMIO_PROT KVM_PGTABLE_PROT_RW
+
+#define PAGE_HYP KVM_PGTABLE_PROT_RW
#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R)
#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
+typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
+ enum kvm_pgtable_prot prot);
+
/**
- * struct kvm_mem_range - Range of Intermediate Physical Addresses
- * @start: Start of the range.
- * @end: End of the range.
+ * struct kvm_pgtable - KVM page-table.
+ * @ia_bits: Maximum input address size, in bits.
+ * @start_level: Level at which the page-table walk starts.
+ * @pgd: Pointer to the first top-level entry of the page-table.
+ * @mm_ops: Memory management callbacks.
+ * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
+ * @flags: Stage-2 page-table flags.
+ * @force_pte_cb: Function that returns true if page level mappings must
+ * be used instead of block mappings.
*/
-struct kvm_mem_range {
- u64 start;
- u64 end;
+struct kvm_pgtable {
+ u32 ia_bits;
+ u32 start_level;
+ kvm_pte_t *pgd;
+ struct kvm_pgtable_mm_ops *mm_ops;
+
+ /* Stage-2 only */
+ struct kvm_s2_mmu *mmu;
+ enum kvm_pgtable_stage2_flags flags;
+ kvm_pgtable_force_pte_cb_t force_pte_cb;
};
/**
@@ -216,21 +268,24 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
/**
- * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table.
+ * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
* @pgt: Uninitialised page-table structure to initialise.
* @arch: Arch-specific KVM structure representing the guest virtual
* machine.
* @mm_ops: Memory management callbacks.
* @flags: Stage-2 configuration flags.
+ * @force_pte_cb: Function that returns true if page level mappings must
+ * be used instead of block mappings.
*
* Return: 0 on success, negative error code on failure.
*/
-int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
- struct kvm_pgtable_mm_ops *mm_ops,
- enum kvm_pgtable_stage2_flags flags);
+int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+ struct kvm_pgtable_mm_ops *mm_ops,
+ enum kvm_pgtable_stage2_flags flags,
+ kvm_pgtable_force_pte_cb_t force_pte_cb);
#define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \
- kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0)
+ __kvm_pgtable_stage2_init(pgt, arch, mm_ops, 0, NULL)
/**
* kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
@@ -374,7 +429,8 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
* If there is a valid, leaf page-table entry used to translate @addr, then
* relax the permissions in that entry according to the read, write and
* execute permissions specified by @prot. No permissions are removed, and
- * TLB invalidation is performed after updating the entry.
+ * TLB invalidation is performed after updating the entry. Software bits cannot
+ * be set or cleared using kvm_pgtable_stage2_relax_perms().
*
* Return: 0 on success, negative error code on failure.
*/
@@ -433,22 +489,42 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_pgtable_walker *walker);
/**
- * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical
- * Addresses with compatible permission
- * attributes.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
- * @addr: Address that must be covered by the range.
- * @prot: Protection attributes that the range must be compatible with.
- * @range: Range structure used to limit the search space at call time and
- * that will hold the result.
+ * kvm_pgtable_get_leaf() - Walk a page-table and retrieve the leaf entry
+ * with its level.
+ * @pgt: Page-table structure initialised by kvm_pgtable_*_init()
+ * or a similar initialiser.
+ * @addr: Input address for the start of the walk.
+ * @ptep: Pointer to storage for the retrieved PTE.
+ * @level: Pointer to storage for the level of the retrieved PTE.
+ *
+ * The offset of @addr within a page is ignored.
*
- * The offset of @addr within a page is ignored. An IPA is compatible with @prot
- * iff its corresponding stage-2 page-table entry has default ownership and, if
- * valid, is mapped with protection attributes identical to @prot.
+ * The walker will walk the page-table entries corresponding to the input
+ * address specified, retrieving the leaf corresponding to this address.
+ * Invalid entries are treated as leaf entries.
*
* Return: 0 on success, negative error code on failure.
*/
-int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
- enum kvm_pgtable_prot prot,
- struct kvm_mem_range *range);
+int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
+ kvm_pte_t *ptep, u32 *level);
+
+/**
+ * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a
+ * stage-2 Page-Table Entry.
+ * @pte: Page-table entry
+ *
+ * Return: protection attributes of the page-table entry in the enum
+ * kvm_pgtable_prot format.
+ */
+enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte);
+
+/**
+ * kvm_pgtable_hyp_pte_prot() - Retrieve the protection attributes of a stage-1
+ * Page-Table Entry.
+ * @pte: Page-table entry
+ *
+ * Return: protection attributes of the page-table entry in the enum
+ * kvm_pgtable_prot format.
+ */
+enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte);
#endif /* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index f2e06e7c0a31..b268082d67ed 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -784,14 +784,13 @@
#define ID_AA64PFR0_AMU 0x1
#define ID_AA64PFR0_SVE 0x1
#define ID_AA64PFR0_RAS_V1 0x1
+#define ID_AA64PFR0_RAS_V1P1 0x2
#define ID_AA64PFR0_FP_NI 0xf
#define ID_AA64PFR0_FP_SUPPORTED 0x0
#define ID_AA64PFR0_ASIMD_NI 0xf
#define ID_AA64PFR0_ASIMD_SUPPORTED 0x0
-#define ID_AA64PFR0_EL1_64BIT_ONLY 0x1
-#define ID_AA64PFR0_EL1_32BIT_64BIT 0x2
-#define ID_AA64PFR0_EL0_64BIT_ONLY 0x1
-#define ID_AA64PFR0_EL0_32BIT_64BIT 0x2
+#define ID_AA64PFR0_ELx_64BIT_ONLY 0x1
+#define ID_AA64PFR0_ELx_32BIT_64BIT 0x2
/* id_aa64pfr1 */
#define ID_AA64PFR1_MPAMFRAC_SHIFT 16
@@ -847,6 +846,9 @@
#define ID_AA64MMFR0_ASID_SHIFT 4
#define ID_AA64MMFR0_PARANGE_SHIFT 0
+#define ID_AA64MMFR0_ASID_8 0x0
+#define ID_AA64MMFR0_ASID_16 0x2
+
#define ID_AA64MMFR0_TGRAN4_NI 0xf
#define ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN 0x0
#define ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX 0x7
@@ -857,9 +859,16 @@
#define ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN 0x1
#define ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX 0xf
+#define ID_AA64MMFR0_PARANGE_32 0x0
+#define ID_AA64MMFR0_PARANGE_36 0x1
+#define ID_AA64MMFR0_PARANGE_40 0x2
+#define ID_AA64MMFR0_PARANGE_42 0x3
+#define ID_AA64MMFR0_PARANGE_44 0x4
#define ID_AA64MMFR0_PARANGE_48 0x5
#define ID_AA64MMFR0_PARANGE_52 0x6
+#define ARM64_MIN_PARANGE_BITS 32
+
#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT 0x0
#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE 0x1
#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN 0x2
@@ -904,6 +913,7 @@
#define ID_AA64MMFR2_CNP_SHIFT 0
/* id_aa64dfr0 */
+#define ID_AA64DFR0_MTPMU_SHIFT 48
#define ID_AA64DFR0_TRBE_SHIFT 44
#define ID_AA64DFR0_TRACE_FILT_SHIFT 40
#define ID_AA64DFR0_DOUBLELOCK_SHIFT 36
@@ -1034,14 +1044,17 @@
#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT
#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN
#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX
+#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN4_2_SHIFT
#elif defined(CONFIG_ARM64_16K_PAGES)
#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT
#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN
#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX
+#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN16_2_SHIFT
#elif defined(CONFIG_ARM64_64K_PAGES)
#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT
#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN64_SUPPORTED_MIN
#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN64_SUPPORTED_MAX
+#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN64_2_SHIFT
#endif
#define MVFR2_FPMISC_SHIFT 4
@@ -1172,6 +1185,11 @@
#define ICH_VTR_A3V_SHIFT 21
#define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT)
+#define ARM64_FEATURE_FIELD_BITS 4
+
+/* Create a mask for the feature bits of the specified feature. */
+#define ARM64_FEATURE_MASK(x) (GENMASK_ULL(x##_SHIFT + ARM64_FEATURE_FIELD_BITS - 1, x##_SHIFT))
+
#ifdef __ASSEMBLY__
.irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index b5f08621fa29..190b494e22ab 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -430,17 +430,6 @@ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const voi
__actu_ret; \
})
-extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n);
-#define raw_copy_in_user(to, from, n) \
-({ \
- unsigned long __aciu_ret; \
- uaccess_ttbr0_enable(); \
- __aciu_ret = __arch_copy_in_user(__uaccess_mask_ptr(to), \
- __uaccess_mask_ptr(from), (n)); \
- uaccess_ttbr0_disable(); \
- __aciu_ret; \
-})
-
#define INLINE_COPY_TO_USER
#define INLINE_COPY_FROM_USER
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 4e99e4b912ef..844f6ae58662 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -649,11 +649,11 @@ __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
#define __NR_inotify_rm_watch 318
__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
#define __NR_mbind 319
-__SYSCALL(__NR_mbind, compat_sys_mbind)
+__SYSCALL(__NR_mbind, sys_mbind)
#define __NR_get_mempolicy 320
-__SYSCALL(__NR_get_mempolicy, compat_sys_get_mempolicy)
+__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
#define __NR_set_mempolicy 321
-__SYSCALL(__NR_set_mempolicy, compat_sys_set_mempolicy)
+__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
#define __NR_openat 322
__SYSCALL(__NR_openat, compat_sys_openat)
#define __NR_mkdirat 323
@@ -699,7 +699,7 @@ __SYSCALL(__NR_tee, sys_tee)
#define __NR_vmsplice 343
__SYSCALL(__NR_vmsplice, sys_vmsplice)
#define __NR_move_pages 344
-__SYSCALL(__NR_move_pages, compat_sys_move_pages)
+__SYSCALL(__NR_move_pages, sys_move_pages)
#define __NR_getcpu 345
__SYSCALL(__NR_getcpu, sys_getcpu)
#define __NR_epoll_pwait 346
@@ -811,7 +811,7 @@ __SYSCALL(__NR_rseq, sys_rseq)
#define __NR_io_pgetevents 399
__SYSCALL(__NR_io_pgetevents, compat_sys_io_pgetevents)
#define __NR_migrate_pages 400
-__SYSCALL(__NR_migrate_pages, compat_sys_migrate_pages)
+__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
#define __NR_kexec_file_load 401
__SYSCALL(__NR_kexec_file_load, sys_kexec_file_load)
/* 402 is unused */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index b2770d753ba3..f8a3067d10c6 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -240,8 +240,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY),
ARM64_FTR_END,
};
@@ -1983,7 +1983,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.sys_reg = SYS_ID_AA64PFR0_EL1,
.sign = FTR_UNSIGNED,
.field_pos = ID_AA64PFR0_EL0_SHIFT,
- .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
+ .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT,
},
#ifdef CONFIG_KVM
{
@@ -1994,7 +1994,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.sys_reg = SYS_ID_AA64PFR0_EL1,
.sign = FTR_UNSIGNED,
.field_pos = ID_AA64PFR0_EL1_SHIFT,
- .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT,
+ .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT,
},
{
.desc = "Protected KVM",
diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
index 1006ed2d7c60..2276689b5411 100644
--- a/arch/arm64/kernel/pci.c
+++ b/arch/arm64/kernel/pci.c
@@ -82,14 +82,29 @@ int acpi_pci_bus_find_domain_nr(struct pci_bus *bus)
int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
{
- if (!acpi_disabled) {
- struct pci_config_window *cfg = bridge->bus->sysdata;
- struct acpi_device *adev = to_acpi_device(cfg->parent);
- struct device *bus_dev = &bridge->bus->dev;
+ struct pci_config_window *cfg;
+ struct acpi_device *adev;
+ struct device *bus_dev;
- ACPI_COMPANION_SET(&bridge->dev, adev);
- set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev)));
- }
+ if (acpi_disabled)
+ return 0;
+
+ cfg = bridge->bus->sysdata;
+
+ /*
+ * On Hyper-V there is no corresponding ACPI device for a root bridge,
+ * therefore ->parent is set as NULL by the driver. And set 'adev' as
+ * NULL in this case because there is no proper ACPI device.
+ */
+ if (!cfg->parent)
+ adev = NULL;
+ else
+ adev = to_acpi_device(cfg->parent);
+
+ bus_dev = &bridge->bus->dev;
+
+ ACPI_COMPANION_SET(&bridge->dev, adev);
+ set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev)));
return 0;
}
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 709d2c433c5e..f6b1a88245db 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -181,6 +181,8 @@ SECTIONS
/* everything from this point to __init_begin will be marked RO NX */
RO_DATA(PAGE_SIZE)
+ HYPERVISOR_DATA_SECTIONS
+
idmap_pg_dir = .;
. += IDMAP_DIR_SIZE;
idmap_pg_end = .;
@@ -260,8 +262,6 @@ SECTIONS
_sdata = .;
RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
- HYPERVISOR_DATA_SECTIONS
-
/*
* Data written with the MMU off but read with the MMU on requires
* cache lines to be invalidated, discarding up to a Cache Writeback
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index a4eba0908bfa..d7eec0b43744 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -26,6 +26,7 @@ menuconfig KVM
select HAVE_KVM_ARCH_TLB_FLUSH_ALL
select KVM_MMIO
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ select KVM_XFER_TO_GUEST_WORK
select SRCU
select KVM_VFIO
select HAVE_KVM_EVENTFD
@@ -46,6 +47,15 @@ if KVM
source "virt/kvm/Kconfig"
+config NVHE_EL2_DEBUG
+ bool "Debug mode for non-VHE EL2 object"
+ help
+ Say Y here to enable the debug mode for the non-VHE KVM EL2 object.
+ Failure reports will BUG() in the hypervisor. This is intended for
+ local EL2 hypervisor development.
+
+ If unsure, say N.
+
endif # KVM
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 0ca72f5cda41..fe102cd2e518 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -6,6 +6,7 @@
#include <linux/bug.h>
#include <linux/cpu_pm.h>
+#include <linux/entry-kvm.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/kvm_host.h>
@@ -15,6 +16,7 @@
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
+#include <linux/kmemleak.h>
#include <linux/kvm.h>
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
@@ -42,10 +44,6 @@
#include <kvm/arm_pmu.h>
#include <kvm/arm_psci.h>
-#ifdef REQUIRES_VIRT
-__asm__(".arch_extension virt");
-#endif
-
static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
@@ -575,7 +573,7 @@ static void update_vmid(struct kvm_vmid *vmid)
kvm_call_hyp(__kvm_flush_vm_context);
}
- vmid->vmid = kvm_next_vmid;
+ WRITE_ONCE(vmid->vmid, kvm_next_vmid);
kvm_next_vmid++;
kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1;
@@ -719,6 +717,45 @@ static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
}
/**
+ * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest
+ * @vcpu: The VCPU pointer
+ * @ret: Pointer to write optional return code
+ *
+ * Returns: true if the VCPU needs to return to a preemptible + interruptible
+ * and skip guest entry.
+ *
+ * This function disambiguates between two different types of exits: exits to a
+ * preemptible + interruptible kernel context and exits to userspace. For an
+ * exit to userspace, this function will write the return code to ret and return
+ * true. For an exit to preemptible + interruptible kernel context (i.e. check
+ * for pending work and re-enter), return true without writing to ret.
+ */
+static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
+{
+ struct kvm_run *run = vcpu->run;
+
+ /*
+ * If we're using a userspace irqchip, then check if we need
+ * to tell a userspace irqchip about timer or PMU level
+ * changes and if so, exit to userspace (the actual level
+ * state gets updated in kvm_timer_update_run and
+ * kvm_pmu_update_run below).
+ */
+ if (static_branch_unlikely(&userspace_irqchip_in_use)) {
+ if (kvm_timer_should_notify_user(vcpu) ||
+ kvm_pmu_should_notify_user(vcpu)) {
+ *ret = -EINTR;
+ run->exit_reason = KVM_EXIT_INTR;
+ return true;
+ }
+ }
+
+ return kvm_request_pending(vcpu) ||
+ need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) ||
+ xfer_to_guest_mode_work_pending();
+}
+
+/**
* kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
* @vcpu: The VCPU pointer
*
@@ -761,7 +798,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
/*
* Check conditions before entering the guest
*/
- cond_resched();
+ ret = xfer_to_guest_mode_handle_work(vcpu);
+ if (!ret)
+ ret = 1;
update_vmid(&vcpu->arch.hw_mmu->vmid);
@@ -781,30 +820,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_vgic_flush_hwstate(vcpu);
/*
- * Exit if we have a signal pending so that we can deliver the
- * signal to user space.
- */
- if (signal_pending(current)) {
- ret = -EINTR;
- run->exit_reason = KVM_EXIT_INTR;
- }
-
- /*
- * If we're using a userspace irqchip, then check if we need
- * to tell a userspace irqchip about timer or PMU level
- * changes and if so, exit to userspace (the actual level
- * state gets updated in kvm_timer_update_run and
- * kvm_pmu_update_run below).
- */
- if (static_branch_unlikely(&userspace_irqchip_in_use)) {
- if (kvm_timer_should_notify_user(vcpu) ||
- kvm_pmu_should_notify_user(vcpu)) {
- ret = -EINTR;
- run->exit_reason = KVM_EXIT_INTR;
- }
- }
-
- /*
* Ensure we set mode to IN_GUEST_MODE after we disable
* interrupts and before the final VCPU requests check.
* See the comment in kvm_vcpu_exiting_guest_mode() and
@@ -812,8 +827,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
smp_store_mb(vcpu->mode, IN_GUEST_MODE);
- if (ret <= 0 || need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) ||
- kvm_request_pending(vcpu)) {
+ if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
isb(); /* Ensure work in x_flush_hwstate is committed */
kvm_pmu_sync_hwstate(vcpu);
@@ -1039,7 +1053,7 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_init *init)
{
unsigned int i, ret;
- int phys_target = kvm_target_cpu();
+ u32 phys_target = kvm_target_cpu();
if (init->target != phys_target)
return -EINVAL;
@@ -1108,6 +1122,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
}
vcpu_reset_hcr(vcpu);
+ vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT;
/*
* Handle the "start in power-off" case.
@@ -1219,6 +1234,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (copy_from_user(&reg, argp, sizeof(reg)))
break;
+ /*
+ * We could owe a reset due to PSCI. Handle the pending reset
+ * here to ensure userspace register accesses are ordered after
+ * the reset.
+ */
+ if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
+ kvm_reset_vcpu(vcpu);
+
if (ioctl == KVM_SET_ONE_REG)
r = kvm_arm_set_reg(vcpu, &reg);
else
@@ -1700,11 +1723,6 @@ static bool init_psci_relay(void)
return true;
}
-static int init_common_resources(void)
-{
- return kvm_set_ipa_limit();
-}
-
static int init_subsystems(void)
{
int err = 0;
@@ -1958,56 +1976,17 @@ static void _kvm_host_prot_finalize(void *discard)
WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize));
}
-static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
-{
- return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end);
-}
-
-#define pkvm_mark_hyp_section(__section) \
- pkvm_mark_hyp(__pa_symbol(__section##_start), \
- __pa_symbol(__section##_end))
-
static int finalize_hyp_mode(void)
{
- int cpu, ret;
-
if (!is_protected_kvm_enabled())
return 0;
- ret = pkvm_mark_hyp_section(__hyp_idmap_text);
- if (ret)
- return ret;
-
- ret = pkvm_mark_hyp_section(__hyp_text);
- if (ret)
- return ret;
-
- ret = pkvm_mark_hyp_section(__hyp_rodata);
- if (ret)
- return ret;
-
- ret = pkvm_mark_hyp_section(__hyp_bss);
- if (ret)
- return ret;
-
- ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size);
- if (ret)
- return ret;
-
- for_each_possible_cpu(cpu) {
- phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]);
- phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order());
-
- ret = pkvm_mark_hyp(start, end);
- if (ret)
- return ret;
-
- start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu));
- end = start + PAGE_SIZE;
- ret = pkvm_mark_hyp(start, end);
- if (ret)
- return ret;
- }
+ /*
+ * Exclude HYP BSS from kmemleak so that it doesn't get peeked
+ * at, which would end badly once the section is inaccessible.
+ * None of other sections should ever be introspected.
+ */
+ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
/*
* Flip the static key upfront as that may no longer be possible
@@ -2019,11 +1998,6 @@ static int finalize_hyp_mode(void)
return 0;
}
-static void check_kvm_target_cpu(void *ret)
-{
- *(int *)ret = kvm_target_cpu();
-}
-
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
{
struct kvm_vcpu *vcpu;
@@ -2083,7 +2057,6 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
int kvm_arch_init(void *opaque)
{
int err;
- int ret, cpu;
bool in_hyp_mode;
if (!is_hyp_mode_available()) {
@@ -2098,15 +2071,7 @@ int kvm_arch_init(void *opaque)
kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
"Only trusted guests should be used on this system.\n");
- for_each_online_cpu(cpu) {
- smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
- if (ret < 0) {
- kvm_err("Error, CPU %d not supported!\n", cpu);
- return -ENODEV;
- }
- }
-
- err = init_common_resources();
+ err = kvm_set_ipa_limit();
if (err)
return err;
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index d5e79d7ee6e9..db9361338b2a 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -21,7 +21,7 @@
DBG_MDSCR_KDE | \
DBG_MDSCR_MDE)
-static DEFINE_PER_CPU(u32, mdcr_el2);
+static DEFINE_PER_CPU(u64, mdcr_el2);
/**
* save/restore_guest_debug_regs
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 1dfb83578277..5ce26bedf23c 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -31,8 +31,6 @@
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS()
};
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
- sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
@@ -50,10 +48,9 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, wfi_exit_stat),
STATS_DESC_COUNTER(VCPU, mmio_exit_user),
STATS_DESC_COUNTER(VCPU, mmio_exit_kernel),
+ STATS_DESC_COUNTER(VCPU, signal_exits),
STATS_DESC_COUNTER(VCPU, exits)
};
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
- sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
@@ -842,7 +839,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
return 0;
}
-int __attribute_const__ kvm_target_cpu(void)
+u32 __attribute_const__ kvm_target_cpu(void)
{
unsigned long implementor = read_cpuid_implementor();
unsigned long part_number = read_cpuid_part_number();
@@ -874,7 +871,7 @@ int __attribute_const__ kvm_target_cpu(void)
int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
{
- int target = kvm_target_cpu();
+ u32 target = kvm_target_cpu();
if (target < 0)
return -ENODEV;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 6f48336b1d86..275a27368a04 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -113,34 +113,20 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu)
* guest and host are using the same debug facilities it will be up to
* userspace to re-inject the correct exception for guest delivery.
*
- * @return: 0 (while setting vcpu->run->exit_reason), -1 for error
+ * @return: 0 (while setting vcpu->run->exit_reason)
*/
static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
u32 esr = kvm_vcpu_get_esr(vcpu);
- int ret = 0;
run->exit_reason = KVM_EXIT_DEBUG;
run->debug.arch.hsr = esr;
- switch (ESR_ELx_EC(esr)) {
- case ESR_ELx_EC_WATCHPT_LOW:
+ if (ESR_ELx_EC(esr) == ESR_ELx_EC_WATCHPT_LOW)
run->debug.arch.far = vcpu->arch.fault.far_el2;
- fallthrough;
- case ESR_ELx_EC_SOFTSTP_LOW:
- case ESR_ELx_EC_BREAKPT_LOW:
- case ESR_ELx_EC_BKPT32:
- case ESR_ELx_EC_BRK64:
- break;
- default:
- kvm_err("%s: un-handled case esr: %#08x\n",
- __func__, (unsigned int) esr);
- ret = -1;
- break;
- }
- return ret;
+ return 0;
}
static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu)
@@ -292,11 +278,12 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu));
}
-void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
+void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
+ u64 elr_virt, u64 elr_phys,
u64 par, uintptr_t vcpu,
u64 far, u64 hpfar) {
- u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr));
- u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr;
+ u64 elr_in_kimg = __phys_to_kimg(elr_phys);
+ u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt;
u64 mode = spsr & PSR_MODE_MASK;
/*
@@ -309,20 +296,24 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
kvm_err("Invalid host exception to nVHE hyp!\n");
} else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
(esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) {
- struct bug_entry *bug = find_bug(elr_in_kimg);
const char *file = NULL;
unsigned int line = 0;
/* All hyp bugs, including warnings, are treated as fatal. */
- if (bug)
- bug_get_file_line(bug, &file, &line);
+ if (!is_protected_kvm_enabled() ||
+ IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
+ struct bug_entry *bug = find_bug(elr_in_kimg);
+
+ if (bug)
+ bug_get_file_line(bug, &file, &line);
+ }
if (file)
kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
else
- kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset);
+ kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset);
} else {
- kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset);
+ kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset);
}
/*
@@ -334,5 +325,5 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
kvm_err("Hyp Offset: 0x%llx\n", hyp_offset);
panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n",
- spsr, elr, esr, far, hpfar, par, vcpu);
+ spsr, elr_virt, esr, far, hpfar, par, vcpu);
}
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index e4a2f295a394..a0e78a6027be 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -92,11 +92,15 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
write_sysreg(0, pmselr_el0);
write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
}
+
+ vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2);
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
}
-static inline void __deactivate_traps_common(void)
+static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
{
+ write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2);
+
write_sysreg(0, hstr_el2);
if (kvm_arm_support_pmu_v3())
write_sysreg(0, pmuserenr_el0);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 9c227d87c36d..b58c910babaf 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -12,6 +12,32 @@
#include <asm/virt.h>
#include <nvhe/spinlock.h>
+/*
+ * SW bits 0-1 are reserved to track the memory ownership state of each page:
+ * 00: The page is owned exclusively by the page-table owner.
+ * 01: The page is owned by the page-table owner, but is shared
+ * with another entity.
+ * 10: The page is shared with, but not owned by the page-table owner.
+ * 11: Reserved for future use (lending).
+ */
+enum pkvm_page_state {
+ PKVM_PAGE_OWNED = 0ULL,
+ PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0,
+ PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1,
+};
+
+#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
+static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot,
+ enum pkvm_page_state state)
+{
+ return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state;
+}
+
+static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
+{
+ return prot & PKVM_PAGE_STATE_PROT_MASK;
+}
+
struct host_kvm {
struct kvm_arch arch;
struct kvm_pgtable pgt;
@@ -20,16 +46,21 @@ struct host_kvm {
};
extern struct host_kvm host_kvm;
+extern const u8 pkvm_hyp_id;
+
int __pkvm_prot_finalize(void);
-int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
+int __pkvm_host_share_hyp(u64 pfn);
+bool addr_is_memory(phys_addr_t phys);
+int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
+int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
int kvm_host_prepare_stage2(void *pgt_pool_base);
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
static __always_inline void __load_host_stage2(void)
{
if (static_branch_likely(&kvm_protected_mode_initialized))
- __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+ __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch);
else
write_sysreg(0, vttbr_el2);
}
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index 8ec3a5a7744b..c9a8f535212e 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -23,8 +23,7 @@ int hyp_map_vectors(void);
int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
-int __pkvm_create_mappings(unsigned long start, unsigned long size,
- unsigned long phys, enum kvm_pgtable_prot prot);
+int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot);
unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
index 76b537f8d1c6..4652fd04bdbe 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
@@ -15,6 +15,7 @@
#include <asm/alternative.h>
#include <asm/lse.h>
+#include <asm/rwonce.h>
typedef union hyp_spinlock {
u32 __val;
@@ -89,4 +90,28 @@ static inline void hyp_spin_unlock(hyp_spinlock_t *lock)
: "memory");
}
+static inline bool hyp_spin_is_locked(hyp_spinlock_t *lock)
+{
+ hyp_spinlock_t lockval = READ_ONCE(*lock);
+
+ return lockval.owner != lockval.next;
+}
+
+#ifdef CONFIG_NVHE_EL2_DEBUG
+static inline void hyp_assert_lock_held(hyp_spinlock_t *lock)
+{
+ /*
+ * The __pkvm_init() path accesses protected data-structures without
+ * holding locks as the other CPUs are guaranteed to not enter EL2
+ * concurrently at this point in time. The point by which EL2 is
+ * initialized on all CPUs is reflected in the pkvm static key, so
+ * wait until it is set before checking the lock state.
+ */
+ if (static_branch_likely(&kvm_protected_mode_initialized))
+ BUG_ON(!hyp_spin_is_locked(lock));
+}
+#else
+static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) { }
+#endif
+
#endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index 7d3f25868cae..df361d839902 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -109,7 +109,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu)
__debug_switch_to_host_common(vcpu);
}
-u32 __kvm_get_mdcr_el2(void)
+u64 __kvm_get_mdcr_el2(void)
{
return read_sysreg(mdcr_el2);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index 2b23400e0fb3..4b652ffb591d 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -7,6 +7,7 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
@@ -85,12 +86,24 @@ SYM_FUNC_START(__hyp_do_panic)
mov x29, x0
+#ifdef CONFIG_NVHE_EL2_DEBUG
+ /* Ensure host stage-2 is disabled */
+ mrs x0, hcr_el2
+ bic x0, x0, #HCR_VM
+ msr hcr_el2, x0
+ isb
+ tlbi vmalls12e1
+ dsb nsh
+#endif
+
/* Load the panic arguments into x0-7 */
mrs x0, esr_el2
- get_vcpu_ptr x4, x5
- mrs x5, far_el2
- mrs x6, hpfar_el2
- mov x7, xzr // Unused argument
+ mov x4, x3
+ mov x3, x2
+ hyp_pa x3, x6
+ get_vcpu_ptr x5, x6
+ mrs x6, far_el2
+ mrs x7, hpfar_el2
/* Enter the host, conditionally restoring the host context. */
cbz x29, __host_enter_without_restoring
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 1632f001f4ed..2da6aa8da868 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -140,14 +140,11 @@ static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot);
}
-static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt)
+static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt)
{
- DECLARE_REG(unsigned long, start, host_ctxt, 1);
- DECLARE_REG(unsigned long, size, host_ctxt, 2);
- DECLARE_REG(unsigned long, phys, host_ctxt, 3);
- DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4);
+ DECLARE_REG(u64, pfn, host_ctxt, 1);
- cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot);
+ cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn);
}
static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt)
@@ -163,14 +160,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
{
cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
}
-
-static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt)
-{
- DECLARE_REG(phys_addr_t, start, host_ctxt, 1);
- DECLARE_REG(phys_addr_t, end, host_ctxt, 2);
-
- cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end);
-}
typedef void (*hcall_t)(struct kvm_cpu_context *);
#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -193,10 +182,9 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__vgic_v3_restore_aprs),
HANDLE_FUNC(__pkvm_init),
HANDLE_FUNC(__pkvm_cpu_set_vector),
- HANDLE_FUNC(__pkvm_create_mappings),
+ HANDLE_FUNC(__pkvm_host_share_hyp),
HANDLE_FUNC(__pkvm_create_private_mapping),
HANDLE_FUNC(__pkvm_prot_finalize),
- HANDLE_FUNC(__pkvm_mark_hyp),
};
static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index a6ce991b1467..bacd493a4eac 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -31,7 +31,7 @@ static struct hyp_pool host_s2_pool;
u64 id_aa64mmfr0_el1_sys_val;
u64 id_aa64mmfr1_el1_sys_val;
-static const u8 pkvm_hyp_id = 1;
+const u8 pkvm_hyp_id = 1;
static void *host_s2_zalloc_pages_exact(size_t size)
{
@@ -89,6 +89,8 @@ static void prepare_host_vtcr(void)
id_aa64mmfr1_el1_sys_val, phys_shift);
}
+static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot);
+
int kvm_host_prepare_stage2(void *pgt_pool_base)
{
struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
@@ -101,16 +103,17 @@ int kvm_host_prepare_stage2(void *pgt_pool_base)
if (ret)
return ret;
- ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch,
- &host_kvm.mm_ops, KVM_HOST_S2_FLAGS);
+ ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch,
+ &host_kvm.mm_ops, KVM_HOST_S2_FLAGS,
+ host_stage2_force_pte_cb);
if (ret)
return ret;
mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
mmu->arch = &host_kvm.arch;
mmu->pgt = &host_kvm.pgt;
- mmu->vmid.vmid_gen = 0;
- mmu->vmid.vmid = 0;
+ WRITE_ONCE(mmu->vmid.vmid_gen, 0);
+ WRITE_ONCE(mmu->vmid.vmid, 0);
return 0;
}
@@ -126,7 +129,7 @@ int __pkvm_prot_finalize(void)
kvm_flush_dcache_to_poc(params, sizeof(*params));
write_sysreg(params->hcr_el2, hcr_el2);
- __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+ __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch);
/*
* Make sure to have an ISB before the TLB maintenance below but only
@@ -159,6 +162,11 @@ static int host_stage2_unmap_dev_all(void)
return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
}
+struct kvm_mem_range {
+ u64 start;
+ u64 end;
+};
+
static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
{
int cur, left = 0, right = hyp_memblock_nr;
@@ -189,16 +197,26 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
return false;
}
+bool addr_is_memory(phys_addr_t phys)
+{
+ struct kvm_mem_range range;
+
+ return find_mem_range(phys, &range);
+}
+
+static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
+{
+ return range->start <= addr && addr < range->end;
+}
+
static bool range_is_memory(u64 start, u64 end)
{
- struct kvm_mem_range r1, r2;
+ struct kvm_mem_range r;
- if (!find_mem_range(start, &r1) || !find_mem_range(end - 1, &r2))
- return false;
- if (r1.start != r2.start)
+ if (!find_mem_range(start, &r))
return false;
- return true;
+ return is_in_mem_range(end - 1, &r);
}
static inline int __host_stage2_idmap(u64 start, u64 end,
@@ -208,60 +226,208 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
prot, &host_s2_pool);
}
+/*
+ * The pool has been provided with enough pages to cover all of memory with
+ * page granularity, but it is difficult to know how much of the MMIO range
+ * we will need to cover upfront, so we may need to 'recycle' the pages if we
+ * run out.
+ */
+#define host_stage2_try(fn, ...) \
+ ({ \
+ int __ret; \
+ hyp_assert_lock_held(&host_kvm.lock); \
+ __ret = fn(__VA_ARGS__); \
+ if (__ret == -ENOMEM) { \
+ __ret = host_stage2_unmap_dev_all(); \
+ if (!__ret) \
+ __ret = fn(__VA_ARGS__); \
+ } \
+ __ret; \
+ })
+
+static inline bool range_included(struct kvm_mem_range *child,
+ struct kvm_mem_range *parent)
+{
+ return parent->start <= child->start && child->end <= parent->end;
+}
+
+static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
+{
+ struct kvm_mem_range cur;
+ kvm_pte_t pte;
+ u32 level;
+ int ret;
+
+ hyp_assert_lock_held(&host_kvm.lock);
+ ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level);
+ if (ret)
+ return ret;
+
+ if (kvm_pte_valid(pte))
+ return -EAGAIN;
+
+ if (pte)
+ return -EPERM;
+
+ do {
+ u64 granule = kvm_granule_size(level);
+ cur.start = ALIGN_DOWN(addr, granule);
+ cur.end = cur.start + granule;
+ level++;
+ } while ((level < KVM_PGTABLE_MAX_LEVELS) &&
+ !(kvm_level_supports_block_mapping(level) &&
+ range_included(&cur, range)));
+
+ *range = cur;
+
+ return 0;
+}
+
+int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
+ enum kvm_pgtable_prot prot)
+{
+ hyp_assert_lock_held(&host_kvm.lock);
+
+ return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
+}
+
+int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
+{
+ hyp_assert_lock_held(&host_kvm.lock);
+
+ return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt,
+ addr, size, &host_s2_pool, owner_id);
+}
+
+static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
+{
+ /*
+ * Block mappings must be used with care in the host stage-2 as a
+ * kvm_pgtable_stage2_map() operation targeting a page in the range of
+ * an existing block will delete the block under the assumption that
+ * mappings in the rest of the block range can always be rebuilt lazily.
+ * That assumption is correct for the host stage-2 with RWX mappings
+ * targeting memory or RW mappings targeting MMIO ranges (see
+ * host_stage2_idmap() below which implements some of the host memory
+ * abort logic). However, this is not safe for any other mappings where
+ * the host stage-2 page-table is in fact the only place where this
+ * state is stored. In all those cases, it is safer to use page-level
+ * mappings, hence avoiding to lose the state because of side-effects in
+ * kvm_pgtable_stage2_map().
+ */
+ if (range_is_memory(addr, end))
+ return prot != PKVM_HOST_MEM_PROT;
+ else
+ return prot != PKVM_HOST_MMIO_PROT;
+}
+
static int host_stage2_idmap(u64 addr)
{
- enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
struct kvm_mem_range range;
bool is_memory = find_mem_range(addr, &range);
+ enum kvm_pgtable_prot prot;
int ret;
- if (is_memory)
- prot |= KVM_PGTABLE_PROT_X;
+ prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT;
hyp_spin_lock(&host_kvm.lock);
- ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range);
+ ret = host_stage2_adjust_range(addr, &range);
if (ret)
goto unlock;
- ret = __host_stage2_idmap(range.start, range.end, prot);
- if (ret != -ENOMEM)
+ ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot);
+unlock:
+ hyp_spin_unlock(&host_kvm.lock);
+
+ return ret;
+}
+
+static inline bool check_prot(enum kvm_pgtable_prot prot,
+ enum kvm_pgtable_prot required,
+ enum kvm_pgtable_prot denied)
+{
+ return (prot & (required | denied)) == required;
+}
+
+int __pkvm_host_share_hyp(u64 pfn)
+{
+ phys_addr_t addr = hyp_pfn_to_phys(pfn);
+ enum kvm_pgtable_prot prot, cur;
+ void *virt = __hyp_va(addr);
+ enum pkvm_page_state state;
+ kvm_pte_t pte;
+ int ret;
+
+ if (!addr_is_memory(addr))
+ return -EINVAL;
+
+ hyp_spin_lock(&host_kvm.lock);
+ hyp_spin_lock(&pkvm_pgd_lock);
+
+ ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, NULL);
+ if (ret)
goto unlock;
+ if (!pte)
+ goto map_shared;
/*
- * The pool has been provided with enough pages to cover all of memory
- * with page granularity, but it is difficult to know how much of the
- * MMIO range we will need to cover upfront, so we may need to 'recycle'
- * the pages if we run out.
+ * Check attributes in the host stage-2 PTE. We need the page to be:
+ * - mapped RWX as we're sharing memory;
+ * - not borrowed, as that implies absence of ownership.
+ * Otherwise, we can't let it got through
*/
- ret = host_stage2_unmap_dev_all();
- if (ret)
+ cur = kvm_pgtable_stage2_pte_prot(pte);
+ prot = pkvm_mkstate(0, PKVM_PAGE_SHARED_BORROWED);
+ if (!check_prot(cur, PKVM_HOST_MEM_PROT, prot)) {
+ ret = -EPERM;
goto unlock;
+ }
- ret = __host_stage2_idmap(range.start, range.end, prot);
+ state = pkvm_getstate(cur);
+ if (state == PKVM_PAGE_OWNED)
+ goto map_shared;
-unlock:
- hyp_spin_unlock(&host_kvm.lock);
+ /*
+ * Tolerate double-sharing the same page, but this requires
+ * cross-checking the hypervisor stage-1.
+ */
+ if (state != PKVM_PAGE_SHARED_OWNED) {
+ ret = -EPERM;
+ goto unlock;
+ }
- return ret;
-}
+ ret = kvm_pgtable_get_leaf(&pkvm_pgtable, (u64)virt, &pte, NULL);
+ if (ret)
+ goto unlock;
-int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
-{
- int ret;
+ /*
+ * If the page has been shared with the hypervisor, it must be
+ * already mapped as SHARED_BORROWED in its stage-1.
+ */
+ cur = kvm_pgtable_hyp_pte_prot(pte);
+ prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED);
+ if (!check_prot(cur, prot, ~prot))
+ ret = -EPERM;
+ goto unlock;
+map_shared:
/*
- * host_stage2_unmap_dev_all() currently relies on MMIO mappings being
- * non-persistent, so don't allow changing page ownership in MMIO range.
+ * If the page is not yet shared, adjust mappings in both page-tables
+ * while both locks are held.
*/
- if (!range_is_memory(start, end))
- return -EINVAL;
+ prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED);
+ ret = pkvm_create_mappings_locked(virt, virt + PAGE_SIZE, prot);
+ BUG_ON(ret);
- hyp_spin_lock(&host_kvm.lock);
- ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start,
- &host_s2_pool, pkvm_hyp_id);
+ prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED);
+ ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot);
+ BUG_ON(ret);
+
+unlock:
+ hyp_spin_unlock(&pkvm_pgd_lock);
hyp_spin_unlock(&host_kvm.lock);
- return ret != -EAGAIN ? ret : 0;
+ return ret;
}
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index a8efdf0f9003..2fabeceb889a 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -23,8 +23,8 @@ u64 __io_map_base;
struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS];
unsigned int hyp_memblock_nr;
-int __pkvm_create_mappings(unsigned long start, unsigned long size,
- unsigned long phys, enum kvm_pgtable_prot prot)
+static int __pkvm_create_mappings(unsigned long start, unsigned long size,
+ unsigned long phys, enum kvm_pgtable_prot prot)
{
int err;
@@ -67,13 +67,15 @@ out:
return addr;
}
-int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot)
{
unsigned long start = (unsigned long)from;
unsigned long end = (unsigned long)to;
unsigned long virt_addr;
phys_addr_t phys;
+ hyp_assert_lock_held(&pkvm_pgd_lock);
+
start = start & PAGE_MASK;
end = PAGE_ALIGN(end);
@@ -81,7 +83,8 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
int err;
phys = hyp_virt_to_phys((void *)virt_addr);
- err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot);
+ err = kvm_pgtable_hyp_map(&pkvm_pgtable, virt_addr, PAGE_SIZE,
+ phys, prot);
if (err)
return err;
}
@@ -89,6 +92,17 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
return 0;
}
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+{
+ int ret;
+
+ hyp_spin_lock(&pkvm_pgd_lock);
+ ret = pkvm_create_mappings_locked(from, to, prot);
+ hyp_spin_unlock(&pkvm_pgd_lock);
+
+ return ret;
+}
+
int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
{
unsigned long start, end;
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 0b574d106519..57c27846320f 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -58,6 +58,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
{
void *start, *end, *virt = hyp_phys_to_virt(phys);
unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT;
+ enum kvm_pgtable_prot prot;
int ret, i;
/* Recreate the hyp page-table using the early page allocator */
@@ -83,10 +84,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
if (ret)
return ret;
- ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO);
- if (ret)
- return ret;
-
ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
if (ret)
return ret;
@@ -95,10 +92,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
if (ret)
return ret;
- ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO);
- if (ret)
- return ret;
-
ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP);
if (ret)
return ret;
@@ -117,6 +110,24 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
return ret;
}
+ /*
+ * Map the host's .bss and .rodata sections RO in the hypervisor, but
+ * transfer the ownership from the host to the hypervisor itself to
+ * make sure it can't be donated or shared with another entity.
+ *
+ * The ownership transition requires matching changes in the host
+ * stage-2. This will be done later (see finalize_host_mappings()) once
+ * the hyp_vmemmap is addressable.
+ */
+ prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED);
+ ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot);
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot);
+ if (ret)
+ return ret;
+
return 0;
}
@@ -148,6 +159,57 @@ static void hpool_put_page(void *addr)
hyp_put_page(&hpool, addr);
}
+static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ enum kvm_pgtable_prot prot;
+ enum pkvm_page_state state;
+ kvm_pte_t pte = *ptep;
+ phys_addr_t phys;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ if (level != (KVM_PGTABLE_MAX_LEVELS - 1))
+ return -EINVAL;
+
+ phys = kvm_pte_to_phys(pte);
+ if (!addr_is_memory(phys))
+ return 0;
+
+ /*
+ * Adjust the host stage-2 mappings to match the ownership attributes
+ * configured in the hypervisor stage-1.
+ */
+ state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
+ switch (state) {
+ case PKVM_PAGE_OWNED:
+ return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id);
+ case PKVM_PAGE_SHARED_OWNED:
+ prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED);
+ break;
+ case PKVM_PAGE_SHARED_BORROWED:
+ prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
+}
+
+static int finalize_host_mappings(void)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = finalize_host_mappings_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), &walker);
+}
+
void __noreturn __pkvm_init_finalise(void)
{
struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
@@ -167,6 +229,10 @@ void __noreturn __pkvm_init_finalise(void)
if (ret)
goto out;
+ ret = finalize_host_mappings();
+ if (ret)
+ goto out;
+
pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
.zalloc_page = hyp_zalloc_hyp_page,
.phys_to_virt = hyp_phys_to_virt,
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index f7af9688c1f7..a34b01cc8ab9 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -41,7 +41,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
___activate_traps(vcpu);
__activate_traps_common(vcpu);
- val = CPTR_EL2_DEFAULT;
+ val = vcpu->arch.cptr_el2;
val |= CPTR_EL2_TTA | CPTR_EL2_TAM;
if (!update_fp_enabled(vcpu)) {
val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
@@ -69,12 +69,10 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
extern char __kvm_hyp_host_vector[];
- u64 mdcr_el2, cptr;
+ u64 cptr;
___deactivate_traps(vcpu);
- mdcr_el2 = read_sysreg(mdcr_el2);
-
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
u64 val;
@@ -92,13 +90,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
isb();
}
- __deactivate_traps_common();
-
- mdcr_el2 &= MDCR_EL2_HPMN_MASK;
- mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
- mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT;
+ __deactivate_traps_common(vcpu);
- write_sysreg(mdcr_el2, mdcr_el2);
write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
cptr = CPTR_EL2_DEFAULT;
@@ -170,6 +163,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *host_ctxt;
struct kvm_cpu_context *guest_ctxt;
+ struct kvm_s2_mmu *mmu;
bool pmu_switch_needed;
u64 exit_code;
@@ -213,7 +207,8 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__sysreg32_restore_state(vcpu);
__sysreg_restore_state_nvhe(guest_ctxt);
- __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu));
+ mmu = kern_hyp_va(vcpu->arch.hw_mmu);
+ __load_stage2(mmu, kern_hyp_va(mmu->arch));
__activate_traps(vcpu);
__hyp_vgic_restore_state(vcpu);
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 38ed0f6f2703..d296d617f589 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -34,12 +34,12 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
}
/*
- * __load_guest_stage2() includes an ISB only when the AT
+ * __load_stage2() includes an ISB only when the AT
* workaround is applied. Take care of the opposite condition,
* ensuring that we always have an ISB, but not two ISBs back
* to back.
*/
- __load_guest_stage2(mmu);
+ __load_stage2(mmu, kern_hyp_va(mmu->arch));
asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
}
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 05321f4165e3..f8ceebe4982e 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -11,16 +11,12 @@
#include <asm/kvm_pgtable.h>
#include <asm/stage2_pgtable.h>
-#define KVM_PTE_VALID BIT(0)
#define KVM_PTE_TYPE BIT(1)
#define KVM_PTE_TYPE_BLOCK 0
#define KVM_PTE_TYPE_PAGE 1
#define KVM_PTE_TYPE_TABLE 1
-#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
-#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
-
#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
@@ -40,6 +36,8 @@
#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
+#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55)
+
#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
@@ -48,9 +46,7 @@
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
KVM_PTE_LEAF_ATTR_HI_S2_XN)
-#define KVM_PTE_LEAF_ATTR_S2_IGNORED GENMASK(58, 55)
-
-#define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56)
+#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
#define KVM_MAX_OWNER_ID 1
struct kvm_pgtable_walk_data {
@@ -61,17 +57,6 @@ struct kvm_pgtable_walk_data {
u64 end;
};
-static u64 kvm_granule_shift(u32 level)
-{
- /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
- return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
-}
-
-static u64 kvm_granule_size(u32 level)
-{
- return BIT(kvm_granule_shift(level));
-}
-
#define KVM_PHYS_INVALID (-1ULL)
static bool kvm_phys_is_valid(u64 phys)
@@ -79,15 +64,6 @@ static bool kvm_phys_is_valid(u64 phys)
return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX));
}
-static bool kvm_level_supports_block_mapping(u32 level)
-{
- /*
- * Reject invalid block mappings and don't bother with 4TB mappings for
- * 52-bit PAs.
- */
- return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1));
-}
-
static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
{
u64 granule = kvm_granule_size(level);
@@ -135,11 +111,6 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
}
-static bool kvm_pte_valid(kvm_pte_t pte)
-{
- return pte & KVM_PTE_VALID;
-}
-
static bool kvm_pte_table(kvm_pte_t pte, u32 level)
{
if (level == KVM_PGTABLE_MAX_LEVELS - 1)
@@ -151,16 +122,6 @@ static bool kvm_pte_table(kvm_pte_t pte, u32 level)
return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
}
-static u64 kvm_pte_to_phys(kvm_pte_t pte)
-{
- u64 pa = pte & KVM_PTE_ADDR_MASK;
-
- if (PAGE_SHIFT == 16)
- pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
-
- return pa;
-}
-
static kvm_pte_t kvm_phys_to_pte(u64 pa)
{
kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
@@ -326,6 +287,45 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
return _kvm_pgtable_walk(&walk_data);
}
+struct leaf_walk_data {
+ kvm_pte_t pte;
+ u32 level;
+};
+
+static int leaf_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ struct leaf_walk_data *data = arg;
+
+ data->pte = *ptep;
+ data->level = level;
+
+ return 0;
+}
+
+int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
+ kvm_pte_t *ptep, u32 *level)
+{
+ struct leaf_walk_data data;
+ struct kvm_pgtable_walker walker = {
+ .cb = leaf_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = &data,
+ };
+ int ret;
+
+ ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE),
+ PAGE_SIZE, &walker);
+ if (!ret) {
+ if (ptep)
+ *ptep = data.pte;
+ if (level)
+ *level = data.level;
+ }
+
+ return ret;
+}
+
struct hyp_map_data {
u64 phys;
kvm_pte_t attr;
@@ -357,11 +357,47 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
+ attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
*ptep = attr;
return 0;
}
+enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
+{
+ enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
+ u32 ap;
+
+ if (!kvm_pte_valid(pte))
+ return prot;
+
+ if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
+ prot |= KVM_PGTABLE_PROT_X;
+
+ ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte);
+ if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
+ prot |= KVM_PGTABLE_PROT_R;
+ else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
+ prot |= KVM_PGTABLE_PROT_RW;
+
+ return prot;
+}
+
+static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
+{
+ /*
+ * Tolerate KVM recreating the exact same mapping, or changing software
+ * bits if the existing mapping was valid.
+ */
+ if (old == new)
+ return false;
+
+ if (!kvm_pte_valid(old))
+ return true;
+
+ return !WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW);
+}
+
static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep, struct hyp_map_data *data)
{
@@ -371,9 +407,8 @@ static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
- /* Tolerate KVM recreating the exact same mapping */
new = kvm_init_valid_leaf_pte(phys, data->attr, level);
- if (old != new && !WARN_ON(kvm_pte_valid(old)))
+ if (hyp_pte_needs_update(old, new))
smp_store_release(ptep, new);
data->phys += granule;
@@ -438,6 +473,8 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
pgt->mm_ops = mm_ops;
pgt->mmu = NULL;
+ pgt->force_pte_cb = NULL;
+
return 0;
}
@@ -475,6 +512,9 @@ struct stage2_map_data {
void *memcache;
struct kvm_pgtable_mm_ops *mm_ops;
+
+ /* Force mappings to page granularity */
+ bool force_pte;
};
u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
@@ -539,11 +579,29 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
*ptep = attr;
return 0;
}
+enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
+{
+ enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
+
+ if (!kvm_pte_valid(pte))
+ return prot;
+
+ if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R)
+ prot |= KVM_PGTABLE_PROT_R;
+ if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
+ prot |= KVM_PGTABLE_PROT_W;
+ if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
+ prot |= KVM_PGTABLE_PROT_X;
+
+ return prot;
+}
+
static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
{
if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
@@ -588,6 +646,15 @@ static bool stage2_pte_executable(kvm_pte_t pte)
return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
}
+static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level,
+ struct stage2_map_data *data)
+{
+ if (data->force_pte && (level < (KVM_PGTABLE_MAX_LEVELS - 1)))
+ return false;
+
+ return kvm_block_mapping_supported(addr, end, data->phys, level);
+}
+
static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep,
struct stage2_map_data *data)
@@ -597,7 +664,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
struct kvm_pgtable *pgt = data->mmu->pgt;
struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
- if (!kvm_block_mapping_supported(addr, end, phys, level))
+ if (!stage2_leaf_mapping_allowed(addr, end, level, data))
return -E2BIG;
if (kvm_phys_is_valid(phys))
@@ -641,7 +708,7 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
if (data->anchor)
return 0;
- if (!kvm_block_mapping_supported(addr, end, data->phys, level))
+ if (!stage2_leaf_mapping_allowed(addr, end, level, data))
return 0;
data->childp = kvm_pte_follow(*ptep, data->mm_ops);
@@ -771,6 +838,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
.mmu = pgt->mmu,
.memcache = mc,
.mm_ops = pgt->mm_ops,
+ .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot),
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
@@ -802,6 +870,7 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
.memcache = mc,
.mm_ops = pgt->mm_ops,
.owner_id = owner_id,
+ .force_pte = true,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
@@ -995,6 +1064,9 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
u32 level;
kvm_pte_t set = 0, clr = 0;
+ if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
+ return -EINVAL;
+
if (prot & KVM_PGTABLE_PROT_R)
set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
@@ -1043,9 +1115,11 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
return kvm_pgtable_walk(pgt, addr, size, &walker);
}
-int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
- struct kvm_pgtable_mm_ops *mm_ops,
- enum kvm_pgtable_stage2_flags flags)
+
+int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+ struct kvm_pgtable_mm_ops *mm_ops,
+ enum kvm_pgtable_stage2_flags flags,
+ kvm_pgtable_force_pte_cb_t force_pte_cb)
{
size_t pgd_sz;
u64 vtcr = arch->vtcr;
@@ -1063,6 +1137,7 @@ int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch
pgt->mm_ops = mm_ops;
pgt->mmu = &arch->mmu;
pgt->flags = flags;
+ pgt->force_pte_cb = force_pte_cb;
/* Ensure zeroed PGD pages are visible to the hardware walker */
dsb(ishst);
@@ -1102,77 +1177,3 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
pgt->pgd = NULL;
}
-
-#define KVM_PTE_LEAF_S2_COMPAT_MASK (KVM_PTE_LEAF_ATTR_S2_PERMS | \
- KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \
- KVM_PTE_LEAF_ATTR_S2_IGNORED)
-
-static int stage2_check_permission_walker(u64 addr, u64 end, u32 level,
- kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag,
- void * const arg)
-{
- kvm_pte_t old_attr, pte = *ptep, *new_attr = arg;
-
- /*
- * Compatible mappings are either invalid and owned by the page-table
- * owner (whose id is 0), or valid with matching permission attributes.
- */
- if (kvm_pte_valid(pte)) {
- old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK;
- if (old_attr != *new_attr)
- return -EEXIST;
- } else if (pte) {
- return -EEXIST;
- }
-
- return 0;
-}
-
-int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
- enum kvm_pgtable_prot prot,
- struct kvm_mem_range *range)
-{
- kvm_pte_t attr;
- struct kvm_pgtable_walker check_perm_walker = {
- .cb = stage2_check_permission_walker,
- .flags = KVM_PGTABLE_WALK_LEAF,
- .arg = &attr,
- };
- u64 granule, start, end;
- u32 level;
- int ret;
-
- ret = stage2_set_prot_attr(pgt, prot, &attr);
- if (ret)
- return ret;
- attr &= KVM_PTE_LEAF_S2_COMPAT_MASK;
-
- for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) {
- granule = kvm_granule_size(level);
- start = ALIGN_DOWN(addr, granule);
- end = start + granule;
-
- if (!kvm_level_supports_block_mapping(level))
- continue;
-
- if (start < range->start || range->end < end)
- continue;
-
- /*
- * Check the presence of existing mappings with incompatible
- * permissions within the current block range, and try one level
- * deeper if one is found.
- */
- ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker);
- if (ret != -EEXIST)
- break;
- }
-
- if (!ret) {
- range->start = start;
- range->end = end;
- }
-
- return ret;
-}
diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c
index f1e2e5a00933..289689b2682d 100644
--- a/arch/arm64/kvm/hyp/vhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c
@@ -20,7 +20,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu)
__debug_switch_to_host_common(vcpu);
}
-u32 __kvm_get_mdcr_el2(void)
+u64 __kvm_get_mdcr_el2(void)
{
return read_sysreg(mdcr_el2);
}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index b3229924d243..ded2c66675f0 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -91,17 +91,9 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu)
__activate_traps_common(vcpu);
}
-void deactivate_traps_vhe_put(void)
+void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu)
{
- u64 mdcr_el2 = read_sysreg(mdcr_el2);
-
- mdcr_el2 &= MDCR_EL2_HPMN_MASK |
- MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT |
- MDCR_EL2_TPMS;
-
- write_sysreg(mdcr_el2, mdcr_el2);
-
- __deactivate_traps_common();
+ __deactivate_traps_common(vcpu);
}
/* Switch to the guest for VHE systems running in EL2 */
@@ -124,11 +116,11 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
*
* We have already configured the guest's stage 1 translation in
* kvm_vcpu_load_sysregs_vhe above. We must now call
- * __load_guest_stage2 before __activate_traps, because
- * __load_guest_stage2 configures stage 2 translation, and
+ * __load_stage2 before __activate_traps, because
+ * __load_stage2 configures stage 2 translation, and
* __activate_traps clear HCR_EL2.TGE (among other things).
*/
- __load_guest_stage2(vcpu->arch.hw_mmu);
+ __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch);
__activate_traps(vcpu);
__kvm_adjust_pc(vcpu);
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 2a0b8c88d74f..007a12dd4351 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -101,7 +101,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *host_ctxt;
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
- deactivate_traps_vhe_put();
+ deactivate_traps_vhe_put(vcpu);
__sysreg_save_el1_state(guest_ctxt);
__sysreg_save_user_state(guest_ctxt);
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index 66f17349f0c3..24cef9b87f9e 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -50,10 +50,10 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
*
* ARM erratum 1165522 requires some special handling (again),
* as we need to make sure both stages of translation are in
- * place before clearing TGE. __load_guest_stage2() already
+ * place before clearing TGE. __load_stage2() already
* has an ISB in order to deal with this.
*/
- __load_guest_stage2(mmu);
+ __load_stage2(mmu, mmu->arch);
val = read_sysreg(hcr_el2);
val &= ~HCR_TGE;
write_sysreg(val, hcr_el2);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 0625bf2353c2..1a94a7ca48f2 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -80,6 +80,7 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot)
*/
void kvm_flush_remote_tlbs(struct kvm *kvm)
{
+ ++kvm->stat.generic.remote_tlb_flush_requests;
kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
}
@@ -259,10 +260,8 @@ static int __create_hyp_mappings(unsigned long start, unsigned long size,
{
int err;
- if (!kvm_host_owns_hyp_mappings()) {
- return kvm_call_hyp_nvhe(__pkvm_create_mappings,
- start, size, phys, prot);
- }
+ if (WARN_ON(!kvm_host_owns_hyp_mappings()))
+ return -EINVAL;
mutex_lock(&kvm_hyp_pgd_mutex);
err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
@@ -282,6 +281,21 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
}
}
+static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end)
+{
+ phys_addr_t addr;
+ int ret;
+
+ for (addr = ALIGN_DOWN(start, PAGE_SIZE); addr < end; addr += PAGE_SIZE) {
+ ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp,
+ __phys_to_pfn(addr));
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
/**
* create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
* @from: The virtual kernel start address of the range
@@ -302,6 +316,13 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
if (is_kernel_in_hyp_mode())
return 0;
+ if (!kvm_host_owns_hyp_mappings()) {
+ if (WARN_ON(prot != PAGE_HYP))
+ return -EPERM;
+ return pkvm_share_hyp(kvm_kaddr_to_phys(from),
+ kvm_kaddr_to_phys(to));
+ }
+
start = start & PAGE_MASK;
end = PAGE_ALIGN(end);
@@ -433,6 +454,32 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
return 0;
}
+static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
+ /* We shouldn't need any other callback to walk the PT */
+ .phys_to_virt = kvm_host_va,
+};
+
+static int get_user_mapping_size(struct kvm *kvm, u64 addr)
+{
+ struct kvm_pgtable pgt = {
+ .pgd = (kvm_pte_t *)kvm->mm->pgd,
+ .ia_bits = VA_BITS,
+ .start_level = (KVM_PGTABLE_MAX_LEVELS -
+ CONFIG_PGTABLE_LEVELS),
+ .mm_ops = &kvm_user_mm_ops,
+ };
+ kvm_pte_t pte = 0; /* Keep GCC quiet... */
+ u32 level = ~0;
+ int ret;
+
+ ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
+ VM_BUG_ON(ret);
+ VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
+ VM_BUG_ON(!(pte & PTE_VALID));
+
+ return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
+}
+
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.zalloc_page = stage2_memcache_zalloc_page,
.zalloc_pages_exact = kvm_host_zalloc_pages_exact,
@@ -485,7 +532,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
mmu->arch = &kvm->arch;
mmu->pgt = pgt;
mmu->pgd_phys = __pa(pgt->pgd);
- mmu->vmid.vmid_gen = 0;
+ WRITE_ONCE(mmu->vmid.vmid_gen, 0);
return 0;
out_destroy_pgtable:
@@ -780,7 +827,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
* Returns the size of the mapping.
*/
static unsigned long
-transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
+transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long hva, kvm_pfn_t *pfnp,
phys_addr_t *ipap)
{
@@ -791,8 +838,8 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
* sure that the HVA and IPA are sufficiently aligned and that the
* block map is contained within the memslot.
*/
- if (kvm_is_transparent_hugepage(pfn) &&
- fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+ if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
+ get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
/*
* The address we faulted on is backed by a transparent huge
* page. However, because we map the compound huge page and
@@ -814,7 +861,7 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
*ipap &= PMD_MASK;
kvm_release_pfn_clean(pfn);
pfn &= ~(PTRS_PER_PMD - 1);
- kvm_get_pfn(pfn);
+ get_page(pfn_to_page(pfn));
*pfnp = pfn;
return PMD_SIZE;
@@ -1050,9 +1097,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* If we are not forced to use page mapping, check if we are
* backed by a THP and thus use block mapping if possible.
*/
- if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
- vma_pagesize = transparent_hugepage_adjust(memslot, hva,
- &pfn, &fault_ipa);
+ if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
+ if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE)
+ vma_pagesize = fault_granule;
+ else
+ vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
+ hva, &pfn,
+ &fault_ipa);
+ }
if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
/* Check the VMM hasn't introduced a new VM_SHARED VMA */
diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c
index 151c31fb9860..f9bb3b14130e 100644
--- a/arch/arm64/kvm/perf.c
+++ b/arch/arm64/kvm/perf.c
@@ -50,7 +50,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
int kvm_perf_init(void)
{
- if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled())
+ if (kvm_pmu_probe_pmuver() != ID_AA64DFR0_PMUVER_IMP_DEF && !is_protected_kvm_enabled())
static_branch_enable(&kvm_arm_pmu_available);
return perf_register_guest_info_callbacks(&kvm_guest_cbs);
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index f33825c995cb..f5065f23b413 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -373,7 +373,6 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
- reg &= kvm_pmu_valid_counter_mask(vcpu);
}
return reg;
@@ -564,20 +563,21 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
*/
void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
{
- unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
int i;
if (val & ARMV8_PMU_PMCR_E) {
kvm_pmu_enable_counter_mask(vcpu,
- __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
+ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
} else {
- kvm_pmu_disable_counter_mask(vcpu, mask);
+ kvm_pmu_disable_counter_mask(vcpu,
+ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
}
if (val & ARMV8_PMU_PMCR_C)
kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
if (val & ARMV8_PMU_PMCR_P) {
+ unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
for_each_set_bit(i, &mask, 32)
kvm_pmu_set_counter_value(vcpu, i, 0);
@@ -745,7 +745,7 @@ int kvm_pmu_probe_pmuver(void)
struct perf_event_attr attr = { };
struct perf_event *event;
struct arm_pmu *pmu;
- int pmuver = 0xf;
+ int pmuver = ID_AA64DFR0_PMUVER_IMP_DEF;
/*
* Create a dummy event that only counts user cycles. As we'll never
@@ -770,7 +770,7 @@ int kvm_pmu_probe_pmuver(void)
if (IS_ERR(event)) {
pr_err_once("kvm: pmu event creation failed %ld\n",
PTR_ERR(event));
- return 0xf;
+ return ID_AA64DFR0_PMUVER_IMP_DEF;
}
if (event->pmu) {
@@ -923,7 +923,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
if (!vcpu->kvm->arch.pmuver)
vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver();
- if (vcpu->kvm->arch.pmuver == 0xf)
+ if (vcpu->kvm->arch.pmuver == ID_AA64DFR0_PMUVER_IMP_DEF)
return -ENODEV;
switch (attr->attr) {
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index db4056ecccfd..74c47d420253 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -59,6 +59,12 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
kvm_vcpu_kick(vcpu);
}
+static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu,
+ unsigned long affinity)
+{
+ return !(affinity & ~MPIDR_HWID_BITMASK);
+}
+
static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
{
struct vcpu_reset_state *reset_state;
@@ -66,9 +72,9 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
struct kvm_vcpu *vcpu = NULL;
unsigned long cpu_id;
- cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK;
- if (vcpu_mode_is_32bit(source_vcpu))
- cpu_id &= ~((u32) 0);
+ cpu_id = smccc_get_arg1(source_vcpu);
+ if (!kvm_psci_valid_affinity(source_vcpu, cpu_id))
+ return PSCI_RET_INVALID_PARAMS;
vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);
@@ -126,6 +132,9 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
target_affinity = smccc_get_arg1(vcpu);
lowest_affinity_level = smccc_get_arg2(vcpu);
+ if (!kvm_psci_valid_affinity(vcpu, target_affinity))
+ return PSCI_RET_INVALID_PARAMS;
+
/* Determine target affinity mask */
target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
if (!target_affinity_mask)
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index cba7872d69a8..5ce36b0a3343 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -210,10 +210,16 @@ static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu)
*/
int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
{
+ struct vcpu_reset_state reset_state;
int ret;
bool loaded;
u32 pstate;
+ mutex_lock(&vcpu->kvm->lock);
+ reset_state = vcpu->arch.reset_state;
+ WRITE_ONCE(vcpu->arch.reset_state.reset, false);
+ mutex_unlock(&vcpu->kvm->lock);
+
/* Reset PMU outside of the non-preemptible section */
kvm_pmu_vcpu_reset(vcpu);
@@ -276,8 +282,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
* Additional reset state handling that PSCI may have imposed on us.
* Must be done after all the sys_reg reset.
*/
- if (vcpu->arch.reset_state.reset) {
- unsigned long target_pc = vcpu->arch.reset_state.pc;
+ if (reset_state.reset) {
+ unsigned long target_pc = reset_state.pc;
/* Gracefully handle Thumb2 entry point */
if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) {
@@ -286,13 +292,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
}
/* Propagate caller endianness */
- if (vcpu->arch.reset_state.be)
+ if (reset_state.be)
kvm_vcpu_set_be(vcpu);
*vcpu_pc(vcpu) = target_pc;
- vcpu_set_reg(vcpu, 0, vcpu->arch.reset_state.r0);
-
- vcpu->arch.reset_state.reset = false;
+ vcpu_set_reg(vcpu, 0, reset_state.r0);
}
/* Reset timer */
@@ -311,31 +315,26 @@ u32 get_kvm_ipa_limit(void)
int kvm_set_ipa_limit(void)
{
- unsigned int parange, tgran_2;
+ unsigned int parange;
u64 mmfr0;
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
parange = cpuid_feature_extract_unsigned_field(mmfr0,
ID_AA64MMFR0_PARANGE_SHIFT);
+ /*
+ * IPA size beyond 48 bits could not be supported
+ * on either 4K or 16K page size. Hence let's cap
+ * it to 48 bits, in case it's reported as larger
+ * on the system.
+ */
+ if (PAGE_SIZE != SZ_64K)
+ parange = min(parange, (unsigned int)ID_AA64MMFR0_PARANGE_48);
/*
* Check with ARMv8.5-GTG that our PAGE_SIZE is supported at
* Stage-2. If not, things will stop very quickly.
*/
- switch (PAGE_SIZE) {
- default:
- case SZ_4K:
- tgran_2 = ID_AA64MMFR0_TGRAN4_2_SHIFT;
- break;
- case SZ_16K:
- tgran_2 = ID_AA64MMFR0_TGRAN16_2_SHIFT;
- break;
- case SZ_64K:
- tgran_2 = ID_AA64MMFR0_TGRAN64_2_SHIFT;
- break;
- }
-
- switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) {
+ switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_TGRAN_2_SHIFT)) {
case ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE:
kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n");
return -EINVAL;
@@ -369,7 +368,7 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
if (phys_shift) {
if (phys_shift > kvm_ipa_limit ||
- phys_shift < 32)
+ phys_shift < ARM64_MIN_PARANGE_BITS)
return -EINVAL;
} else {
phys_shift = KVM_PHYS_SHIFT;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index f6f126eb6ac1..1d46e185f31e 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -44,10 +44,6 @@
* 64bit interface.
*/
-#define reg_to_encoding(x) \
- sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \
- (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2)
-
static bool read_from_write_only(struct kvm_vcpu *vcpu,
struct sys_reg_params *params,
const struct sys_reg_desc *r)
@@ -318,14 +314,14 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu,
/*
* We want to avoid world-switching all the DBG registers all the
* time:
- *
+ *
* - If we've touched any debug register, it is likely that we're
* going to touch more of them. It then makes sense to disable the
* traps and start doing the save/restore dance
* - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is
* then mandatory to save/restore the registers, as the guest
* depends on them.
- *
+ *
* For this, we use a DIRTY bit, indicating the guest has modified the
* debug registers, used as follow:
*
@@ -603,6 +599,41 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
return REG_HIDDEN;
}
+static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX);
+
+ /* No PMU available, any PMU reg may UNDEF... */
+ if (!kvm_arm_support_pmu_v3())
+ return;
+
+ n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
+ n &= ARMV8_PMU_PMCR_N_MASK;
+ if (n)
+ mask |= GENMASK(n - 1, 0);
+
+ reset_unknown(vcpu, r);
+ __vcpu_sys_reg(vcpu, r->reg) &= mask;
+}
+
+static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ reset_unknown(vcpu, r);
+ __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0);
+}
+
+static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ reset_unknown(vcpu, r);
+ __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK;
+}
+
+static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ reset_unknown(vcpu, r);
+ __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK;
+}
+
static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 pmcr, val;
@@ -845,7 +876,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
kvm_pmu_disable_counter_mask(vcpu, val);
}
} else {
- p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask;
+ p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
}
return true;
@@ -869,7 +900,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
/* accessing PMINTENCLR_EL1 */
__vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
} else {
- p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask;
+ p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
}
return true;
@@ -891,7 +922,7 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
/* accessing PMOVSCLR_EL0 */
__vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask);
} else {
- p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask;
+ p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
}
return true;
@@ -944,16 +975,18 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr }
#define PMU_SYS_REG(r) \
- SYS_DESC(r), .reset = reset_unknown, .visibility = pmu_visibility
+ SYS_DESC(r), .reset = reset_pmu_reg, .visibility = pmu_visibility
/* Macro to expand the PMEVCNTRn_EL0 register */
#define PMU_PMEVCNTR_EL0(n) \
{ PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \
+ .reset = reset_pmevcntr, \
.access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), }
/* Macro to expand the PMEVTYPERn_EL0 register */
#define PMU_PMEVTYPER_EL0(n) \
{ PMU_SYS_REG(SYS_PMEVTYPERn_EL0(n)), \
+ .reset = reset_pmevtyper, \
.access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), }
static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
@@ -1026,8 +1059,6 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
return true;
}
-#define FEATURE(x) (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT))
-
/* Read a sanitised cpufeature ID register by sys_reg_desc */
static u64 read_id_reg(const struct kvm_vcpu *vcpu,
struct sys_reg_desc const *r, bool raz)
@@ -1038,40 +1069,40 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
switch (id) {
case SYS_ID_AA64PFR0_EL1:
if (!vcpu_has_sve(vcpu))
- val &= ~FEATURE(ID_AA64PFR0_SVE);
- val &= ~FEATURE(ID_AA64PFR0_AMU);
- val &= ~FEATURE(ID_AA64PFR0_CSV2);
- val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2);
- val &= ~FEATURE(ID_AA64PFR0_CSV3);
- val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_AMU);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2);
+ val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3);
+ val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3);
break;
case SYS_ID_AA64PFR1_EL1:
- val &= ~FEATURE(ID_AA64PFR1_MTE);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE);
if (kvm_has_mte(vcpu->kvm)) {
u64 pfr, mte;
pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT);
- val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte);
+ val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), mte);
}
break;
case SYS_ID_AA64ISAR1_EL1:
if (!vcpu_has_ptrauth(vcpu))
- val &= ~(FEATURE(ID_AA64ISAR1_APA) |
- FEATURE(ID_AA64ISAR1_API) |
- FEATURE(ID_AA64ISAR1_GPA) |
- FEATURE(ID_AA64ISAR1_GPI));
+ val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_API) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI));
break;
case SYS_ID_AA64DFR0_EL1:
/* Limit debug to ARMv8.0 */
- val &= ~FEATURE(ID_AA64DFR0_DEBUGVER);
- val |= FIELD_PREP(FEATURE(ID_AA64DFR0_DEBUGVER), 6);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER);
+ val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), 6);
/* Limit guests to PMUv3 for ARMv8.4 */
val = cpuid_feature_cap_perfmon_field(val,
ID_AA64DFR0_PMUVER_SHIFT,
kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0);
/* Hide SPE from guests */
- val &= ~FEATURE(ID_AA64DFR0_PMSVER);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER);
break;
case SYS_ID_DFR0_EL1:
/* Limit guests to PMUv3 for ARMv8.4 */
@@ -1249,6 +1280,20 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
return __set_id_reg(vcpu, rd, uaddr, true);
}
+static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ const struct kvm_one_reg *reg, void __user *uaddr)
+{
+ int err;
+ u64 val;
+
+ /* Perform the access even if we are going to ignore the value */
+ err = reg_from_user(&val, uaddr, sys_reg_to_index(rd));
+ if (err)
+ return err;
+
+ return 0;
+}
+
static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
@@ -1592,16 +1637,21 @@ static const struct sys_reg_desc sys_reg_descs[] = {
.access = access_pmcnten, .reg = PMCNTENSET_EL0 },
{ PMU_SYS_REG(SYS_PMOVSCLR_EL0),
.access = access_pmovs, .reg = PMOVSSET_EL0 },
+ /*
+ * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was
+ * previously (and pointlessly) advertised in the past...
+ */
{ PMU_SYS_REG(SYS_PMSWINC_EL0),
- .access = access_pmswinc, .reg = PMSWINC_EL0 },
+ .get_user = get_raz_id_reg, .set_user = set_wi_reg,
+ .access = access_pmswinc, .reset = NULL },
{ PMU_SYS_REG(SYS_PMSELR_EL0),
- .access = access_pmselr, .reg = PMSELR_EL0 },
+ .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 },
{ PMU_SYS_REG(SYS_PMCEID0_EL0),
.access = access_pmceid, .reset = NULL },
{ PMU_SYS_REG(SYS_PMCEID1_EL0),
.access = access_pmceid, .reset = NULL },
{ PMU_SYS_REG(SYS_PMCCNTR_EL0),
- .access = access_pmu_evcntr, .reg = PMCCNTR_EL0 },
+ .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 },
{ PMU_SYS_REG(SYS_PMXEVTYPER_EL0),
.access = access_pmu_evtyper, .reset = NULL },
{ PMU_SYS_REG(SYS_PMXEVCNTR_EL0),
@@ -2106,23 +2156,6 @@ static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
return 0;
}
-static int match_sys_reg(const void *key, const void *elt)
-{
- const unsigned long pval = (unsigned long)key;
- const struct sys_reg_desc *r = elt;
-
- return pval - reg_to_encoding(r);
-}
-
-static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params,
- const struct sys_reg_desc table[],
- unsigned int num)
-{
- unsigned long pval = reg_to_encoding(params);
-
- return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg);
-}
-
int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu)
{
kvm_inject_undefined(vcpu);
@@ -2365,13 +2398,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu)
trace_kvm_handle_sys_reg(esr);
- params.Op0 = (esr >> 20) & 3;
- params.Op1 = (esr >> 14) & 0x7;
- params.CRn = (esr >> 10) & 0xf;
- params.CRm = (esr >> 1) & 0xf;
- params.Op2 = (esr >> 17) & 0x7;
+ params = esr_sys64_to_params(esr);
params.regval = vcpu_get_reg(vcpu, Rt);
- params.is_write = !(esr & 1);
ret = emulate_sys_reg(vcpu, &params);
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 9d0621417c2a..cc0cc95a0280 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -11,6 +11,12 @@
#ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__
#define __ARM64_KVM_SYS_REGS_LOCAL_H__
+#include <linux/bsearch.h>
+
+#define reg_to_encoding(x) \
+ sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \
+ (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2)
+
struct sys_reg_params {
u8 Op0;
u8 Op1;
@@ -21,6 +27,14 @@ struct sys_reg_params {
bool is_write;
};
+#define esr_sys64_to_params(esr) \
+ ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \
+ .Op1 = ((esr) >> 14) & 0x7, \
+ .CRn = ((esr) >> 10) & 0xf, \
+ .CRm = ((esr) >> 1) & 0xf, \
+ .Op2 = ((esr) >> 17) & 0x7, \
+ .is_write = !((esr) & 1) })
+
struct sys_reg_desc {
/* Sysreg string for debug */
const char *name;
@@ -152,6 +166,23 @@ static inline int cmp_sys_reg(const struct sys_reg_desc *i1,
return i1->Op2 - i2->Op2;
}
+static inline int match_sys_reg(const void *key, const void *elt)
+{
+ const unsigned long pval = (unsigned long)key;
+ const struct sys_reg_desc *r = elt;
+
+ return pval - reg_to_encoding(r);
+}
+
+static inline const struct sys_reg_desc *
+find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[],
+ unsigned int num)
+{
+ unsigned long pval = reg_to_encoding(params);
+
+ return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg);
+}
+
const struct sys_reg_desc *find_reg_by_id(u64 id,
struct sys_reg_params *params,
const struct sys_reg_desc table[],
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h
index 8d78acc4fba7..064a58c19f48 100644
--- a/arch/arm64/kvm/trace_handle_exit.h
+++ b/arch/arm64/kvm/trace_handle_exit.h
@@ -78,13 +78,17 @@ TRACE_EVENT(kvm_arm_clear_debug,
TP_printk("flags: 0x%08x", __entry->guest_debug)
);
+/*
+ * The dreg32 name is a leftover from a distant past. This will really
+ * output a 64bit value...
+ */
TRACE_EVENT(kvm_arm_set_dreg32,
- TP_PROTO(const char *name, __u32 value),
+ TP_PROTO(const char *name, __u64 value),
TP_ARGS(name, value),
TP_STRUCT__entry(
__field(const char *, name)
- __field(__u32, value)
+ __field(__u64, value)
),
TP_fast_assign(
@@ -92,7 +96,7 @@ TRACE_EVENT(kvm_arm_set_dreg32,
__entry->value = value;
),
- TP_printk("%s: 0x%08x", __entry->name, __entry->value)
+ TP_printk("%s: 0x%llx", __entry->name, __entry->value)
);
TRACE_DEFINE_SIZEOF(__u64);
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
index a016f07adc28..5f9014ae595b 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -282,7 +282,7 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
case GIC_CPU_PRIMASK:
/*
* Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
- * the PMR field as GICH_VMCR.VMPriMask rather than
+ * PMR field as GICH_VMCR.VMPriMask rather than
* GICC_PMR.Priority, so we expose the upper five bits of
* priority mask to userspace using the lower bits in the
* unsigned long.
@@ -329,7 +329,7 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
case GIC_CPU_PRIMASK:
/*
* Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
- * the PMR field as GICH_VMCR.VMPriMask rather than
+ * PMR field as GICH_VMCR.VMPriMask rather than
* GICC_PMR.Priority, so we expose the upper five bits of
* priority mask to userspace using the lower bits in the
* unsigned long.
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 2c580204f1dc..95a18cec14a3 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
u32 val = cpuif->vgic_lr[lr];
u32 cpuid, intid = val & GICH_LR_VIRTUALID;
struct vgic_irq *irq;
+ bool deactivated;
/* Extract the source vCPU id from the LR */
cpuid = val & GICH_LR_PHYSID_CPUID;
@@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
if (irq->active && vgic_irq_is_sgi(intid))
@@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & GICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 66004f61cd83..21a6207fb2ee 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
u32 intid, cpuid;
struct vgic_irq *irq;
bool is_v2_sgi = false;
+ bool deactivated;
cpuid = val & GICH_LR_PHYSID_CPUID;
cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
@@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
irq->active = !!(val & ICH_LR_ACTIVE_BIT);
if (irq->active && is_v2_sgi)
@@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & ICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 111bff47e471..5dad4996cfb2 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -106,7 +106,6 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
if (intid >= VGIC_MIN_LPI)
return vgic_get_lpi(kvm, intid);
- WARN(1, "Looking up struct vgic_irq for reserved INTID");
return NULL;
}
@@ -1022,3 +1021,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
return map_is_active;
}
+
+/*
+ * Level-triggered mapped IRQs are special because we only observe rising
+ * edges as input to the VGIC.
+ *
+ * If the guest never acked the interrupt we have to sample the physical
+ * line and set the line level, because the device state could have changed
+ * or we simply need to process the still pending interrupt later.
+ *
+ * We could also have entered the guest with the interrupt active+pending.
+ * On the next exit, we need to re-evaluate the pending state, as it could
+ * otherwise result in a spurious interrupt by injecting a now potentially
+ * stale pending state.
+ *
+ * If this causes us to lower the level, we have to also clear the physical
+ * active state, since we will otherwise never be told when the interrupt
+ * becomes asserted again.
+ *
+ * Another case is when the interrupt requires a helping hand on
+ * deactivation (no HW deactivation, for example).
+ */
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending)
+{
+ if (vgic_irq_is_mapped_level(irq)) {
+ bool resample = false;
+
+ if (unlikely(vgic_irq_needs_resampling(irq))) {
+ resample = !(irq->active || irq->pending_latch);
+ } else if (lr_pending || (lr_deactivated && irq->line_level)) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+ resample = !irq->line_level;
+ }
+
+ if (resample)
+ vgic_irq_set_phys_active(irq, false);
+ }
+}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index dc1f3d1657ee..14a9218641f5 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
unsigned long flags);
void vgic_kick_vcpus(struct kvm *kvm);
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending);
int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
phys_addr_t addr, phys_addr_t alignment);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 6dd56a49790a..0941180a86d3 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
lib-y := clear_user.o delay.o copy_from_user.o \
- copy_to_user.o copy_in_user.o copy_page.o \
+ copy_to_user.o copy_page.o \
clear_page.o csum.o insn.o memchr.o memcpy.o \
memset.o memcmp.o strcmp.o strncmp.o strlen.o \
strnlen.o strchr.o strrchr.o tishift.o
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
deleted file mode 100644
index dbea3799c3ef..000000000000
--- a/arch/arm64/lib/copy_in_user.S
+++ /dev/null
@@ -1,77 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copy from user space to user space
- *
- * Copyright (C) 2012 ARM Ltd.
- */
-
-#include <linux/linkage.h>
-
-#include <asm/asm-uaccess.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
-/*
- * Copy from user space to user space (alignment handled by the hardware)
- *
- * Parameters:
- * x0 - to
- * x1 - from
- * x2 - n
- * Returns:
- * x0 - bytes not copied
- */
- .macro ldrb1 reg, ptr, val
- user_ldst 9998f, ldtrb, \reg, \ptr, \val
- .endm
-
- .macro strb1 reg, ptr, val
- user_ldst 9998f, sttrb, \reg, \ptr, \val
- .endm
-
- .macro ldrh1 reg, ptr, val
- user_ldst 9997f, ldtrh, \reg, \ptr, \val
- .endm
-
- .macro strh1 reg, ptr, val
- user_ldst 9997f, sttrh, \reg, \ptr, \val
- .endm
-
- .macro ldr1 reg, ptr, val
- user_ldst 9997f, ldtr, \reg, \ptr, \val
- .endm
-
- .macro str1 reg, ptr, val
- user_ldst 9997f, sttr, \reg, \ptr, \val
- .endm
-
- .macro ldp1 reg1, reg2, ptr, val
- user_ldp 9997f, \reg1, \reg2, \ptr, \val
- .endm
-
- .macro stp1 reg1, reg2, ptr, val
- user_stp 9997f, \reg1, \reg2, \ptr, \val
- .endm
-
-end .req x5
-srcin .req x15
-SYM_FUNC_START(__arch_copy_in_user)
- add end, x0, x2
- mov srcin, x1
-#include "copy_template.S"
- mov x0, #0
- ret
-SYM_FUNC_END(__arch_copy_in_user)
-EXPORT_SYMBOL(__arch_copy_in_user)
-
- .section .fixup,"ax"
- .align 2
-9997: cmp dst, dstin
- b.ne 9998f
- // Before being absolutely sure we couldn't copy anything, try harder
-USER(9998f, ldtrb tmp1w, [srcin])
-USER(9998f, sttrb tmp1w, [dst])
- add dst, dst, #1
-9998: sub x0, end, dst // bytes not copied
- ret
- .previous
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index b16be52233c6..37a81754d9b6 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -30,6 +30,7 @@
#include <linux/crash_dump.h>
#include <linux/hugetlb.h>
#include <linux/acpi_iort.h>
+#include <linux/kmemleak.h>
#include <asm/boot.h>
#include <asm/fixmap.h>
@@ -101,6 +102,11 @@ static void __init reserve_crashkernel(void)
pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
crash_base, crash_base + crash_size, crash_size >> 20);
+ /*
+ * The crashkernel memory will be removed from the kernel linear
+ * map. Inform kmemleak so that it won't try to access it.
+ */
+ kmemleak_ignore_phys(crash_base);
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
}
@@ -222,7 +228,21 @@ early_param("mem", early_mem);
void __init arm64_memblock_init(void)
{
- const s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);
+ s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);
+
+ /*
+ * Corner case: 52-bit VA capable systems running KVM in nVHE mode may
+ * be limited in their ability to support a linear map that exceeds 51
+ * bits of VA space, depending on the placement of the ID map. Given
+ * that the placement of the ID map may be randomized, let's simply
+ * limit the kernel's linear map to 51 bits as well if we detect this
+ * configuration.
+ */
+ if (IS_ENABLED(CONFIG_KVM) && vabits_actual == 52 &&
+ is_hyp_mode_available() && !is_kernel_in_hyp_mode()) {
+ pr_info("Capping linear region to 51 bits for KVM in nVHE mode on LVA capable hardware.\n");
+ linear_region_size = min_t(u64, linear_region_size, BIT(51));
+ }
/* Remove memory above our supported physical address size */
memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9ff0de1b2b93..cfd9deb347c3 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1502,8 +1502,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return ret;
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 2716f6395ba7..9d4d898df76b 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -82,6 +82,7 @@ config CSKY
select PCI_SYSCALL if PCI
select PCI_MSI if PCI
select SET_FS
+ select TRACE_IRQFLAGS_SUPPORT
config LOCKDEP_SUPPORT
def_bool y
@@ -139,9 +140,6 @@ config STACKTRACE_SUPPORT
config TIME_LOW_RES
def_bool y
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
config CPU_TLB_SIZE
int
default "128" if (CPU_CK610 || CPU_CK807 || CPU_CK810)
diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c
index 5d8b969cd8f3..bdbe988d8dbc 100644
--- a/arch/h8300/kernel/traps.c
+++ b/arch/h8300/kernel/traps.c
@@ -39,10 +39,6 @@ void __init base_trap_init(void)
{
}
-void __init trap_init(void)
-{
-}
-
asmlinkage void set_esp0(unsigned long ssp)
{
current->thread.esp0 = ssp;
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index aab1a40eb653..15dd8f38b698 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -32,6 +32,7 @@ config HEXAGON
select GENERIC_CPU_DEVICES
select SET_FS
select ARCH_WANT_LD_ORPHAN_WARN
+ select TRACE_IRQFLAGS_SUPPORT
help
Qualcomm Hexagon is a processor architecture designed for high
performance and low power across a wide variety of applications.
@@ -53,9 +54,6 @@ config EARLY_PRINTK
config MMU
def_bool y
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
config GENERIC_CSUM
def_bool y
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index 904134b37232..edfc35dafeb1 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -28,10 +28,6 @@
#define TRAP_SYSCALL 1
#define TRAP_DEBUG 0xdb
-void __init trap_init(void)
-{
-}
-
#ifdef CONFIG_GENERIC_BUG
/* Maybe should resemble arch/sh/kernel/traps.c ?? */
int is_valid_bugaddr(unsigned long addr)
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 064a967a7b6e..5c6da8d83c1a 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -484,8 +484,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return ret;
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/microblaze/Kbuild b/arch/microblaze/Kbuild
index a4e40e534e6a..a1c597889319 100644
--- a/arch/microblaze/Kbuild
+++ b/arch/microblaze/Kbuild
@@ -1 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
+obj-y += kernel/
+obj-y += mm/
+obj-$(CONFIG_PCI) += pci/
+obj-y += boot/dts/
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 14a67a42fcae..59798e43cdb0 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -44,6 +44,7 @@ config MICROBLAZE
select SPARSE_IRQ
select SET_FS
select ZONE_DMA
+ select TRACE_IRQFLAGS_SUPPORT
# Endianness selection
choice
diff --git a/arch/microblaze/Kconfig.debug b/arch/microblaze/Kconfig.debug
index 865527ac332a..a4e40e534e6a 100644
--- a/arch/microblaze/Kconfig.debug
+++ b/arch/microblaze/Kconfig.debug
@@ -1,6 +1 @@
# SPDX-License-Identifier: GPL-2.0-only
-# For a description of the syntax of this configuration file,
-# see Documentation/kbuild/kconfig-language.rst.
-
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
diff --git a/arch/microblaze/Makefile b/arch/microblaze/Makefile
index 6d4af39e3890..9adc6b6434df 100644
--- a/arch/microblaze/Makefile
+++ b/arch/microblaze/Makefile
@@ -50,17 +50,12 @@ KBUILD_CFLAGS += -ffixed-r31 $(CPUFLAGS-y) $(CPUFLAGS-1) $(CPUFLAGS-2)
head-y := arch/microblaze/kernel/head.o
libs-y += arch/microblaze/lib/
-core-y += arch/microblaze/kernel/
-core-y += arch/microblaze/mm/
-core-$(CONFIG_PCI) += arch/microblaze/pci/
boot := arch/microblaze/boot
# Are we making a simpleImage.<boardname> target? If so, crack out the boardname
DTB:=$(subst simpleImage.,,$(filter simpleImage.%, $(MAKECMDGOALS)))
-core-y += $(boot)/dts/
-
export DTB
all: linux.bin
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 9b8ff6c2c1e3..771ca53af06d 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -98,6 +98,7 @@ config MIPS
select PCI_MSI_ARCH_FALLBACKS if PCI_MSI
select RTC_LIB
select SYSCTL_EXCEPTION_TRACE
+ select TRACE_IRQFLAGS_SUPPORT
select VIRT_TO_BUS
select ARCH_HAS_ELFCORE_COMPAT
diff --git a/arch/mips/Kconfig.debug b/arch/mips/Kconfig.debug
index 43dbf5930796..f4ae7900fcd3 100644
--- a/arch/mips/Kconfig.debug
+++ b/arch/mips/Kconfig.debug
@@ -1,9 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-config TRACE_IRQFLAGS_SUPPORT
- bool
- default y
-
config EARLY_PRINTK
bool "Early printk" if EXPERT
depends on SYS_HAS_EARLY_PRINTK
diff --git a/arch/mips/cavium-octeon/octeon-memcpy.S b/arch/mips/cavium-octeon/octeon-memcpy.S
index 600d018cf354..0a515cde1c18 100644
--- a/arch/mips/cavium-octeon/octeon-memcpy.S
+++ b/arch/mips/cavium-octeon/octeon-memcpy.S
@@ -154,8 +154,6 @@ FEXPORT(__raw_copy_from_user)
EXPORT_SYMBOL(__raw_copy_from_user)
FEXPORT(__raw_copy_to_user)
EXPORT_SYMBOL(__raw_copy_to_user)
-FEXPORT(__raw_copy_in_user)
-EXPORT_SYMBOL(__raw_copy_in_user)
/*
* Note: dst & src may be unaligned, len may be 0
* Temps
diff --git a/arch/mips/configs/lemote2f_defconfig b/arch/mips/configs/lemote2f_defconfig
index aaf9d5e0aa2c..791894c4d8fb 100644
--- a/arch/mips/configs/lemote2f_defconfig
+++ b/arch/mips/configs/lemote2f_defconfig
@@ -116,7 +116,6 @@ CONFIG_8139TOO=y
CONFIG_R8169=y
CONFIG_USB_USBNET=m
CONFIG_USB_NET_CDC_EEM=m
-CONFIG_INPUT_POLLDEV=m
CONFIG_INPUT_EVDEV=y
# CONFIG_MOUSE_PS2_ALPS is not set
# CONFIG_MOUSE_PS2_LOGIPS2PP is not set
diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig
index 63fe2da1b37f..fd567247adc7 100644
--- a/arch/mips/configs/pic32mzda_defconfig
+++ b/arch/mips/configs/pic32mzda_defconfig
@@ -34,7 +34,6 @@ CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_SCAN_ASYNC=y
# CONFIG_SCSI_LOWLEVEL is not set
CONFIG_INPUT_LEDS=m
-CONFIG_INPUT_POLLDEV=y
CONFIG_INPUT_MOUSEDEV=m
CONFIG_INPUT_EVDEV=y
CONFIG_INPUT_EVBUG=m
diff --git a/arch/mips/configs/rt305x_defconfig b/arch/mips/configs/rt305x_defconfig
index fec5851c164b..eb359db15dba 100644
--- a/arch/mips/configs/rt305x_defconfig
+++ b/arch/mips/configs/rt305x_defconfig
@@ -90,7 +90,6 @@ CONFIG_PPPOE=m
CONFIG_PPP_ASYNC=m
CONFIG_ISDN=y
CONFIG_INPUT=m
-CONFIG_INPUT_POLLDEV=m
# CONFIG_KEYBOARD_ATKBD is not set
# CONFIG_INPUT_MOUSE is not set
CONFIG_INPUT_MISC=y
diff --git a/arch/mips/configs/xway_defconfig b/arch/mips/configs/xway_defconfig
index 9abbc0debc2a..eeb689f715cb 100644
--- a/arch/mips/configs/xway_defconfig
+++ b/arch/mips/configs/xway_defconfig
@@ -96,7 +96,6 @@ CONFIG_PPPOE=m
CONFIG_PPP_ASYNC=m
CONFIG_ISDN=y
CONFIG_INPUT=m
-CONFIG_INPUT_POLLDEV=m
# CONFIG_KEYBOARD_ATKBD is not set
# CONFIG_INPUT_MOUSE is not set
CONFIG_INPUT_MISC=y
diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h
index 53f015a1b0a7..bbb3bc5a42fd 100644
--- a/arch/mips/include/asm/compat.h
+++ b/arch/mips/include/asm/compat.h
@@ -96,14 +96,6 @@ struct compat_statfs {
#define COMPAT_OFF_T_MAX 0x7fffffff
-static inline void __user *arch_compat_alloc_user_space(long len)
-{
- struct pt_regs *regs = (struct pt_regs *)
- ((unsigned long) current_thread_info() + THREAD_SIZE - 32) - 1;
-
- return (void __user *) (regs->regs[29] - len);
-}
-
struct compat_ipc64_perm {
compat_key_t key;
__compat_uid32_t uid;
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index 783fecce65c8..f8f74f9f5883 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -428,7 +428,6 @@ do { \
extern size_t __raw_copy_from_user(void *__to, const void *__from, size_t __n);
extern size_t __raw_copy_to_user(void *__to, const void *__from, size_t __n);
-extern size_t __raw_copy_in_user(void *__to, const void *__from, size_t __n);
static inline unsigned long
raw_copy_from_user(void *to, const void __user *from, unsigned long n)
@@ -480,31 +479,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n)
#define INLINE_COPY_FROM_USER
#define INLINE_COPY_TO_USER
-static inline unsigned long
-raw_copy_in_user(void __user *to, const void __user *from, unsigned long n)
-{
- register void __user *__cu_to_r __asm__("$4");
- register const void __user *__cu_from_r __asm__("$5");
- register long __cu_len_r __asm__("$6");
-
- __cu_to_r = to;
- __cu_from_r = from;
- __cu_len_r = n;
-
- __asm__ __volatile__(
- ".set\tnoreorder\n\t"
- __MODULE_JAL(__raw_copy_in_user)
- ".set\tnoat\n\t"
- __UA_ADDU "\t$1, %1, %2\n\t"
- ".set\tat\n\t"
- ".set\treorder"
- : "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r)
- :
- : "$8", "$9", "$10", "$11", "$12", "$14", "$15", "$24", "$31",
- DADDI_SCRATCH, "memory");
- return __cu_len_r;
-}
-
extern __kernel_size_t __bzero(void __user *addr, __kernel_size_t size);
/*
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 56c8d3cf42ed..70e32de2bcaa 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -239,9 +239,9 @@
228 n32 clock_nanosleep sys_clock_nanosleep_time32
229 n32 tgkill sys_tgkill
230 n32 utimes sys_utimes_time32
-231 n32 mbind compat_sys_mbind
-232 n32 get_mempolicy compat_sys_get_mempolicy
-233 n32 set_mempolicy compat_sys_set_mempolicy
+231 n32 mbind sys_mbind
+232 n32 get_mempolicy sys_get_mempolicy
+233 n32 set_mempolicy sys_set_mempolicy
234 n32 mq_open compat_sys_mq_open
235 n32 mq_unlink sys_mq_unlink
236 n32 mq_timedsend sys_mq_timedsend_time32
@@ -258,7 +258,7 @@
247 n32 inotify_init sys_inotify_init
248 n32 inotify_add_watch sys_inotify_add_watch
249 n32 inotify_rm_watch sys_inotify_rm_watch
-250 n32 migrate_pages compat_sys_migrate_pages
+250 n32 migrate_pages sys_migrate_pages
251 n32 openat sys_openat
252 n32 mkdirat sys_mkdirat
253 n32 mknodat sys_mknodat
@@ -279,7 +279,7 @@
268 n32 sync_file_range sys_sync_file_range
269 n32 tee sys_tee
270 n32 vmsplice sys_vmsplice
-271 n32 move_pages compat_sys_move_pages
+271 n32 move_pages sys_move_pages
272 n32 set_robust_list compat_sys_set_robust_list
273 n32 get_robust_list compat_sys_get_robust_list
274 n32 kexec_load compat_sys_kexec_load
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 201237fd0f43..a61c35edaa74 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -279,9 +279,9 @@
265 o32 clock_nanosleep sys_clock_nanosleep_time32
266 o32 tgkill sys_tgkill
267 o32 utimes sys_utimes_time32
-268 o32 mbind sys_mbind compat_sys_mbind
-269 o32 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
-270 o32 set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy
+268 o32 mbind sys_mbind
+269 o32 get_mempolicy sys_get_mempolicy
+270 o32 set_mempolicy sys_set_mempolicy
271 o32 mq_open sys_mq_open compat_sys_mq_open
272 o32 mq_unlink sys_mq_unlink
273 o32 mq_timedsend sys_mq_timedsend_time32
@@ -298,7 +298,7 @@
284 o32 inotify_init sys_inotify_init
285 o32 inotify_add_watch sys_inotify_add_watch
286 o32 inotify_rm_watch sys_inotify_rm_watch
-287 o32 migrate_pages sys_migrate_pages compat_sys_migrate_pages
+287 o32 migrate_pages sys_migrate_pages
288 o32 openat sys_openat compat_sys_openat
289 o32 mkdirat sys_mkdirat
290 o32 mknodat sys_mknodat
@@ -319,7 +319,7 @@
305 o32 sync_file_range sys_sync_file_range sys32_sync_file_range
306 o32 tee sys_tee
307 o32 vmsplice sys_vmsplice
-308 o32 move_pages sys_move_pages compat_sys_move_pages
+308 o32 move_pages sys_move_pages
309 o32 set_robust_list sys_set_robust_list compat_sys_set_robust_list
310 o32 get_robust_list sys_get_robust_list compat_sys_get_robust_list
311 o32 kexec_load sys_kexec_load compat_sys_kexec_load
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index af9dd029a4e1..75c6f264c626 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -41,8 +41,6 @@
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS()
};
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
- sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
@@ -85,8 +83,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, vz_cpucfg_exits),
#endif
};
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
- sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 43cad10b877d..4adca5abbc72 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -388,7 +388,6 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu,
u32 compare, u32 cause)
{
u32 start_count, after_count;
- ktime_t freeze_time;
unsigned long flags;
/*
@@ -396,7 +395,7 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu,
* this with interrupts disabled to avoid latency.
*/
local_irq_save(flags);
- freeze_time = kvm_mips_freeze_hrtimer(vcpu, &start_count);
+ kvm_mips_freeze_hrtimer(vcpu, &start_count);
write_c0_gtoffset(start_count - read_c0_count());
local_irq_restore(flags);
diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S
index e19fb98b5d38..277c32296636 100644
--- a/arch/mips/lib/memcpy.S
+++ b/arch/mips/lib/memcpy.S
@@ -666,8 +666,6 @@ FEXPORT(__raw_copy_from_user)
EXPORT_SYMBOL(__raw_copy_from_user)
FEXPORT(__raw_copy_to_user)
EXPORT_SYMBOL(__raw_copy_to_user)
-FEXPORT(__raw_copy_in_user)
-EXPORT_SYMBOL(__raw_copy_in_user)
#endif
/* Legacy Mode, user <-> user */
__BUILD_COPY_USER LEGACY_MODE USEROP USEROP
@@ -703,13 +701,4 @@ EXPORT_SYMBOL(__raw_copy_to_user)
__BUILD_COPY_USER EVA_MODE KERNELOP USEROP
END(__raw_copy_to_user)
-/*
- * __copy_in_user (EVA)
- */
-
-LEAF(__raw_copy_in_user)
-EXPORT_SYMBOL(__raw_copy_in_user)
-__BUILD_COPY_USER EVA_MODE USEROP USEROP
-END(__raw_copy_in_user)
-
#endif
diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig
index 9c9f3877abf9..aea26e739543 100644
--- a/arch/nds32/Kconfig
+++ b/arch/nds32/Kconfig
@@ -46,6 +46,7 @@ config NDS32
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_DYNAMIC_FTRACE
select SET_FS
+ select TRACE_IRQFLAGS_SUPPORT
help
Andes(nds32) Linux support.
@@ -62,9 +63,6 @@ config GENERIC_LOCKBREAK
def_bool y
depends on PREEMPTION
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
config STACKTRACE_SUPPORT
def_bool y
diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c
index 41725eaf8bac..b3d34d646652 100644
--- a/arch/nds32/kernel/setup.c
+++ b/arch/nds32/kernel/setup.c
@@ -244,7 +244,6 @@ static void __init setup_memory(void)
unsigned long ram_start_pfn;
unsigned long free_ram_start_pfn;
phys_addr_t memory_start, memory_end;
- struct memblock_region *region;
memory_end = memory_start = 0;
diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c
index ee0d9ae192a5..f06421c645af 100644
--- a/arch/nds32/kernel/traps.c
+++ b/arch/nds32/kernel/traps.c
@@ -183,11 +183,6 @@ void __pgd_error(const char *file, int line, unsigned long val)
}
extern char *exception_vector, *exception_vector_end;
-void __init trap_init(void)
-{
- return;
-}
-
void __init early_trap_init(void)
{
unsigned long ivb = 0;
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 3efe5533ea1c..33fd06f5fa41 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -41,9 +41,6 @@ config NO_IOPORT_MAP
config FPU
def_bool n
-config TRACE_IRQFLAGS_SUPPORT
- def_bool n
-
menu "Kernel features"
source "kernel/Kconfig.hz"
diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c
index b172da4eb1a9..596986a74a26 100644
--- a/arch/nios2/kernel/traps.c
+++ b/arch/nios2/kernel/traps.c
@@ -105,11 +105,6 @@ void show_stack(struct task_struct *task, unsigned long *stack,
printk("%s\n", loglvl);
}
-void __init trap_init(void)
-{
- /* Nothing to do here */
-}
-
/* Breakpoint handler */
asmlinkage void breakpoint_c(struct pt_regs *fp)
{
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 50035a9816c8..e804026b4797 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -37,6 +37,7 @@ config OPENRISC
select GENERIC_IRQ_MULTI_HANDLER
select MMU_GATHER_NO_RANGE if MMU
select SET_FS
+ select TRACE_IRQFLAGS_SUPPORT
config CPU_BIG_ENDIAN
def_bool y
@@ -50,9 +51,6 @@ config GENERIC_HWEIGHT
config NO_IOPORT_MAP
def_bool y
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
# For now, use generic checksum functions
#These can be reimplemented in assembly later if so inclined
config GENERIC_CSUM
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 4d61333c2623..aa1e709405ac 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -231,11 +231,6 @@ void unhandled_exception(struct pt_regs *regs, int ea, int vector)
die("Oops", regs, 9);
}
-void __init trap_init(void)
-{
- /* Nothing needs to be done */
-}
-
asmlinkage void do_trap(struct pt_regs *regs, unsigned long address)
{
force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc);
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 95d4bbf4e455..4742b6f169b7 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -10,7 +10,6 @@ config PARISC
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_UBSAN_SANITIZE_ALL
- select ARCH_HAS_STRNLEN_USER
select ARCH_NO_SG_CHAIN
select ARCH_SUPPORTS_HUGETLBFS if PA20
select ARCH_SUPPORTS_MEMORY_FAILURE
@@ -65,7 +64,7 @@ config PARISC
select HAVE_KPROBES_ON_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
select HAVE_SOFTIRQ_ON_OWN_STACK if IRQSTACKS
- select SET_FS
+ select TRACE_IRQFLAGS_SUPPORT
help
The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/parisc/Kconfig.debug b/arch/parisc/Kconfig.debug
index 1478ded0e247..f66554cd5c45 100644
--- a/arch/parisc/Kconfig.debug
+++ b/arch/parisc/Kconfig.debug
@@ -1,4 +1 @@
# SPDX-License-Identifier: GPL-2.0
-
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile
index dff453687530..9fe54878167d 100644
--- a/arch/parisc/boot/compressed/Makefile
+++ b/arch/parisc/boot/compressed/Makefile
@@ -26,7 +26,7 @@ endif
OBJECTS += $(obj)/head.o $(obj)/real2.o $(obj)/firmware.o $(obj)/misc.o $(obj)/piggy.o
LDFLAGS_vmlinux := -X -e startup --as-needed -T
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(LIBGCC)
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(LIBGCC) FORCE
$(call if_changed,ld)
sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\|parisc_kernel_start\)$$/\#define SZ\2 0x\1/p'
@@ -34,7 +34,7 @@ sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\|parisc_kernel_start\
quiet_cmd_sizes = GEN $@
cmd_sizes = $(NM) $< | sed -n $(sed-sizes) > $@
-$(obj)/sizes.h: vmlinux
+$(obj)/sizes.h: vmlinux FORCE
$(call if_changed,sizes)
AFLAGS_head.o += -I$(objtree)/$(obj) -DBOOTLOADER
@@ -70,19 +70,19 @@ suffix-$(CONFIG_KERNEL_LZMA) := lzma
suffix-$(CONFIG_KERNEL_LZO) := lzo
suffix-$(CONFIG_KERNEL_XZ) := xz
-$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
$(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lz4)
-$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lzma)
-$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lzo)
-$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,xzkern)
LDFLAGS_piggy.o := -r --format binary --oformat $(LD_BFD) -T
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y)
+$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE
$(call if_changed,ld)
diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig
index 7611d48c599e..dd14e3131325 100644
--- a/arch/parisc/configs/generic-32bit_defconfig
+++ b/arch/parisc/configs/generic-32bit_defconfig
@@ -111,7 +111,6 @@ CONFIG_PPP_BSDCOMP=m
CONFIG_PPP_DEFLATE=m
CONFIG_PPPOE=m
# CONFIG_WLAN is not set
-CONFIG_INPUT_POLLDEV=y
CONFIG_KEYBOARD_HIL_OLD=m
CONFIG_KEYBOARD_HIL=m
CONFIG_MOUSE_SERIAL=y
diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h
index b5d90e82b65d..c04f5a637c39 100644
--- a/arch/parisc/include/asm/compat.h
+++ b/arch/parisc/include/asm/compat.h
@@ -163,12 +163,6 @@ struct compat_shmid64_ds {
#define COMPAT_ELF_NGREG 80
typedef compat_ulong_t compat_elf_gregset_t[COMPAT_ELF_NGREG];
-static __inline__ void __user *arch_compat_alloc_user_space(long len)
-{
- struct pt_regs *regs = &current->thread.regs;
- return (void __user *)regs->gr[30];
-}
-
static inline int __is_compat_task(struct task_struct *t)
{
return test_tsk_thread_flag(t, TIF_32BIT);
diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h
index b5fbcd2c1780..eeb7da064289 100644
--- a/arch/parisc/include/asm/processor.h
+++ b/arch/parisc/include/asm/processor.h
@@ -101,10 +101,6 @@ DECLARE_PER_CPU(struct cpuinfo_parisc, cpu_data);
#define CPU_HVERSION ((boot_cpu_data.hversion >> 4) & 0x0FFF)
-typedef struct {
- int seg;
-} mm_segment_t;
-
#define ARCH_MIN_TASKALIGN 8
struct thread_struct {
diff --git a/arch/parisc/include/asm/rt_sigframe.h b/arch/parisc/include/asm/rt_sigframe.h
index 2b3010ade00e..4b9e3d707571 100644
--- a/arch/parisc/include/asm/rt_sigframe.h
+++ b/arch/parisc/include/asm/rt_sigframe.h
@@ -2,7 +2,7 @@
#ifndef _ASM_PARISC_RT_SIGFRAME_H
#define _ASM_PARISC_RT_SIGFRAME_H
-#define SIGRETURN_TRAMP 4
+#define SIGRETURN_TRAMP 3
#define SIGRESTARTBLOCK_TRAMP 5
#define TRAMP_SIZE (SIGRETURN_TRAMP + SIGRESTARTBLOCK_TRAMP)
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index 0bd38a972cea..00ad50fef769 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -11,7 +11,6 @@
struct thread_info {
struct task_struct *task; /* main task structure */
unsigned long flags; /* thread_info flags (see TIF_*) */
- mm_segment_t addr_limit; /* user-level address space limit */
__u32 cpu; /* current CPU */
int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */
};
@@ -21,7 +20,6 @@ struct thread_info {
.task = &tsk, \
.flags = 0, \
.cpu = 0, \
- .addr_limit = KERNEL_DS, \
.preempt_count = INIT_PREEMPT_COUNT, \
}
diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h
index ed2cd4fb479b..192ad9e11b25 100644
--- a/arch/parisc/include/asm/uaccess.h
+++ b/arch/parisc/include/asm/uaccess.h
@@ -11,14 +11,6 @@
#include <linux/bug.h>
#include <linux/string.h>
-#define KERNEL_DS ((mm_segment_t){0})
-#define USER_DS ((mm_segment_t){1})
-
-#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg)
-
-#define get_fs() (current_thread_info()->addr_limit)
-#define set_fs(x) (current_thread_info()->addr_limit = (x))
-
/*
* Note that since kernel addresses are in a separate address space on
* parisc, we don't need to do anything for access_ok().
@@ -33,11 +25,11 @@
#define get_user __get_user
#if !defined(CONFIG_64BIT)
-#define LDD_USER(val, ptr) __get_user_asm64(val, ptr)
-#define STD_USER(x, ptr) __put_user_asm64(x, ptr)
+#define LDD_USER(sr, val, ptr) __get_user_asm64(sr, val, ptr)
+#define STD_USER(sr, x, ptr) __put_user_asm64(sr, x, ptr)
#else
-#define LDD_USER(val, ptr) __get_user_asm(val, "ldd", ptr)
-#define STD_USER(x, ptr) __put_user_asm("std", x, ptr)
+#define LDD_USER(sr, val, ptr) __get_user_asm(sr, val, "ldd", ptr)
+#define STD_USER(sr, x, ptr) __put_user_asm(sr, "std", x, ptr)
#endif
/*
@@ -67,28 +59,15 @@ struct exception_table_entry {
#define ASM_EXCEPTIONTABLE_ENTRY_EFAULT( fault_addr, except_addr )\
ASM_EXCEPTIONTABLE_ENTRY( fault_addr, except_addr + 1)
-/*
- * load_sr2() preloads the space register %%sr2 - based on the value of
- * get_fs() - with either a value of 0 to access kernel space (KERNEL_DS which
- * is 0), or with the current value of %%sr3 to access user space (USER_DS)
- * memory. The following __get_user_asm() and __put_user_asm() functions have
- * %%sr2 hard-coded to access the requested memory.
- */
-#define load_sr2() \
- __asm__(" or,= %0,%%r0,%%r0\n\t" \
- " mfsp %%sr3,%0\n\t" \
- " mtsp %0,%%sr2\n\t" \
- : : "r"(get_fs()) : )
-
-#define __get_user_internal(val, ptr) \
+#define __get_user_internal(sr, val, ptr) \
({ \
register long __gu_err __asm__ ("r8") = 0; \
\
switch (sizeof(*(ptr))) { \
- case 1: __get_user_asm(val, "ldb", ptr); break; \
- case 2: __get_user_asm(val, "ldh", ptr); break; \
- case 4: __get_user_asm(val, "ldw", ptr); break; \
- case 8: LDD_USER(val, ptr); break; \
+ case 1: __get_user_asm(sr, val, "ldb", ptr); break; \
+ case 2: __get_user_asm(sr, val, "ldh", ptr); break; \
+ case 4: __get_user_asm(sr, val, "ldw", ptr); break; \
+ case 8: LDD_USER(sr, val, ptr); break; \
default: BUILD_BUG(); \
} \
\
@@ -97,15 +76,14 @@ struct exception_table_entry {
#define __get_user(val, ptr) \
({ \
- load_sr2(); \
- __get_user_internal(val, ptr); \
+ __get_user_internal("%%sr3,", val, ptr); \
})
-#define __get_user_asm(val, ldx, ptr) \
+#define __get_user_asm(sr, val, ldx, ptr) \
{ \
register long __gu_val; \
\
- __asm__("1: " ldx " 0(%%sr2,%2),%0\n" \
+ __asm__("1: " ldx " 0(" sr "%2),%0\n" \
"9:\n" \
ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \
: "=r"(__gu_val), "=r"(__gu_err) \
@@ -114,9 +92,22 @@ struct exception_table_entry {
(val) = (__force __typeof__(*(ptr))) __gu_val; \
}
+#define HAVE_GET_KERNEL_NOFAULT
+#define __get_kernel_nofault(dst, src, type, err_label) \
+{ \
+ type __z; \
+ long __err; \
+ __err = __get_user_internal("%%sr0,", __z, (type *)(src)); \
+ if (unlikely(__err)) \
+ goto err_label; \
+ else \
+ *(type *)(dst) = __z; \
+}
+
+
#if !defined(CONFIG_64BIT)
-#define __get_user_asm64(val, ptr) \
+#define __get_user_asm64(sr, val, ptr) \
{ \
union { \
unsigned long long l; \
@@ -124,8 +115,8 @@ struct exception_table_entry {
} __gu_tmp; \
\
__asm__(" copy %%r0,%R0\n" \
- "1: ldw 0(%%sr2,%2),%0\n" \
- "2: ldw 4(%%sr2,%2),%R0\n" \
+ "1: ldw 0(" sr "%2),%0\n" \
+ "2: ldw 4(" sr "%2),%R0\n" \
"9:\n" \
ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \
ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \
@@ -138,16 +129,16 @@ struct exception_table_entry {
#endif /* !defined(CONFIG_64BIT) */
-#define __put_user_internal(x, ptr) \
+#define __put_user_internal(sr, x, ptr) \
({ \
register long __pu_err __asm__ ("r8") = 0; \
__typeof__(*(ptr)) __x = (__typeof__(*(ptr)))(x); \
\
switch (sizeof(*(ptr))) { \
- case 1: __put_user_asm("stb", __x, ptr); break; \
- case 2: __put_user_asm("sth", __x, ptr); break; \
- case 4: __put_user_asm("stw", __x, ptr); break; \
- case 8: STD_USER(__x, ptr); break; \
+ case 1: __put_user_asm(sr, "stb", __x, ptr); break; \
+ case 2: __put_user_asm(sr, "sth", __x, ptr); break; \
+ case 4: __put_user_asm(sr, "stw", __x, ptr); break; \
+ case 8: STD_USER(sr, __x, ptr); break; \
default: BUILD_BUG(); \
} \
\
@@ -156,10 +147,20 @@ struct exception_table_entry {
#define __put_user(x, ptr) \
({ \
- load_sr2(); \
- __put_user_internal(x, ptr); \
+ __put_user_internal("%%sr3,", x, ptr); \
})
+#define __put_kernel_nofault(dst, src, type, err_label) \
+{ \
+ type __z = *(type *)(src); \
+ long __err; \
+ __err = __put_user_internal("%%sr0,", __z, (type *)(dst)); \
+ if (unlikely(__err)) \
+ goto err_label; \
+}
+
+
+
/*
* The "__put_user/kernel_asm()" macros tell gcc they read from memory
@@ -170,26 +171,26 @@ struct exception_table_entry {
* r8 is already listed as err.
*/
-#define __put_user_asm(stx, x, ptr) \
- __asm__ __volatile__ ( \
- "1: " stx " %2,0(%%sr2,%1)\n" \
- "9:\n" \
- ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \
- : "=r"(__pu_err) \
+#define __put_user_asm(sr, stx, x, ptr) \
+ __asm__ __volatile__ ( \
+ "1: " stx " %2,0(" sr "%1)\n" \
+ "9:\n" \
+ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \
+ : "=r"(__pu_err) \
: "r"(ptr), "r"(x), "0"(__pu_err))
#if !defined(CONFIG_64BIT)
-#define __put_user_asm64(__val, ptr) do { \
- __asm__ __volatile__ ( \
- "1: stw %2,0(%%sr2,%1)\n" \
- "2: stw %R2,4(%%sr2,%1)\n" \
- "9:\n" \
- ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \
- ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \
- : "=r"(__pu_err) \
- : "r"(ptr), "r"(__val), "0"(__pu_err)); \
+#define __put_user_asm64(sr, __val, ptr) do { \
+ __asm__ __volatile__ ( \
+ "1: stw %2,0(" sr "%1)\n" \
+ "2: stw %R2,4(" sr "%1)\n" \
+ "9:\n" \
+ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \
+ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \
+ : "=r"(__pu_err) \
+ : "r"(ptr), "r"(__val), "0"(__pu_err)); \
} while (0)
#endif /* !defined(CONFIG_64BIT) */
@@ -200,14 +201,12 @@ struct exception_table_entry {
*/
extern long strncpy_from_user(char *, const char __user *, long);
-extern unsigned lclear_user(void __user *, unsigned long);
-extern long lstrnlen_user(const char __user *, long);
+extern __must_check unsigned lclear_user(void __user *, unsigned long);
+extern __must_check long strnlen_user(const char __user *src, long n);
/*
* Complex access routines -- macros
*/
-#define user_addr_max() (~0UL)
-#define strnlen_user lstrnlen_user
#define clear_user lclear_user
#define __clear_user lclear_user
@@ -215,8 +214,6 @@ unsigned long __must_check raw_copy_to_user(void __user *dst, const void *src,
unsigned long len);
unsigned long __must_check raw_copy_from_user(void *dst, const void __user *src,
unsigned long len);
-unsigned long __must_check raw_copy_in_user(void __user *dst, const void __user *src,
- unsigned long len);
#define INLINE_COPY_TO_USER
#define INLINE_COPY_FROM_USER
diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c
index 33113ba24054..22924a3f1728 100644
--- a/arch/parisc/kernel/asm-offsets.c
+++ b/arch/parisc/kernel/asm-offsets.c
@@ -230,7 +230,6 @@ int main(void)
DEFINE(TI_TASK, offsetof(struct thread_info, task));
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
- DEFINE(TI_SEGMENT, offsetof(struct thread_info, addr_limit));
DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
DEFINE(THREAD_SZ, sizeof(struct thread_info));
/* THREAD_SZ_ALGN includes space for a stack frame. */
diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c
index e8a6a751dfd8..00297e8e1c88 100644
--- a/arch/parisc/kernel/parisc_ksyms.c
+++ b/arch/parisc/kernel/parisc_ksyms.c
@@ -32,7 +32,6 @@ EXPORT_SYMBOL(__xchg64);
#include <linux/uaccess.h>
EXPORT_SYMBOL(lclear_user);
-EXPORT_SYMBOL(lstrnlen_user);
#ifndef CONFIG_64BIT
/* Needed so insmod can set dp value */
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 3fb86ee507dd..cceb09855e03 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -150,8 +150,6 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_PA11
dma_ops_init();
#endif
-
- clear_sched_clock_stable();
}
/*
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index db1a47cf424d..bbfe23c40c01 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -237,18 +237,22 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs,
#endif
usp = (regs->gr[30] & ~(0x01UL));
+ sigframe_size = PARISC_RT_SIGFRAME_SIZE;
#ifdef CONFIG_64BIT
if (is_compat_task()) {
/* The gcc alloca implementation leaves garbage in the upper 32 bits of sp */
usp = (compat_uint_t)usp;
+ sigframe_size = PARISC_RT_SIGFRAME_SIZE32;
}
#endif
- /*FIXME: frame_size parameter is unused, remove it. */
- frame = get_sigframe(&ksig->ka, usp, sizeof(*frame));
+ frame = get_sigframe(&ksig->ka, usp, sigframe_size);
DBG(1,"SETUP_RT_FRAME: START\n");
DBG(1,"setup_rt_frame: frame %p info %p\n", frame, ksig->info);
+ start = (unsigned long) frame;
+ if (start >= user_addr_max() - sigframe_size)
+ return -EFAULT;
#ifdef CONFIG_64BIT
@@ -284,32 +288,21 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs,
already in userspace. The first words of tramp are used to
save the previous sigrestartblock trampoline that might be
on the stack. We start the sigreturn trampoline at
- SIGRESTARTBLOCK_TRAMP+X. */
+ SIGRESTARTBLOCK_TRAMP. */
err |= __put_user(in_syscall ? INSN_LDI_R25_1 : INSN_LDI_R25_0,
&frame->tramp[SIGRESTARTBLOCK_TRAMP+0]);
- err |= __put_user(INSN_LDI_R20,
- &frame->tramp[SIGRESTARTBLOCK_TRAMP+1]);
err |= __put_user(INSN_BLE_SR2_R0,
+ &frame->tramp[SIGRESTARTBLOCK_TRAMP+1]);
+ err |= __put_user(INSN_LDI_R20,
&frame->tramp[SIGRESTARTBLOCK_TRAMP+2]);
- err |= __put_user(INSN_NOP, &frame->tramp[SIGRESTARTBLOCK_TRAMP+3]);
-
-#if DEBUG_SIG
- /* Assert that we're flushing in the correct space... */
- {
- unsigned long sid;
- asm ("mfsp %%sr3,%0" : "=r" (sid));
- DBG(1,"setup_rt_frame: Flushing 64 bytes at space %#x offset %p\n",
- sid, frame->tramp);
- }
-#endif
- start = (unsigned long) &frame->tramp[0];
- end = (unsigned long) &frame->tramp[TRAMP_SIZE];
+ start = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP+0];
+ end = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP+3];
flush_user_dcache_range_asm(start, end);
flush_user_icache_range_asm(start, end);
/* TRAMP Words 0-4, Length 5 = SIGRESTARTBLOCK_TRAMP
- * TRAMP Words 5-9, Length 4 = SIGRETURN_TRAMP
+ * TRAMP Words 5-7, Length 3 = SIGRETURN_TRAMP
* So the SIGRETURN_TRAMP is at the end of SIGRESTARTBLOCK_TRAMP
*/
rp = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP];
@@ -353,11 +346,6 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs,
/* The syscall return path will create IAOQ values from r31.
*/
- sigframe_size = PARISC_RT_SIGFRAME_SIZE;
-#ifdef CONFIG_64BIT
- if (is_compat_task())
- sigframe_size = PARISC_RT_SIGFRAME_SIZE32;
-#endif
if (in_syscall) {
regs->gr[31] = haddr;
#ifdef CONFIG_64BIT
@@ -501,7 +489,6 @@ syscall_restart(struct pt_regs *regs, struct k_sigaction *ka)
DBG(1,"ERESTARTNOHAND: returning -EINTR\n");
regs->gr[28] = -EINTR;
break;
-
case -ERESTARTSYS:
if (!(ka->sa.sa_flags & SA_RESTART)) {
DBG(1,"ERESTARTSYS: putting -EINTR\n");
@@ -529,6 +516,10 @@ insert_restart_trampoline(struct pt_regs *regs)
unsigned long end = (unsigned long) &usp[5];
long err = 0;
+ /* check that we don't exceed the stack */
+ if (A(&usp[0]) >= user_addr_max() - 5 * sizeof(int))
+ return;
+
/* Setup a trampoline to restart the syscall
* with __NR_restart_syscall
*
@@ -569,10 +560,6 @@ insert_restart_trampoline(struct pt_regs *regs)
}
/*
- * Note that 'init' is a special process: it doesn't get signals it doesn't
- * want to handle. Thus you cannot kill init even with a SIGKILL even by
- * mistake.
- *
* We need to be able to restore the syscall arguments (r21-r26) to
* restart syscalls. Thus, the syscall path should save them in the
* pt_regs structure (it's okay to do so since they are caller-save
diff --git a/arch/parisc/kernel/signal32.h b/arch/parisc/kernel/signal32.h
index f166250f2d06..a5bdbb5678b7 100644
--- a/arch/parisc/kernel/signal32.h
+++ b/arch/parisc/kernel/signal32.h
@@ -36,7 +36,7 @@ struct compat_regfile {
compat_int_t rf_sar;
};
-#define COMPAT_SIGRETURN_TRAMP 4
+#define COMPAT_SIGRETURN_TRAMP 3
#define COMPAT_SIGRESTARTBLOCK_TRAMP 5
#define COMPAT_TRAMP_SIZE (COMPAT_SIGRETURN_TRAMP + \
COMPAT_SIGRESTARTBLOCK_TRAMP)
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 0bf854b70612..bf751e0732b7 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -292,9 +292,9 @@
258 32 clock_nanosleep sys_clock_nanosleep_time32
258 64 clock_nanosleep sys_clock_nanosleep
259 common tgkill sys_tgkill
-260 common mbind sys_mbind compat_sys_mbind
-261 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
-262 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy
+260 common mbind sys_mbind
+261 common get_mempolicy sys_get_mempolicy
+262 common set_mempolicy sys_set_mempolicy
# 263 was vserver
264 common add_key sys_add_key
265 common request_key sys_request_key
@@ -331,7 +331,7 @@
292 64 sync_file_range sys_sync_file_range
293 common tee sys_tee
294 common vmsplice sys_vmsplice
-295 common move_pages sys_move_pages compat_sys_move_pages
+295 common move_pages sys_move_pages
296 common getcpu sys_getcpu
297 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait
298 common statfs64 sys_statfs64 compat_sys_statfs64
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 08e4d480abe1..9fb1e794831b 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -265,6 +265,9 @@ static int __init init_cr16_clocksource(void)
(cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc))
continue;
+ /* mark sched_clock unstable */
+ clear_sched_clock_stable();
+
clocksource_cr16.name = "cr16_unstable";
clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE;
clocksource_cr16.rating = 0;
@@ -272,10 +275,6 @@ static int __init init_cr16_clocksource(void)
}
}
- /* XXX: We may want to mark sched_clock stable here if cr16 clocks are
- * in sync:
- * (clocksource_cr16.flags == CLOCK_SOURCE_IS_CONTINUOUS) */
-
/* register at clocksource framework */
clocksource_register_hz(&clocksource_cr16,
100 * PAGE0->mem_10msec);
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 8d8441d4562a..747c328fb886 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -859,7 +859,3 @@ void __init early_trap_init(void)
initialize_ivt(&fault_vector_20);
}
-
-void __init trap_init(void)
-{
-}
diff --git a/arch/parisc/lib/lusercopy.S b/arch/parisc/lib/lusercopy.S
index 36d6a8638ead..b428d29e45fb 100644
--- a/arch/parisc/lib/lusercopy.S
+++ b/arch/parisc/lib/lusercopy.S
@@ -28,21 +28,6 @@
#include <linux/linkage.h>
/*
- * get_sr gets the appropriate space value into
- * sr1 for kernel/user space access, depending
- * on the flag stored in the task structure.
- */
-
- .macro get_sr
- mfctl %cr30,%r1
- ldw TI_SEGMENT(%r1),%r22
- mfsp %sr3,%r1
- or,<> %r22,%r0,%r0
- copy %r0,%r1
- mtsp %r1,%sr1
- .endm
-
- /*
* unsigned long lclear_user(void *to, unsigned long n)
*
* Returns 0 for success.
@@ -51,10 +36,9 @@
ENTRY_CFI(lclear_user)
comib,=,n 0,%r25,$lclu_done
- get_sr
$lclu_loop:
addib,<> -1,%r25,$lclu_loop
-1: stbs,ma %r0,1(%sr1,%r26)
+1: stbs,ma %r0,1(%sr3,%r26)
$lclu_done:
bv %r0(%r2)
@@ -67,40 +51,6 @@ $lclu_done:
ENDPROC_CFI(lclear_user)
- /*
- * long lstrnlen_user(char *s, long n)
- *
- * Returns 0 if exception before zero byte or reaching N,
- * N+1 if N would be exceeded,
- * else strlen + 1 (i.e. includes zero byte).
- */
-
-ENTRY_CFI(lstrnlen_user)
- comib,= 0,%r25,$lslen_nzero
- copy %r26,%r24
- get_sr
-1: ldbs,ma 1(%sr1,%r26),%r1
-$lslen_loop:
- comib,=,n 0,%r1,$lslen_done
- addib,<> -1,%r25,$lslen_loop
-2: ldbs,ma 1(%sr1,%r26),%r1
-$lslen_done:
- bv %r0(%r2)
- sub %r26,%r24,%r28
-
-$lslen_nzero:
- b $lslen_done
- ldo 1(%r26),%r26 /* special case for N == 0 */
-
-3: b $lslen_done
- copy %r24,%r26 /* reset r26 so 0 is returned on fault */
-
- ASM_EXCEPTIONTABLE_ENTRY(1b,3b)
- ASM_EXCEPTIONTABLE_ENTRY(2b,3b)
-
-ENDPROC_CFI(lstrnlen_user)
-
-
/*
* unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
*
diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c
index 4b75388190b4..ea70a0e08321 100644
--- a/arch/parisc/lib/memcpy.c
+++ b/arch/parisc/lib/memcpy.c
@@ -38,14 +38,6 @@ unsigned long raw_copy_from_user(void *dst, const void __user *src,
}
EXPORT_SYMBOL(raw_copy_from_user);
-unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long len)
-{
- mtsp(get_user_space(), 1);
- mtsp(get_user_space(), 2);
- return pa_memcpy((void __force *)dst, (void __force *)src, len);
-}
-
-
void * memcpy(void * dst,const void *src, size_t count)
{
mtsp(get_kernel_space(), 1);
@@ -54,7 +46,6 @@ void * memcpy(void * dst,const void *src, size_t count)
return dst;
}
-EXPORT_SYMBOL(raw_copy_in_user);
EXPORT_SYMBOL(memcpy);
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8a93b509337d..ba5b66189358 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -94,10 +94,6 @@ config STACKTRACE_SUPPORT
bool
default y
-config TRACE_IRQFLAGS_SUPPORT
- bool
- default y
-
config LOCKDEP_SUPPORT
bool
default y
@@ -270,6 +266,7 @@ config PPC
select STRICT_KERNEL_RWX if STRICT_MODULE_RWX
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+ select TRACE_IRQFLAGS_SUPPORT
select VIRT_TO_BUS if !PPC64
#
# Please keep this list sorted alphabetically.
diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h
index e33dcf134cdd..7afc96fb6524 100644
--- a/arch/powerpc/include/asm/compat.h
+++ b/arch/powerpc/include/asm/compat.h
@@ -83,22 +83,6 @@ struct compat_statfs {
#define COMPAT_OFF_T_MAX 0x7fffffff
-static inline void __user *arch_compat_alloc_user_space(long len)
-{
- struct pt_regs *regs = current->thread.regs;
- unsigned long usp = regs->gpr[1];
-
- /*
- * We can't access below the stack pointer in the 32bit ABI and
- * can access 288 bytes in the 64bit big-endian ABI,
- * or 512 bytes with the new ELFv2 little-endian ABI.
- */
- if (!is_32bit_task())
- usp -= USER_REDZONE_SIZE;
-
- return (void __user *) (usp - len);
-}
-
/*
* ipc64_perm is actually 32/64bit clean but since the compat layer refers to
* it we may as well define it.
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index a779f7849cfb..080a7feb7731 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -103,7 +103,6 @@ struct kvm_vcpu_stat {
u64 emulated_inst_exits;
u64 dec_exits;
u64 ext_intr_exits;
- u64 halt_wait_ns;
u64 halt_successful_wait;
u64 dbell_exits;
u64 gdbell_exits;
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 29b55e2e035c..7bef917cc84e 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -330,10 +330,10 @@
256 64 sys_debug_setcontext sys_ni_syscall
256 spu sys_debug_setcontext sys_ni_syscall
# 257 reserved for vserver
-258 nospu migrate_pages sys_migrate_pages compat_sys_migrate_pages
-259 nospu mbind sys_mbind compat_sys_mbind
-260 nospu get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
-261 nospu set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy
+258 nospu migrate_pages sys_migrate_pages
+259 nospu mbind sys_mbind
+260 nospu get_mempolicy sys_get_mempolicy
+261 nospu set_mempolicy sys_set_mempolicy
262 nospu mq_open sys_mq_open compat_sys_mq_open
263 nospu mq_unlink sys_mq_unlink
264 32 mq_timedsend sys_mq_timedsend_time32
@@ -381,7 +381,7 @@
298 common faccessat sys_faccessat
299 common get_robust_list sys_get_robust_list compat_sys_get_robust_list
300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list
-301 common move_pages sys_move_pages compat_sys_move_pages
+301 common move_pages sys_move_pages
302 common getcpu sys_getcpu
303 nospu epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait
304 32 utimensat sys_utimensat_time32
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 4390f8d72126..aac8c0412ff9 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -2219,11 +2219,6 @@ DEFINE_INTERRUPT_HANDLER(kernel_bad_stack)
die("Bad kernel stack pointer", regs, SIGABRT);
}
-void __init trap_init(void)
-{
-}
-
-
#ifdef CONFIG_PPC_EMULATED_STATS
#define WARN_EMULATED_SETUP(type) .type = { .name = #type }
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 79833f78d1da..b785f6772391 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -43,8 +43,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
STATS_DESC_ICOUNTER(VM, num_2M_pages),
STATS_DESC_ICOUNTER(VM, num_1G_pages)
};
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
- sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
@@ -71,7 +69,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
STATS_DESC_COUNTER(VCPU, dec_exits),
STATS_DESC_COUNTER(VCPU, ext_intr_exits),
- STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns),
STATS_DESC_COUNTER(VCPU, halt_successful_wait),
STATS_DESC_COUNTER(VCPU, dbell_exits),
STATS_DESC_COUNTER(VCPU, gdbell_exits),
@@ -88,8 +85,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, pthru_host),
STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
};
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
- sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 8da93fdfa59e..6365087f3160 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -346,7 +346,7 @@ static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
unsigned long gfn = tce >> PAGE_SHIFT;
struct kvm_memory_slot *memslot;
- memslot = search_memslots(kvm_memslots(kvm), gfn);
+ memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
if (!memslot)
return -EINVAL;
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 636c6ae0939b..870b7f0c7ea5 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -80,7 +80,7 @@ static long kvmppc_rm_tce_to_ua(struct kvm *kvm,
unsigned long gfn = tce >> PAGE_SHIFT;
struct kvm_memory_slot *memslot;
- memslot = search_memslots(kvm_memslots_raw(kvm), gfn);
+ memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
if (!memslot)
return -EINVAL;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index bb0dacf7cbec..2acb1c96cfaf 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4202,19 +4202,31 @@ out:
/* Attribute wait time */
if (do_sleep) {
- vc->runner->stat.halt_wait_ns +=
+ vc->runner->stat.generic.halt_wait_ns +=
ktime_to_ns(cur) - ktime_to_ns(start_wait);
+ KVM_STATS_LOG_HIST_UPDATE(
+ vc->runner->stat.generic.halt_wait_hist,
+ ktime_to_ns(cur) - ktime_to_ns(start_wait));
/* Attribute failed poll time */
- if (vc->halt_poll_ns)
+ if (vc->halt_poll_ns) {
vc->runner->stat.generic.halt_poll_fail_ns +=
ktime_to_ns(start_wait) -
ktime_to_ns(start_poll);
+ KVM_STATS_LOG_HIST_UPDATE(
+ vc->runner->stat.generic.halt_poll_fail_hist,
+ ktime_to_ns(start_wait) -
+ ktime_to_ns(start_poll));
+ }
} else {
/* Attribute successful poll time */
- if (vc->halt_poll_ns)
+ if (vc->halt_poll_ns) {
vc->runner->stat.generic.halt_poll_success_ns +=
ktime_to_ns(cur) -
ktime_to_ns(start_poll);
+ KVM_STATS_LOG_HIST_UPDATE(
+ vc->runner->stat.generic.halt_poll_success_hist,
+ ktime_to_ns(cur) - ktime_to_ns(start_poll));
+ }
}
/* Adjust poll time */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 551b30d84aee..977801c83aff 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -41,8 +41,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
STATS_DESC_ICOUNTER(VM, num_2M_pages),
STATS_DESC_ICOUNTER(VM, num_1G_pages)
};
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
- sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
@@ -69,7 +67,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
STATS_DESC_COUNTER(VCPU, dec_exits),
STATS_DESC_COUNTER(VCPU, ext_intr_exits),
- STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns),
STATS_DESC_COUNTER(VCPU, halt_successful_wait),
STATS_DESC_COUNTER(VCPU, dbell_exits),
STATS_DESC_COUNTER(VCPU, gdbell_exits),
@@ -79,8 +76,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, pthru_host),
STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
};
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
- sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ad198b439222..c3c4e31462ec 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -119,8 +119,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
return rc;
}
-void __ref arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 44246ba59768..91cf23495ccb 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -286,7 +286,7 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si
{
unsigned long block_sz, start_pfn;
int sections_per_block;
- int i, nid;
+ int i;
start_pfn = base >> PAGE_SHIFT;
@@ -297,10 +297,9 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si
block_sz = pseries_memory_block_size();
sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
- nid = memory_add_physaddr_to_nid(base);
for (i = 0; i < sections_per_block; i++) {
- __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
+ __remove_memory(base, MIN_MEMORY_BLOCK_SIZE);
base += MIN_MEMORY_BLOCK_SIZE;
}
@@ -387,7 +386,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
block_sz = pseries_memory_block_size();
- __remove_memory(mem_block->nid, lmb->base_addr, block_sz);
+ __remove_memory(lmb->base_addr, block_sz);
put_device(&mem_block->dev);
/* Update memory regions for memory remove */
@@ -660,7 +659,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
rc = dlpar_online_lmb(lmb);
if (rc) {
- __remove_memory(nid, lmb->base_addr, block_sz);
+ __remove_memory(lmb->base_addr, block_sz);
invalidate_lmb_associativity_index(lmb);
} else {
lmb->flags |= DRCONF_MEM_ASSIGNED;
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index c1abbc876e5b..c3f3fd583e04 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -52,7 +52,7 @@ config RISCV
select GENERIC_EARLY_IOREMAP
select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO
select GENERIC_IDLE_POLL_SETUP
- select GENERIC_IOREMAP
+ select GENERIC_IOREMAP if MMU
select GENERIC_IRQ_MULTI_HANDLER
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL
@@ -114,6 +114,7 @@ config RISCV
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+ select TRACE_IRQFLAGS_SUPPORT
select UACCESS_MEMCPY if !MMU
select ZONE_DMA32 if 64BIT
@@ -183,9 +184,6 @@ config ARCH_SUPPORTS_UPROBES
config STACKTRACE_SUPPORT
def_bool y
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
config GENERIC_BUG
def_bool y
depends on BUG
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 0a98fd0ddfe9..0daaa3e4630d 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -199,11 +199,6 @@ int is_valid_bugaddr(unsigned long pc)
}
#endif /* CONFIG_GENERIC_BUG */
-/* stvec & scratch is already set from head.S */
-void __init trap_init(void)
-{
-}
-
#ifdef CONFIG_VMAP_STACK
static DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)],
overflow_stack)__aligned(16);
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 92c0a1b4c528..2bd90c51efd3 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -110,6 +110,7 @@ config S390
select ARCH_STACKWALK
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select ARCH_SUPPORTS_HUGETLBFS
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
@@ -209,6 +210,7 @@ config S390
select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+ select TRACE_IRQFLAGS_SUPPORT
select TTY
select VIRT_CPU_ACCOUNTING
select ZONE_DMA
diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index 9ea6e61d5858..e94a2a7f6bf4 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -1,8 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
config EARLY_PRINTK
def_bool y
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 11ffc7c37ada..37b6115ed80e 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -804,6 +804,7 @@ CONFIG_DEBUG_VM_PGFLAGS=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
CONFIG_DEBUG_PER_CPU_MAPS=y
+CONFIG_KFENCE=y
CONFIG_DEBUG_SHIRQ=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index e1642d2cba59..56a1cc85c5d7 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -397,7 +397,6 @@ CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=32768
-# CONFIG_BLK_DEV_XPRAM is not set
CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_RBD=m
CONFIG_BLK_DEV_NVME=m
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index d576aaab27c9..aceccf3b9a88 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -35,7 +35,6 @@ CONFIG_NET=y
# CONFIG_ETHTOOL_NETLINK is not set
CONFIG_DEVTMPFS=y
CONFIG_BLK_DEV_RAM=y
-# CONFIG_BLK_DEV_XPRAM is not set
# CONFIG_DCSSBLK is not set
# CONFIG_DASD is not set
CONFIG_ENCLOSURE_SERVICES=y
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index 8d49505b4a43..cdc7ae72529d 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -176,16 +176,6 @@ static inline int is_compat_task(void)
return test_thread_flag(TIF_31BIT);
}
-static inline void __user *arch_compat_alloc_user_space(long len)
-{
- unsigned long stack;
-
- stack = KSTK_ESP(current);
- if (is_compat_task())
- stack &= 0x7fffffffUL;
- return (void __user *) (stack - len);
-}
-
#endif
struct compat_ipc64_perm {
diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h
index ca0e0e5ddbc4..f87a4788c19c 100644
--- a/arch/s390/include/asm/cpu_mcf.h
+++ b/arch/s390/include/asm/cpu_mcf.h
@@ -24,13 +24,6 @@ enum cpumf_ctr_set {
#define CPUMF_LCCTL_ENABLE_SHIFT 16
#define CPUMF_LCCTL_ACTCTL_SHIFT 0
-static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
- [CPUMF_CTR_SET_BASIC] = 0x02,
- [CPUMF_CTR_SET_USER] = 0x04,
- [CPUMF_CTR_SET_CRYPTO] = 0x08,
- [CPUMF_CTR_SET_EXT] = 0x01,
- [CPUMF_CTR_SET_MT_DIAG] = 0x20,
-};
static inline void ctr_set_enable(u64 *state, u64 ctrsets)
{
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d681ae462350..a604d51acfc8 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -244,6 +244,7 @@ struct kvm_s390_sie_block {
__u8 fpf; /* 0x0060 */
#define ECB_GS 0x40
#define ECB_TE 0x10
+#define ECB_SPECI 0x08
#define ECB_SRSI 0x04
#define ECB_HOSTPROTINT 0x02
__u8 ecb; /* 0x0061 */
@@ -955,6 +956,7 @@ struct kvm_arch{
atomic64_t cmma_dirty_pages;
/* subset of available cpu features enabled by user space */
DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+ /* indexed by vcpu_idx */
DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
struct kvm_s390_gisa_interrupt gisa_int;
struct kvm_s390_pv pv;
diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index e317fd4866c1..f16f4d054ae2 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -18,6 +18,7 @@ extern struct mutex smp_cpu_state_mutex;
extern unsigned int smp_cpu_mt_shift;
extern unsigned int smp_cpu_mtid;
extern __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS];
+extern cpumask_t cpu_setup_mask;
extern int __cpu_up(unsigned int cpu, struct task_struct *tidle);
diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h
index 3d8a4b94c620..dd00d98804ec 100644
--- a/arch/s390/include/asm/stacktrace.h
+++ b/arch/s390/include/asm/stacktrace.h
@@ -34,16 +34,6 @@ static inline bool on_stack(struct stack_info *info,
return addr >= info->begin && addr + len <= info->end;
}
-static __always_inline unsigned long get_stack_pointer(struct task_struct *task,
- struct pt_regs *regs)
-{
- if (regs)
- return (unsigned long) kernel_stack_pointer(regs);
- if (task == current)
- return current_stack_pointer();
- return (unsigned long) task->thread.ksp;
-}
-
/*
* Stack layout of a C stack frame.
*/
@@ -74,6 +64,16 @@ struct stack_frame {
((unsigned long)__builtin_frame_address(0) - \
offsetof(struct stack_frame, back_chain))
+static __always_inline unsigned long get_stack_pointer(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ if (regs)
+ return (unsigned long)kernel_stack_pointer(regs);
+ if (task == current)
+ return current_frame_address();
+ return (unsigned long)task->thread.ksp;
+}
+
/*
* To keep this simple mark register 2-6 as being changed (volatile)
* by the called function, even though register 6 is saved/nonvolatile.
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 9ed9aa37e836..ce550d06abc3 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -227,9 +227,6 @@ static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long s
__get_user(x, ptr); \
})
-unsigned long __must_check
-raw_copy_in_user(void __user *to, const void __user *from, unsigned long n);
-
/*
* Copy a null terminated string from userspace.
*/
diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h
index de9006b0cfeb..5ebf534ef753 100644
--- a/arch/s390/include/asm/unwind.h
+++ b/arch/s390/include/asm/unwind.h
@@ -55,10 +55,10 @@ static inline bool unwind_error(struct unwind_state *state)
return state->error;
}
-static inline void unwind_start(struct unwind_state *state,
- struct task_struct *task,
- struct pt_regs *regs,
- unsigned long first_frame)
+static __always_inline void unwind_start(struct unwind_state *state,
+ struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned long first_frame)
{
task = task ?: current;
first_frame = first_frame ?: get_stack_pointer(task, regs);
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index b9716a7e326d..4c9b967290ae 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -140,10 +140,10 @@ _LPP_OFFSET = __LC_LPP
TSTMSK __LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR)
jnz \errlabel
TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD
- jz oklabel\@
+ jz .Loklabel\@
TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR
jnz \errlabel
-oklabel\@:
+.Loklabel\@:
.endm
#if IS_ENABLED(CONFIG_KVM)
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 0a464d328467..1d94ffdf347b 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -341,13 +341,13 @@ NOKPROBE_SYMBOL(prepare_ftrace_return);
*/
int ftrace_enable_ftrace_graph_caller(void)
{
- brcl_disable(__va(ftrace_graph_caller));
+ brcl_disable(ftrace_graph_caller);
return 0;
}
int ftrace_disable_ftrace_graph_caller(void)
{
- brcl_enable(__va(ftrace_graph_caller));
+ brcl_enable(ftrace_graph_caller);
return 0;
}
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 2e3bb633acf6..4a99154fe651 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -158,6 +158,14 @@ static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
return need;
}
+static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
+ [CPUMF_CTR_SET_BASIC] = 0x02,
+ [CPUMF_CTR_SET_USER] = 0x04,
+ [CPUMF_CTR_SET_CRYPTO] = 0x08,
+ [CPUMF_CTR_SET_EXT] = 0x01,
+ [CPUMF_CTR_SET_MT_DIAG] = 0x20,
+};
+
/* Read out all counter sets and save them in the provided data buffer.
* The last 64 byte host an artificial trailer entry.
*/
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 5a01872f5984..67e5fff96ee0 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -50,6 +50,7 @@
#include <linux/compat.h>
#include <linux/start_kernel.h>
#include <linux/hugetlb.h>
+#include <linux/kmemleak.h>
#include <asm/boot_data.h>
#include <asm/ipl.h>
@@ -356,9 +357,12 @@ void *restart_stack;
unsigned long stack_alloc(void)
{
#ifdef CONFIG_VMAP_STACK
- return (unsigned long)__vmalloc_node(THREAD_SIZE, THREAD_SIZE,
- THREADINFO_GFP, NUMA_NO_NODE,
- __builtin_return_address(0));
+ void *ret;
+
+ ret = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP,
+ NUMA_NO_NODE, __builtin_return_address(0));
+ kmemleak_not_leak(ret);
+ return (unsigned long)ret;
#else
return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
#endif
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 2a991e43ead3..1a04e5bdf655 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -95,6 +95,7 @@ __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS];
#endif
static unsigned int smp_max_threads __initdata = -1U;
+cpumask_t cpu_setup_mask;
static int __init early_nosmt(char *s)
{
@@ -902,13 +903,14 @@ static void smp_start_secondary(void *cpuvoid)
vtime_init();
vdso_getcpu_init();
pfault_init();
+ cpumask_set_cpu(cpu, &cpu_setup_mask);
+ update_cpu_masks();
notify_cpu_starting(cpu);
if (topology_cpu_dedicated(cpu))
set_cpu_flag(CIF_DEDICATED_CPU);
else
clear_cpu_flag(CIF_DEDICATED_CPU);
set_cpu_online(cpu, true);
- update_cpu_masks();
inc_irq_stat(CPU_RST);
local_irq_enable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
@@ -950,10 +952,13 @@ early_param("possible_cpus", _setup_possible_cpus);
int __cpu_disable(void)
{
unsigned long cregs[16];
+ int cpu;
/* Handle possible pending IPIs */
smp_handle_ext_call();
- set_cpu_online(smp_processor_id(), false);
+ cpu = smp_processor_id();
+ set_cpu_online(cpu, false);
+ cpumask_clear_cpu(cpu, &cpu_setup_mask);
update_cpu_masks();
/* Disable pseudo page faults on this cpu. */
pfault_fini();
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index aa9d68b8ee14..df5261e5cfe1 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -274,9 +274,9 @@
265 common statfs64 sys_statfs64 compat_sys_statfs64
266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64
267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages
-268 common mbind sys_mbind compat_sys_mbind
-269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
-270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy
+268 common mbind sys_mbind sys_mbind
+269 common get_mempolicy sys_get_mempolicy sys_get_mempolicy
+270 common set_mempolicy sys_set_mempolicy sys_set_mempolicy
271 common mq_open sys_mq_open compat_sys_mq_open
272 common mq_unlink sys_mq_unlink sys_mq_unlink
273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32
@@ -293,7 +293,7 @@
284 common inotify_init sys_inotify_init sys_inotify_init
285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch
286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch
-287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages
+287 common migrate_pages sys_migrate_pages sys_migrate_pages
288 common openat sys_openat compat_sys_openat
289 common mkdirat sys_mkdirat sys_mkdirat
290 common mknodat sys_mknodat sys_mknodat
@@ -317,7 +317,7 @@
307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range
308 common tee sys_tee sys_tee
309 common vmsplice sys_vmsplice sys_vmsplice
-310 common move_pages sys_move_pages compat_sys_move_pages
+310 common move_pages sys_move_pages sys_move_pages
311 common getcpu sys_getcpu sys_getcpu
312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait
313 common utimes sys_utimes sys_utimes_time32
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index d2458a29618f..58f8291950cb 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -67,7 +67,7 @@ static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int c
static cpumask_t mask;
cpumask_clear(&mask);
- if (!cpu_online(cpu))
+ if (!cpumask_test_cpu(cpu, &cpu_setup_mask))
goto out;
cpumask_set_cpu(cpu, &mask);
switch (topology_mode) {
@@ -88,7 +88,7 @@ static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int c
case TOPOLOGY_MODE_SINGLE:
break;
}
- cpumask_and(&mask, &mask, cpu_online_mask);
+ cpumask_and(&mask, &mask, &cpu_setup_mask);
out:
cpumask_copy(dst, &mask);
}
@@ -99,16 +99,16 @@ static void cpu_thread_map(cpumask_t *dst, unsigned int cpu)
int i;
cpumask_clear(&mask);
- if (!cpu_online(cpu))
+ if (!cpumask_test_cpu(cpu, &cpu_setup_mask))
goto out;
cpumask_set_cpu(cpu, &mask);
if (topology_mode != TOPOLOGY_MODE_HW)
goto out;
cpu -= cpu % (smp_cpu_mtid + 1);
- for (i = 0; i <= smp_cpu_mtid; i++)
- if (cpu_present(cpu + i))
+ for (i = 0; i <= smp_cpu_mtid; i++) {
+ if (cpumask_test_cpu(cpu + i, &cpu_setup_mask))
cpumask_set_cpu(cpu + i, &mask);
- cpumask_and(&mask, &mask, cpu_online_mask);
+ }
out:
cpumask_copy(dst, &mask);
}
@@ -569,6 +569,7 @@ void __init topology_init_early(void)
alloc_masks(info, &book_info, 2);
alloc_masks(info, &drawer_info, 3);
out:
+ cpumask_set_cpu(0, &cpu_setup_mask);
__arch_update_cpu_topology();
__arch_update_dedicated_flag(NULL);
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index d548d60caed2..16256e17a544 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
static void __set_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
- set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
@@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
{
- int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus);
+ int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
struct kvm_vcpu *vcpu;
- for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
- vcpu = kvm_get_vcpu(kvm, vcpu_id);
+ for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
if (psw_ioint_disabled(vcpu))
continue;
deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
if (deliverable_mask) {
/* lately kicked but not yet running */
- if (test_and_set_bit(vcpu_id, gi->kicked_mask))
+ if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
return;
kvm_s390_vcpu_wakeup(vcpu);
return;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index efda0615741f..752a0ffab9bf 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -66,8 +66,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
STATS_DESC_COUNTER(VM, inject_service_signal),
STATS_DESC_COUNTER(VM, inject_virtio)
};
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
- sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
@@ -174,8 +172,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
STATS_DESC_COUNTER(VCPU, pfault_sync)
};
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
- sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
@@ -1953,7 +1949,7 @@ out:
static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
{
int start = 0, end = slots->used_slots;
- int slot = atomic_read(&slots->lru_slot);
+ int slot = atomic_read(&slots->last_used_slot);
struct kvm_memory_slot *memslots = slots->memslots;
if (gfn >= memslots[slot].base_gfn &&
@@ -1974,7 +1970,7 @@ static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
if (gfn >= memslots[start].base_gfn &&
gfn < memslots[start].base_gfn + memslots[start].npages) {
- atomic_set(&slots->lru_slot, start);
+ atomic_set(&slots->last_used_slot, start);
}
return start;
@@ -3224,6 +3220,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->ecb |= ECB_SRSI;
if (test_kvm_facility(vcpu->kvm, 73))
vcpu->arch.sie_block->ecb |= ECB_TE;
+ if (!kvm_is_ucontrol(vcpu->kvm))
+ vcpu->arch.sie_block->ecb |= ECB_SPECI;
if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi)
vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
@@ -4068,7 +4066,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
kvm_s390_patch_guest_per_regs(vcpu);
}
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask);
vcpu->arch.sie_block->icptcode = 0;
cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 9fad25109b0d..ecd741ee3276 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
{
- return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static inline int kvm_is_ucontrol(struct kvm *kvm)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 4002a24bc43a..acda4b6fc851 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -510,6 +510,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
prefix_unmapped(vsie_page);
scb_s->ecb |= ECB_TE;
}
+ /* specification exception interpretation */
+ scb_s->ecb |= scb_o->ecb & ECB_SPECI;
/* branch prediction */
if (test_kvm_facility(vcpu->kvm, 82))
scb_s->fpf |= scb_o->fpf & FPF_BPBC;
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index 94ca99bde59d..a596e69d3c47 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -204,69 +204,6 @@ unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long
}
EXPORT_SYMBOL(raw_copy_to_user);
-static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from,
- unsigned long size)
-{
- unsigned long tmp1, tmp2;
-
- tmp1 = -4096UL;
- /* FIXME: copy with reduced length. */
- asm volatile(
- " lgr 0,%[spec]\n"
- "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n"
- " jz 2f\n"
- "1: algr %0,%3\n"
- " slgr %1,%3\n"
- " slgr %2,%3\n"
- " j 0b\n"
- "2:slgr %0,%0\n"
- "3: \n"
- EX_TABLE(0b,3b)
- : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2)
- : [spec] "d" (0x810081UL)
- : "cc", "memory", "0");
- return size;
-}
-
-static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from,
- unsigned long size)
-{
- unsigned long tmp1;
-
- asm volatile(
- " sacf 256\n"
- " aghi %0,-1\n"
- " jo 5f\n"
- " bras %3,3f\n"
- "0: aghi %0,257\n"
- "1: mvc 0(1,%1),0(%2)\n"
- " la %1,1(%1)\n"
- " la %2,1(%2)\n"
- " aghi %0,-1\n"
- " jnz 1b\n"
- " j 5f\n"
- "2: mvc 0(256,%1),0(%2)\n"
- " la %1,256(%1)\n"
- " la %2,256(%2)\n"
- "3: aghi %0,-256\n"
- " jnm 2b\n"
- "4: ex %0,1b-0b(%3)\n"
- "5: slgr %0,%0\n"
- "6: sacf 768\n"
- EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b)
- : "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1)
- : : "cc", "memory");
- return size;
-}
-
-unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n)
-{
- if (copy_with_mvcos())
- return copy_in_user_mvcos(to, from, n);
- return copy_in_user_mvc(to, from, n);
-}
-EXPORT_SYMBOL(raw_copy_in_user);
-
static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size)
{
unsigned long tmp1, tmp2;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index a834e4672f72..212632d57db9 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -822,7 +822,7 @@ void do_secure_storage_access(struct pt_regs *regs)
break;
case KERNEL_FAULT:
page = phys_to_page(addr);
- if (unlikely(!try_get_compound_head(page, 1)))
+ if (unlikely(!try_get_page(page)))
break;
rc = arch_make_page_accessible(page);
put_page(page);
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 9bb2c7512cd5..4d3b33ce81c6 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -27,7 +27,6 @@
/**
* gmap_alloc - allocate and initialize a guest address space
- * @mm: pointer to the parent mm_struct
* @limit: maximum address of the gmap address space
*
* Returns a guest address space structure.
@@ -504,7 +503,7 @@ EXPORT_SYMBOL_GPL(gmap_translate);
/**
* gmap_unlink - disconnect a page table from the gmap shadow tables
- * @gmap: pointer to guest mapping meta data structure
+ * @mm: pointer to the parent mm_struct
* @table: pointer to the host page table
* @vmaddr: vm address associated with the host page table
*/
@@ -527,7 +526,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
unsigned long gaddr);
/**
- * gmap_link - set up shadow page tables to connect a host to a guest address
+ * __gmap_link - set up shadow page tables to connect a host to a guest address
* @gmap: pointer to guest mapping meta data structure
* @gaddr: guest address
* @vmaddr: vm address
@@ -1971,7 +1970,7 @@ out_free:
EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
/**
- * gmap_shadow_lookup_pgtable - find a shadow page table
+ * gmap_shadow_pgt_lookup - find a shadow page table
* @sg: pointer to the shadow guest address space structure
* @saddr: the address in the shadow aguest address space
* @pgt: parent gmap address of the page table to get shadowed
@@ -2165,7 +2164,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
}
EXPORT_SYMBOL_GPL(gmap_shadow_page);
-/**
+/*
* gmap_shadow_notify - handle notifications for shadow gmap
*
* Called with sg->parent->shadow_lock.
@@ -2225,7 +2224,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
/**
* ptep_notify - call all invalidation callbacks for a specific pte.
* @mm: pointer to the process mm_struct
- * @addr: virtual address in the process address space
+ * @vmaddr: virtual address in the process address space
* @pte: pointer to the page table entry
* @bits: bits from the pgste that caused the notify call
*
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index f14e7e61cd8e..a04faf49001a 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -307,8 +307,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return rc;
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index eec3a9d7176e..034721a68d8f 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -834,7 +834,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(set_guest_storage_key);
-/**
+/*
* Conditionally set a guest storage key (handling csske).
* oldkey will be updated when either mr or mc is set and a pointer is given.
*
@@ -867,7 +867,7 @@ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(cond_set_guest_storage_key);
-/**
+/*
* Reset a guest reference bit (rrbe), returning the reference and changed bit.
*
* Returns < 0 in case of error, otherwise the cc to be reported to the guest.
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 51dc2215a2b7..be077b39da33 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -383,8 +383,8 @@ static int clp_find_pci(struct clp_req_rsp_list_pci *rrb, u32 fid,
rc = clp_list_pci_req(rrb, &resume_token, &nentries);
if (rc)
return rc;
+ fh_list = rrb->response.fh_list;
for (i = 0; i < nentries; i++) {
- fh_list = rrb->response.fh_list;
if (fh_list[i].fid == fid) {
*entry = fh_list[i];
return 0;
@@ -449,14 +449,17 @@ int clp_get_state(u32 fid, enum zpci_state *state)
struct clp_fh_list_entry entry;
int rc;
- *state = ZPCI_FN_STATE_RESERVED;
rrb = clp_alloc_block(GFP_ATOMIC);
if (!rrb)
return -ENOMEM;
rc = clp_find_pci(rrb, fid, &entry);
- if (!rc)
+ if (!rc) {
*state = entry.config_state;
+ } else if (rc == -ENODEV) {
+ *state = ZPCI_FN_STATE_RESERVED;
+ rc = 0;
+ }
clp_free_block(rrb);
return rc;
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index f37280e805ea..6904f4bdbf00 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -69,6 +69,7 @@ config SUPERH
select RTC_LIB
select SET_FS
select SPARSE_IRQ
+ select TRACE_IRQFLAGS_SUPPORT
help
The SuperH is a RISC processor targeted for use in embedded systems
and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug
index 28a43d63bde1..958f790273ab 100644
--- a/arch/sh/Kconfig.debug
+++ b/arch/sh/Kconfig.debug
@@ -1,8 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
config SH_STANDARD_BIOS
bool "Use LinuxSH standard BIOS"
help
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index ce26c7f8950a..506784702430 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -414,8 +414,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return ret;
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index fa650e4eadba..b120ed947f50 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -47,6 +47,7 @@ config SPARC
select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
select SET_FS
+ select TRACE_IRQFLAGS_SUPPORT
config SPARC32
def_bool !64BIT
diff --git a/arch/sparc/Kconfig.debug b/arch/sparc/Kconfig.debug
index 50a918d496c8..6b2bec1888b3 100644
--- a/arch/sparc/Kconfig.debug
+++ b/arch/sparc/Kconfig.debug
@@ -1,9 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-config TRACE_IRQFLAGS_SUPPORT
- bool
- default y
-
config DEBUG_DCFLUSH
bool "D-cache flush debugging"
depends on SPARC64 && DEBUG_KERNEL
diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
index 8b63410e830f..bd949fcf9d63 100644
--- a/arch/sparc/include/asm/compat.h
+++ b/arch/sparc/include/asm/compat.h
@@ -116,25 +116,6 @@ struct compat_statfs {
#define COMPAT_OFF_T_MAX 0x7fffffff
-#ifdef CONFIG_COMPAT
-static inline void __user *arch_compat_alloc_user_space(long len)
-{
- struct pt_regs *regs = current_thread_info()->kregs;
- unsigned long usp = regs->u_regs[UREG_I6];
-
- if (test_thread_64bit_stack(usp))
- usp += STACK_BIAS;
-
- if (test_thread_flag(TIF_32BIT))
- usp &= 0xffffffffUL;
-
- usp -= len;
- usp &= ~0x7UL;
-
- return (void __user *) usp;
-}
-#endif
-
struct compat_ipc64_perm {
compat_key_t key;
__compat_uid32_t uid;
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 093849bfda50..d1cc410d2f64 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -455,7 +455,7 @@ static unsigned long clone_stackframe(unsigned long csp, unsigned long psp)
distance = fp - psp;
rval = (csp - distance);
- if (copy_in_user((void __user *) rval, (void __user *) psp, distance))
+ if (raw_copy_in_user((void __user *)rval, (void __user *)psp, distance))
rval = 0;
else if (!stack_64bit) {
if (put_user(((u32)csp),
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 4276b9e003ca..6cc124a3bb98 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -435,9 +435,9 @@ static int setup_frame32(struct ksignal *ksig, struct pt_regs *regs,
(_COMPAT_NSIG_WORDS - 1) * sizeof(unsigned int));
if (!wsaved) {
- err |= copy_in_user((u32 __user *)sf,
- (u32 __user *)(regs->u_regs[UREG_FP]),
- sizeof(struct reg_window32));
+ err |= raw_copy_in_user((u32 __user *)sf,
+ (u32 __user *)(regs->u_regs[UREG_FP]),
+ sizeof(struct reg_window32));
} else {
struct reg_window *rp;
@@ -567,9 +567,9 @@ static int setup_rt_frame32(struct ksignal *ksig, struct pt_regs *regs,
err |= put_compat_sigset(&sf->mask, oldset, sizeof(compat_sigset_t));
if (!wsaved) {
- err |= copy_in_user((u32 __user *)sf,
- (u32 __user *)(regs->u_regs[UREG_FP]),
- sizeof(struct reg_window32));
+ err |= raw_copy_in_user((u32 __user *)sf,
+ (u32 __user *)(regs->u_regs[UREG_FP]),
+ sizeof(struct reg_window32));
} else {
struct reg_window *rp;
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index cea23cf95600..2a78d2af1265 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -406,10 +406,10 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
err |= copy_to_user(&sf->mask, sigmask_to_save(), sizeof(sigset_t));
if (!wsaved) {
- err |= copy_in_user((u64 __user *)sf,
- (u64 __user *)(regs->u_regs[UREG_FP] +
- STACK_BIAS),
- sizeof(struct reg_window));
+ err |= raw_copy_in_user((u64 __user *)sf,
+ (u64 __user *)(regs->u_regs[UREG_FP] +
+ STACK_BIAS),
+ sizeof(struct reg_window));
} else {
struct reg_window *rp;
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 7893104718c2..c37764dc764d 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -365,12 +365,12 @@
299 common unshare sys_unshare
300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list
301 common get_robust_list sys_get_robust_list compat_sys_get_robust_list
-302 common migrate_pages sys_migrate_pages compat_sys_migrate_pages
-303 common mbind sys_mbind compat_sys_mbind
-304 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
-305 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy
+302 common migrate_pages sys_migrate_pages
+303 common mbind sys_mbind
+304 common get_mempolicy sys_get_mempolicy
+305 common set_mempolicy sys_set_mempolicy
306 common kexec_load sys_kexec_load compat_sys_kexec_load
-307 common move_pages sys_move_pages compat_sys_move_pages
+307 common move_pages sys_move_pages
308 common getcpu sys_getcpu
309 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait
310 32 utimensat sys_utimensat_time32
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 77e66d3719f6..c18b45f75d41 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -22,7 +22,9 @@ config UML
select GENERIC_CPU_DEVICES
select HAVE_GCC_PLUGINS
select SET_FS
+ select TRACE_IRQFLAGS_SUPPORT
select TTY # Needed for line.c
+ select HAVE_ARCH_VMAP_STACK
config MMU
bool
@@ -52,10 +54,6 @@ config ISA
config SBUS
bool
-config TRACE_IRQFLAGS_SUPPORT
- bool
- default y
-
config LOCKDEP_SUPPORT
bool
default y
diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c
index 0b802834f40a..c08066633023 100644
--- a/arch/um/drivers/virt-pci.c
+++ b/arch/um/drivers/virt-pci.c
@@ -56,6 +56,13 @@ static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)];
#define UM_VIRT_PCI_MAXDELAY 40000
+struct um_pci_message_buffer {
+ struct virtio_pcidev_msg hdr;
+ u8 data[8];
+};
+
+static struct um_pci_message_buffer __percpu *um_pci_msg_bufs;
+
static int um_pci_send_cmd(struct um_pci_device *dev,
struct virtio_pcidev_msg *cmd,
unsigned int cmd_size,
@@ -68,11 +75,12 @@ static int um_pci_send_cmd(struct um_pci_device *dev,
[1] = extra ? &extra_sg : &in_sg,
[2] = extra ? &in_sg : NULL,
};
+ struct um_pci_message_buffer *buf;
int delay_count = 0;
int ret, len;
bool posted;
- if (WARN_ON(cmd_size < sizeof(*cmd)))
+ if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf)))
return -EINVAL;
switch (cmd->op) {
@@ -88,6 +96,9 @@ static int um_pci_send_cmd(struct um_pci_device *dev,
break;
}
+ buf = get_cpu_var(um_pci_msg_bufs);
+ memcpy(buf, cmd, cmd_size);
+
if (posted) {
u8 *ncmd = kmalloc(cmd_size + extra_size, GFP_ATOMIC);
@@ -102,7 +113,10 @@ static int um_pci_send_cmd(struct um_pci_device *dev,
} else {
/* try without allocating memory */
posted = false;
+ cmd = (void *)buf;
}
+ } else {
+ cmd = (void *)buf;
}
sg_init_one(&out_sg, cmd, cmd_size);
@@ -118,11 +132,12 @@ static int um_pci_send_cmd(struct um_pci_device *dev,
posted ? cmd : HANDLE_NO_FREE(cmd),
GFP_ATOMIC);
if (ret)
- return ret;
+ goto out;
if (posted) {
virtqueue_kick(dev->cmd_vq);
- return 0;
+ ret = 0;
+ goto out;
}
/* kick and poll for getting a response on the queue */
@@ -148,6 +163,8 @@ static int um_pci_send_cmd(struct um_pci_device *dev,
}
clear_bit(UM_PCI_STAT_WAITING, &dev->status);
+out:
+ put_cpu_var(um_pci_msg_bufs);
return ret;
}
@@ -161,12 +178,17 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset,
.size = size,
.addr = offset,
};
- /* maximum size - we may only use parts of it */
- u8 data[8];
+ /* buf->data is maximum size - we may only use parts of it */
+ struct um_pci_message_buffer *buf;
+ u8 *data;
+ unsigned long ret = ~0ULL;
if (!dev)
return ~0ULL;
+ buf = get_cpu_var(um_pci_msg_bufs);
+ data = buf->data;
+
memset(data, 0xff, sizeof(data));
switch (size) {
@@ -179,27 +201,34 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset,
break;
default:
WARN(1, "invalid config space read size %d\n", size);
- return ~0ULL;
+ goto out;
}
- if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0,
- data, sizeof(data)))
- return ~0ULL;
+ if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, 8))
+ goto out;
switch (size) {
case 1:
- return data[0];
+ ret = data[0];
+ break;
case 2:
- return le16_to_cpup((void *)data);
+ ret = le16_to_cpup((void *)data);
+ break;
case 4:
- return le32_to_cpup((void *)data);
+ ret = le32_to_cpup((void *)data);
+ break;
#ifdef CONFIG_64BIT
case 8:
- return le64_to_cpup((void *)data);
+ ret = le64_to_cpup((void *)data);
+ break;
#endif
default:
- return ~0ULL;
+ break;
}
+
+out:
+ put_cpu_var(um_pci_msg_bufs);
+ return ret;
}
static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size,
@@ -272,8 +301,13 @@ static void um_pci_bar_copy_from(void *priv, void *buffer,
static unsigned long um_pci_bar_read(void *priv, unsigned int offset,
int size)
{
- /* maximum size - we may only use parts of it */
- u8 data[8];
+ /* buf->data is maximum size - we may only use parts of it */
+ struct um_pci_message_buffer *buf;
+ u8 *data;
+ unsigned long ret = ~0ULL;
+
+ buf = get_cpu_var(um_pci_msg_bufs);
+ data = buf->data;
switch (size) {
case 1:
@@ -285,25 +319,33 @@ static unsigned long um_pci_bar_read(void *priv, unsigned int offset,
break;
default:
WARN(1, "invalid config space read size %d\n", size);
- return ~0ULL;
+ goto out;
}
um_pci_bar_copy_from(priv, data, offset, size);
switch (size) {
case 1:
- return data[0];
+ ret = data[0];
+ break;
case 2:
- return le16_to_cpup((void *)data);
+ ret = le16_to_cpup((void *)data);
+ break;
case 4:
- return le32_to_cpup((void *)data);
+ ret = le32_to_cpup((void *)data);
+ break;
#ifdef CONFIG_64BIT
case 8:
- return le64_to_cpup((void *)data);
+ ret = le64_to_cpup((void *)data);
+ break;
#endif
default:
- return ~0ULL;
+ break;
}
+
+out:
+ put_cpu_var(um_pci_msg_bufs);
+ return ret;
}
static void um_pci_bar_copy_to(void *priv, unsigned int offset,
@@ -810,7 +852,7 @@ void *pci_root_bus_fwnode(struct pci_bus *bus)
return um_pci_fwnode;
}
-int um_pci_init(void)
+static int um_pci_init(void)
{
int err, i;
@@ -823,10 +865,16 @@ int um_pci_init(void)
"No virtio device ID configured for PCI - no PCI support\n"))
return 0;
- bridge = pci_alloc_host_bridge(0);
- if (!bridge)
+ um_pci_msg_bufs = alloc_percpu(struct um_pci_message_buffer);
+ if (!um_pci_msg_bufs)
return -ENOMEM;
+ bridge = pci_alloc_host_bridge(0);
+ if (!bridge) {
+ err = -ENOMEM;
+ goto free;
+ }
+
um_pci_fwnode = irq_domain_alloc_named_fwnode("um-pci");
if (!um_pci_fwnode) {
err = -ENOMEM;
@@ -878,18 +926,22 @@ free:
irq_domain_remove(um_pci_inner_domain);
if (um_pci_fwnode)
irq_domain_free_fwnode(um_pci_fwnode);
- pci_free_resource_list(&bridge->windows);
- pci_free_host_bridge(bridge);
+ if (bridge) {
+ pci_free_resource_list(&bridge->windows);
+ pci_free_host_bridge(bridge);
+ }
+ free_percpu(um_pci_msg_bufs);
return err;
}
module_init(um_pci_init);
-void um_pci_exit(void)
+static void um_pci_exit(void)
{
unregister_virtio_driver(&um_pci_virtio_driver);
irq_domain_remove(um_pci_msi_domain);
irq_domain_remove(um_pci_inner_domain);
pci_free_resource_list(&bridge->windows);
pci_free_host_bridge(bridge);
+ free_percpu(um_pci_msg_bufs);
}
module_exit(um_pci_exit);
diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c
index 4412d6febade..d51e445df797 100644
--- a/arch/um/drivers/virtio_uml.c
+++ b/arch/um/drivers/virtio_uml.c
@@ -27,6 +27,7 @@
#include <linux/virtio_config.h>
#include <linux/virtio_ring.h>
#include <linux/time-internal.h>
+#include <linux/virtio-uml.h>
#include <shared/as-layout.h>
#include <irq_kern.h>
#include <init.h>
@@ -1139,7 +1140,7 @@ static int virtio_uml_probe(struct platform_device *pdev)
rc = os_connect_socket(pdata->socket_path);
} while (rc == -EINTR);
if (rc < 0)
- return rc;
+ goto error_free;
vu_dev->sock = rc;
spin_lock_init(&vu_dev->sock_lock);
@@ -1160,6 +1161,8 @@ static int virtio_uml_probe(struct platform_device *pdev)
error_init:
os_close_file(vu_dev->sock);
+error_free:
+ kfree(vu_dev);
return rc;
}
diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c
index 5afac0fef24e..ff5061f29167 100644
--- a/arch/um/kernel/skas/clone.c
+++ b/arch/um/kernel/skas/clone.c
@@ -24,8 +24,7 @@
void __attribute__ ((__section__ (".__syscall_stub")))
stub_clone_handler(void)
{
- int stack;
- struct stub_data *data = (void *) ((unsigned long)&stack & ~(UM_KERN_PAGE_SIZE - 1));
+ struct stub_data *data = get_stub_page();
long err;
err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD,
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index ad12f78bda7e..3198c4767387 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -311,7 +311,3 @@ void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
{
do_IRQ(WINCH_IRQ, regs);
}
-
-void trap_init(void)
-{
-}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1146b85d708b..4e001bbbb425 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -259,6 +259,7 @@ config X86
select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE)
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+ select TRACE_IRQFLAGS_SUPPORT
select USER_STACKTRACE_SUPPORT
select VIRT_TO_BUS
select HAVE_ARCH_KCSAN if X86_64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 80b57e7f4947..d3a6f74a94bd 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -1,8 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
config TRACE_IRQFLAGS_NMI_SUPPORT
def_bool y
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 9c9c4a888b1d..e81885384f60 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -156,7 +156,6 @@ CONFIG_FORCEDETH=y
CONFIG_8139TOO=y
# CONFIG_8139TOO_PIO is not set
CONFIG_R8169=y
-CONFIG_INPUT_POLLDEV=y
CONFIG_INPUT_EVDEV=y
CONFIG_INPUT_JOYSTICK=y
CONFIG_INPUT_TABLET=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index b60bd2d86034..e8a7a0af2bda 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -148,7 +148,6 @@ CONFIG_SKY2=y
CONFIG_FORCEDETH=y
CONFIG_8139TOO=y
CONFIG_R8169=y
-CONFIG_INPUT_POLLDEV=y
CONFIG_INPUT_EVDEV=y
CONFIG_INPUT_JOYSTICK=y
CONFIG_INPUT_TABLET=y
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 61f18b72552b..960a021d543e 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -286,7 +286,7 @@
272 i386 fadvise64_64 sys_ia32_fadvise64_64
273 i386 vserver
274 i386 mbind sys_mbind
-275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
+275 i386 get_mempolicy sys_get_mempolicy
276 i386 set_mempolicy sys_set_mempolicy
277 i386 mq_open sys_mq_open compat_sys_mq_open
278 i386 mq_unlink sys_mq_unlink
@@ -328,7 +328,7 @@
314 i386 sync_file_range sys_ia32_sync_file_range
315 i386 tee sys_tee
316 i386 vmsplice sys_vmsplice
-317 i386 move_pages sys_move_pages compat_sys_move_pages
+317 i386 move_pages sys_move_pages
318 i386 getcpu sys_getcpu
319 i386 epoll_pwait sys_epoll_pwait
320 i386 utimensat sys_utimensat_time32
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 807b6a1de8e8..18b5500ea8bf 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -398,7 +398,7 @@
530 x32 set_robust_list compat_sys_set_robust_list
531 x32 get_robust_list compat_sys_get_robust_list
532 x32 vmsplice sys_vmsplice
-533 x32 move_pages compat_sys_move_pages
+533 x32 move_pages sys_move_pages
534 x32 preadv compat_sys_preadv64
535 x32 pwritev compat_sys_pwritev64
536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 4ae01cdb99de..7516e4199b3c 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -156,19 +156,6 @@ struct compat_shmid64_ds {
(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
#endif
-static inline void __user *arch_compat_alloc_user_space(long len)
-{
- compat_uptr_t sp = task_pt_regs(current)->sp;
-
- /*
- * -128 for the x32 ABI redzone. For IA32, it is not strictly
- * necessary, but not harmful.
- */
- sp -= 128;
-
- return (void __user *)round_down(sp - len, 16);
-}
-
static inline bool in_x32_syscall(void)
{
#ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index a12a4987154e..cefe1d81e2e8 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -72,7 +72,6 @@ KVM_X86_OP(enable_nmi_window)
KVM_X86_OP(enable_irq_window)
KVM_X86_OP(update_cr8_intercept)
KVM_X86_OP(check_apicv_inhibit_reasons)
-KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl)
KVM_X86_OP(refresh_apicv_exec_ctrl)
KVM_X86_OP(hwapic_irr_update)
KVM_X86_OP(hwapic_isr_update)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index af6ce8d4c86a..f8f48a7ec577 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -37,9 +37,21 @@
#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
-#define KVM_MAX_VCPUS 288
-#define KVM_SOFT_MAX_VCPUS 240
-#define KVM_MAX_VCPU_ID 1023
+#define KVM_MAX_VCPUS 1024
+#define KVM_SOFT_MAX_VCPUS 710
+
+/*
+ * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
+ * might be larger than the actual number of VCPUs because the
+ * APIC ID encodes CPU topology information.
+ *
+ * In the worst case, we'll need less than one extra bit for the
+ * Core ID, and less than one extra bit for the Package (Die) ID,
+ * so ratio of 4 should be enough.
+ */
+#define KVM_VCPU_ID_RATIO 4
+#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
+
/* memory slots that are not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 3
@@ -124,13 +136,6 @@
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
-static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
-{
- /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
- return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
- (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
-}
-
#define KVM_PERMILLE_MMU_PAGES 20
#define KVM_MIN_ALLOC_MMU_PAGES 64UL
#define KVM_MMU_HASH_SHIFT 12
@@ -229,7 +234,8 @@ enum x86_intercept_stage;
KVM_GUESTDBG_USE_HW_BP | \
KVM_GUESTDBG_USE_SW_BP | \
KVM_GUESTDBG_INJECT_BP | \
- KVM_GUESTDBG_INJECT_DB)
+ KVM_GUESTDBG_INJECT_DB | \
+ KVM_GUESTDBG_BLOCKIRQ)
#define PFERR_PRESENT_BIT 0
@@ -447,6 +453,7 @@ struct kvm_mmu {
u64 *pae_root;
u64 *pml4_root;
+ u64 *pml5_root;
/*
* check zero bits on shadow page table entries, these
@@ -482,6 +489,7 @@ struct kvm_pmc {
* ctrl value for fixed counters.
*/
u64 current_config;
+ bool is_paused;
};
struct kvm_pmu {
@@ -522,7 +530,6 @@ struct kvm_pmu_ops;
enum {
KVM_DEBUGREG_BP_ENABLED = 1,
KVM_DEBUGREG_WONT_EXIT = 2,
- KVM_DEBUGREG_RELOAD = 4,
};
struct kvm_mtrr_range {
@@ -723,7 +730,6 @@ struct kvm_vcpu_arch {
u64 reserved_gpa_bits;
int maxphyaddr;
- int max_tdp_level;
/* emulate context */
@@ -988,6 +994,12 @@ struct kvm_hv {
/* How many vCPUs have VP index != vCPU index */
atomic_t num_mismatched_vp_indexes;
+ /*
+ * How many SynICs use 'AutoEOI' feature
+ * (protected by arch.apicv_update_lock)
+ */
+ unsigned int synic_auto_eoi_used;
+
struct hv_partition_assist_pg *hv_pa_pg;
struct kvm_hv_syndbg hv_syndbg;
};
@@ -1002,9 +1014,8 @@ struct msr_bitmap_range {
/* Xen emulation context */
struct kvm_xen {
bool long_mode;
- bool shinfo_set;
u8 upcall_vector;
- struct gfn_to_hva_cache shinfo_cache;
+ gfn_t shinfo_gfn;
};
enum kvm_irqchip_mode {
@@ -1061,6 +1072,9 @@ struct kvm_arch {
struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
+ /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */
+ struct mutex apicv_update_lock;
+
bool apic_access_memslot_enabled;
unsigned long apicv_inhibit_reasons;
@@ -1213,9 +1227,17 @@ struct kvm_vm_stat {
u64 mmu_recycled;
u64 mmu_cache_miss;
u64 mmu_unsync;
- u64 lpages;
+ union {
+ struct {
+ atomic64_t pages_4k;
+ atomic64_t pages_2m;
+ atomic64_t pages_1g;
+ };
+ atomic64_t pages[KVM_NR_PAGE_SIZES];
+ };
u64 nx_lpage_splits;
u64 max_mmu_page_hash_collisions;
+ u64 max_mmu_rmap_size;
};
struct kvm_vcpu_stat {
@@ -1359,7 +1381,6 @@ struct kvm_x86_ops {
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
bool (*check_apicv_inhibit_reasons)(ulong bit);
- void (*pre_update_apicv_exec_ctrl)(struct kvm *kvm, bool activate);
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
@@ -1543,12 +1564,12 @@ void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
+ const struct kvm_memory_slot *memslot,
int start_level);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
- struct kvm_memory_slot *memslot);
+ const struct kvm_memory_slot *memslot);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
@@ -1744,6 +1765,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
void kvm_request_apicv_update(struct kvm *kvm, bool activate,
unsigned long bit);
+void __kvm_request_apicv_update(struct kvm *kvm, bool activate,
+ unsigned long bit);
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
@@ -1754,8 +1778,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
- int tdp_huge_page_level);
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level);
static inline u16 kvm_read_ldt(void)
{
@@ -1779,11 +1803,6 @@ static inline unsigned long read_msr(unsigned long msr)
}
#endif
-static inline u32 get_rdx_init_val(void)
-{
- return 0x600; /* P6 family */
-}
-
static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
{
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
@@ -1816,31 +1835,6 @@ enum {
#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
-asmlinkage void kvm_spurious_fault(void);
-
-/*
- * Hardware virtualization extension instructions may fault if a
- * reboot turns off virtualization while processes are running.
- * Usually after catching the fault we just panic; during reboot
- * instead the instruction is ignored.
- */
-#define __kvm_handle_fault_on_reboot(insn) \
- "666: \n\t" \
- insn "\n\t" \
- "jmp 668f \n\t" \
- "667: \n\t" \
- "1: \n\t" \
- ".pushsection .discard.instr_begin \n\t" \
- ".long 1b - . \n\t" \
- ".popsection \n\t" \
- "call kvm_spurious_fault \n\t" \
- "1: \n\t" \
- ".pushsection .discard.instr_end \n\t" \
- ".long 1b - . \n\t" \
- ".popsection \n\t" \
- "668: \n\t" \
- _ASM_EXTABLE(666b, 667b)
-
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index e7265a552f4f..45697e04d771 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -58,13 +58,6 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
return copy_user_generic((__force void *)dst, src, size);
}
-static __always_inline __must_check
-unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size)
-{
- return copy_user_generic((__force void *)dst,
- (__force void *)src, size);
-}
-
extern long __copy_user_nocache(void *dst, const void __user *src,
unsigned size, int zerorest);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a6c327f8ad9e..2ef1f6513c68 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -295,6 +295,7 @@ struct kvm_debug_exit_arch {
#define KVM_GUESTDBG_USE_HW_BP 0x00020000
#define KVM_GUESTDBG_INJECT_DB 0x00040000
#define KVM_GUESTDBG_INJECT_BP 0x00080000
+#define KVM_GUESTDBG_BLOCKIRQ 0x00100000
/* for KVM_SET_GUEST_DEBUG */
struct kvm_guest_debug_arch {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a26643dc6bd6..b656456c3a94 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -884,10 +884,11 @@ static void kvm_wait(u8 *ptr, u8 val)
} else {
local_irq_disable();
+ /* safe_halt() will enable IRQ */
if (READ_ONCE(*ptr) == val)
safe_halt();
-
- local_irq_enable();
+ else
+ local_irq_enable();
}
}
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 95a98413dc32..54a83a744538 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -7,6 +7,8 @@
#include <linux/kvm_host.h>
#include <linux/debugfs.h>
#include "lapic.h"
+#include "mmu.h"
+#include "mmu/mmu_internal.h"
static int vcpu_get_timer_advance_ns(void *data, u64 *val)
{
@@ -73,3 +75,112 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_
&vcpu_tsc_scaling_frac_fops);
}
}
+
+/*
+ * This covers statistics <1024 (11=log(1024)+1), which should be enough to
+ * cover RMAP_RECYCLE_THRESHOLD.
+ */
+#define RMAP_LOG_SIZE 11
+
+static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" };
+
+static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
+{
+ struct kvm_rmap_head *rmap;
+ struct kvm *kvm = m->private;
+ struct kvm_memory_slot *slot;
+ struct kvm_memslots *slots;
+ unsigned int lpage_size, index;
+ /* Still small enough to be on the stack */
+ unsigned int *log[KVM_NR_PAGE_SIZES], *cur;
+ int i, j, k, l, ret;
+
+ ret = -ENOMEM;
+ memset(log, 0, sizeof(log));
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL);
+ if (!log[i])
+ goto out;
+ }
+
+ mutex_lock(&kvm->slots_lock);
+ write_lock(&kvm->mmu_lock);
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ for (j = 0; j < slots->used_slots; j++) {
+ slot = &slots->memslots[j];
+ for (k = 0; k < KVM_NR_PAGE_SIZES; k++) {
+ rmap = slot->arch.rmap[k];
+ lpage_size = kvm_mmu_slot_lpages(slot, k + 1);
+ cur = log[k];
+ for (l = 0; l < lpage_size; l++) {
+ index = ffs(pte_list_count(&rmap[l]));
+ if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE))
+ index = RMAP_LOG_SIZE - 1;
+ cur[index]++;
+ }
+ }
+ }
+ }
+
+ write_unlock(&kvm->mmu_lock);
+ mutex_unlock(&kvm->slots_lock);
+
+ /* index=0 counts no rmap; index=1 counts 1 rmap */
+ seq_printf(m, "Rmap_Count:\t0\t1\t");
+ for (i = 2; i < RMAP_LOG_SIZE; i++) {
+ j = 1 << (i - 1);
+ k = (1 << i) - 1;
+ seq_printf(m, "%d-%d\t", j, k);
+ }
+ seq_printf(m, "\n");
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]);
+ cur = log[i];
+ for (j = 0; j < RMAP_LOG_SIZE; j++)
+ seq_printf(m, "%d\t", cur[j]);
+ seq_printf(m, "\n");
+ }
+
+ ret = 0;
+out:
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++)
+ kfree(log[i]);
+
+ return ret;
+}
+
+static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ return single_open(file, kvm_mmu_rmaps_stat_show, kvm);
+}
+
+static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+
+ kvm_put_kvm(kvm);
+
+ return single_release(inode, file);
+}
+
+static const struct file_operations mmu_rmaps_stat_fops = {
+ .open = kvm_mmu_rmaps_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_mmu_rmaps_stat_release,
+};
+
+int kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+ debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm,
+ &mmu_rmaps_stat_fops);
+ return 0;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 41d2a53c5dea..232a86a6faaf 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -88,6 +88,10 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
int vector)
{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ int auto_eoi_old, auto_eoi_new;
+
if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
return;
@@ -96,10 +100,30 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
else
__clear_bit(vector, synic->vec_bitmap);
+ auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256);
+
if (synic_has_vector_auto_eoi(synic, vector))
__set_bit(vector, synic->auto_eoi_bitmap);
else
__clear_bit(vector, synic->auto_eoi_bitmap);
+
+ auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256);
+
+ if (!!auto_eoi_old == !!auto_eoi_new)
+ return;
+
+ mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+
+ if (auto_eoi_new)
+ hv->synic_auto_eoi_used++;
+ else
+ hv->synic_auto_eoi_used--;
+
+ __kvm_request_apicv_update(vcpu->kvm,
+ !hv->synic_auto_eoi_used,
+ APICV_INHIBIT_REASON_HYPERV);
+
+ mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
}
static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
@@ -933,12 +957,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
synic = to_hv_synic(vcpu);
- /*
- * Hyper-V SynIC auto EOI SINT's are
- * not compatible with APICV, so request
- * to deactivate APICV permanently.
- */
- kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
synic->active = true;
synic->dont_zero_synic_pages = dont_zero_synic_pages;
synic->control = HV_SYNIC_CONTROL_ENABLE;
@@ -2476,6 +2494,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
if (!cpu_smt_possible())
ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
+
+ ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
/*
* Default number of spinlock retry attempts, matches
* HyperV 2016.
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index a6e218c6140d..5a69cce4d72d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -220,7 +220,8 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
struct hrtimer *timer;
- if (!kvm_vcpu_is_bsp(vcpu) || !pit)
+ /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */
+ if (vcpu->vcpu_id || !pit)
return;
timer = &pit->pit_state.timer;
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 11e4065e1617..bbd4a5d18b5d 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -35,11 +35,7 @@ struct kvm_vcpu;
#define IOAPIC_INIT 0x5
#define IOAPIC_EXTINT 0x7
-#ifdef CONFIG_X86
#define RTC_GSI 8
-#else
-#define RTC_GSI -1U
-#endif
struct dest_map {
/* vcpu bitmap where IRQ has been sent */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ba5a27879f1d..76fb00921203 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -192,6 +192,9 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
return;
+ WARN_ONCE(!irqchip_in_kernel(kvm),
+ "Dirty APIC map without an in-kernel local APIC");
+
mutex_lock(&kvm->arch.apic_map_lock);
/*
* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map
@@ -2265,9 +2268,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
u64 old_value = vcpu->arch.apic_base;
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
- value |= MSR_IA32_APICBASE_BSP;
-
vcpu->arch.apic_base = value;
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
@@ -2323,6 +2323,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
struct kvm_lapic *apic = vcpu->arch.apic;
int i;
+ if (!init_event) {
+ vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ }
+
if (!apic)
return;
@@ -2330,8 +2337,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
hrtimer_cancel(&apic->lapic_timer.timer);
if (!init_event) {
- kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE);
+ apic->base_address = APIC_DEFAULT_PHYS_BASE;
+
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
}
kvm_apic_set_version(apic->vcpu);
@@ -2364,9 +2371,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
- if (kvm_vcpu_is_bsp(vcpu))
- kvm_lapic_set_base(vcpu,
- vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
+
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (vcpu->arch.apicv_active) {
@@ -2476,11 +2481,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
lapic_timer_advance_dynamic = false;
}
- /*
- * APIC is created enabled. This will prevent kvm_lapic_set_base from
- * thinking that APIC state has changed.
- */
- vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 83e6c6965f1e..e9688a9f7b57 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -240,4 +240,29 @@ static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
return smp_load_acquire(&kvm->arch.memslots_have_rmaps);
}
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+ /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
+ return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
+static inline unsigned long
+__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages,
+ int level)
+{
+ return gfn_to_index(slot->base_gfn + npages - 1,
+ slot->base_gfn, level) + 1;
+}
+
+static inline unsigned long
+kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level)
+{
+ return __kvm_mmu_slot_lpages(slot, slot->npages, level);
+}
+
+static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
+{
+ atomic64_add(count, &kvm->stat.pages[level - 1]);
+}
#endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 47b765270239..2d7e61122af8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
bool tdp_enabled = false;
static int max_huge_page_level __read_mostly;
+static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
enum {
@@ -137,12 +138,22 @@ module_param(dbg, bool, 0644);
#include <trace/events/kvm.h>
-/* make pte_list_desc fit well in cache line */
-#define PTE_LIST_EXT 3
+/* make pte_list_desc fit well in cache lines */
+#define PTE_LIST_EXT 14
+/*
+ * Slight optimization of cacheline layout, by putting `more' and `spte_count'
+ * at the start; then accessing it will only use one single cacheline for
+ * either full (entries==PTE_LIST_EXT) case or entries<=6.
+ */
struct pte_list_desc {
- u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
+ /*
+ * Stores number of entries stored in the pte_list_desc. No need to be
+ * u64 but just for easier alignment. When PTE_LIST_EXT, means full.
+ */
+ u64 spte_count;
+ u64 *sptes[PTE_LIST_EXT];
};
struct kvm_shadow_walk_iterator {
@@ -193,7 +204,7 @@ struct kvm_mmu_role_regs {
* the single source of truth for the MMU's state.
*/
#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
-static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
+static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
{ \
return !!(regs->reg & flag); \
}
@@ -215,7 +226,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
* and the vCPU may be incorrect/irrelevant.
*/
#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
-static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \
+static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
{ \
return !!(mmu->mmu_role. base_or_ext . reg##_##name); \
}
@@ -323,12 +334,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception)
{
- /* Check if guest physical address doesn't exceed guest maximum */
- if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) {
- exception->error_code |= PFERR_RSVD_MASK;
- return UNMAPPED_GVA;
- }
-
return gpa;
}
@@ -592,12 +597,13 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* Rules for using mmu_spte_clear_track_bits:
* It sets the sptep from present to nonpresent, and track the
* state bits, it is used to clear the last level sptep.
- * Returns non-zero if the PTE was previously valid.
+ * Returns the old PTE.
*/
-static int mmu_spte_clear_track_bits(u64 *sptep)
+static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
{
kvm_pfn_t pfn;
u64 old_spte = *sptep;
+ int level = sptep_to_sp(sptep)->role.level;
if (!spte_has_volatile_bits(old_spte))
__update_clear_spte_fast(sptep, 0ull);
@@ -605,7 +611,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
old_spte = __update_clear_spte_slow(sptep, 0ull);
if (!is_shadow_present_pte(old_spte))
- return 0;
+ return old_spte;
+
+ kvm_update_page_stats(kvm, level, -1);
pfn = spte_to_pfn(old_spte);
@@ -622,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
if (is_dirty_spte(old_spte))
kvm_set_pfn_dirty(pfn);
- return 1;
+ return old_spte;
}
/*
@@ -686,28 +694,36 @@ static bool mmu_spte_age(u64 *sptep)
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
- /*
- * Prevent page table teardown by making any free-er wait during
- * kvm_flush_remote_tlbs() IPI to all active vcpus.
- */
- local_irq_disable();
+ if (is_tdp_mmu(vcpu->arch.mmu)) {
+ kvm_tdp_mmu_walk_lockless_begin();
+ } else {
+ /*
+ * Prevent page table teardown by making any free-er wait during
+ * kvm_flush_remote_tlbs() IPI to all active vcpus.
+ */
+ local_irq_disable();
- /*
- * Make sure a following spte read is not reordered ahead of the write
- * to vcpu->mode.
- */
- smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+ /*
+ * Make sure a following spte read is not reordered ahead of the write
+ * to vcpu->mode.
+ */
+ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+ }
}
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
- /*
- * Make sure the write to vcpu->mode is not reordered in front of
- * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
- * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
- */
- smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
- local_irq_enable();
+ if (is_tdp_mmu(vcpu->arch.mmu)) {
+ kvm_tdp_mmu_walk_lockless_end();
+ } else {
+ /*
+ * Make sure the write to vcpu->mode is not reordered in front of
+ * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
+ * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+ */
+ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+ local_irq_enable();
+ }
}
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
@@ -786,7 +802,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
return &slot->arch.lpage_info[level - 2][idx];
}
-static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
gfn_t gfn, int count)
{
struct kvm_lpage_info *linfo;
@@ -799,12 +815,12 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
}
}
-void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
{
update_gfn_disallow_lpage_count(slot, gfn, 1);
}
-void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
{
update_gfn_disallow_lpage_count(slot, gfn, -1);
}
@@ -893,7 +909,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
- int i, count = 0;
+ int count = 0;
if (!rmap_head->val) {
rmap_printk("%p %llx 0->1\n", spte, *spte);
@@ -903,24 +919,24 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
desc = mmu_alloc_pte_list_desc(vcpu);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
+ desc->spte_count = 2;
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- while (desc->sptes[PTE_LIST_EXT-1]) {
+ while (desc->spte_count == PTE_LIST_EXT) {
count += PTE_LIST_EXT;
-
if (!desc->more) {
desc->more = mmu_alloc_pte_list_desc(vcpu);
desc = desc->more;
+ desc->spte_count = 0;
break;
}
desc = desc->more;
}
- for (i = 0; desc->sptes[i]; ++i)
- ++count;
- desc->sptes[i] = spte;
+ count += desc->spte_count;
+ desc->sptes[desc->spte_count++] = spte;
}
return count;
}
@@ -930,13 +946,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
struct pte_list_desc *desc, int i,
struct pte_list_desc *prev_desc)
{
- int j;
+ int j = desc->spte_count - 1;
- for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
- ;
desc->sptes[i] = desc->sptes[j];
desc->sptes[j] = NULL;
- if (j != 0)
+ desc->spte_count--;
+ if (desc->spte_count)
return;
if (!prev_desc && !desc->more)
rmap_head->val = 0;
@@ -969,7 +984,7 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
prev_desc = NULL;
while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+ for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
pte_list_desc_remove_entry(rmap_head,
desc, i, prev_desc);
@@ -984,30 +999,68 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
}
}
-static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
+static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ u64 *sptep)
{
- mmu_spte_clear_track_bits(sptep);
+ mmu_spte_clear_track_bits(kvm, sptep);
__pte_list_remove(sptep, rmap_head);
}
-static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
- struct kvm_memory_slot *slot)
+/* Return true if rmap existed, false otherwise */
+static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
- unsigned long idx;
+ struct pte_list_desc *desc, *next;
+ int i;
- idx = gfn_to_index(gfn, slot->base_gfn, level);
- return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
+ if (!rmap_head->val)
+ return false;
+
+ if (!(rmap_head->val & 1)) {
+ mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
+ goto out;
+ }
+
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+
+ for (; desc; desc = next) {
+ for (i = 0; i < desc->spte_count; i++)
+ mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
+ next = desc->more;
+ mmu_free_pte_list_desc(desc);
+ }
+out:
+ /* rmap_head is meaningless now, remember to reset it */
+ rmap_head->val = 0;
+ return true;
}
-static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
- struct kvm_mmu_page *sp)
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
+ struct pte_list_desc *desc;
+ unsigned int count = 0;
- slots = kvm_memslots_for_spte_role(kvm, sp->role);
- slot = __gfn_to_memslot(slots, gfn);
- return __gfn_to_rmap(gfn, sp->role.level, slot);
+ if (!rmap_head->val)
+ return 0;
+ else if (!(rmap_head->val & 1))
+ return 1;
+
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+
+ while (desc) {
+ count += desc->spte_count;
+ desc = desc->more;
+ }
+
+ return count;
+}
+
+static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
+ const struct kvm_memory_slot *slot)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
}
static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1020,24 +1073,39 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu)
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
+ struct kvm_memory_slot *slot;
struct kvm_mmu_page *sp;
struct kvm_rmap_head *rmap_head;
sp = sptep_to_sp(spte);
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
return pte_list_add(vcpu, spte, rmap_head);
}
+
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
struct kvm_mmu_page *sp;
gfn_t gfn;
struct kvm_rmap_head *rmap_head;
sp = sptep_to_sp(spte);
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
- rmap_head = gfn_to_rmap(kvm, gfn, sp);
+
+ /*
+ * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the
+ * context of a vCPU so have to determine which memslots to use based
+ * on context information in sp->role.
+ */
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+
+ slot = __gfn_to_memslot(slots, gfn);
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+
__pte_list_remove(spte, rmap_head);
}
@@ -1119,7 +1187,9 @@ out:
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
- if (mmu_spte_clear_track_bits(sptep))
+ u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
+
+ if (is_shadow_present_pte(old_spte))
rmap_remove(kvm, sptep);
}
@@ -1129,7 +1199,6 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
if (is_large_pte(*sptep)) {
WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
drop_spte(kvm, sptep);
- --kvm->stat.lpages;
return true;
}
@@ -1218,7 +1287,7 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep)
* Returns true iff any D or W bits were cleared.
*/
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1256,8 +1325,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
return;
while (mask) {
- rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PG_LEVEL_4K, slot);
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
__rmap_write_protect(kvm, rmap_head, false);
/* clear the first set bit */
@@ -1289,8 +1358,8 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
return;
while (mask) {
- rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PG_LEVEL_4K, slot);
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
__rmap_clear_dirty(kvm, rmap_head, slot);
/* clear the first set bit */
@@ -1356,7 +1425,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
if (kvm_memslots_have_rmaps(kvm)) {
for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
- rmap_head = __gfn_to_rmap(gfn, i, slot);
+ rmap_head = gfn_to_rmap(gfn, i, slot);
write_protected |= __rmap_write_protect(kvm, rmap_head, true);
}
}
@@ -1377,20 +1446,9 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
}
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
- u64 *sptep;
- struct rmap_iterator iter;
- bool flush = false;
-
- while ((sptep = rmap_get_first(rmap_head, &iter))) {
- rmap_printk("spte %p %llx.\n", sptep, *sptep);
-
- pte_list_remove(rmap_head, sptep);
- flush = true;
- }
-
- return flush;
+ return pte_list_destroy(kvm, rmap_head);
}
static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -1421,13 +1479,13 @@ restart:
need_flush = 1;
if (pte_write(pte)) {
- pte_list_remove(rmap_head, sptep);
+ pte_list_remove(kvm, rmap_head, sptep);
goto restart;
} else {
new_spte = kvm_mmu_changed_pte_notifier_make_spte(
*sptep, new_pfn);
- mmu_spte_clear_track_bits(sptep);
+ mmu_spte_clear_track_bits(kvm, sptep);
mmu_spte_set(sptep, new_spte);
}
}
@@ -1442,7 +1500,7 @@ restart:
struct slot_rmap_walk_iterator {
/* input fields. */
- struct kvm_memory_slot *slot;
+ const struct kvm_memory_slot *slot;
gfn_t start_gfn;
gfn_t end_gfn;
int start_level;
@@ -1462,14 +1520,13 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
{
iterator->level = level;
iterator->gfn = iterator->start_gfn;
- iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
- iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
- iterator->slot);
+ iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
+ iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
}
static void
slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
- struct kvm_memory_slot *slot, int start_level,
+ const struct kvm_memory_slot *slot, int start_level,
int end_level, gfn_t start_gfn, gfn_t end_gfn)
{
iterator->slot = slot;
@@ -1584,12 +1641,13 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
+ struct kvm_memory_slot *slot;
struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *sp;
sp = sptep_to_sp(spte);
-
- rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
@@ -2232,8 +2290,6 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level)) {
drop_spte(kvm, spte);
- if (is_large_pte(pte))
- --kvm->stat.lpages;
} else {
child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
@@ -2716,15 +2772,12 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
trace_kvm_mmu_set_spte(level, gfn, sptep);
- if (!was_rmapped && is_large_pte(*sptep))
- ++vcpu->kvm->stat.lpages;
- if (is_shadow_present_pte(*sptep)) {
- if (!was_rmapped) {
- rmap_count = rmap_add(vcpu, sptep, gfn);
- if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, sptep, gfn);
- }
+ if (!was_rmapped) {
+ kvm_update_page_stats(vcpu->kvm, level, 1);
+ rmap_count = rmap_add(vcpu, sptep, gfn);
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+ rmap_recycle(vcpu, sptep, gfn);
}
return ret;
@@ -2852,6 +2905,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
kvm_pfn_t pfn, int max_level)
{
struct kvm_lpage_info *linfo;
+ int host_level;
max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
@@ -2863,7 +2917,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
- return host_pfn_mapping_level(kvm, gfn, pfn, slot);
+ host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
+ return min(host_level, max_level);
}
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2887,17 +2942,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
if (!slot)
return PG_LEVEL_4K;
- level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
- if (level == PG_LEVEL_4K)
- return level;
-
- *req_level = level = min(level, max_level);
-
/*
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
- if (huge_page_disallowed)
+ *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
+ if (level == PG_LEVEL_4K || huge_page_disallowed)
return PG_LEVEL_4K;
/*
@@ -2965,15 +3015,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
break;
drop_large_spte(vcpu, it.sptep);
- if (!is_shadow_present_pte(*it.sptep)) {
- sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
- it.level - 1, true, ACC_ALL);
-
- link_shadow_page(vcpu, it.sptep, sp);
- if (is_tdp && huge_page_disallowed &&
- req_level >= it.level)
- account_huge_nx_page(vcpu->kvm, sp);
- }
+ if (is_shadow_present_pte(*it.sptep))
+ continue;
+
+ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+ it.level - 1, true, ACC_ALL);
+
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (is_tdp && huge_page_disallowed &&
+ req_level >= it.level)
+ account_huge_nx_page(vcpu->kvm, sp);
}
ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
@@ -3122,15 +3173,40 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
}
/*
- * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between walk_shadow_page_lockless_{begin,end}.
+ * - The returned sptep must not be used after walk_shadow_page_lockless_end.
*/
-static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 error_code)
+static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
{
struct kvm_shadow_walk_iterator iterator;
+ u64 old_spte;
+ u64 *sptep = NULL;
+
+ for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
+ sptep = iterator.sptep;
+ *spte = old_spte;
+
+ if (!is_shadow_present_pte(old_spte))
+ break;
+ }
+
+ return sptep;
+}
+
+/*
+ * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
+ */
+static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
+{
struct kvm_mmu_page *sp;
int ret = RET_PF_INVALID;
u64 spte = 0ull;
+ u64 *sptep = NULL;
uint retry_count = 0;
if (!page_fault_can_be_fast(error_code))
@@ -3141,14 +3217,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
do {
u64 new_spte;
- for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
- if (!is_shadow_present_pte(spte))
- break;
+ if (is_tdp_mmu(vcpu->arch.mmu))
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte);
+ else
+ sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte);
if (!is_shadow_present_pte(spte))
break;
- sp = sptep_to_sp(iterator.sptep);
+ sp = sptep_to_sp(sptep);
if (!is_last_spte(spte, sp->role.level))
break;
@@ -3206,8 +3283,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* since the gfn is not stable for indirect shadow page. See
* Documentation/virt/kvm/locking.rst to get more detail.
*/
- if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte,
- new_spte)) {
+ if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) {
ret = RET_PF_FIXED;
break;
}
@@ -3220,8 +3296,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
} while (true);
- trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
- spte, ret);
+ trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret);
walk_shadow_page_lockless_end(vcpu);
return ret;
@@ -3455,15 +3530,22 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* the shadow page table may be a PAE or a long mode page table.
*/
pm_mask = PT_PRESENT_MASK | shadow_me_mask;
- if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
if (WARN_ON_ONCE(!mmu->pml4_root)) {
r = -EIO;
goto out_unlock;
}
-
mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
+
+ if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pml5_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+ mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
+ }
}
for (i = 0; i < 4; ++i) {
@@ -3482,7 +3564,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
mmu->pae_root[i] = root | pm_mask;
}
- if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+ if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
+ mmu->root_hpa = __pa(mmu->pml5_root);
+ else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
mmu->root_hpa = __pa(mmu->pml4_root);
else
mmu->root_hpa = __pa(mmu->pae_root);
@@ -3498,7 +3582,10 @@ out_unlock:
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- u64 *pml4_root, *pae_root;
+ bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL;
+ u64 *pml5_root = NULL;
+ u64 *pml4_root = NULL;
+ u64 *pae_root;
/*
* When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
@@ -3511,20 +3598,21 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
return 0;
/*
- * This mess only works with 4-level paging and needs to be updated to
- * work with 5-level paging.
+ * NPT, the only paging mode that uses this horror, uses a fixed number
+ * of levels for the shadow page tables, e.g. all MMUs are 4-level or
+ * all MMus are 5-level. Thus, this can safely require that pml5_root
+ * is allocated if the other roots are valid and pml5 is needed, as any
+ * prior MMU would also have required pml5.
*/
- if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
- return -EIO;
-
- if (mmu->pae_root && mmu->pml4_root)
+ if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
return 0;
/*
* The special roots should always be allocated in concert. Yell and
* bail if KVM ends up in a state where only one of the roots is valid.
*/
- if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
+ (need_pml5 && mmu->pml5_root)))
return -EIO;
/*
@@ -3535,16 +3623,31 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
if (!pae_root)
return -ENOMEM;
+#ifdef CONFIG_X86_64
pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (!pml4_root) {
- free_page((unsigned long)pae_root);
- return -ENOMEM;
+ if (!pml4_root)
+ goto err_pml4;
+
+ if (need_pml5) {
+ pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml5_root)
+ goto err_pml5;
}
+#endif
mmu->pae_root = pae_root;
mmu->pml4_root = pml4_root;
+ mmu->pml5_root = pml5_root;
return 0;
+
+#ifdef CONFIG_X86_64
+err_pml5:
+ free_page((unsigned long)pml4_root);
+err_pml4:
+ free_page((unsigned long)pae_root);
+ return -ENOMEM;
+#endif
}
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -3640,6 +3743,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
/*
* Return the level of the lowest level SPTE added to sptes.
* That SPTE may be non-present.
+ *
+ * Must be called between walk_shadow_page_lockless_{begin,end}.
*/
static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
{
@@ -3647,8 +3752,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
int leaf = -1;
u64 spte;
- walk_shadow_page_lockless_begin(vcpu);
-
for (shadow_walk_init(&iterator, vcpu, addr),
*root_level = iterator.level;
shadow_walk_okay(&iterator);
@@ -3662,8 +3765,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
break;
}
- walk_shadow_page_lockless_end(vcpu);
-
return leaf;
}
@@ -3675,11 +3776,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
int root, leaf, level;
bool reserved = false;
+ walk_shadow_page_lockless_begin(vcpu);
+
if (is_tdp_mmu(vcpu->arch.mmu))
leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
else
leaf = get_walk(vcpu, addr, sptes, &root);
+ walk_shadow_page_lockless_end(vcpu);
+
if (unlikely(leaf < 0)) {
*sptep = 0ull;
return reserved;
@@ -3795,9 +3900,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
- bool write, bool *writable)
+ bool write, bool *writable, int *r)
{
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
@@ -3808,13 +3913,26 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
* be zapped before KVM inserts a new MMIO SPTE for the gfn.
*/
if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
- return true;
-
- /* Don't expose private memslots to L2. */
- if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
- *pfn = KVM_PFN_NOSLOT;
- *writable = false;
- return false;
+ goto out_retry;
+
+ if (!kvm_is_visible_memslot(slot)) {
+ /* Don't expose private memslots to L2. */
+ if (is_guest_mode(vcpu)) {
+ *pfn = KVM_PFN_NOSLOT;
+ *writable = false;
+ return false;
+ }
+ /*
+ * If the APIC access page exists but is disabled, go directly
+ * to emulation without caching the MMIO access or creating a
+ * MMIO SPTE. That way the cache doesn't need to be purged
+ * when the AVIC is re-enabled.
+ */
+ if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
+ !kvm_apicv_activated(vcpu->kvm)) {
+ *r = RET_PF_EMULATE;
+ return true;
+ }
}
async = false;
@@ -3828,14 +3946,17 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
- return true;
+ goto out_retry;
} else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
- return true;
+ goto out_retry;
}
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
write, writable, hva);
- return false;
+
+out_retry:
+ *r = RET_PF_RETRY;
+ return true;
}
static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
@@ -3854,11 +3975,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
- if (!is_tdp_mmu_fault) {
- r = fast_page_fault(vcpu, gpa, error_code);
- if (r != RET_PF_INVALID)
- return r;
- }
+ r = fast_page_fault(vcpu, gpa, error_code);
+ if (r != RET_PF_INVALID)
+ return r;
r = mmu_topup_memory_caches(vcpu, false);
if (r)
@@ -3867,9 +3986,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
- write, &map_writable))
- return RET_PF_RETRY;
+ if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva,
+ write, &map_writable, &r))
+ return r;
if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
return r;
@@ -4588,6 +4707,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
{
+ /* tdp_root_level is architecture forced level, use it if nonzero */
+ if (tdp_root_level)
+ return tdp_root_level;
+
/* Use 5-level TDP if and only if it's useful/necessary. */
if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
return 4;
@@ -5160,7 +5283,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
if (r == RET_PF_INVALID) {
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
lower_32_bits(error_code), false);
- if (WARN_ON_ONCE(r == RET_PF_INVALID))
+ if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
}
@@ -5279,10 +5402,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
*/
}
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
- int tdp_huge_page_level)
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level)
{
tdp_enabled = enable_tdp;
+ tdp_root_level = tdp_forced_root_level;
max_tdp_level = tdp_max_root_level;
/*
@@ -5302,12 +5426,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot);
+typedef bool (*slot_level_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot);
/* The caller should hold mmu-lock before calling this function. */
static __always_inline bool
-slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
bool flush)
@@ -5334,7 +5459,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
}
static __always_inline bool
-slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
bool flush_on_yield)
{
@@ -5345,7 +5470,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
}
static __always_inline bool
-slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot,
slot_level_handler fn, bool flush_on_yield)
{
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
@@ -5358,6 +5483,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
set_memory_encrypted((unsigned long)mmu->pae_root, 1);
free_page((unsigned long)mmu->pae_root);
free_page((unsigned long)mmu->pml4_root);
+ free_page((unsigned long)mmu->pml5_root);
}
static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
@@ -5587,6 +5713,10 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
kvm_mmu_uninit_tdp_mmu(kvm);
}
+/*
+ * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
+ * (not including it)
+ */
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
struct kvm_memslots *slots;
@@ -5594,8 +5724,11 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
int i;
bool flush = false;
+ write_lock(&kvm->mmu_lock);
+
+ kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
+
if (kvm_memslots_have_rmaps(kvm)) {
- write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot(memslot, slots) {
@@ -5606,41 +5739,44 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (start >= end)
continue;
- flush = slot_handle_level_range(kvm, memslot,
+ flush = slot_handle_level_range(kvm,
+ (const struct kvm_memory_slot *) memslot,
kvm_zap_rmapp, PG_LEVEL_4K,
KVM_MAX_HUGEPAGE_LEVEL, start,
end - 1, true, flush);
}
}
if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
- write_unlock(&kvm->mmu_lock);
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+ gfn_end - gfn_start);
}
if (is_tdp_mmu_enabled(kvm)) {
- flush = false;
-
- read_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
- gfn_end, flush, true);
+ gfn_end, flush);
if (flush)
kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end);
-
- read_unlock(&kvm->mmu_lock);
+ gfn_end - gfn_start);
}
+
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+
+ kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
+
+ write_unlock(&kvm->mmu_lock);
}
static bool slot_rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
return __rmap_write_protect(kvm, rmap_head, false);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
+ const struct kvm_memory_slot *memslot,
int start_level)
{
bool flush = false;
@@ -5676,7 +5812,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -5699,7 +5835,7 @@ restart:
if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
pfn, PG_LEVEL_NUM)) {
- pte_list_remove(rmap_head, sptep);
+ pte_list_remove(kvm, rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
@@ -5715,10 +5851,8 @@ restart:
}
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *slot)
{
- /* FIXME: const-ify all uses of struct kvm_memory_slot. */
- struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
bool flush = false;
if (kvm_memslots_have_rmaps(kvm)) {
@@ -5754,7 +5888,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
}
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
bool flush = false;
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
index cedc17b2f60e..9e7dcf999f08 100644
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c
@@ -147,7 +147,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
return;
}
- rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
+ rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot);
if (!rmap_head->val) {
if (!__ratelimit(&ratelimit_state))
return;
@@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, sp->gfn);
- rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
+ rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
for_each_rmap_spte(rmap_head, &iter, sptep) {
if (is_writable_pte(*sptep))
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 35567293c1fd..bf2bdbf333c2 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -31,13 +31,16 @@ extern bool dbg;
#define IS_VALID_PAE_ROOT(x) (!!(x))
struct kvm_mmu_page {
+ /*
+ * Note, "link" through "spt" fit in a single 64 byte cache line on
+ * 64-bit kernels, keep it that way unless there's a reason not to.
+ */
struct list_head link;
struct hlist_node hash_link;
- struct list_head lpage_disallowed_link;
+ bool tdp_mmu_page;
bool unsync;
u8 mmu_valid_gen;
- bool mmio_cached;
bool lpage_disallowed; /* Can't be replaced by an equiv large page */
/*
@@ -59,6 +62,7 @@ struct kvm_mmu_page {
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
DECLARE_BITMAP(unsync_child_bitmap, 512);
+ struct list_head lpage_disallowed_link;
#ifdef CONFIG_X86_32
/*
* Used out of the mmu-lock to avoid reading spte values while an
@@ -71,8 +75,6 @@ struct kvm_mmu_page {
atomic_t write_flooding_count;
#ifdef CONFIG_X86_64
- bool tdp_mmu_page;
-
/* Used for freeing the page asynchronously if it is a TDP MMU page. */
struct rcu_head rcu_head;
#endif
@@ -124,13 +126,14 @@ static inline bool is_nx_huge_page_enabled(void)
int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync);
-void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
-void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn,
int min_level);
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
/*
* Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
@@ -140,6 +143,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
* RET_PF_FIXED: The faulting entry has been fixed.
* RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
+ *
+ * Any names added to this enum should be exported to userspace for use in
+ * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
*/
enum {
RET_PF_RETRY = 0,
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index efbad33a0645..2924a4081a19 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -54,6 +54,12 @@
{ PFERR_RSVD_MASK, "RSVD" }, \
{ PFERR_FETCH_MASK, "F" }
+TRACE_DEFINE_ENUM(RET_PF_RETRY);
+TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_INVALID);
+TRACE_DEFINE_ENUM(RET_PF_FIXED);
+TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
+
/*
* A pagetable walk has started
*/
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 91a9f7e0fd91..269f11f92fd0 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -16,6 +16,7 @@
#include <asm/kvm_page_track.h>
+#include "mmu.h"
#include "mmu_internal.h"
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index ee044d357b5f..7d03e9b7ccfa 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -881,9 +881,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
- write_fault, &map_writable))
- return RET_PF_RETRY;
+ if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
+ write_fault, &map_writable, &r))
+ return r;
if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
return r;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d80cb122b5f3..64ccfc1fa553 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -10,7 +10,7 @@
#include <asm/cmpxchg.h>
#include <trace/events/kvm.h>
-static bool __read_mostly tdp_mmu_enabled = false;
+static bool __read_mostly tdp_mmu_enabled = true;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
/* Initializes the TDP MMU for the VM, if enabled. */
@@ -255,26 +255,17 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
*
* @kvm: kvm instance
* @sp: the new page
- * @shared: This operation may not be running under the exclusive use of
- * the MMU lock and the operation must synchronize with other
- * threads that might be adding or removing pages.
* @account_nx: This page replaces a NX large page and should be marked for
* eventual reclaim.
*/
static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- bool shared, bool account_nx)
+ bool account_nx)
{
- if (shared)
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
- else
- lockdep_assert_held_write(&kvm->mmu_lock);
-
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
if (account_nx)
account_huge_nx_page(kvm, sp);
-
- if (shared)
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
/**
@@ -445,13 +436,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
- if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
- if (is_large_pte(old_spte))
- atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
- else
- atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
- }
-
/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -477,6 +461,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
return;
}
+ if (is_leaf != was_leaf)
+ kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
if (was_leaf && is_dirty_spte(old_spte) &&
(!is_present || !is_dirty_spte(new_spte) || pfn_changed))
@@ -526,6 +512,10 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
if (is_removed_spte(iter->old_spte))
return false;
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
+ * does not hold the mmu_lock.
+ */
if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
new_spte) != iter->old_spte)
return false;
@@ -537,15 +527,40 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
return true;
}
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+/*
+ * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a
+ * TDP page fault.
+ *
+ * @vcpu: The vcpu instance that took the TDP page fault.
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ *
+ * Returns: true if the SPTE was set, false if it was not. If false is returned,
+ * this function will have no side-effects.
+ */
+static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
+ struct kvm *kvm = vcpu->kvm;
+
if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
return false;
- handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
- iter->old_spte, new_spte, iter->level);
+ /*
+ * Use kvm_vcpu_gfn_to_memslot() instead of going through
+ * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot.
+ */
+ if (is_writable_pte(new_spte)) {
+ struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn);
+
+ if (slot && kvm_slot_dirty_track_enabled(slot)) {
+ /* Enforced by kvm_mmu_hugepage_adjust. */
+ WARN_ON_ONCE(iter->level > PG_LEVEL_4K);
+ mark_page_dirty_in_slot(kvm, slot, iter->gfn);
+ }
+ }
+
return true;
}
@@ -558,7 +573,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* immediately installing a present entry in its place
* before the TLBs are flushed.
*/
- if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
+ if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE))
return false;
kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
@@ -789,21 +804,15 @@ retry:
* non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
- *
- * If shared is true, this thread holds the MMU lock in read mode and must
- * account for the possibility that other threads are modifying the paging
- * structures concurrently. If shared is false, this thread should hold the
- * MMU in write mode.
*/
bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
- gfn_t end, bool can_yield, bool flush,
- bool shared)
+ gfn_t end, bool can_yield, bool flush)
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
+ for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
- shared);
+ false);
return flush;
}
@@ -814,8 +823,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
int i;
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
- flush, false);
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
if (flush)
kvm_flush_remote_tlbs(kvm);
@@ -940,7 +948,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte))
return RET_PF_RETRY;
/*
@@ -1044,9 +1052,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
new_spte = make_nonleaf_spte(child_pt,
!shadow_accessed_mask);
- if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
- new_spte)) {
- tdp_mmu_link_page(vcpu->kvm, sp, true,
+ if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) {
+ tdp_mmu_link_page(vcpu->kvm, sp,
huge_page_disallowed &&
req_level >= iter.level);
@@ -1255,8 +1262,8 @@ retry:
* only affect leaf SPTEs down to min_level.
* Returns true if an SPTE has been changed and the TLBs need to be flushed.
*/
-bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
- int min_level)
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level)
{
struct kvm_mmu_page *root;
bool spte_set = false;
@@ -1326,7 +1333,8 @@ retry:
* each SPTE. Returns true if an SPTE has been changed and the TLBs need to
* be flushed.
*/
-bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
bool spte_set = false;
@@ -1529,6 +1537,8 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
/*
* Return the level of the lowest level SPTE added to sptes.
* That SPTE may be non-present.
+ *
+ * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
*/
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level)
@@ -1540,14 +1550,47 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*root_level = vcpu->arch.mmu->shadow_root_level;
- rcu_read_lock();
-
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
- rcu_read_unlock();
-
return leaf;
}
+
+/*
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
+ * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
+ *
+ * WARNING: This function is only intended to be called during fast_page_fault.
+ */
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+ u64 *spte)
+{
+ struct tdp_iter iter;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ gfn_t gfn = addr >> PAGE_SHIFT;
+ tdp_ptep_t sptep = NULL;
+
+ tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ *spte = iter.old_spte;
+ sptep = iter.sptep;
+ }
+
+ /*
+ * Perform the rcu_dereference to get the raw spte pointer value since
+ * we are passing it up to fast_page_fault, which is shared with the
+ * legacy MMU and thus does not retain the TDP MMU-specific __rcu
+ * annotation.
+ *
+ * This is safe since fast_page_fault obeys the contracts of this
+ * function as well as all TDP MMU contracts around modifying SPTEs
+ * outside of mmu_lock.
+ */
+ return rcu_dereference(sptep);
+}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 1cae4485b3bc..358f447d4012 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -20,14 +20,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared);
bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
- gfn_t end, bool can_yield, bool flush,
- bool shared);
+ gfn_t end, bool can_yield, bool flush);
static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
- gfn_t start, gfn_t end, bool flush,
- bool shared)
+ gfn_t start, gfn_t end, bool flush)
{
- return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
- shared);
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
}
static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
@@ -44,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
*/
lockdep_assert_held_write(&kvm->mmu_lock);
return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
- sp->gfn, end, false, false, false);
+ sp->gfn, end, false, false);
}
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
@@ -61,10 +58,10 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
-bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
- int min_level);
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level);
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
- struct kvm_memory_slot *slot);
+ const struct kvm_memory_slot *slot);
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
@@ -77,8 +74,20 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
int min_level);
+static inline void kvm_tdp_mmu_walk_lockless_begin(void)
+{
+ rcu_read_lock();
+}
+
+static inline void kvm_tdp_mmu_walk_lockless_end(void)
+{
+ rcu_read_unlock();
+}
+
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+ u64 *spte);
#ifdef CONFIG_X86_64
bool kvm_mmu_init_tdp_mmu(struct kvm *kvm);
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 827886c12c16..0772bad9165c 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
pmc->perf_event = event;
pmc_to_pmu(pmc)->event_count++;
clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+ pmc->is_paused = false;
}
static void pmc_pause_counter(struct kvm_pmc *pmc)
{
u64 counter = pmc->counter;
- if (!pmc->perf_event)
+ if (!pmc->perf_event || pmc->is_paused)
return;
/* update counter, reset event value to avoid redundant accumulation */
counter += perf_event_pause(pmc->perf_event, true);
pmc->counter = counter & pmc_bitmask(pmc);
+ pmc->is_paused = true;
}
static bool pmc_resume_counter(struct kvm_pmc *pmc)
@@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
/* reuse perf_event to serve as pmc_reprogram_counter() does*/
perf_event_enable(pmc->perf_event);
+ pmc->is_paused = false;
clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
return true;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 67e753edfa22..0e4f2b1fa9fb 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -55,7 +55,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
u64 counter, enabled, running;
counter = pmc->counter;
- if (pmc->perf_event)
+ if (pmc->perf_event && !pmc->is_paused)
counter += perf_event_read_value(pmc->perf_event,
&enabled, &running);
/* FIXME: Scaling needed? */
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index a8ad78a2faa1..8052d92069e0 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -197,6 +197,8 @@ void avic_init_vmcb(struct vcpu_svm *svm)
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+
if (kvm_apicv_activated(svm->vcpu.kvm))
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
else
@@ -225,31 +227,26 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
* field of the VMCB. Therefore, we set up the
* APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
*/
-static int avic_update_access_page(struct kvm *kvm, bool activate)
+static int avic_alloc_access_page(struct kvm *kvm)
{
void __user *ret;
int r = 0;
mutex_lock(&kvm->slots_lock);
- /*
- * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger
- * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
- * memory region. So, we need to ensure that kvm->mm == current->mm.
- */
- if ((kvm->arch.apic_access_memslot_enabled == activate) ||
- (kvm->mm != current->mm))
+
+ if (kvm->arch.apic_access_memslot_enabled)
goto out;
ret = __x86_set_memory_region(kvm,
APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE,
- activate ? PAGE_SIZE : 0);
+ PAGE_SIZE);
if (IS_ERR(ret)) {
r = PTR_ERR(ret);
goto out;
}
- kvm->arch.apic_access_memslot_enabled = activate;
+ kvm->arch.apic_access_memslot_enabled = true;
out:
mutex_unlock(&kvm->slots_lock);
return r;
@@ -270,7 +267,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
if (kvm_apicv_activated(vcpu->kvm)) {
int ret;
- ret = avic_update_access_page(vcpu->kvm, true);
+ ret = avic_alloc_access_page(vcpu->kvm);
if (ret)
return ret;
}
@@ -587,17 +584,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
-{
- if (!enable_apicv || !lapic_in_kernel(vcpu))
- return;
-
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- kvm_request_apicv_update(vcpu->kvm, activate,
- APICV_INHIBIT_REASON_IRQWIN);
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-}
-
void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
{
return;
@@ -667,6 +653,11 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
}
vmcb_mark_dirty(vmcb, VMCB_AVIC);
+ if (activated)
+ avic_vcpu_load(vcpu, vcpu->cpu);
+ else
+ avic_vcpu_put(vcpu);
+
svm_set_pi_irte_mode(vcpu, activated);
}
@@ -918,10 +909,6 @@ bool svm_check_apicv_inhibit_reasons(ulong bit)
return supported & BIT(bit);
}
-void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate)
-{
- avic_update_access_page(kvm, activate);
-}
static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
@@ -960,9 +947,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
/*
* Since the host physical APIC id is 8 bits,
* we can support host APIC ID upto 255.
@@ -990,9 +974,6 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
entry = READ_ONCE(*(svm->avic_physical_id_cache));
if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
@@ -1009,6 +990,10 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
struct vcpu_svm *svm = to_svm(vcpu);
svm->avic_is_running = is_run;
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
if (is_run)
avic_vcpu_load(vcpu, vcpu->cpu);
else
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index e5515477c30a..2545d0c61985 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -666,11 +666,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
goto out;
}
-
- /* Clear internal status */
- kvm_clear_exception_queue(vcpu);
- kvm_clear_interrupt_queue(vcpu);
-
/*
* Since vmcb01 is not in use, we can use it to store some of the L1
* state.
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 7fbce342eec4..75e0b21ad07c 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -28,8 +28,6 @@
#include "cpuid.h"
#include "trace.h"
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
#ifndef CONFIG_KVM_AMD_SEV
/*
* When this config is not defined, SEV feature is not supported and APIs in
@@ -584,6 +582,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xcr0 = svm->vcpu.arch.xcr0;
save->pkru = svm->vcpu.arch.pkru;
save->xss = svm->vcpu.arch.ia32_xss;
+ save->dr6 = svm->vcpu.arch.dr6;
/*
* SEV-ES will use a VMSA that is pointed to by the VMCB, not
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 69639f9624f5..05e8d4d27969 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -46,8 +46,6 @@
#include "kvm_onhyperv.h"
#include "svm_onhyperv.h"
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -261,7 +259,7 @@ u32 svm_msrpm_offset(u32 msr)
static int get_max_npt_level(void)
{
#ifdef CONFIG_X86_64
- return PT64_ROOT_4LEVEL;
+ return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
#else
return PT32E_ROOT_LEVEL;
#endif
@@ -462,11 +460,6 @@ static int has_svm(void)
return 0;
}
- if (pgtable_l5_enabled()) {
- pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
- return 0;
- }
-
return 1;
}
@@ -1015,7 +1008,9 @@ static __init int svm_hardware_setup(void)
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
- kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
+ /* Force VM NPT level equal to the host's max NPT level */
+ kvm_configure_mmu(npt_enabled, get_max_npt_level(),
+ get_max_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Note, SEV setup consumes npt_enabled. */
@@ -1161,8 +1156,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
- vcpu->arch.hflags = 0;
-
svm_set_intercept(svm, INTERCEPT_CR0_READ);
svm_set_intercept(svm, INTERCEPT_CR3_READ);
svm_set_intercept(svm, INTERCEPT_CR4_READ);
@@ -1241,29 +1234,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
save->cs.limit = 0xffff;
+ save->gdtr.base = 0;
save->gdtr.limit = 0xffff;
+ save->idtr.base = 0;
save->idtr.limit = 0xffff;
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- svm_set_cr4(vcpu, 0);
- svm_set_efer(vcpu, 0);
- save->dr6 = 0xffff0ff0;
- kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
- save->rip = 0x0000fff0;
- vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
-
- /*
- * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
- * It also updates the guest-visible cr0 value.
- */
- svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
- kvm_mmu_reset_context(vcpu);
-
- save->cr4 = X86_CR4_PAE;
- /* rdx = ?? */
-
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
@@ -1273,14 +1251,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
save->g_pat = vcpu->arch.pat;
save->cr3 = 0;
- save->cr4 = 0;
}
svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
svm->nested.vmcb12_gpa = INVALID_GPA;
svm->nested.last_vmcb12_gpa = INVALID_GPA;
- vcpu->arch.hflags = 0;
if (!kvm_pause_in_guest(vcpu->kvm)) {
control->pause_filter_count = pause_filter_count;
@@ -1330,25 +1306,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u32 dummy;
- u32 eax = 1;
svm->spec_ctrl = 0;
svm->virt_spec_ctrl = 0;
- if (!init_event) {
- vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(vcpu))
- vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
- }
init_vmcb(vcpu);
-
- kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
- kvm_rdx_write(vcpu, eax);
-
- if (kvm_vcpu_apicv_active(vcpu) && !init_event)
- avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
}
void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
@@ -1513,12 +1475,15 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
sd->current_vmcb = svm->vmcb;
indirect_branch_prediction_barrier();
}
- avic_vcpu_load(vcpu, cpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_load(vcpu, cpu);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
- avic_vcpu_put(vcpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_put(vcpu);
+
svm_prepare_host_switch(vcpu);
++vcpu->stat.host_state_reload;
@@ -1560,7 +1525,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
break;
default:
- WARN_ON_ONCE(1);
+ KVM_BUG_ON(1, vcpu->kvm);
}
}
@@ -2078,11 +2043,15 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
return -EINVAL;
/*
- * VMCB is undefined after a SHUTDOWN intercept
- * so reinitialize it.
+ * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
+ * the VMCB in a known good state. Unfortuately, KVM doesn't have
+ * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
+ * userspace. At a platform view, INIT is acceptable behavior as
+ * there exist bare metal platforms that automatically INIT the CPU
+ * in response to shutdown.
*/
clear_page(svm->vmcb);
- init_vmcb(vcpu);
+ kvm_vcpu_reset(vcpu, true);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@@ -2993,10 +2962,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->msr_decfg = data;
break;
}
- case MSR_IA32_APICBASE:
- if (kvm_vcpu_apicv_active(vcpu))
- avic_update_vapic_bar(to_svm(vcpu), data);
- fallthrough;
default:
return kvm_set_msr_common(vcpu, msr);
}
@@ -3021,7 +2986,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
* In this case AVIC was temporarily disabled for
* requesting the IRQ window and we have to re-enable it.
*/
- svm_toggle_avic_for_irq_window(vcpu, true);
+ kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN);
++vcpu->stat.irq_window_exits;
return 1;
@@ -3269,12 +3234,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
"excp_to:", save->last_excp_to);
}
-static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code)
{
- if (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
- svm_exit_handlers[exit_code])
- return 0;
+ return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+ svm_exit_handlers[exit_code]);
+}
+static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+{
vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
dump_vmcb(vcpu);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -3282,14 +3249,13 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
vcpu->run->internal.ndata = 2;
vcpu->run->internal.data[0] = exit_code;
vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
-
- return -EINVAL;
+ return 0;
}
int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
{
- if (svm_handle_invalid_exit(vcpu, exit_code))
- return 0;
+ if (!svm_check_exit_valid(vcpu, exit_code))
+ return svm_handle_invalid_exit(vcpu, exit_code);
#ifdef CONFIG_RETPOLINE
if (exit_code == SVM_EXIT_MSR)
@@ -3573,7 +3539,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
* via AVIC. In such case, we need to temporarily disable AVIC,
* and fallback to injecting IRQ via V_IRQ.
*/
- svm_toggle_avic_for_irq_window(vcpu, false);
+ kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN);
svm_set_vintr(svm);
}
}
@@ -3808,6 +3774,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
pre_svm_run(vcpu);
+ WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+
sync_lapic_to_cr8(vcpu);
if (unlikely(svm->asid != svm->vmcb->control.asid)) {
@@ -4610,7 +4578,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_virtual_apic_mode = svm_set_virtual_apic_mode,
.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
.check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
- .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index bd0fe94c2920..524d943f3efc 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -503,12 +503,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
-{
- svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
- vmcb_mark_dirty(svm->vmcb, VMCB_AVIC);
-}
-
static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -524,7 +518,6 @@ int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm);
-void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
@@ -534,7 +527,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu);
void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
bool svm_check_apicv_inhibit_reasons(ulong bit);
-void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate);
void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
index 8170f2a5a16f..22e2b019de37 100644
--- a/arch/x86/kvm/svm/svm_ops.h
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -4,7 +4,7 @@
#include <linux/compiler_types.h>
-#include <asm/kvm_host.h>
+#include "x86.h"
#define svm_asm(insn, clobber...) \
do { \
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 896b2a50b4aa..0dab1b7b529f 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -14,7 +14,6 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs);
#if IS_ENABLED(CONFIG_HYPERV)
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
{EVMCS1_OFFSET(name), clean_field}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 2ec9b46f0d0c..152ab0aa82cf 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -73,8 +73,6 @@ struct evmcs_field {
extern const struct evmcs_field vmcs_field_to_evmcs_1[];
extern const unsigned int nr_evmcs_1_fields;
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
-
static __always_inline int get_evmcs_offset(unsigned long field,
u16 *clean_field)
{
@@ -95,8 +93,6 @@ static __always_inline int get_evmcs_offset(unsigned long field,
return evmcs_field->offset;
}
-#undef ROL16
-
static inline void evmcs_write64(unsigned long field, u64 value)
{
u16 clean_field;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b3f77d18eb5a..ccb03d69546c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2207,7 +2207,8 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
}
}
-static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
+ struct vmcs12 *vmcs12)
{
u32 exec_control;
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
@@ -2218,23 +2219,22 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
/*
* PIN CONTROLS
*/
- exec_control = vmx_pin_based_exec_ctrl(vmx);
+ exec_control = __pin_controls_get(vmcs01);
exec_control |= (vmcs12->pin_based_vm_exec_control &
~PIN_BASED_VMX_PREEMPTION_TIMER);
/* Posted interrupts setting is only taken from vmcs12. */
- if (nested_cpu_has_posted_intr(vmcs12)) {
+ vmx->nested.pi_pending = false;
+ if (nested_cpu_has_posted_intr(vmcs12))
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- vmx->nested.pi_pending = false;
- } else {
+ else
exec_control &= ~PIN_BASED_POSTED_INTR;
- }
pin_controls_set(vmx, exec_control);
/*
* EXEC CONTROLS
*/
- exec_control = vmx_exec_control(vmx); /* L0's desires */
+ exec_control = __exec_controls_get(vmcs01); /* L0's desires */
exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
exec_control &= ~CPU_BASED_TPR_SHADOW;
@@ -2271,10 +2271,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* SECONDARY EXEC CONTROLS
*/
if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmx->secondary_exec_control;
+ exec_control = __secondary_exec_controls_get(vmcs01);
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_XSAVES |
@@ -2282,7 +2283,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_ENABLE_VMFUNC |
- SECONDARY_EXEC_TSC_SCALING);
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_DESC);
+
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
@@ -2322,8 +2325,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* on the related bits (if supported by the CPU) in the hope that
* we can avoid VMWrites during vmx_set_efer().
*/
- exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
- ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
+ exec_control = __vm_entry_controls_get(vmcs01);
+ exec_control |= vmcs12->vm_entry_controls;
+ exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
exec_control |= VM_ENTRY_IA32E_MODE;
@@ -2339,9 +2343,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
- exec_control = vmx_vmexit_ctrl();
+ exec_control = __vm_exit_controls_get(vmcs01);
if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
+ else
+ exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
vm_exit_controls_set(vmx, exec_control);
/*
@@ -3384,7 +3390,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
- prepare_vmcs02_early(vmx, vmcs12);
+ prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
if (from_vmentry) {
if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
@@ -4304,7 +4310,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
seg.l = 1;
else
seg.db = 1;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
seg = (struct kvm_segment) {
.base = 0,
.limit = 0xFFFFFFFF,
@@ -4315,17 +4321,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
.g = 1
};
seg.selector = vmcs12->host_ds_selector;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
seg.selector = vmcs12->host_es_selector;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
seg.selector = vmcs12->host_ss_selector;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
seg.selector = vmcs12->host_fs_selector;
seg.base = vmcs12->host_fs_base;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
seg.selector = vmcs12->host_gs_selector;
seg.base = vmcs12->host_gs_base;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
seg = (struct kvm_segment) {
.base = vmcs12->host_tr_base,
.limit = 0x67,
@@ -4333,14 +4339,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
.type = 11,
.present = 1
};
- vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
+ memset(&seg, 0, sizeof(seg));
+ seg.unusable = 1;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
kvm_set_dr(vcpu, 7, 0x400);
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(vcpu);
-
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
@@ -4419,9 +4426,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(vcpu);
-
/*
* This nasty bit of open coding is a compromise between blindly
* loading L1's MSRs using the exit load lists (incorrect emulation
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 9efc1a6b8693..10cc4f65c4ef 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -437,13 +437,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
- if (pmc->perf_event)
+ if (pmc->perf_event && !pmc->is_paused)
perf_event_period(pmc->perf_event,
get_sample_period(pmc, data));
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc->counter += data - pmc_read_counter(pmc);
- if (pmc->perf_event)
+ if (pmc->perf_event && !pmc->is_paused)
perf_event_period(pmc->perf_event,
get_sample_period(pmc, data));
return 0;
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 4b9957e2bf5b..6e5de2e2b0da 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -11,6 +11,8 @@
#include "capabilities.h"
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+
struct vmcs_hdr {
u32 revision_id:31;
u32 shadow_vmcs:1;
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index d9f5d7c56ae3..cab6ba7a5005 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -2,7 +2,6 @@
#include "vmcs12.h"
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
#define FIELD64(number, name) \
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 5e0e1b39f495..2a45f026ee11 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -364,8 +364,6 @@ static inline void vmx_check_vmcs12_offsets(void)
extern const unsigned short vmcs_field_to_offset_table[];
extern const unsigned int nr_vmcs12_fields;
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
-
static inline short vmcs_field_to_offset(unsigned long field)
{
unsigned short offset;
@@ -385,8 +383,6 @@ static inline short vmcs_field_to_offset(unsigned long field)
return offset;
}
-#undef ROL16
-
static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field,
u16 offset)
{
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 927a552393b9..0c2c0d5ae873 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -136,8 +136,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON \
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
- X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -1648,11 +1647,12 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
}
/*
- * Set up the vmcs to automatically save and restore system
- * msrs. Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
+ * Configuring user return MSRs to automatically save, load, and restore MSRs
+ * that need to be shoved into hardware when running the guest. Note, omitting
+ * an MSR here does _NOT_ mean it's not emulated, only that it will not be
+ * loaded into hardware when running the guest.
*/
-static void setup_msrs(struct vcpu_vmx *vmx)
+static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
{
#ifdef CONFIG_X86_64
bool load_syscall_msrs;
@@ -1682,9 +1682,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
*/
vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(&vmx->vcpu);
-
/*
* The set of MSRs to load may have changed, reload MSRs before the
* next VM-Enter.
@@ -2263,8 +2260,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
break;
case VCPU_EXREG_CR3:
- if (is_unrestricted_guest(vcpu) ||
- (enable_ept && is_paging(vcpu)))
+ /*
+ * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
+ * CR3 is loaded into hardware, not the guest's CR3.
+ */
+ if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
break;
case VCPU_EXREG_CR4:
@@ -2274,7 +2274,7 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
break;
default:
- WARN_ON_ONCE(1);
+ KVM_BUG_ON(1, vcpu->kvm);
break;
}
}
@@ -2733,7 +2733,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
save->dpl = save->selector & SEGMENT_RPL_MASK;
save->s = 1;
}
- vmx_set_segment(vcpu, save, seg);
+ __vmx_set_segment(vcpu, save, seg);
}
static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2754,7 +2754,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmx->rmode.vm86_active = 0;
- vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -2852,8 +2852,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
-
- kvm_mmu_reset_context(vcpu);
}
int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -2874,7 +2872,7 @@ int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
msr->data = efer & ~EFER_LME;
}
- setup_msrs(vmx);
+ vmx_setup_uret_msrs(vmx);
return 0;
}
@@ -2997,42 +2995,24 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
}
-static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
- unsigned long cr0,
- struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
- if (!(cr0 & X86_CR0_PG)) {
- /* From paging/starting to nonpaging */
- exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING);
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- } else if (!is_paging(vcpu)) {
- /* From nonpaging to paging */
- exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING);
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- }
-
- if (!(cr0 & X86_CR0_WP))
- *hw_cr0 &= ~X86_CR0_WP;
-}
+#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING)
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long hw_cr0;
+ unsigned long hw_cr0, old_cr0_pg;
+ u32 tmp;
+
+ old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
if (is_unrestricted_guest(vcpu))
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
+ if (!enable_ept)
+ hw_cr0 |= X86_CR0_WP;
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
@@ -3041,22 +3021,60 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
enter_rmode(vcpu);
}
+ vmcs_writel(CR0_READ_SHADOW, cr0);
+ vmcs_writel(GUEST_CR0, hw_cr0);
+ vcpu->arch.cr0 = cr0;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
+ if (!old_cr0_pg && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
+ else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
exit_lmode(vcpu);
}
#endif
- if (enable_ept && !is_unrestricted_guest(vcpu))
- ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+ if (enable_ept && !is_unrestricted_guest(vcpu)) {
+ /*
+ * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
+ * the below code _enables_ CR3 exiting, vmx_cache_reg() will
+ * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
+ * KVM's CR3 is installed.
+ */
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
- vmcs_writel(CR0_READ_SHADOW, cr0);
- vmcs_writel(GUEST_CR0, hw_cr0);
- vcpu->arch.cr0 = cr0;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+ /*
+ * When running with EPT but not unrestricted guest, KVM must
+ * intercept CR3 accesses when paging is _disabled_. This is
+ * necessary because restricted guests can't actually run with
+ * paging disabled, and so KVM stuffs its own CR3 in order to
+ * run the guest when identity mapped page tables.
+ *
+ * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
+ * update, it may be stale with respect to CR3 interception,
+ * e.g. after nested VM-Enter.
+ *
+ * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
+ * stores to forward them to L1, even if KVM does not need to
+ * intercept them to preserve its identity mapped page tables.
+ */
+ if (!(cr0 & X86_CR0_PG)) {
+ exec_controls_setbit(vmx, CR3_EXITING_BITS);
+ } else if (!is_guest_mode(vcpu)) {
+ exec_controls_clearbit(vmx, CR3_EXITING_BITS);
+ } else {
+ tmp = exec_controls_get(vmx);
+ tmp &= ~CR3_EXITING_BITS;
+ tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
+ exec_controls_set(vmx, tmp);
+ }
+
+ /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
+ if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
/* depends on vcpu->arch.cr0 to be set to a new value */
vmx->emulation_required = emulation_required(vcpu);
@@ -3271,7 +3289,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
return ar;
}
-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -3284,7 +3302,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
vmcs_write16(sf->selector, var->selector);
else if (var->s)
fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
- goto out;
+ return;
}
vmcs_writel(sf->base, var->base);
@@ -3306,9 +3324,13 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
var->type |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
+}
-out:
- vmx->emulation_required = emulation_required(vcpu);
+static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+ __vmx_set_segment(vcpu, var, seg);
+
+ to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3790,21 +3812,6 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
-{
- u8 mode = 0;
-
- if (cpu_has_secondary_exec_ctrls() &&
- (secondary_exec_controls_get(to_vmx(vcpu)) &
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
- mode |= MSR_BITMAP_MODE_X2APIC;
- if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
- mode |= MSR_BITMAP_MODE_X2APIC_APICV;
- }
-
- return mode;
-}
-
static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
{
unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
@@ -3822,11 +3829,29 @@ static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
}
}
-static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u8 mode;
+
if (!cpu_has_vmx_msr_bitmap())
return;
+ if (cpu_has_secondary_exec_ctrls() &&
+ (secondary_exec_controls_get(vmx) &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+ mode = MSR_BITMAP_MODE_X2APIC;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ } else {
+ mode = 0;
+ }
+
+ if (mode == vmx->x2apic_msr_bitmap_mode)
+ return;
+
+ vmx->x2apic_msr_bitmap_mode = mode;
+
vmx_reset_x2apic_msrs(vcpu, mode);
/*
@@ -3843,21 +3868,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
}
}
-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u8 mode = vmx_msr_bitmap_mode(vcpu);
- u8 changed = mode ^ vmx->msr_bitmap_mode;
-
- if (!changed)
- return;
-
- if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
- vmx_update_msr_bitmap_x2apic(vcpu, mode);
-
- vmx->msr_bitmap_mode = mode;
-}
-
void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3914,7 +3924,6 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
}
pt_update_intercept_for_msr(vcpu);
- vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));
}
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -4086,7 +4095,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
}
-u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
{
u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
@@ -4102,6 +4111,30 @@ u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
return pin_based_exec_ctrl;
}
+static u32 vmx_vmentry_ctrl(void)
+{
+ u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+ VM_ENTRY_LOAD_IA32_RTIT_CTL);
+ /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+ return vmentry_ctrl &
+ ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
+}
+
+static u32 vmx_vmexit_ctrl(void)
+{
+ u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
+ VM_EXIT_CLEAR_IA32_RTIT_CTL);
+ /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+ return vmexit_ctrl &
+ ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
+}
+
static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4118,11 +4151,10 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
}
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap_x2apic(vcpu);
}
-u32 vmx_exec_control(struct vcpu_vmx *vmx)
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -4204,7 +4236,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
-static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
struct kvm_vcpu *vcpu = &vmx->vcpu;
@@ -4290,7 +4322,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (!vcpu->kvm->arch.bus_lock_detection_enabled)
exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
- vmx->secondary_exec_control = exec_control;
+ return exec_control;
}
#define VMX_XSS_EXIT_BITMAP 0
@@ -4314,10 +4346,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
exec_controls_set(vmx, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls()) {
- vmx_compute_secondary_exec_control(vmx);
- secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
- }
+ if (cpu_has_secondary_exec_ctrls())
+ secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
@@ -4388,32 +4418,35 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmx->pt_desc.guest.output_mask = 0x7F;
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
}
+
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+ if (cpu_need_tpr_shadow(&vmx->vcpu))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ __pa(vmx->vcpu.arch.apic->regs));
+ vmcs_write32(TPR_THRESHOLD, 0);
+ }
+
+ vmx_setup_uret_msrs(vmx);
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct msr_data apic_base_msr;
- u64 cr0;
vmx->rmode.vm86_active = 0;
vmx->spec_ctrl = 0;
vmx->msr_ia32_umwait_control = 0;
- vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
vmx->hv_deadline_tsc = -1;
kvm_set_cr8(vcpu, 0);
- if (!init_event) {
- apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(vcpu))
- apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
- apic_base_msr.host_initiated = true;
- kvm_set_apic_base(vcpu, &apic_base_msr);
- }
-
vmx_segment_cache_clear(vmx);
seg_setup(VCPU_SREG_CS);
@@ -4436,16 +4469,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
- if (!init_event) {
- vmcs_write32(GUEST_SYSENTER_CS, 0);
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
- }
-
- kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
- kvm_rip_write(vcpu, 0xfff0);
-
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4458,31 +4481,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (kvm_mpx_supported())
vmcs_write64(GUEST_BNDCFGS, 0);
- setup_msrs(vmx);
-
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
- if (cpu_has_vmx_tpr_shadow() && !init_event) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (cpu_need_tpr_shadow(vcpu))
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- __pa(vcpu->arch.apic->regs));
- vmcs_write32(TPR_THRESHOLD, 0);
- }
-
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx->vcpu.arch.cr0 = cr0;
- vmx_set_cr0(vcpu, cr0); /* enter rmode */
- vmx_set_cr4(vcpu, 0);
- vmx_set_efer(vcpu, 0);
-
- vmx_update_exception_bitmap(vcpu);
-
vpid_sync_context(vmx->vpid);
- if (init_event)
- vmx_clear_hlt(vcpu);
}
static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
@@ -4996,6 +4999,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return kvm_complete_insn_gp(vcpu, err);
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
err = kvm_set_cr3(vcpu, val);
return kvm_complete_insn_gp(vcpu, err);
case 4:
@@ -5021,14 +5025,13 @@ static int handle_cr(struct kvm_vcpu *vcpu)
}
break;
case 2: /* clts */
- WARN_ONCE(1, "Guest should always own CR0.TS");
- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
- trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
- return kvm_skip_emulated_instruction(vcpu);
+ KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
+ return -EIO;
case 1: /*mov from cr*/
switch (cr) {
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
val = kvm_read_cr3(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
@@ -5129,6 +5132,12 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
+
+ /*
+ * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
+ * a stale dr6 from the guest.
+ */
+ set_debugreg(DR6_RESERVED, 6);
}
static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -5338,7 +5347,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
- WARN_ON_ONCE(!enable_vnmi);
+ if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
+ return -EIO;
+
exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5896,7 +5907,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* below) should never happen as that means we incorrectly allowed a
* nested VM-Enter with an invalid vmcs12.
*/
- WARN_ON_ONCE(vmx->nested.nested_run_pending);
+ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
+ return -EIO;
/* If guest state is invalid, start emulating */
if (vmx->emulation_required)
@@ -6189,7 +6201,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
}
secondary_exec_controls_set(vmx, sec_exec_control);
- vmx_update_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap_x2apic(vcpu);
}
static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
@@ -6274,7 +6286,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
int max_irr;
bool max_irr_updated;
- WARN_ON(!vcpu->arch.apicv_active);
+ if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))
+ return -EIO;
+
if (pi_test_on(&vmx->pi_desc)) {
pi_clear_on(&vmx->pi_desc);
/*
@@ -6357,7 +6371,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
gate_desc *desc = (gate_desc *)host_idt_base + vector;
- if (WARN_ONCE(!is_external_intr(intr_info),
+ if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
@@ -6368,6 +6382,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vmx->emulation_required)
+ return;
+
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
@@ -6639,6 +6656,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ set_debugreg(vcpu->arch.dr6, 6);
+
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -6838,7 +6859,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
}
- vmx->msr_bitmap_mode = 0;
vmx->loaded_vmcs = &vmx->vmcs01;
cpu = get_cpu();
@@ -6997,7 +7017,7 @@ exit:
return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
}
-static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
+static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
{
/*
* These bits in the secondary execution controls field
@@ -7011,7 +7031,6 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_DESC;
- u32 new_ctl = vmx->secondary_exec_control;
u32 cur_ctl = secondary_exec_controls_get(vmx);
secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
@@ -7154,10 +7173,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
/* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
vcpu->arch.xsaves_enabled = false;
- if (cpu_has_secondary_exec_ctrls()) {
- vmx_compute_secondary_exec_control(vmx);
- vmcs_set_secondary_exec_control(vmx);
- }
+ vmx_setup_uret_msrs(vmx);
+
+ if (cpu_has_secondary_exec_ctrls())
+ vmcs_set_secondary_exec_control(vmx,
+ vmx_secondary_exec_control(vmx));
if (nested_vmx_allowed(vcpu))
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
@@ -7803,7 +7823,8 @@ static __init int hardware_setup(void)
ept_lpage_level = PG_LEVEL_2M;
else
ept_lpage_level = PG_LEVEL_4K;
- kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
+ kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
+ ept_lpage_level);
/*
* Only enable PML when hardware supports PML feature, and both EPT
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 17a1cb4b059d..4858c5fd95f2 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -227,7 +227,7 @@ struct nested_vmx {
struct vcpu_vmx {
struct kvm_vcpu vcpu;
u8 fail;
- u8 msr_bitmap_mode;
+ u8 x2apic_msr_bitmap_mode;
/*
* If true, host state has been stored in vmx->loaded_vmcs for
@@ -263,8 +263,6 @@ struct vcpu_vmx {
u64 spec_ctrl;
u32 msr_ia32_umwait_control;
- u32 secondary_exec_control;
-
/*
* loaded_vmcs points to the VMCS currently used in this vcpu. For a
* non-nested (L1) guest, it always points to vmcs01. For a nested
@@ -371,12 +369,11 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
@@ -419,9 +416,13 @@ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \
vmx->loaded_vmcs->controls_shadow.lname = val; \
} \
} \
+static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \
+{ \
+ return vmcs->controls_shadow.lname; \
+} \
static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \
{ \
- return vmx->loaded_vmcs->controls_shadow.lname; \
+ return __##lname##_controls_get(vmx->loaded_vmcs); \
} \
static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \
{ \
@@ -451,31 +452,6 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
vcpu->arch.regs_dirty = 0;
}
-static inline u32 vmx_vmentry_ctrl(void)
-{
- u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
- if (vmx_pt_mode_is_system())
- vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
- VM_ENTRY_LOAD_IA32_RTIT_CTL);
- /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
- return vmentry_ctrl &
- ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
-}
-
-static inline u32 vmx_vmexit_ctrl(void)
-{
- u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
- if (vmx_pt_mode_is_system())
- vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
- VM_EXIT_CLEAR_IA32_RTIT_CTL);
- /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
- return vmexit_ctrl &
- ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
-}
-
-u32 vmx_exec_control(struct vcpu_vmx *vmx);
-u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx);
-
static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
return container_of(kvm, struct kvm_vmx, kvm);
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 164b64f65a8f..9e9ef47e988c 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -4,13 +4,11 @@
#include <linux/nospec.h>
-#include <asm/kvm_host.h>
#include <asm/vmx.h>
#include "evmcs.h"
#include "vmcs.h"
-
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
+#include "x86.h"
asmlinkage void vmread_error(unsigned long field, bool fault);
__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5d5c5ed7dd4..28ef14155726 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -233,12 +233,13 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
STATS_DESC_COUNTER(VM, mmu_recycled),
STATS_DESC_COUNTER(VM, mmu_cache_miss),
STATS_DESC_ICOUNTER(VM, mmu_unsync),
- STATS_DESC_ICOUNTER(VM, lpages),
+ STATS_DESC_ICOUNTER(VM, pages_4k),
+ STATS_DESC_ICOUNTER(VM, pages_2m),
+ STATS_DESC_ICOUNTER(VM, pages_1g),
STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+ STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
};
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
- sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
@@ -278,8 +279,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, directed_yield_successful),
STATS_DESC_ICOUNTER(VCPU, guest_mode)
};
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
- sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
@@ -485,7 +484,14 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-asmlinkage __visible noinstr void kvm_spurious_fault(void)
+/*
+ * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
+ *
+ * Hardware virtualization extension instructions may fault if a reboot turns
+ * off virtualization while processes are running. Usually after catching the
+ * fault we just panic; during reboot instead the instruction is ignored.
+ */
+noinstr void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
BUG_ON(!kvm_rebooting);
@@ -1180,7 +1186,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
for (i = 0; i < KVM_NR_DB_REGS; i++)
vcpu->arch.eff_db[i] = vcpu->arch.db[i];
- vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
}
}
@@ -3316,6 +3321,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated) {
s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
adjust_tsc_offset_guest(vcpu, adj);
+ /* Before back to guest, tsc_timestamp must be adjusted
+ * as well, otherwise guest's percpu pvclock time could jump.
+ */
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
vcpu->arch.ia32_tsc_adjust_msr = data;
}
@@ -4310,12 +4319,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static_call(kvm_x86_vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
- /*
- * If userspace has set any breakpoints or watchpoints, dr6 is restored
- * on every vmexit, but if not, we might have a stale dr6 from the
- * guest. do_debug expects dr6 to be cleared after it runs, do the same.
- */
- set_debugreg(0, 6);
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -6567,9 +6570,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
* there is no pkey in EPT page table for L1 guest or EPT
* shadow page table for L2 guest.
*/
- if (vcpu_match_mmio_gva(vcpu, gva)
- && !permission_fault(vcpu, vcpu->arch.walk_mmu,
- vcpu->arch.mmio_access, 0, access)) {
+ if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
+ !permission_fault(vcpu, vcpu->arch.walk_mmu,
+ vcpu->arch.mmio_access, 0, access))) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -8578,6 +8581,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated);
static void kvm_apicv_init(struct kvm *kvm)
{
+ mutex_init(&kvm->arch.apicv_update_lock);
+
if (enable_apicv)
clear_bit(APICV_INHIBIT_REASON_DISABLE,
&kvm->arch.apicv_inhibit_reasons);
@@ -8891,6 +8896,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
can_inject = false;
}
+ /* Don't inject interrupts if the user asked to avoid doing so */
+ if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
+ return 0;
+
/*
* Finally, inject interrupt events. If an event cannot be injected
* due to architectural conditions (e.g. IF=0) a window-open exit
@@ -9236,10 +9245,18 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
{
+ bool activate;
+
if (!lapic_in_kernel(vcpu))
return;
- vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
+ mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+
+ activate = kvm_apicv_activated(vcpu->kvm);
+ if (vcpu->arch.apicv_active == activate)
+ goto out;
+
+ vcpu->arch.apicv_active = activate;
kvm_apic_update_apicv(vcpu);
static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
@@ -9251,54 +9268,45 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
*/
if (!vcpu->arch.apicv_active)
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+out:
+ mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
-/*
- * NOTE: Do not hold any lock prior to calling this.
- *
- * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
- * locked, because it calls __x86_set_memory_region() which does
- * synchronize_srcu(&kvm->srcu).
- */
-void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
{
- struct kvm_vcpu *except;
- unsigned long old, new, expected;
+ unsigned long old, new;
if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
return;
- old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
- do {
- expected = new = old;
- if (activate)
- __clear_bit(bit, &new);
- else
- __set_bit(bit, &new);
- if (new == old)
- break;
- old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
- } while (old != expected);
-
- if (!!old == !!new)
- return;
+ old = new = kvm->arch.apicv_inhibit_reasons;
- trace_kvm_apicv_update_request(activate, bit);
- if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
- static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate);
+ if (activate)
+ __clear_bit(bit, &new);
+ else
+ __set_bit(bit, &new);
+
+ if (!!old != !!new) {
+ trace_kvm_apicv_update_request(activate, bit);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
+ kvm->arch.apicv_inhibit_reasons = new;
+ if (new) {
+ unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
+ kvm_zap_gfn_range(kvm, gfn, gfn+1);
+ }
+ } else
+ kvm->arch.apicv_inhibit_reasons = new;
+}
+EXPORT_SYMBOL_GPL(__kvm_request_apicv_update);
- /*
- * Sending request to update APICV for all other vcpus,
- * while update the calling vcpu immediately instead of
- * waiting for another #VMEXIT to handle the request.
- */
- except = kvm_get_running_vcpu();
- kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
- except);
- if (except)
- kvm_vcpu_update_apicv(except);
+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+{
+ mutex_lock(&kvm->arch.apicv_update_lock);
+ __kvm_request_apicv_update(kvm, activate, bit);
+ mutex_unlock(&kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
@@ -9395,6 +9403,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) {
+ r = -EIO;
+ goto out;
+ }
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
r = 0;
@@ -9608,8 +9620,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
- set_debugreg(vcpu->arch.dr6, 6);
- vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(0, 7);
}
@@ -9639,7 +9649,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
- vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
/*
@@ -9976,7 +9985,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
- if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+ if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
+ (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
r = -EINVAL;
goto out;
}
@@ -10581,9 +10591,6 @@ static void store_regs(struct kvm_vcpu *vcpu)
static int sync_regs(struct kvm_vcpu *vcpu)
{
- if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
- return -EINVAL;
-
if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
__set_regs(vcpu, &vcpu->run->s.regs.regs);
vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
@@ -10799,6 +10806,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long new_cr0;
+ u32 eax, dummy;
kvm_lapic_reset(vcpu, init_event);
@@ -10865,10 +10874,41 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
+ /*
+ * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
+ * if no CPUID match is found. Note, it's impossible to get a match at
+ * RESET since KVM emulates RESET before exposing the vCPU to userspace,
+ * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET.
+ * But, go through the motions in case that's ever remedied.
+ */
+ eax = 1;
+ if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true))
+ eax = 0x600;
+ kvm_rdx_write(vcpu, eax);
+
vcpu->arch.ia32_xss = 0;
static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0xfff0);
+
+ /*
+ * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions
+ * of Intel's SDM list CD/NW as being set on INIT, but they contradict
+ * (or qualify) that with a footnote stating that CD/NW are preserved.
+ */
+ new_cr0 = X86_CR0_ET;
+ if (init_event)
+ new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
+ else
+ new_cr0 |= X86_CR0_NW | X86_CR0_CD;
+
+ static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
+ static_call(kvm_x86_set_cr4)(vcpu, 0);
+ static_call(kvm_x86_set_efer)(vcpu, 0);
+ static_call(kvm_x86_update_exception_bitmap)(vcpu);
+
/*
* Reset the MMU context if paging was enabled prior to INIT (which is
* implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the
@@ -10879,7 +10919,20 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
*/
if (old_cr0 & X86_CR0_PG)
kvm_mmu_reset_context(vcpu);
+
+ /*
+ * Intel's SDM states that all TLB entries are flushed on INIT. AMD's
+ * APM states the TLBs are untouched by INIT, but it also states that
+ * the TLBs are flushed on "External initialization of the processor."
+ * Flush the guest TLB regardless of vendor, there is no meaningful
+ * benefit in relying on the guest to flush the TLB immediately after
+ * INIT. A spurious TLB flush is benign and likely negligible from a
+ * performance perspective.
+ */
+ if (init_event)
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_reset);
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
@@ -11123,6 +11176,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_hv_init_vm(kvm);
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
+ kvm_xen_init_vm(kvm);
return static_call(kvm_x86_vm_init)(kvm);
}
@@ -11312,8 +11366,7 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
int level = i + 1;
- int lpages = gfn_to_index(slot->base_gfn + npages - 1,
- slot->base_gfn, level) + 1;
+ int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
WARN_ON(slot->arch.rmap[i]);
@@ -11396,8 +11449,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
int lpages;
int level = i + 1;
- lpages = gfn_to_index(slot->base_gfn + npages - 1,
- slot->base_gfn, level) + 1;
+ lpages = __kvm_mmu_slot_lpages(slot, npages, level);
linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
if (!linfo)
@@ -11481,7 +11533,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
struct kvm_memory_slot *old,
- struct kvm_memory_slot *new,
+ const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
@@ -11561,10 +11613,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_mmu_change_mmu_pages(kvm,
kvm_mmu_calculate_default_mmu_pages(kvm));
- /*
- * FIXME: const-ify all uses of struct kvm_memory_slot.
- */
- kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
+ kvm_mmu_slot_apply_flags(kvm, old, new, change);
/* Free the arrays associated with the old memslot. */
if (change == KVM_MR_MOVE)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 44ae10312740..7d66d63dc55a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,6 +8,8 @@
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+void kvm_spurious_fault(void);
+
static __always_inline void kvm_guest_enter_irqoff(void)
{
/*
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index ae17250e1efe..9ea9c3dabe37 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -25,15 +25,14 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
{
gpa_t gpa = gfn_to_gpa(gfn);
int wc_ofs, sec_hi_ofs;
- int ret;
+ int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
- ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache,
- gpa, PAGE_SIZE);
- if (ret)
+ if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) {
+ ret = -EFAULT;
goto out;
-
- kvm->arch.xen.shinfo_set = true;
+ }
+ kvm->arch.xen.shinfo_gfn = gfn;
/* Paranoia checks on the 32-bit struct layout */
BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
@@ -245,7 +244,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
if (data->u.shared_info.gfn == GPA_INVALID) {
- kvm->arch.xen.shinfo_set = false;
+ kvm->arch.xen.shinfo_gfn = GPA_INVALID;
r = 0;
break;
}
@@ -283,10 +282,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- if (kvm->arch.xen.shinfo_set)
- data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
- else
- data->u.shared_info.gfn = GPA_INVALID;
+ data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn);
r = 0;
break;
@@ -646,6 +642,11 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
return 0;
}
+void kvm_xen_init_vm(struct kvm *kvm)
+{
+ kvm->arch.xen.shinfo_gfn = GPA_INVALID;
+}
+
void kvm_xen_destroy_vm(struct kvm *kvm)
{
if (kvm->arch.xen_hvm_config.msr)
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 463a7844a8ca..cc0cf5f37450 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -21,6 +21,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
+void kvm_xen_init_vm(struct kvm *kvm);
void kvm_xen_destroy_vm(struct kvm *kvm);
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
@@ -50,6 +51,10 @@ static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
return 1;
}
+static inline void kvm_xen_init_vm(struct kvm *kvm)
+{
+}
+
static inline void kvm_xen_destroy_vm(struct kvm *kvm)
{
}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 74b78840182d..bd90b8fe81e4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -801,8 +801,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return __add_pages(nid, start_pfn, nr_pages, params);
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ddeaba947eb3..a6e11763763f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1255,8 +1255,7 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
remove_pagetable(start, end, true, NULL);
}
-void __ref arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
index 01a085d9135a..4f0147d4e225 100644
--- a/arch/x86/pci/numachip.c
+++ b/arch/x86/pci/numachip.c
@@ -12,6 +12,7 @@
#include <linux/pci.h>
#include <asm/pci_x86.h>
+#include <asm/numachip/numachip.h>
static u8 limit __read_mostly;
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 7d2525691854..101081ad64b6 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -146,8 +146,7 @@ static void sta2x11_map_ep(struct pci_dev *pdev)
dev_err(dev, "sta2x11: could not set DMA offset\n");
dev->bus_dma_limit = max_amba_addr;
- pci_set_consistent_dma_mask(pdev, max_amba_addr);
- pci_set_dma_mask(pdev, max_amba_addr);
+ dma_set_mask_and_coherent(&pdev->dev, max_amba_addr);
/* Configure AHB mapping */
pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0);
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
index b95db9daf0e8..4c6c2be0c899 100644
--- a/arch/x86/um/shared/sysdep/stub_32.h
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -101,4 +101,16 @@ static inline void remap_stack_and_trap(void)
"memory");
}
+static __always_inline void *get_stub_page(void)
+{
+ unsigned long ret;
+
+ asm volatile (
+ "movl %%esp,%0 ;"
+ "andl %1,%0"
+ : "=a" (ret)
+ : "g" (~(UM_KERN_PAGE_SIZE - 1)));
+
+ return (void *)ret;
+}
#endif
diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h
index 6e2626b77a2e..e9c4b2b38803 100644
--- a/arch/x86/um/shared/sysdep/stub_64.h
+++ b/arch/x86/um/shared/sysdep/stub_64.h
@@ -108,4 +108,16 @@ static inline void remap_stack_and_trap(void)
__syscall_clobber, "r10", "r8", "r9");
}
+static __always_inline void *get_stub_page(void)
+{
+ unsigned long ret;
+
+ asm volatile (
+ "movq %%rsp,%0 ;"
+ "andq %1,%0"
+ : "=a" (ret)
+ : "g" (~(UM_KERN_PAGE_SIZE - 1)));
+
+ return (void *)ret;
+}
#endif
diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c
index 21836eaf1725..f7eefba034f9 100644
--- a/arch/x86/um/stub_segv.c
+++ b/arch/x86/um/stub_segv.c
@@ -11,9 +11,8 @@
void __attribute__ ((__section__ (".__syscall_stub")))
stub_segv_handler(int sig, siginfo_t *info, void *p)
{
- int stack;
+ struct faultinfo *f = get_stub_page();
ucontext_t *uc = p;
- struct faultinfo *f = (void *)(((unsigned long)&stack) & ~(UM_KERN_PAGE_SIZE - 1));
GET_FAULTINFO_FROM_MC(*f, &uc->uc_mcontext);
trap_myself();
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 9bf2a9bc8539..0e56bad058fa 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -42,6 +42,7 @@ config XTENSA
select MODULES_USE_ELF_RELA
select PERF_USE_VMALLOC
select SET_FS
+ select TRACE_IRQFLAGS_SUPPORT
select VIRT_TO_BUS
help
Xtensa processors are 32-bit RISC machines designed by Tensilica
@@ -73,9 +74,6 @@ config LOCKDEP_SUPPORT
config STACKTRACE_SUPPORT
def_bool y
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
config MMU
def_bool n