summaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile12
-rw-r--r--arch/x86/mm/amdtopology.c1
-rw-r--r--arch/x86/mm/extable.c7
-rw-r--r--arch/x86/mm/fault.c100
-rw-r--r--arch/x86/mm/hugetlbpage.c1
-rw-r--r--arch/x86/mm/ident_map.c1
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_64.c10
-rw-r--r--arch/x86/mm/ioremap.c123
-rw-r--r--arch/x86/mm/kasan_init_64.c102
-rw-r--r--arch/x86/mm/kaslr.c1
-rw-r--r--arch/x86/mm/kmemcheck/error.c1
-rw-r--r--arch/x86/mm/kmemcheck/error.h1
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c1
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h1
-rw-r--r--arch/x86/mm/kmemcheck/pte.c1
-rw-r--r--arch/x86/mm/kmemcheck/pte.h1
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c1
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h1
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h1
-rw-r--r--arch/x86/mm/kmmio.c1
-rw-r--r--arch/x86/mm/mem_encrypt.c303
-rw-r--r--arch/x86/mm/mm_internal.h1
-rw-r--r--arch/x86/mm/mpx.c121
-rw-r--r--arch/x86/mm/numa_64.c1
-rw-r--r--arch/x86/mm/numa_emulation.c1
-rw-r--r--arch/x86/mm/numa_internal.h1
-rw-r--r--arch/x86/mm/pageattr-test.c1
-rw-r--r--arch/x86/mm/pageattr.c4
-rw-r--r--arch/x86/mm/pat_internal.h1
-rw-r--r--arch/x86/mm/pat_rbtree.c1
-rw-r--r--arch/x86/mm/pgtable.c1
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/mm/physaddr.c1
-rw-r--r--arch/x86/mm/physaddr.h1
-rw-r--r--arch/x86/mm/setup_nx.c1
-rw-r--r--arch/x86/mm/srat.c1
-rw-r--r--arch/x86/mm/tlb.c122
38 files changed, 642 insertions, 291 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 72bf8c01c6e3..7ba7f3d7f477 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,13 @@
-# Kernel does not boot with instrumentation of tlb.c.
-KCOV_INSTRUMENT_tlb.o := n
+# SPDX-License-Identifier: GPL-2.0
+# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
+KCOV_INSTRUMENT_tlb.o := n
+KCOV_INSTRUMENT_mem_encrypt.o := n
+
+KASAN_SANITIZE_mem_encrypt.o := n
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_mem_encrypt.o = -pg
+endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o setup_nx.o tlb.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 91f501b2da3b..048c761d97b0 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* AMD NUMA support.
* Discover the memory map and associated nodes.
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index c3521e2be396..3321b446b66c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -67,12 +67,17 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup,
* wrapped around) will be set. Additionally, seeing the refcount
* reach 0 will set ZF (Zero Flag: result was zero). In each of
* these cases we want a report, since it's a boundary condition.
- *
+ * The SF case is not reported since it indicates post-boundary
+ * manipulations below zero or above INT_MAX. And if none of the
+ * flags are set, something has gone very wrong, so report it.
*/
if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
bool zero = regs->flags & X86_EFLAGS_ZF;
refcount_error_report(regs, zero ? "hit zero" : "overflow");
+ } else if ((regs->flags & X86_EFLAGS_SF) == 0) {
+ /* Report if none of OF, ZF, nor SF are set. */
+ refcount_error_report(regs, "unexpected saturation");
}
return true;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e2baeaa053a5..3109ba6c6ede 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1995 Linus Torvalds
* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
@@ -29,26 +30,6 @@
#include <asm/trace/exceptions.h>
/*
- * Page fault error code bits:
- *
- * bit 0 == 0: no page found 1: protection fault
- * bit 1 == 0: read access 1: write access
- * bit 2 == 0: kernel-mode access 1: user-mode access
- * bit 3 == 1: use of reserved bit detected
- * bit 4 == 1: fault was an instruction fetch
- * bit 5 == 1: protection keys block access
- */
-enum x86_pf_error_code {
-
- PF_PROT = 1 << 0,
- PF_WRITE = 1 << 1,
- PF_USER = 1 << 2,
- PF_RSVD = 1 << 3,
- PF_INSTR = 1 << 4,
- PF_PK = 1 << 5,
-};
-
-/*
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
@@ -149,7 +130,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
* If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault:
*/
- if (error_code & PF_INSTR)
+ if (error_code & X86_PF_INSTR)
return 0;
instr = (void *)convert_ip_to_linear(current, regs);
@@ -179,7 +160,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
* siginfo so userspace can discover which protection key was set
* on the PTE.
*
- * If we get here, we know that the hardware signaled a PF_PK
+ * If we get here, we know that the hardware signaled a X86_PF_PK
* fault and that there was a VMA once we got in the fault
* handler. It does *not* guarantee that the VMA we find here
* was the one that we faulted on.
@@ -204,7 +185,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
/*
* force_sig_info_fault() is called from a number of
* contexts, some of which have a VMA and some of which
- * do not. The PF_PK handing happens after we have a
+ * do not. The X86_PF_PK handing happens after we have a
* valid VMA, so we should never reach this without a
* valid VMA.
*/
@@ -697,7 +678,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (!oops_may_print())
return;
- if (error_code & PF_INSTR) {
+ if (error_code & X86_PF_INSTR) {
unsigned int level;
pgd_t *pgd;
pte_t *pte;
@@ -779,7 +760,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
*/
if (current->thread.sig_on_uaccess_err && signal) {
tsk->thread.trap_nr = X86_TRAP_PF;
- tsk->thread.error_code = error_code | PF_USER;
+ tsk->thread.error_code = error_code | X86_PF_USER;
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
@@ -897,7 +878,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
/* User mode accesses just cause a SIGSEGV */
- if (error_code & PF_USER) {
+ if (error_code & X86_PF_USER) {
/*
* It's possible to have interrupts off here:
*/
@@ -918,7 +899,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
* Instruction fetch faults in the vsyscall page might need
* emulation.
*/
- if (unlikely((error_code & PF_INSTR) &&
+ if (unlikely((error_code & X86_PF_INSTR) &&
((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
@@ -931,7 +912,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
* are always protection faults.
*/
if (address >= TASK_SIZE_MAX)
- error_code |= PF_PROT;
+ error_code |= X86_PF_PROT;
if (likely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
@@ -992,11 +973,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return false;
- if (error_code & PF_PK)
+ if (error_code & X86_PF_PK)
return true;
/* this checks permission keys on the VMA: */
- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
- (error_code & PF_INSTR), foreign))
+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ (error_code & X86_PF_INSTR), foreign))
return true;
return false;
}
@@ -1024,7 +1005,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
int code = BUS_ADRERR;
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & PF_USER)) {
+ if (!(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -1052,14 +1033,14 @@ static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 *pkey, unsigned int fault)
{
- if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+ if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
return;
}
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & PF_USER)) {
+ if (!(error_code & X86_PF_USER)) {
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
return;
@@ -1084,16 +1065,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
{
- if ((error_code & PF_WRITE) && !pte_write(*pte))
+ if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
- if ((error_code & PF_INSTR) && !pte_exec(*pte))
+ if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
/*
* Note: We do not do lazy flushing on protection key
- * changes, so no spurious fault will ever set PF_PK.
+ * changes, so no spurious fault will ever set X86_PF_PK.
*/
- if ((error_code & PF_PK))
+ if ((error_code & X86_PF_PK))
return 1;
return 1;
@@ -1139,8 +1120,8 @@ spurious_fault(unsigned long error_code, unsigned long address)
* change, so user accesses are not expected to cause spurious
* faults.
*/
- if (error_code != (PF_WRITE | PF_PROT)
- && error_code != (PF_INSTR | PF_PROT))
+ if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
+ error_code != (X86_PF_INSTR | X86_PF_PROT))
return 0;
pgd = init_mm.pgd + pgd_index(address);
@@ -1200,19 +1181,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
* always an unconditional error and can never result in
* a follow-up action to resolve the fault, like a COW.
*/
- if (error_code & PF_PK)
+ if (error_code & X86_PF_PK)
return 1;
/*
* Make sure to check the VMA so that we do not perform
- * faults just to hit a PF_PK as soon as we fill in a
+ * faults just to hit a X86_PF_PK as soon as we fill in a
* page.
*/
- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
- (error_code & PF_INSTR), foreign))
+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ (error_code & X86_PF_INSTR), foreign))
return 1;
- if (error_code & PF_WRITE) {
+ if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
return 1;
@@ -1220,7 +1201,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
}
/* read, present: */
- if (unlikely(error_code & PF_PROT))
+ if (unlikely(error_code & X86_PF_PROT))
return 1;
/* read, not present: */
@@ -1243,7 +1224,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
if (!static_cpu_has(X86_FEATURE_SMAP))
return false;
- if (error_code & PF_USER)
+ if (error_code & X86_PF_USER)
return false;
if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
@@ -1296,7 +1277,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {
- if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+ if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
@@ -1324,7 +1305,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (unlikely(kprobes_fault(regs)))
return;
- if (unlikely(error_code & PF_RSVD))
+ if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
@@ -1350,7 +1331,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
if (user_mode(regs)) {
local_irq_enable();
- error_code |= PF_USER;
+ error_code |= X86_PF_USER;
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
@@ -1359,9 +1340,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (error_code & PF_WRITE)
+ if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (error_code & PF_INSTR)
+ if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
/*
@@ -1381,7 +1362,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* space check, thus avoiding the deadlock:
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
- if ((error_code & PF_USER) == 0 &&
+ if (!(error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
@@ -1408,7 +1389,7 @@ retry:
bad_area(regs, error_code, address);
return;
}
- if (error_code & PF_USER) {
+ if (error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
@@ -1440,7 +1421,17 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
* we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
+ *
+ * Note that handle_userfault() may also release and reacquire mmap_sem
+ * (and not return with VM_FAULT_RETRY), when returning to userland to
+ * repeat the page fault later with a VM_FAULT_NOPAGE retval
+ * (potentially after handling any pending signal during the return to
+ * userland). The return to userland is identified whenever
+ * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
+ * Thus we have to be careful about not touching vma after handling the
+ * fault, so we read the pkey beforehand.
*/
+ pkey = vma_pkey(vma);
fault = handle_mm_fault(vma, address, flags);
major |= fault & VM_FAULT_MAJOR;
@@ -1467,7 +1458,6 @@ good_area:
return;
}
- pkey = vma_pkey(vma);
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, &pkey, fault);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 6d06cf33e3de..8ae0000cbdb3 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* IA-32 Huge TLB Page Support for Kernel.
*
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 31cea988fa36..ab33a32df2a8 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Helper routines for building identity mapping page tables. This is
* included by both the compressed kernel and the regular kernel.
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index af5c1ed21d43..a22c2b95e513 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -671,7 +671,7 @@ void __init init_mem_mapping(void)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
- hypervisor_init_mem_mapping();
+ x86_init.hyper.init_mem_mapping();
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 048fbe8fc274..adcea90a2046 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1426,16 +1426,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
void register_page_bootmem_memmap(unsigned long section_nr,
- struct page *start_page, unsigned long size)
+ struct page *start_page, unsigned long nr_pages)
{
unsigned long addr = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + size);
+ unsigned long end = (unsigned long)(start_page + nr_pages);
unsigned long next;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
- unsigned int nr_pages;
+ unsigned int nr_pmd_pages;
struct page *page;
for (; addr < end; addr = next) {
@@ -1482,9 +1482,9 @@ void register_page_bootmem_memmap(unsigned long section_nr,
if (pmd_none(*pmd))
continue;
- nr_pages = 1 << (get_order(PMD_SIZE));
+ nr_pmd_pages = 1 << get_order(PMD_SIZE);
page = pmd_page(*pmd);
- while (nr_pages--)
+ while (nr_pmd_pages--)
get_page_bootmem(section_nr, page++,
SECTION_INFO);
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 34f0e1847dd6..6e4573b1da34 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -27,6 +27,11 @@
#include "physaddr.h"
+struct ioremap_mem_flags {
+ bool system_ram;
+ bool desc_other;
+};
+
/*
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
@@ -56,17 +61,59 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
return err;
}
-static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
- void *arg)
+static bool __ioremap_check_ram(struct resource *res)
{
+ unsigned long start_pfn, stop_pfn;
unsigned long i;
- for (i = 0; i < nr_pages; ++i)
- if (pfn_valid(start_pfn + i) &&
- !PageReserved(pfn_to_page(start_pfn + i)))
- return 1;
+ if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
+ return false;
- return 0;
+ start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ stop_pfn = (res->end + 1) >> PAGE_SHIFT;
+ if (stop_pfn > start_pfn) {
+ for (i = 0; i < (stop_pfn - start_pfn); ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return true;
+ }
+
+ return false;
+}
+
+static int __ioremap_check_desc_other(struct resource *res)
+{
+ return (res->desc != IORES_DESC_NONE);
+}
+
+static int __ioremap_res_check(struct resource *res, void *arg)
+{
+ struct ioremap_mem_flags *flags = arg;
+
+ if (!flags->system_ram)
+ flags->system_ram = __ioremap_check_ram(res);
+
+ if (!flags->desc_other)
+ flags->desc_other = __ioremap_check_desc_other(res);
+
+ return flags->system_ram && flags->desc_other;
+}
+
+/*
+ * To avoid multiple resource walks, this function walks resources marked as
+ * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a
+ * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
+ */
+static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
+ struct ioremap_mem_flags *flags)
+{
+ u64 start, end;
+
+ start = (u64)addr;
+ end = start + size - 1;
+ memset(flags, 0, sizeof(*flags));
+
+ walk_mem_res(start, end, flags, __ioremap_res_check);
}
/*
@@ -87,9 +134,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
unsigned long size, enum page_cache_mode pcm, void *caller)
{
unsigned long offset, vaddr;
- resource_size_t pfn, last_pfn, last_addr;
+ resource_size_t last_addr;
const resource_size_t unaligned_phys_addr = phys_addr;
const unsigned long unaligned_size = size;
+ struct ioremap_mem_flags mem_flags;
struct vm_struct *area;
enum page_cache_mode new_pcm;
pgprot_t prot;
@@ -108,13 +156,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
return NULL;
}
+ __ioremap_check_mem(phys_addr, size, &mem_flags);
+
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- pfn = phys_addr >> PAGE_SHIFT;
- last_pfn = last_addr >> PAGE_SHIFT;
- if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
- __ioremap_check_ram) == 1) {
+ if (mem_flags.system_ram) {
WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
&phys_addr, &last_addr);
return NULL;
@@ -146,7 +193,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
pcm = new_pcm;
}
+ /*
+ * If the page being mapped is in memory and SEV is active then
+ * make sure the memory encryption attribute is enabled in the
+ * resulting mapping.
+ */
prot = PAGE_KERNEL_IO;
+ if (sev_active() && mem_flags.desc_other)
+ prot = pgprot_encrypted(prot);
+
switch (pcm) {
case _PAGE_CACHE_MODE_UC:
default:
@@ -422,6 +477,9 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
* areas should be mapped decrypted. And since the encryption key can
* change across reboots, persistent memory should also be mapped
* decrypted.
+ *
+ * If SEV is active, that implies that BIOS/UEFI also ran encrypted so
+ * only persistent memory should be mapped decrypted.
*/
static bool memremap_should_map_decrypted(resource_size_t phys_addr,
unsigned long size)
@@ -458,6 +516,11 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr,
case E820_TYPE_ACPI:
case E820_TYPE_NVS:
case E820_TYPE_UNUSABLE:
+ /* For SEV, these areas are encrypted */
+ if (sev_active())
+ break;
+ /* Fallthrough */
+
case E820_TYPE_PRAM:
return true;
default:
@@ -581,7 +644,7 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
unsigned long flags)
{
- if (!sme_active())
+ if (!mem_encrypt_active())
return true;
if (flags & MEMREMAP_ENC)
@@ -590,12 +653,13 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
if (flags & MEMREMAP_DEC)
return false;
- if (memremap_is_setup_data(phys_addr, size) ||
- memremap_is_efi_data(phys_addr, size) ||
- memremap_should_map_decrypted(phys_addr, size))
- return false;
+ if (sme_active()) {
+ if (memremap_is_setup_data(phys_addr, size) ||
+ memremap_is_efi_data(phys_addr, size))
+ return false;
+ }
- return true;
+ return !memremap_should_map_decrypted(phys_addr, size);
}
/*
@@ -608,17 +672,24 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
unsigned long size,
pgprot_t prot)
{
- if (!sme_active())
+ bool encrypted_prot;
+
+ if (!mem_encrypt_active())
return prot;
- if (early_memremap_is_setup_data(phys_addr, size) ||
- memremap_is_efi_data(phys_addr, size) ||
- memremap_should_map_decrypted(phys_addr, size))
- prot = pgprot_decrypted(prot);
- else
- prot = pgprot_encrypted(prot);
+ encrypted_prot = true;
+
+ if (sme_active()) {
+ if (early_memremap_is_setup_data(phys_addr, size) ||
+ memremap_is_efi_data(phys_addr, size))
+ encrypted_prot = false;
+ }
+
+ if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size))
+ encrypted_prot = false;
- return prot;
+ return encrypted_prot ? pgprot_encrypted(prot)
+ : pgprot_decrypted(prot);
}
bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index bc84b73684b7..2b60dc6e64b1 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#define DISABLE_BRANCH_PROFILING
#define pr_fmt(fmt) "kasan: " fmt
#include <linux/bootmem.h>
@@ -15,6 +16,8 @@
extern struct range pfn_mapped[E820_MAX_ENTRIES];
+static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+
static int __init map_range(struct range *range)
{
unsigned long start;
@@ -30,8 +33,10 @@ static void __init clear_pgds(unsigned long start,
unsigned long end)
{
pgd_t *pgd;
+ /* See comment in kasan_init() */
+ unsigned long pgd_end = end & PGDIR_MASK;
- for (; start < end; start += PGDIR_SIZE) {
+ for (; start < pgd_end; start += PGDIR_SIZE) {
pgd = pgd_offset_k(start);
/*
* With folded p4d, pgd_clear() is nop, use p4d_clear()
@@ -42,29 +47,61 @@ static void __init clear_pgds(unsigned long start,
else
pgd_clear(pgd);
}
+
+ pgd = pgd_offset_k(start);
+ for (; start < end; start += P4D_SIZE)
+ p4d_clear(p4d_offset(pgd, start));
+}
+
+static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
+{
+ unsigned long p4d;
+
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return (p4d_t *)pgd;
+
+ p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
+ p4d += __START_KERNEL_map - phys_base;
+ return (p4d_t *)p4d + p4d_index(addr);
+}
+
+static void __init kasan_early_p4d_populate(pgd_t *pgd,
+ unsigned long addr,
+ unsigned long end)
+{
+ pgd_t pgd_entry;
+ p4d_t *p4d, p4d_entry;
+ unsigned long next;
+
+ if (pgd_none(*pgd)) {
+ pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d));
+ set_pgd(pgd, pgd_entry);
+ }
+
+ p4d = early_p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_none(*p4d))
+ continue;
+
+ p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud));
+ set_p4d(p4d, p4d_entry);
+ } while (p4d++, addr = next, addr != end && p4d_none(*p4d));
}
static void __init kasan_map_early_shadow(pgd_t *pgd)
{
- int i;
- unsigned long start = KASAN_SHADOW_START;
+ /* See comment in kasan_init() */
+ unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK;
unsigned long end = KASAN_SHADOW_END;
+ unsigned long next;
- for (i = pgd_index(start); start < end; i++) {
- switch (CONFIG_PGTABLE_LEVELS) {
- case 4:
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
- _KERNPG_TABLE);
- break;
- case 5:
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
- _KERNPG_TABLE);
- break;
- default:
- BUILD_BUG();
- }
- start += PGDIR_SIZE;
- }
+ pgd += pgd_index(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ kasan_early_p4d_populate(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
}
#ifdef CONFIG_KASAN_INLINE
@@ -101,7 +138,7 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
- for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
+ for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_top_pgt);
@@ -117,12 +154,35 @@ void __init kasan_init(void)
#endif
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+
+ /*
+ * We use the same shadow offset for 4- and 5-level paging to
+ * facilitate boot-time switching between paging modes.
+ * As result in 5-level paging mode KASAN_SHADOW_START and
+ * KASAN_SHADOW_END are not aligned to PGD boundary.
+ *
+ * KASAN_SHADOW_START doesn't share PGD with anything else.
+ * We claim whole PGD entry to make things easier.
+ *
+ * KASAN_SHADOW_END lands in the last PGD entry and it collides with
+ * bunch of things like kernel code, modules, EFI mapping, etc.
+ * We need to take extra steps to not overwrite them.
+ */
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ void *ptr;
+
+ ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
+ memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table));
+ set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)],
+ __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE));
+ }
+
load_cr3(early_top_pgt);
__flush_tlb_all();
- clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+ clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END);
- kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
+ kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
kasan_mem_to_shadow((void *)PAGE_OFFSET));
for (i = 0; i < E820_MAX_ENTRIES; i++) {
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index af599167fe3c..879ef930e2c2 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file implements KASLR memory randomization for x86_64. It randomizes
* the virtual address space of kernel memory regions (physical memory
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index dab41876cdd5..872ec4159a68 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/interrupt.h>
#include <linux/kdebug.h>
#include <linux/kmemcheck.h>
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
index 0efc2e8d0a20..39f80d7a874d 100644
--- a/arch/x86/mm/kmemcheck/error.h
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
#define ARCH__X86__MM__KMEMCHECK__ERROR_H
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 324aa3f07237..df8109ddf7fe 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
#include "opcode.h"
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
index 6956aad66b5b..51a1ce94c24a 100644
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
index 4ead26eeaf96..8a03be90272a 100644
--- a/arch/x86/mm/kmemcheck/pte.c
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
index 9f5966456492..b595612382c2 100644
--- a/arch/x86/mm/kmemcheck/pte.h
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
#define ARCH__X86__MM__KMEMCHECK__PTE_H
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index aef7140c0063..7ce0be1f99eb 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bug.h>
#include <linux/kernel.h>
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
index 8fed4fe11f95..8d759aae453d 100644
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index ff0b2f70fbcb..49768dc18664 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index afc47f5c9531..c21c2ed04612 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* Support for MMIO probes.
* Benfit many code from kprobes
* (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 16c5f37933a2..d9a9e9fc75dd 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -30,6 +30,8 @@
#include <asm/msr.h>
#include <asm/cmdline.h>
+#include "mm_internal.h"
+
static char sme_cmdline_arg[] __initdata = "mem_encrypt";
static char sme_cmdline_on[] __initdata = "on";
static char sme_cmdline_off[] __initdata = "off";
@@ -40,7 +42,11 @@ static char sme_cmdline_off[] __initdata = "off";
* section is later cleared.
*/
u64 sme_me_mask __section(.data) = 0;
-EXPORT_SYMBOL_GPL(sme_me_mask);
+EXPORT_SYMBOL(sme_me_mask);
+DEFINE_STATIC_KEY_FALSE(sev_enable_key);
+EXPORT_SYMBOL_GPL(sev_enable_key);
+
+static bool sev_enabled __section(.data);
/* Buffer used for early in-place encryption by BSP, no locking needed */
static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
@@ -63,7 +69,6 @@ static void __init __sme_early_enc_dec(resource_size_t paddr,
if (!sme_me_mask)
return;
- local_flush_tlb();
wbinvd();
/*
@@ -190,8 +195,238 @@ void __init sme_early_init(void)
/* Update the protection map with memory encryption mask */
for (i = 0; i < ARRAY_SIZE(protection_map); i++)
protection_map[i] = pgprot_encrypted(protection_map[i]);
+
+ if (sev_active())
+ swiotlb_force = SWIOTLB_FORCE;
+}
+
+static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp, unsigned long attrs)
+{
+ unsigned long dma_mask;
+ unsigned int order;
+ struct page *page;
+ void *vaddr = NULL;
+
+ dma_mask = dma_alloc_coherent_mask(dev, gfp);
+ order = get_order(size);
+
+ /*
+ * Memory will be memset to zero after marking decrypted, so don't
+ * bother clearing it before.
+ */
+ gfp &= ~__GFP_ZERO;
+
+ page = alloc_pages_node(dev_to_node(dev), gfp, order);
+ if (page) {
+ dma_addr_t addr;
+
+ /*
+ * Since we will be clearing the encryption bit, check the
+ * mask with it already cleared.
+ */
+ addr = __sme_clr(phys_to_dma(dev, page_to_phys(page)));
+ if ((addr + size) > dma_mask) {
+ __free_pages(page, get_order(size));
+ } else {
+ vaddr = page_address(page);
+ *dma_handle = addr;
+ }
+ }
+
+ if (!vaddr)
+ vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp);
+
+ if (!vaddr)
+ return NULL;
+
+ /* Clear the SME encryption bit for DMA use if not swiotlb area */
+ if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) {
+ set_memory_decrypted((unsigned long)vaddr, 1 << order);
+ memset(vaddr, 0, PAGE_SIZE << order);
+ *dma_handle = __sme_clr(*dma_handle);
+ }
+
+ return vaddr;
}
+static void sev_free(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle, unsigned long attrs)
+{
+ /* Set the SME encryption bit for re-use if not swiotlb area */
+ if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle)))
+ set_memory_encrypted((unsigned long)vaddr,
+ 1 << get_order(size));
+
+ swiotlb_free_coherent(dev, size, vaddr, dma_handle);
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+ pgprot_t old_prot, new_prot;
+ unsigned long pfn, pa, size;
+ pte_t new_pte;
+
+ switch (level) {
+ case PG_LEVEL_4K:
+ pfn = pte_pfn(*kpte);
+ old_prot = pte_pgprot(*kpte);
+ break;
+ case PG_LEVEL_2M:
+ pfn = pmd_pfn(*(pmd_t *)kpte);
+ old_prot = pmd_pgprot(*(pmd_t *)kpte);
+ break;
+ case PG_LEVEL_1G:
+ pfn = pud_pfn(*(pud_t *)kpte);
+ old_prot = pud_pgprot(*(pud_t *)kpte);
+ break;
+ default:
+ return;
+ }
+
+ new_prot = old_prot;
+ if (enc)
+ pgprot_val(new_prot) |= _PAGE_ENC;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+ /* If prot is same then do nothing. */
+ if (pgprot_val(old_prot) == pgprot_val(new_prot))
+ return;
+
+ pa = pfn << page_level_shift(level);
+ size = page_level_size(level);
+
+ /*
+ * We are going to perform in-place en-/decryption and change the
+ * physical page attribute from C=1 to C=0 or vice versa. Flush the
+ * caches to ensure that data gets accessed with the correct C-bit.
+ */
+ clflush_cache_range(__va(pa), size);
+
+ /* Encrypt/decrypt the contents in-place */
+ if (enc)
+ sme_early_encrypt(pa, size);
+ else
+ sme_early_decrypt(pa, size);
+
+ /* Change the page encryption mask. */
+ new_pte = pfn_pte(pfn, new_prot);
+ set_pte_atomic(kpte, new_pte);
+}
+
+static int __init early_set_memory_enc_dec(unsigned long vaddr,
+ unsigned long size, bool enc)
+{
+ unsigned long vaddr_end, vaddr_next;
+ unsigned long psize, pmask;
+ int split_page_size_mask;
+ int level, ret;
+ pte_t *kpte;
+
+ vaddr_next = vaddr;
+ vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (level == PG_LEVEL_4K) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+ continue;
+ }
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ /*
+ * Check whether we can change the large page in one go.
+ * We request a split when the address is not aligned and
+ * the number of pages to set/clear encryption bit is smaller
+ * than the number of pages in the large page.
+ */
+ if (vaddr == (vaddr & pmask) &&
+ ((vaddr_end - vaddr) >= psize)) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & pmask) + psize;
+ continue;
+ }
+
+ /*
+ * The virtual address is part of a larger page, create the next
+ * level page table mapping (4K or 2M). If it is part of a 2M
+ * page then we request a split of the large page into 4K
+ * chunks. A 1GB large page is split into 2M pages, resp.
+ */
+ if (level == PG_LEVEL_2M)
+ split_page_size_mask = 0;
+ else
+ split_page_size_mask = 1 << PG_LEVEL_2M;
+
+ kernel_physical_mapping_init(__pa(vaddr & pmask),
+ __pa((vaddr_end & pmask) + psize),
+ split_page_size_mask);
+ }
+
+ ret = 0;
+
+out:
+ __flush_tlb_all();
+ return ret;
+}
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, false);
+}
+
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, true);
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * sme_active() and sev_active() functions are used for this. When a
+ * distinction isn't needed, the mem_encrypt_active() function can be used.
+ *
+ * The trampoline code is a good example for this requirement. Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted. So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+bool sme_active(void)
+{
+ return sme_me_mask && !sev_enabled;
+}
+EXPORT_SYMBOL_GPL(sme_active);
+
+bool sev_active(void)
+{
+ return sme_me_mask && sev_enabled;
+}
+EXPORT_SYMBOL_GPL(sev_active);
+
+static const struct dma_map_ops sev_dma_ops = {
+ .alloc = sev_alloc,
+ .free = sev_free,
+ .map_page = swiotlb_map_page,
+ .unmap_page = swiotlb_unmap_page,
+ .map_sg = swiotlb_map_sg_attrs,
+ .unmap_sg = swiotlb_unmap_sg_attrs,
+ .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = swiotlb_sync_single_for_device,
+ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = swiotlb_sync_sg_for_device,
+ .mapping_error = swiotlb_dma_mapping_error,
+};
+
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void)
{
@@ -201,7 +436,23 @@ void __init mem_encrypt_init(void)
/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
swiotlb_update_mem_attributes();
- pr_info("AMD Secure Memory Encryption (SME) active\n");
+ /*
+ * With SEV, DMA operations cannot use encryption. New DMA ops
+ * are required in order to mark the DMA areas as decrypted or
+ * to use bounce buffers.
+ */
+ if (sev_active())
+ dma_ops = &sev_dma_ops;
+
+ /*
+ * With SEV, we need to unroll the rep string I/O instructions.
+ */
+ if (sev_active())
+ static_branch_enable(&sev_enable_key);
+
+ pr_info("AMD %s active\n",
+ sev_active() ? "Secure Encrypted Virtualization (SEV)"
+ : "Secure Memory Encryption (SME)");
}
void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
@@ -529,37 +780,63 @@ void __init __nostackprotector sme_enable(struct boot_params *bp)
{
const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
unsigned int eax, ebx, ecx, edx;
+ unsigned long feature_mask;
bool active_by_default;
unsigned long me_mask;
char buffer[16];
u64 msr;
- /* Check for the SME support leaf */
+ /* Check for the SME/SEV support leaf */
eax = 0x80000000;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (eax < 0x8000001f)
return;
+#define AMD_SME_BIT BIT(0)
+#define AMD_SEV_BIT BIT(1)
/*
- * Check for the SME feature:
- * CPUID Fn8000_001F[EAX] - Bit 0
- * Secure Memory Encryption support
- * CPUID Fn8000_001F[EBX] - Bits 5:0
- * Pagetable bit position used to indicate encryption
+ * Set the feature mask (SME or SEV) based on whether we are
+ * running under a hypervisor.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
*/
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
- if (!(eax & 1))
+ if (!(eax & feature_mask))
return;
me_mask = 1UL << (ebx & 0x3f);
- /* Check if SME is enabled */
- msr = __rdmsr(MSR_K8_SYSCFG);
- if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ /* Check if memory encryption is enabled */
+ if (feature_mask == AMD_SME_BIT) {
+ /* For SME, check the SYSCFG MSR */
+ msr = __rdmsr(MSR_K8_SYSCFG);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+ } else {
+ /* For SEV, check the SEV MSR */
+ msr = __rdmsr(MSR_AMD64_SEV);
+ if (!(msr & MSR_AMD64_SEV_ENABLED))
+ return;
+
+ /* SEV state cannot be controlled by a command line option */
+ sme_me_mask = me_mask;
+ sev_enabled = true;
return;
+ }
/*
* Fixups have not been applied to phys_base yet and we're running
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index 62474ba66c8e..4e1f6e1b8159 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_MM_INTERNAL_H
#define __X86_MM_INTERNAL_H
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 9ceaa955d2ba..e500949bae24 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* mpx.c - Memory Protection eXtensions
*
@@ -12,6 +13,7 @@
#include <linux/sched/sysctl.h>
#include <asm/insn.h>
+#include <asm/insn-eval.h>
#include <asm/mman.h>
#include <asm/mmu_context.h>
#include <asm/mpx.h>
@@ -60,123 +62,6 @@ static unsigned long mpx_mmap(unsigned long len)
return addr;
}
-enum reg_type {
- REG_TYPE_RM = 0,
- REG_TYPE_INDEX,
- REG_TYPE_BASE,
-};
-
-static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
- enum reg_type type)
-{
- int regno = 0;
-
- static const int regoff[] = {
- offsetof(struct pt_regs, ax),
- offsetof(struct pt_regs, cx),
- offsetof(struct pt_regs, dx),
- offsetof(struct pt_regs, bx),
- offsetof(struct pt_regs, sp),
- offsetof(struct pt_regs, bp),
- offsetof(struct pt_regs, si),
- offsetof(struct pt_regs, di),
-#ifdef CONFIG_X86_64
- offsetof(struct pt_regs, r8),
- offsetof(struct pt_regs, r9),
- offsetof(struct pt_regs, r10),
- offsetof(struct pt_regs, r11),
- offsetof(struct pt_regs, r12),
- offsetof(struct pt_regs, r13),
- offsetof(struct pt_regs, r14),
- offsetof(struct pt_regs, r15),
-#endif
- };
- int nr_registers = ARRAY_SIZE(regoff);
- /*
- * Don't possibly decode a 32-bit instructions as
- * reading a 64-bit-only register.
- */
- if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
- nr_registers -= 8;
-
- switch (type) {
- case REG_TYPE_RM:
- regno = X86_MODRM_RM(insn->modrm.value);
- if (X86_REX_B(insn->rex_prefix.value))
- regno += 8;
- break;
-
- case REG_TYPE_INDEX:
- regno = X86_SIB_INDEX(insn->sib.value);
- if (X86_REX_X(insn->rex_prefix.value))
- regno += 8;
- break;
-
- case REG_TYPE_BASE:
- regno = X86_SIB_BASE(insn->sib.value);
- if (X86_REX_B(insn->rex_prefix.value))
- regno += 8;
- break;
-
- default:
- pr_err("invalid register type");
- BUG();
- break;
- }
-
- if (regno >= nr_registers) {
- WARN_ONCE(1, "decoded an instruction with an invalid register");
- return -EINVAL;
- }
- return regoff[regno];
-}
-
-/*
- * return the address being referenced be instruction
- * for rm=3 returning the content of the rm reg
- * for rm!=3 calculates the address using SIB and Disp
- */
-static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs)
-{
- unsigned long addr, base, indx;
- int addr_offset, base_offset, indx_offset;
- insn_byte_t sib;
-
- insn_get_modrm(insn);
- insn_get_sib(insn);
- sib = insn->sib.value;
-
- if (X86_MODRM_MOD(insn->modrm.value) == 3) {
- addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
- if (addr_offset < 0)
- goto out_err;
- addr = regs_get_register(regs, addr_offset);
- } else {
- if (insn->sib.nbytes) {
- base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
- if (base_offset < 0)
- goto out_err;
-
- indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
- if (indx_offset < 0)
- goto out_err;
-
- base = regs_get_register(regs, base_offset);
- indx = regs_get_register(regs, indx_offset);
- addr = base + indx * (1 << X86_SIB_SCALE(sib));
- } else {
- addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
- if (addr_offset < 0)
- goto out_err;
- addr = regs_get_register(regs, addr_offset);
- }
- addr += insn->displacement.value;
- }
- return (void __user *)addr;
-out_err:
- return (void __user *)-1;
-}
-
static int mpx_insn_decode(struct insn *insn,
struct pt_regs *regs)
{
@@ -289,7 +174,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
info->si_signo = SIGSEGV;
info->si_errno = 0;
info->si_code = SEGV_BNDERR;
- info->si_addr = mpx_get_addr_ref(&insn, regs);
+ info->si_addr = insn_get_addr_ref(&insn, regs);
/*
* We were not able to extract an address from the instruction,
* probably because there was something invalid in it.
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 9405ffc91502..066f3511d5f1 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Generic VM initialization for x86-64 NUMA setups.
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index d805162e6045..34a2a3bfde9c 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NUMA emulation
*/
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index ad86ec91e640..86860f279662 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_MM_NUMA_INTERNAL_H
#define __X86_MM_NUMA_INTERNAL_H
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 5f169d5d76a8..a25588ad75ef 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* self test for change_page_attr.
*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dfb7d657cf43..3fe68483463c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1781,8 +1781,8 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
unsigned long start;
int ret;
- /* Nothing to do if the SME is not active */
- if (!sme_active())
+ /* Nothing to do if memory encryption is not active */
+ if (!mem_encrypt_active())
return 0;
/* Should not be working on unaligned addresses */
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index a739bfc40690..eeb5caeb089b 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __PAT_INTERNAL_H_
#define __PAT_INTERNAL_H_
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index d76485b22824..fa16036fa592 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Handle caching attributes in page tables (PAT)
*
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index b372f3442bbf..17ebc5a978cc 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/gfp.h>
#include <asm/pgalloc.h>
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b9bd5b8b14fa..6b9bf023a700 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index cfc3b9121ce4..7f9acb68324c 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bootmem.h>
#include <linux/mmdebug.h>
#include <linux/export.h>
diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h
index a3cd5a0c97b3..9f6419cafc32 100644
--- a/arch/x86/mm/physaddr.h
+++ b/arch/x86/mm/physaddr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/processor.h>
static inline int phys_addr_valid(resource_size_t addr)
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index f65a33f505b6..adb3c5784dac 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/errno.h>
#include <linux/init.h>
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 3ea20d61b523..dac07e4f5834 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* ACPI 3.0 based NUMA setup
* Copyright 2004 Andi Kleen, SuSE Labs.
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 49d9778376d7..3118392cdf75 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,7 @@
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+
static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
u16 *new_asid, bool *need_flush)
{
@@ -80,10 +81,11 @@ void leave_mm(int cpu)
return;
/* Warn if we're not lazy. */
- WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
+ WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
switch_mm(NULL, &init_mm, NULL);
}
+EXPORT_SYMBOL_GPL(leave_mm);
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
@@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
__flush_tlb_all();
}
#endif
+ this_cpu_write(cpu_tlbstate.is_lazy, false);
if (real_prev == next) {
- VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
- next->context.ctx_id);
-
- if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
- /*
- * There's nothing to do: we weren't lazy, and we
- * aren't changing our mm. We don't need to flush
- * anything, nor do we need to update CR3, CR4, or
- * LDTR.
- */
- return;
- }
-
- /* Resume remote flushes and then read tlb_gen. */
- cpumask_set_cpu(cpu, mm_cpumask(next));
- next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
- if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
- next_tlb_gen) {
- /*
- * Ideally, we'd have a flush_tlb() variant that
- * takes the known CR3 value as input. This would
- * be faster on Xen PV and on hypothetical CPUs
- * on which INVPCID is fast.
- */
- this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
- next_tlb_gen);
- write_cr3(build_cr3(next, prev_asid));
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
- TLB_FLUSH_ALL);
- }
+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ next->context.ctx_id);
/*
- * We just exited lazy mode, which means that CR4 and/or LDTR
- * may be stale. (Changes to the required CR4 and LDTR states
- * are not reflected in tlb_gen.)
+ * We don't currently support having a real mm loaded without
+ * our cpu set in mm_cpumask(). We have all the bookkeeping
+ * in place to figure out whether we would need to flush
+ * if our cpu were cleared in mm_cpumask(), but we don't
+ * currently use it.
*/
+ if (WARN_ON_ONCE(real_prev != &init_mm &&
+ !cpumask_test_cpu(cpu, mm_cpumask(next))))
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ return;
} else {
u16 new_asid;
bool need_flush;
@@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
/* Stop remote flushes for the previous mm */
- if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-
- VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+ real_prev != &init_mm);
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
/*
* Start remote flushes and then read tlb_gen.
@@ -216,12 +196,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
write_cr3(build_cr3(next, new_asid));
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
- TLB_FLUSH_ALL);
+
+ /*
+ * NB: This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we need to use the _rcuidle variant.
+ *
+ * (There is no good reason for this. The idle code should
+ * be rearranged to call this before rcu_idle_enter().)
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
write_cr3(build_cr3_noflush(next, new_asid));
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
this_cpu_write(cpu_tlbstate.loaded_mm, next);
@@ -233,6 +223,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
/*
+ * Please ignore the name of this function. It should be called
+ * switch_to_kernel_thread().
+ *
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm. Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row. It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+ return;
+
+ if (tlb_defer_switch_to_init_mm()) {
+ /*
+ * There's a significant optimization that may be possible
+ * here. We have accurate enough TLB flush tracking that we
+ * don't need to maintain coherence of TLB per se when we're
+ * lazy. We do, however, need to maintain coherence of
+ * paging-structure caches. We could, in principle, leave our
+ * old mm loaded and only switch to init_mm when
+ * tlb_remove_page() happens.
+ */
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
+ } else {
+ switch_mm(NULL, &init_mm, NULL);
+ }
+}
+
+/*
* Call this when reinitializing a CPU. It fixes the following potential
* problems:
*
@@ -303,16 +327,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
+ if (unlikely(loaded_mm == &init_mm))
+ return;
+
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
- if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+ if (this_cpu_read(cpu_tlbstate.is_lazy)) {
/*
- * We're in lazy mode -- don't flush. We can get here on
- * remote flushes due to races and on local flushes if a
- * kernel thread coincidentally flushes the mm it's lazily
- * still using.
+ * We're in lazy mode. We need to at least flush our
+ * paging-structure cache to avoid speculatively reading
+ * garbage into our TLB. Since switching to init_mm is barely
+ * slower than a minimal flush, just switch to init_mm.
*/
+ switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
}