summaryrefslogtreecommitdiff
path: root/arch/x86/mm/pat/set_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/pat/set_memory.c')
-rw-r--r--arch/x86/mm/pat/set_memory.c672
1 files changed, 553 insertions, 119 deletions
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index b4072115c8ef..6c6eb486f7a6 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
#include <linux/debugfs.h>
#include <linux/pfn.h>
#include <linux/percpu.h>
@@ -19,6 +20,8 @@
#include <linux/vmstat.h>
#include <linux/kernel.h>
#include <linux/cc_platform.h>
+#include <linux/set_memory.h>
+#include <linux/memregion.h>
#include <asm/e820/api.h>
#include <asm/processor.h>
@@ -29,9 +32,6 @@
#include <asm/pgalloc.h>
#include <asm/proto.h>
#include <asm/memtype.h>
-#include <asm/set_memory.h>
-#include <asm/hyperv-tlfs.h>
-#include <asm/mshyperv.h>
#include "../mm_internal.h"
@@ -73,6 +73,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
+#define CPA_COLLAPSE 16 /* try to collapse large pages */
static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
{
@@ -105,6 +106,18 @@ static void split_page_count(int level)
direct_pages_count[level - 1] += PTRS_PER_PTE;
}
+static void collapse_page_count(int level)
+{
+ direct_pages_count[level]++;
+ if (system_state == SYSTEM_RUNNING) {
+ if (level == PG_LEVEL_2M)
+ count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE);
+ else if (level == PG_LEVEL_1G)
+ count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE);
+ }
+ direct_pages_count[level - 1] -= PTRS_PER_PTE;
+}
+
void arch_report_meminfo(struct seq_file *m)
{
seq_printf(m, "DirectMap4k: %8lu kB\n",
@@ -122,6 +135,7 @@ void arch_report_meminfo(struct seq_file *m)
}
#else
static inline void split_page_count(int level) { }
+static inline void collapse_page_count(int level) { }
#endif
#ifdef CONFIG_X86_CPA_STATISTICS
@@ -211,13 +225,30 @@ within(unsigned long addr, unsigned long start, unsigned long end)
return addr >= start && addr < end;
}
+#ifdef CONFIG_X86_64
+
static inline int
within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
{
return addr >= start && addr <= end;
}
-#ifdef CONFIG_X86_64
+/*
+ * The kernel image is mapped into two places in the virtual address space
+ * (addresses without KASLR, of course):
+ *
+ * 1. The kernel direct map (0xffff880000000000)
+ * 2. The "high kernel map" (0xffffffff81000000)
+ *
+ * We actually execute out of #2. If we get the address of a kernel symbol, it
+ * points to #2, but almost all physical-to-virtual translations point to #1.
+ *
+ * This is so that we can have both a directmap of all physical memory *and*
+ * take full advantage of the limited (s32) immediate addressing range (2G)
+ * of x86_64.
+ *
+ * See Documentation/arch/x86/x86_64/mm.rst for more detail.
+ */
static inline unsigned long highmap_start_pfn(void)
{
@@ -330,6 +361,23 @@ void arch_invalidate_pmem(void *addr, size_t size)
EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
#endif
+#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
+bool cpu_cache_has_invalidate_memregion(void)
+{
+ return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
+}
+EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM");
+
+int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len)
+{
+ if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
+ return -ENXIO;
+ wbinvd_on_all_cpus();
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM");
+#endif
+
static void __cpa_flush_all(void *arg)
{
unsigned long cache = (unsigned long)arg;
@@ -351,34 +399,61 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
-static void __cpa_flush_tlb(void *data)
+static int collapse_large_pages(unsigned long addr, struct list_head *pgtables);
+
+static void cpa_collapse_large_pages(struct cpa_data *cpa)
{
- struct cpa_data *cpa = data;
- unsigned int i;
+ unsigned long start, addr, end;
+ struct ptdesc *ptdesc, *tmp;
+ LIST_HEAD(pgtables);
+ int collapsed = 0;
+ int i;
+
+ if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+ for (i = 0; i < cpa->numpages; i++)
+ collapsed += collapse_large_pages(__cpa_addr(cpa, i),
+ &pgtables);
+ } else {
+ addr = __cpa_addr(cpa, 0);
+ start = addr & PMD_MASK;
+ end = addr + PAGE_SIZE * cpa->numpages;
+
+ for (addr = start; within(addr, start, end); addr += PMD_SIZE)
+ collapsed += collapse_large_pages(addr, &pgtables);
+ }
- for (i = 0; i < cpa->numpages; i++)
- flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
+ if (!collapsed)
+ return;
+
+ flush_tlb_all();
+
+ list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
+ list_del(&ptdesc->pt_list);
+ pagetable_free(ptdesc);
+ }
}
-static void cpa_flush(struct cpa_data *data, int cache)
+static void cpa_flush(struct cpa_data *cpa, int cache)
{
- struct cpa_data *cpa = data;
+ unsigned long start, end;
unsigned int i;
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
- return;
+ goto collapse_large_pages;
}
- if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling)
- flush_tlb_all();
- else
- on_each_cpu(__cpa_flush_tlb, cpa, 1);
+ start = fix_addr(__cpa_addr(cpa, 0));
+ end = start + cpa->numpages * PAGE_SIZE;
+ if (cpa->force_flush_all)
+ end = TLB_FLUSH_ALL;
+
+ flush_tlb_kernel_range(start, end);
if (!cache)
- return;
+ goto collapse_large_pages;
mb();
for (i = 0; i < cpa->numpages; i++) {
@@ -394,6 +469,10 @@ static void cpa_flush(struct cpa_data *data, int cache)
clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
}
mb();
+
+collapse_large_pages:
+ if (cpa->flags & CPA_COLLAPSE)
+ cpa_collapse_large_pages(cpa);
}
static bool overlaps(unsigned long r1_start, unsigned long r1_end,
@@ -580,57 +659,128 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
}
/*
+ * Validate strict W^X semantics.
+ */
+static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
+ unsigned long pfn, unsigned long npg,
+ bool nx, bool rw)
+{
+ unsigned long end;
+
+ /*
+ * 32-bit has some unfixable W+X issues, like EFI code
+ * and writeable data being in the same page. Disable
+ * detection and enforcement there.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ return new;
+
+ /* Only verify when NX is supported: */
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return new;
+
+ if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX)))
+ return new;
+
+ if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
+ return new;
+
+ /* Non-leaf translation entries can disable writing or execution. */
+ if (!rw || nx)
+ return new;
+
+ end = start + npg * PAGE_SIZE - 1;
+ WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
+ (unsigned long long)pgprot_val(old),
+ (unsigned long long)pgprot_val(new),
+ start, end, pfn);
+
+ /*
+ * For now, allow all permission change attempts by returning the
+ * attempted permissions. This can 'return old' to actively
+ * refuse the permission change at a later time.
+ */
+ return new;
+}
+
+/*
* Lookup the page table entry for a virtual address in a specific pgd.
- * Return a pointer to the entry and the level of the mapping.
+ * Return a pointer to the entry (or NULL if the entry does not exist),
+ * the level of the entry, and the effective NX and RW bits of all
+ * page table levels.
*/
-pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
- unsigned int *level)
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
- *level = PG_LEVEL_NONE;
+ *level = PG_LEVEL_256T;
+ *nx = false;
+ *rw = true;
if (pgd_none(*pgd))
return NULL;
+ *level = PG_LEVEL_512G;
+ *nx |= pgd_flags(*pgd) & _PAGE_NX;
+ *rw &= pgd_flags(*pgd) & _PAGE_RW;
+
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d))
return NULL;
- *level = PG_LEVEL_512G;
- if (p4d_large(*p4d) || !p4d_present(*p4d))
+ if (p4d_leaf(*p4d) || !p4d_present(*p4d))
return (pte_t *)p4d;
+ *level = PG_LEVEL_1G;
+ *nx |= p4d_flags(*p4d) & _PAGE_NX;
+ *rw &= p4d_flags(*p4d) & _PAGE_RW;
+
pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
- *level = PG_LEVEL_1G;
- if (pud_large(*pud) || !pud_present(*pud))
+ if (pud_leaf(*pud) || !pud_present(*pud))
return (pte_t *)pud;
+ *level = PG_LEVEL_2M;
+ *nx |= pud_flags(*pud) & _PAGE_NX;
+ *rw &= pud_flags(*pud) & _PAGE_RW;
+
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
return NULL;
- *level = PG_LEVEL_2M;
- if (pmd_large(*pmd) || !pmd_present(*pmd))
+ if (pmd_leaf(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
*level = PG_LEVEL_4K;
+ *nx |= pmd_flags(*pmd) & _PAGE_NX;
+ *rw &= pmd_flags(*pmd) & _PAGE_RW;
return pte_offset_kernel(pmd, address);
}
/*
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
+{
+ bool nx, rw;
+
+ return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
+}
+
+/*
* Lookup the page table entry for a virtual address. Return a pointer
* to the entry and the level of the mapping.
*
- * Note: We return pud and pmd either when the entry is marked large
- * or when the present bit is not set. Otherwise we would return a
- * pointer to a nonexisting mapping.
+ * Note: the function returns p4d, pud or pmd either when the entry is marked
+ * large or when the present bit is not set. Otherwise it returns NULL.
*/
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
@@ -638,25 +788,17 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
}
EXPORT_SYMBOL_GPL(lookup_address);
-/*
- * Lookup the page table entry for a virtual address in a given mm. Return a
- * pointer to the entry and the level of the mapping.
- */
-pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
- unsigned int *level)
-{
- return lookup_address_in_pgd(pgd_offset(mm, address), address, level);
-}
-EXPORT_SYMBOL_GPL(lookup_address_in_mm);
-
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
- unsigned int *level)
+ unsigned int *level, bool *nx, bool *rw)
{
- if (cpa->pgd)
- return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
- address, level);
+ pgd_t *pgd;
- return lookup_address(address, level);
+ if (!cpa->pgd)
+ pgd = pgd_offset_k(address);
+ else
+ pgd = cpa->pgd + pgd_index(address);
+
+ return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
}
/*
@@ -674,11 +816,11 @@ pmd_t *lookup_pmd_address(unsigned long address)
return NULL;
p4d = p4d_offset(pgd, address);
- if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
+ if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d))
return NULL;
pud = pud_offset(p4d, address);
- if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
+ if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud))
return NULL;
return pmd_offset(pud, address);
@@ -690,10 +832,14 @@ pmd_t *lookup_pmd_address(unsigned long address)
* areas on 32-bit NUMA systems. The percpu areas can
* end up in this kind of memory, for instance.
*
- * This could be optimized, but it is only intended to be
- * used at initialization time, and keeping it
- * unoptimized should increase the testing coverage for
- * the more obscure platforms.
+ * Note that as long as the PTEs are well-formed with correct PFNs, this
+ * works without checking the PRESENT bit in the leaf PTE. This is unlike
+ * the similar vmalloc_to_page() and derivatives. Callers may depend on
+ * this behavior.
+ *
+ * This could be optimized, but it is only used in paths that are not perf
+ * sensitive, and keeping it unoptimized should increase the testing coverage
+ * for the more obscure platforms.
*/
phys_addr_t slow_virt_to_phys(void *__virt_addr)
{
@@ -714,11 +860,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
switch (level) {
case PG_LEVEL_1G:
phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
- offset = virt_addr & ~PUD_PAGE_MASK;
+ offset = virt_addr & ~PUD_MASK;
break;
case PG_LEVEL_2M:
phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
- offset = virt_addr & ~PMD_PAGE_MASK;
+ offset = virt_addr & ~PMD_MASK;
break;
default:
phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
@@ -737,7 +883,7 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
/* change init_mm */
set_pte_atomic(kpte, pte);
#ifdef CONFIG_X86_32
- if (!SHARED_KERNEL_PMD) {
+ {
struct page *page;
list_for_each_entry(page, &pgd_list, lru) {
@@ -780,12 +926,13 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, *tmp;
enum pg_level level;
+ bool nx, rw;
/*
* Check for races, another CPU might have split this page
* up already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte)
return 1;
@@ -896,6 +1043,9 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);
+ new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
+ nx, rw);
+
/*
* If there is a conflict, split the large page.
*
@@ -975,6 +1125,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
pte_t *pbase = (pte_t *)page_address(base);
unsigned int i, level;
pgprot_t ref_prot;
+ bool nx, rw;
pte_t *tmp;
spin_lock(&pgd_lock);
@@ -982,7 +1133,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* Check for races, another CPU might have split this page
* up for us already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte) {
spin_unlock(&pgd_lock);
return 1;
@@ -1006,13 +1157,13 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
case PG_LEVEL_1G:
ref_prot = pud_pgprot(*(pud_t *)kpte);
ref_pfn = pud_pfn(*(pud_t *)kpte);
- pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+ pfninc = PMD_SIZE >> PAGE_SHIFT;
lpaddr = address & PUD_MASK;
lpinc = PMD_SIZE;
/*
* Clear the PSE flags if the PRESENT flag is not set
- * otherwise pmd_present/pmd_huge will return true
- * even on a non present pmd.
+ * otherwise pmd_present() will return true even on a non
+ * present pmd.
*/
if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
pgprot_val(ref_prot) &= ~_PAGE_PSE;
@@ -1091,6 +1242,164 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
return 0;
}
+static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
+ struct list_head *pgtables)
+{
+ pmd_t _pmd, old_pmd;
+ pte_t *pte, first;
+ unsigned long pfn;
+ pgprot_t pgprot;
+ int i = 0;
+
+ if (!cpu_feature_enabled(X86_FEATURE_PSE))
+ return 0;
+
+ addr &= PMD_MASK;
+ pte = pte_offset_kernel(pmd, addr);
+ first = *pte;
+ pfn = pte_pfn(first);
+
+ /* Make sure alignment is suitable */
+ if (PFN_PHYS(pfn) & ~PMD_MASK)
+ return 0;
+
+ /* The page is 4k intentionally */
+ if (pte_flags(first) & _PAGE_KERNEL_4K)
+ return 0;
+
+ /* Check that the rest of PTEs are compatible with the first one */
+ for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
+ pte_t entry = *pte;
+
+ if (!pte_present(entry))
+ return 0;
+ if (pte_flags(entry) != pte_flags(first))
+ return 0;
+ if (pte_pfn(entry) != pte_pfn(first) + i)
+ return 0;
+ }
+
+ old_pmd = *pmd;
+
+ /* Success: set up a large page */
+ pgprot = pgprot_4k_2_large(pte_pgprot(first));
+ pgprot_val(pgprot) |= _PAGE_PSE;
+ _pmd = pfn_pmd(pfn, pgprot);
+ set_pmd(pmd, _pmd);
+
+ /* Queue the page table to be freed after TLB flush */
+ list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables);
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ struct page *page;
+
+ /* Update all PGD tables to use the same large page */
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+ /* Something is wrong if entries doesn't match */
+ if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
+ continue;
+ set_pmd(pmd, _pmd);
+ }
+ }
+
+ if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
+ collapse_page_count(PG_LEVEL_2M);
+
+ return 1;
+}
+
+static int collapse_pud_page(pud_t *pud, unsigned long addr,
+ struct list_head *pgtables)
+{
+ unsigned long pfn;
+ pmd_t *pmd, first;
+ int i;
+
+ if (!direct_gbpages)
+ return 0;
+
+ addr &= PUD_MASK;
+ pmd = pmd_offset(pud, addr);
+ first = *pmd;
+
+ /*
+ * To restore PUD page all PMD entries must be large and
+ * have suitable alignment
+ */
+ pfn = pmd_pfn(first);
+ if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK))
+ return 0;
+
+ /*
+ * To restore PUD page, all following PMDs must be compatible with the
+ * first one.
+ */
+ for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) {
+ pmd_t entry = *pmd;
+
+ if (!pmd_present(entry) || !pmd_leaf(entry))
+ return 0;
+ if (pmd_flags(entry) != pmd_flags(first))
+ return 0;
+ if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE)
+ return 0;
+ }
+
+ /* Restore PUD page and queue page table to be freed after TLB flush */
+ list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables);
+ set_pud(pud, pfn_pud(pfn, pmd_pgprot(first)));
+
+ if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
+ collapse_page_count(PG_LEVEL_1G);
+
+ return 1;
+}
+
+/*
+ * Collapse PMD and PUD pages in the kernel mapping around the address where
+ * possible.
+ *
+ * Caller must flush TLB and free page tables queued on the list before
+ * touching the new entries. CPU must not see TLB entries of different size
+ * with different attributes.
+ */
+static int collapse_large_pages(unsigned long addr, struct list_head *pgtables)
+{
+ int collapsed = 0;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ addr &= PMD_MASK;
+
+ spin_lock(&pgd_lock);
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd))
+ goto out;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ goto out;
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud) || pud_leaf(*pud))
+ goto out;
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd) || pmd_leaf(*pmd))
+ goto out;
+
+ collapsed = collapse_pmd_page(pmd, addr, pgtables);
+ if (collapsed)
+ collapsed += collapse_pud_page(pud, addr, pgtables);
+
+out:
+ spin_unlock(&pgd_lock);
+ return collapsed;
+}
+
static bool try_to_free_pte_page(pte_t *pte)
{
int i;
@@ -1162,7 +1471,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
* Try to unmap in 2M chunks.
*/
while (end - start >= PMD_SIZE) {
- if (pmd_large(*pmd))
+ if (pmd_leaf(*pmd))
pmd_clear(pmd);
else
__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
@@ -1207,7 +1516,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
*/
while (end - start >= PUD_SIZE) {
- if (pud_large(*pud))
+ if (pud_leaf(*pud))
pud_clear(pud);
else
unmap_pmd_range(pud, start, start + PUD_SIZE);
@@ -1523,10 +1832,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ bool nx, rw;
address = __cpa_addr(cpa, cpa->curpage);
repeat:
- kpte = _lookup_address_cpa(cpa, address, &level);
+ kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (!kpte)
return __cpa_process_fault(cpa, address, primary);
@@ -1536,6 +1846,7 @@ repeat:
if (level == PG_LEVEL_4K) {
pte_t new_pte;
+ pgprot_t old_prot = pte_pgprot(old_pte);
pgprot_t new_prot = pte_pgprot(old_pte);
unsigned long pfn = pte_pfn(old_pte);
@@ -1547,11 +1858,14 @@ repeat:
new_prot = static_protections(new_prot, address, pfn, 1, 0,
CPA_PROTECT);
+ new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
+ nx, rw);
+
new_prot = pgprot_clear_protnone_bits(new_prot);
/*
* We need to keep the pfn from the existing PTE,
- * after all we're only going to change it's attributes
+ * after all we're only going to change its attributes
* not the memory it points to
*/
new_pte = pfn_pte(pfn, new_prot);
@@ -1590,8 +1904,11 @@ repeat:
return err;
}
-static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary);
+/*
+ * Check the directmap and "high kernel map" 'aliases'.
+ */
static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
@@ -1615,6 +1932,12 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
+ /* Directmap always has NX set, do not modify. */
+ if (__supported_pte_mask & _PAGE_NX) {
+ alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
+ alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
+ }
+
cpa->force_flush_all = 1;
ret = __change_page_attr_set_clr(&alias_cpa, 0);
@@ -1637,6 +1960,15 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
+ /*
+ * [_text, _brk_end) also covers data, do not modify NX except
+ * in cases where the highmap is the primary target.
+ */
+ if (__supported_pte_mask & _PAGE_NX) {
+ alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
+ alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
+ }
+
cpa->force_flush_all = 1;
/*
* The high mapping range is imprecise, so ignore the
@@ -1649,12 +1981,19 @@ static int cpa_process_alias(struct cpa_data *cpa)
return 0;
}
-static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
{
unsigned long numpages = cpa->numpages;
unsigned long rempages = numpages;
int ret = 0;
+ /*
+ * No changes, easy!
+ */
+ if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) &&
+ !cpa->force_split)
+ return ret;
+
while (rempages) {
/*
* Store the remaining nr of pages for the large page
@@ -1667,13 +2006,13 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
- ret = __change_page_attr(cpa, checkalias);
+ ret = __change_page_attr(cpa, primary);
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
if (ret)
goto out;
- if (checkalias) {
+ if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) {
ret = cpa_process_alias(cpa);
if (ret)
goto out;
@@ -1701,7 +2040,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
struct page **pages)
{
struct cpa_data cpa;
- int ret, cache, checkalias;
+ int ret, cache;
memset(&cpa, 0, sizeof(cpa));
@@ -1747,20 +2086,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cpa.numpages = numpages;
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
- cpa.flags = 0;
+ cpa.flags = in_flag;
cpa.curpage = 0;
cpa.force_split = force_split;
- if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
- cpa.flags |= in_flag;
-
- /* No alias checking for _NX bit modifications */
- checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
- /* Has caller explicitly disabled alias checking? */
- if (in_flag & CPA_NO_CHECK_ALIAS)
- checkalias = 0;
-
- ret = __change_page_attr_set_clr(&cpa, checkalias);
+ ret = __change_page_attr_set_clr(&cpa, 1);
/*
* Check whether we really changed something:
@@ -1816,7 +2146,7 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages,
}
/*
- * _set_memory_prot is an internal helper for callers that have been passed
+ * __set_memory_prot is an internal helper for callers that have been passed
* a pgprot_t value from upper layers and a reservation has already been taken.
* If you want to set the pgprot to a specific page protocol, use the
* set_memory_xx() functions.
@@ -1925,6 +2255,47 @@ int set_memory_wb(unsigned long addr, int numpages)
}
EXPORT_SYMBOL(set_memory_wb);
+/* Prevent speculative access to a page by marking it not-present */
+#ifdef CONFIG_X86_64
+int set_mce_nospec(unsigned long pfn)
+{
+ unsigned long decoy_addr;
+ int rc;
+
+ /* SGX pages are not in the 1:1 map */
+ if (arch_is_platform_page(pfn << PAGE_SHIFT))
+ return 0;
+ /*
+ * We would like to just call:
+ * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
+ * but doing that would radically increase the odds of a
+ * speculative access to the poison page because we'd have
+ * the virtual address of the kernel 1:1 mapping sitting
+ * around in registers.
+ * Instead we get tricky. We create a non-canonical address
+ * that looks just like the one we want, but has bit 63 flipped.
+ * This relies on set_memory_XX() properly sanitizing any __pa()
+ * results with __PHYSICAL_MASK or PTE_PFN_MASK.
+ */
+ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+
+ rc = set_memory_np(decoy_addr, 1);
+ if (rc)
+ pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(set_mce_nospec);
+
+/* Restore full speculative operation to the pfn. */
+int clear_mce_nospec(unsigned long pfn)
+{
+ unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
+
+ return set_memory_p(addr, 1);
+}
+EXPORT_SYMBOL_GPL(clear_mce_nospec);
+#endif /* CONFIG_X86_64 */
+
int set_memory_x(unsigned long addr, int numpages)
{
if (!(__supported_pte_mask & _PAGE_NX))
@@ -1943,7 +2314,18 @@ int set_memory_nx(unsigned long addr, int numpages)
int set_memory_ro(unsigned long addr, int numpages)
{
- return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0);
+}
+
+int set_memory_rox(unsigned long addr, int numpages)
+{
+ pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY);
+
+ if (__supported_pte_mask & _PAGE_NX)
+ clr.pgprot |= _PAGE_NX;
+
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0,
+ CPA_COLLAPSE, NULL);
}
int set_memory_rw(unsigned long addr, int numpages)
@@ -1958,16 +2340,20 @@ int set_memory_np(unsigned long addr, int numpages)
int set_memory_np_noalias(unsigned long addr, int numpages)
{
- int cpa_flags = CPA_NO_CHECK_ALIAS;
-
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
__pgprot(_PAGE_PRESENT), 0,
- cpa_flags, NULL);
+ CPA_NO_CHECK_ALIAS, NULL);
+}
+
+int set_memory_p(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}
int set_memory_4k(unsigned long addr, int numpages)
{
- return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ return change_page_attr_set_clr(&addr, numpages,
+ __pgprot(_PAGE_KERNEL_4K),
__pgprot(0), 1, 0, NULL);
}
@@ -1989,6 +2375,7 @@ int set_memory_global(unsigned long addr, int numpages)
*/
static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
{
+ pgprot_t empty = __pgprot(0);
struct cpa_data cpa;
int ret;
@@ -1999,18 +2386,22 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
memset(&cpa, 0, sizeof(cpa));
cpa.vaddr = &addr;
cpa.numpages = numpages;
- cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0);
- cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC);
+ cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty);
+ cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty);
cpa.pgd = init_mm.pgd;
/* Must avoid aliasing mappings in the highmem code */
kmap_flush_unused();
vm_unmap_aliases();
- /*
- * Before changing the encryption attribute, we need to flush caches.
- */
- cpa_flush(&cpa, !this_cpu_has(X86_FEATURE_SME_COHERENT));
+ /* Flush the caches as needed before changing the encryption attribute. */
+ if (x86_platform.guest.enc_tlb_flush_required(enc))
+ cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
+
+ /* Notify hypervisor that we are about to set/clr encryption attribute. */
+ ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc);
+ if (ret)
+ goto vmm_fail;
ret = __change_page_attr_set_clr(&cpa, 1);
@@ -2023,24 +2414,65 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
*/
cpa_flush(&cpa, 0);
+ if (ret)
+ return ret;
+
+ /* Notify hypervisor that we have successfully set/clr encryption attribute. */
+ ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc);
+ if (ret)
+ goto vmm_fail;
+
+ return 0;
+
+vmm_fail:
+ WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n",
+ (void *)addr, numpages, enc ? "private" : "shared", ret);
+
+ return ret;
+}
+
+/*
+ * The lock serializes conversions between private and shared memory.
+ *
+ * It is taken for read on conversion. A write lock guarantees that no
+ * concurrent conversions are in progress.
+ */
+static DECLARE_RWSEM(mem_enc_lock);
+
+/*
+ * Stop new private<->shared conversions.
+ *
+ * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
+ * The lock is not released to prevent new conversions from being started.
+ */
+bool set_memory_enc_stop_conversion(void)
+{
/*
- * Notify hypervisor that a given memory range is mapped encrypted
- * or decrypted.
+ * In a crash scenario, sleep is not allowed. Try to take the lock.
+ * Failure indicates that there is a race with the conversion.
*/
- notify_range_enc_status_changed(addr, numpages, enc);
+ if (oops_in_progress)
+ return down_write_trylock(&mem_enc_lock);
- return ret;
+ down_write(&mem_enc_lock);
+
+ return true;
}
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
- if (hv_is_isolation_supported())
- return hv_set_mem_host_visibility(addr, numpages, !enc);
+ int ret = 0;
- if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
- return __set_memory_enc_pgtable(addr, numpages, enc);
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ if (!down_read_trylock(&mem_enc_lock))
+ return -EBUSY;
- return 0;
+ ret = __set_memory_enc_pgtable(addr, numpages, enc);
+
+ up_read(&mem_enc_lock);
+ }
+
+ return ret;
}
int set_memory_encrypted(unsigned long addr, int numpages)
@@ -2121,12 +2553,6 @@ int set_pages_array_wc(struct page **pages, int numpages)
}
EXPORT_SYMBOL(set_pages_array_wc);
-int set_pages_array_wt(struct page **pages, int numpages)
-{
- return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WT);
-}
-EXPORT_SYMBOL_GPL(set_pages_array_wt);
-
int set_pages_wb(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
@@ -2182,7 +2608,7 @@ static int __set_pages_p(struct page *page, int numpages)
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
.mask_clr = __pgprot(0),
- .flags = 0};
+ .flags = CPA_NO_CHECK_ALIAS };
/*
* No alias checking needed for setting present flag. otherwise,
@@ -2190,7 +2616,7 @@ static int __set_pages_p(struct page *page, int numpages)
* mappings (this adds to complexity if we want to do this from
* atomic context especially). Let's keep it simple!
*/
- return __change_page_attr_set_clr(&cpa, 0);
+ return __change_page_attr_set_clr(&cpa, 1);
}
static int __set_pages_np(struct page *page, int numpages)
@@ -2200,8 +2626,8 @@ static int __set_pages_np(struct page *page, int numpages)
.pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
- .flags = 0};
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
+ .flags = CPA_NO_CHECK_ALIAS };
/*
* No alias checking needed for setting not present flag. otherwise,
@@ -2209,7 +2635,7 @@ static int __set_pages_np(struct page *page, int numpages)
* mappings (this adds to complexity if we want to do this from
* atomic context especially). Let's keep it simple!
*/
- return __change_page_attr_set_clr(&cpa, 0);
+ return __change_page_attr_set_clr(&cpa, 1);
}
int set_direct_map_invalid_noflush(struct page *page)
@@ -2222,6 +2648,14 @@ int set_direct_map_default_noflush(struct page *page)
return __set_pages_p(page, 1);
}
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+ if (valid)
+ return __set_pages_p(page, nr);
+
+ return __set_pages_np(page, nr);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
@@ -2279,8 +2713,8 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
.pgd = pgd,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
- .flags = 0,
+ .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)),
+ .flags = CPA_NO_CHECK_ALIAS,
};
WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
@@ -2293,7 +2727,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
- retval = __change_page_attr_set_clr(&cpa, 0);
+ retval = __change_page_attr_set_clr(&cpa, 1);
__flush_tlb_all();
out:
@@ -2313,7 +2747,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
/*
* The typical sequence for unmapping is to find a pte through
* lookup_address_in_pgd() (ideally, it should never return NULL because
- * the address is already mapped) and change it's protections. As pfn is
+ * the address is already mapped) and change its protections. As pfn is
* the *target* of a mapping, it's not useful while unmapping.
*/
struct cpa_data cpa = {
@@ -2322,13 +2756,13 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
.pgd = pgd,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
- .flags = 0,
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
+ .flags = CPA_NO_CHECK_ALIAS,
};
WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
- retval = __change_page_attr_set_clr(&cpa, 0);
+ retval = __change_page_attr_set_clr(&cpa, 1);
__flush_tlb_all();
return retval;