diff options
Diffstat (limited to 'arch/x86/include/asm/pgtable.h')
| -rw-r--r-- | arch/x86/include/asm/pgtable.h | 726 |
1 files changed, 502 insertions, 224 deletions
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 76aa21e8128d..e33df3da6980 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -15,32 +15,35 @@ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) -/* - * Macros to add or remove encryption attribute - */ -#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) -#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) - -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ +#include <linux/spinlock.h> #include <asm/x86_init.h> -#include <asm/fpu/xstate.h> +#include <asm/pkru.h> #include <asm/fpu/api.h> +#include <asm/coco.h> #include <asm-generic/pgtable_uffd.h> +#include <linux/page_table_check.h> extern pgd_t early_top_pgt[PTRS_PER_PGD]; -int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); +bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); +struct seq_file; void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user); -void ptdump_walk_pgd_level_checkwx(void); +bool ptdump_walk_pgd_level_checkwx(void); +#define ptdump_check_wx ptdump_walk_pgd_level_checkwx void ptdump_walk_user_pgd_level_checkwx(void); +/* + * Macros to add or remove encryption attribute + */ +#define pgprot_encrypted(prot) __pgprot(cc_mkenc(pgprot_val(prot))) +#define pgprot_decrypted(prot) __pgprot(cc_mkdec(pgprot_val(prot))) + #ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_walk_pgd_level_checkwx() #define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx() #else -#define debug_checkwx() do { } while (0) #define debug_checkwx_user() do { } while (0) #endif @@ -63,7 +66,6 @@ extern pmdval_t early_pmd_flags; #include <asm/paravirt.h> #else /* !CONFIG_PARAVIRT_XXL */ #define set_pte(ptep, pte) native_set_pte(ptep, pte) -#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) @@ -118,42 +120,47 @@ extern pmdval_t early_pmd_flags; #define arch_end_context_switch(prev) do {} while(0) #endif /* CONFIG_PARAVIRT_XXL */ -/* - * The following only work if pte_present() is true. - * Undefined behaviour if not.. - */ -static inline int pte_dirty(pte_t pte) +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) { - return pte_flags(pte) & _PAGE_DIRTY; + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v | set); } +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); -static inline u32 read_pkru(void) + return native_make_pmd(v & ~clear); +} + +static inline pud_t pud_set_flags(pud_t pud, pudval_t set) { - if (boot_cpu_has(X86_FEATURE_OSPKE)) - return rdpkru(); - return 0; + pudval_t v = native_pud_val(pud); + + return native_make_pud(v | set); } -static inline void write_pkru(u32 pkru) +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) { - struct pkru_state *pk; + pudval_t v = native_pud_val(pud); - if (!boot_cpu_has(X86_FEATURE_OSPKE)) - return; + return native_make_pud(v & ~clear); +} - pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); +/* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ +static inline bool pte_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_DIRTY_BITS; +} - /* - * The PKRU value in xstate needs to be in sync with the value that is - * written to the CPU. The FPU restore on return to userland would - * otherwise load the previous value again. - */ - fpregs_lock(); - if (pk) - pk->pkru = pkru; - __write_pkru(pkru); - fpregs_unlock(); +static inline bool pte_shstk(pte_t pte) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY; } static inline int pte_young(pte_t pte) @@ -161,19 +168,33 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } -static inline int pmd_dirty(pmd_t pmd) +static inline bool pte_decrypted(pte_t pte) +{ + return cc_mkdec(pte_val(pte)) == pte_val(pte); +} + +#define pmd_dirty pmd_dirty +static inline bool pmd_dirty(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_DIRTY; + return pmd_flags(pmd) & _PAGE_DIRTY_BITS; } +static inline bool pmd_shstk(pmd_t pmd) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == + (_PAGE_DIRTY | _PAGE_PSE); +} + +#define pmd_young pmd_young static inline int pmd_young(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_ACCESSED; } -static inline int pud_dirty(pud_t pud) +static inline bool pud_dirty(pud_t pud) { - return pud_flags(pud) & _PAGE_DIRTY; + return pud_flags(pud) & _PAGE_DIRTY_BITS; } static inline int pud_young(pud_t pud) @@ -181,9 +202,36 @@ static inline int pud_young(pud_t pud) return pud_flags(pud) & _PAGE_ACCESSED; } +static inline bool pud_shstk(pud_t pud) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pud_flags(pud) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == + (_PAGE_DIRTY | _PAGE_PSE); +} + static inline int pte_write(pte_t pte) { - return pte_flags(pte) & _PAGE_RW; + /* + * Shadow stack pages are logically writable, but do not have + * _PAGE_RW. Check for them separately from _PAGE_RW itself. + */ + return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte); +} + +#define pmd_write pmd_write +static inline int pmd_write(pmd_t pmd) +{ + /* + * Shadow stack pages are logically writable, but do not have + * _PAGE_RW. Check for them separately from _PAGE_RW itself. + */ + return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd); +} + +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return pud_flags(pud) & _PAGE_RW; } static inline int pte_huge(pte_t pte) @@ -210,6 +258,8 @@ static inline int pte_special(pte_t pte) static inline u64 protnone_mask(u64 val); +#define PFN_PTE_SHIFT PAGE_SHIFT + static inline unsigned long pte_pfn(pte_t pte) { phys_addr_t pfn = pte_val(pte); @@ -224,6 +274,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd) return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } +#define pud_pfn pud_pfn static inline unsigned long pud_pfn(pud_t pud) { phys_addr_t pfn = pud_val(pud); @@ -241,32 +292,24 @@ static inline unsigned long pgd_pfn(pgd_t pgd) return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; } -#define p4d_leaf p4d_large -static inline int p4d_large(p4d_t p4d) -{ - /* No 512 GiB pages yet */ - return 0; -} - #define pte_page(pte) pfn_to_page(pte_pfn(pte)) -#define pmd_leaf pmd_large -static inline int pmd_large(pmd_t pte) +#define pmd_leaf pmd_leaf +static inline bool pmd_leaf(pmd_t pte) { return pmd_flags(pte) & _PAGE_PSE; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */ static inline int pmd_trans_huge(pmd_t pmd) { - return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; + return (pmd_val(pmd) & _PAGE_PSE) == _PAGE_PSE; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static inline int pud_trans_huge(pud_t pud) { - return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; + return (pud_val(pud) & _PAGE_PSE) == _PAGE_PSE; } #endif @@ -276,29 +319,29 @@ static inline int has_transparent_hugepage(void) return boot_cpu_has(X86_FEATURE_PSE); } -#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP -static inline int pmd_devmap(pmd_t pmd) +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) { - return !!(pmd_val(pmd) & _PAGE_DEVMAP); + return pmd_flags(pmd) & _PAGE_SPECIAL; } -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static inline int pud_devmap(pud_t pud) +static inline pmd_t pmd_mkspecial(pmd_t pmd) { - return !!(pud_val(pud) & _PAGE_DEVMAP); + return pmd_set_flags(pmd, _PAGE_SPECIAL); } -#else -static inline int pud_devmap(pud_t pud) +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) { - return 0; + return pud_flags(pud) & _PAGE_SPECIAL; } -#endif -static inline int pgd_devmap(pgd_t pgd) +static inline pud_t pud_mkspecial(pud_t pud) { - return 0; + return pud_set_flags(pud, _PAGE_SPECIAL); } -#endif +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline pte_t pte_set_flags(pte_t pte, pteval_t set) @@ -315,6 +358,65 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) return native_make_pte(v & ~clear); } +/* + * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the + * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So + * when creating dirty, write-protected memory, a software bit is used: + * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the + * Dirty bit to SavedDirty, and vice-vesra. + * + * This shifting is only done if needed. In the case of shifting + * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of + * shifting SavedDirty->Dirty, the condition is Write=1. + */ +static inline pgprotval_t mksaveddirty_shift(pgprotval_t v) +{ + pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1; + + v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY; + v &= ~(cond << _PAGE_BIT_DIRTY); + + return v; +} + +static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v) +{ + pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1; + + v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY; + v &= ~(cond << _PAGE_BIT_SAVED_DIRTY); + + return v; +} + +static inline pte_t pte_mksaveddirty(pte_t pte) +{ + pteval_t v = native_pte_val(pte); + + v = mksaveddirty_shift(v); + return native_make_pte(v); +} + +static inline pte_t pte_clear_saveddirty(pte_t pte) +{ + pteval_t v = native_pte_val(pte); + + v = clear_saveddirty_shift(v); + return native_make_pte(v); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + pte = pte_clear_flags(pte, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PTE (Write=0,Dirty=1). Move the hardware + * dirty value to the software bit, if present. + */ + return pte_mksaveddirty(pte); +} + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { @@ -323,7 +425,7 @@ static inline int pte_uffd_wp(pte_t pte) static inline pte_t pte_mkuffd_wp(pte_t pte) { - return pte_set_flags(pte, _PAGE_UFFD_WP); + return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); } static inline pte_t pte_clear_uffd_wp(pte_t pte) @@ -334,7 +436,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte) static inline pte_t pte_mkclean(pte_t pte) { - return pte_clear_flags(pte, _PAGE_DIRTY); + return pte_clear_flags(pte, _PAGE_DIRTY_BITS); } static inline pte_t pte_mkold(pte_t pte) @@ -342,11 +444,6 @@ static inline pte_t pte_mkold(pte_t pte) return pte_clear_flags(pte, _PAGE_ACCESSED); } -static inline pte_t pte_wrprotect(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_RW); -} - static inline pte_t pte_mkexec(pte_t pte) { return pte_clear_flags(pte, _PAGE_NX); @@ -354,7 +451,16 @@ static inline pte_t pte_mkexec(pte_t pte) static inline pte_t pte_mkdirty(pte_t pte) { - return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pte_mksaveddirty(pte); +} + +static inline pte_t pte_mkwrite_shstk(pte_t pte) +{ + pte = pte_clear_flags(pte, _PAGE_RW); + + return pte_set_flags(pte, _PAGE_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) @@ -362,11 +468,15 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte_set_flags(pte, _PAGE_ACCESSED); } -static inline pte_t pte_mkwrite(pte_t pte) +static inline pte_t pte_mkwrite_novma(pte_t pte) { return pte_set_flags(pte, _PAGE_RW); } +struct vm_area_struct; +pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma); +#define pte_mkwrite pte_mkwrite + static inline pte_t pte_mkhuge(pte_t pte) { return pte_set_flags(pte, _PAGE_PSE); @@ -392,23 +502,34 @@ static inline pte_t pte_mkspecial(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL); } -static inline pte_t pte_mkdevmap(pte_t pte) +/* See comments above mksaveddirty_shift() */ +static inline pmd_t pmd_mksaveddirty(pmd_t pmd) { - return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); + pmdval_t v = native_pmd_val(pmd); + + v = mksaveddirty_shift(v); + return native_make_pmd(v); } -static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +/* See comments above mksaveddirty_shift() */ +static inline pmd_t pmd_clear_saveddirty(pmd_t pmd) { pmdval_t v = native_pmd_val(pmd); - return native_make_pmd(v | set); + v = clear_saveddirty_shift(v); + return native_make_pmd(v); } -static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +static inline pmd_t pmd_wrprotect(pmd_t pmd) { - pmdval_t v = native_pmd_val(pmd); + pmd = pmd_clear_flags(pmd, _PAGE_RW); - return native_make_pmd(v & ~clear); + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PMD (RW=0, Dirty=1). Move the hardware + * dirty value to the software bit. + */ + return pmd_mksaveddirty(pmd); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP @@ -419,7 +540,7 @@ static inline int pmd_uffd_wp(pmd_t pmd) static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_UFFD_WP); + return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP)); } static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) @@ -435,22 +556,21 @@ static inline pmd_t pmd_mkold(pmd_t pmd) static inline pmd_t pmd_mkclean(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_DIRTY); -} - -static inline pmd_t pmd_wrprotect(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_RW); + return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS); } static inline pmd_t pmd_mkdirty(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pmd_mksaveddirty(pmd); } -static inline pmd_t pmd_mkdevmap(pmd_t pmd) +static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_DEVMAP); + pmd = pmd_clear_flags(pmd, _PAGE_RW); + + return pmd_set_flags(pmd, _PAGE_DIRTY); } static inline pmd_t pmd_mkhuge(pmd_t pmd) @@ -463,23 +583,30 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_ACCESSED); } -static inline pmd_t pmd_mkwrite(pmd_t pmd) +static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_RW); } -static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); +#define pmd_mkwrite pmd_mkwrite + +/* See comments above mksaveddirty_shift() */ +static inline pud_t pud_mksaveddirty(pud_t pud) { pudval_t v = native_pud_val(pud); - return native_make_pud(v | set); + v = mksaveddirty_shift(v); + return native_make_pud(v); } -static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +/* See comments above mksaveddirty_shift() */ +static inline pud_t pud_clear_saveddirty(pud_t pud) { pudval_t v = native_pud_val(pud); - return native_make_pud(v & ~clear); + v = clear_saveddirty_shift(v); + return native_make_pud(v); } static inline pud_t pud_mkold(pud_t pud) @@ -489,22 +616,26 @@ static inline pud_t pud_mkold(pud_t pud) static inline pud_t pud_mkclean(pud_t pud) { - return pud_clear_flags(pud, _PAGE_DIRTY); + return pud_clear_flags(pud, _PAGE_DIRTY_BITS); } static inline pud_t pud_wrprotect(pud_t pud) { - return pud_clear_flags(pud, _PAGE_RW); + pud = pud_clear_flags(pud, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PUD (RW=0, Dirty=1). Move the hardware + * dirty value to the software bit. + */ + return pud_mksaveddirty(pud); } static inline pud_t pud_mkdirty(pud_t pud) { - return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); -} + pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); -static inline pud_t pud_mkdevmap(pud_t pud) -{ - return pud_set_flags(pud, _PAGE_DEVMAP); + return pud_mksaveddirty(pud); } static inline pud_t pud_mkhuge(pud_t pud) @@ -519,7 +650,9 @@ static inline pud_t pud_mkyoung(pud_t pud) static inline pud_t pud_mkwrite(pud_t pud) { - return pud_set_flags(pud, _PAGE_RW); + pud = pud_set_flags(pud, _PAGE_RW); + + return pud_clear_saveddirty(pud); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -604,6 +737,9 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot) static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + /* This bit combination is used to mark shadow stacks */ + WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == + _PAGE_DIRTY); pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PTE_PFN_MASK; return __pte(pfn | check_pgprot(pgprot)); @@ -631,11 +767,18 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd) __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } +static inline pud_t pud_mkinvalid(pud_t pud) +{ + return pfn_pud(pud_pfn(pud), + __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pteval_t val = pte_val(pte), oldval = val; + pte_t pte_result; /* * Chop off the NX bit (if present), and add the NX portion of @@ -644,17 +787,71 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) val &= _PAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); - return __pte(val); + + pte_result = __pte(val); + + /* + * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid: + * 1. Marking Write=0 PTEs Dirty=1 + * 2. Marking Dirty=1 PTEs Write=0 + * + * The first case cannot happen because the _PAGE_CHG_MASK will filter + * out any Dirty bit passed in newprot. Handle the second case by + * going through the mksaveddirty exercise. Only do this if the old + * value was Write=1 to avoid doing this on Shadow Stack PTEs. + */ + if (oldval & _PAGE_RW) + pte_result = pte_mksaveddirty(pte_result); + else + pte_result = pte_clear_saveddirty(pte_result); + + return pte_result; } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { pmdval_t val = pmd_val(pmd), oldval = val; + pmd_t pmd_result; - val &= _HPAGE_CHG_MASK; + val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY); val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK); - return __pmd(val); + + pmd_result = __pmd(val); + + /* + * Avoid creating shadow stack PMD by accident. See comment in + * pte_modify(). + */ + if (oldval & _PAGE_RW) + pmd_result = pmd_mksaveddirty(pmd_result); + else + pmd_result = pmd_clear_saveddirty(pmd_result); + + return pmd_result; +} + +static inline pud_t pud_modify(pud_t pud, pgprot_t newprot) +{ + pudval_t val = pud_val(pud), oldval = val; + pud_t pud_result; + + val &= _HPAGE_CHG_MASK; + val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; + val = flip_protnone_guard(oldval, val, PHYSICAL_PUD_PAGE_MASK); + + pud_result = __pud(val); + + /* + * Avoid creating shadow stack PUD by accident. See comment in + * pte_modify(). + */ + if (oldval & _PAGE_RW) + pud_result = pud_mksaveddirty(pud_result); + else + pud_result = pud_clear_saveddirty(pud_result); + + return pud_result; } /* @@ -676,11 +873,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) -static inline pgprot_t arch_filter_pgprot(pgprot_t prot) -{ - return canon_pgprot(prot); -} - static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, enum page_cache_mode pcm, enum page_cache_mode new_pcm) @@ -716,7 +908,7 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, pmd_t *populate_extra_pmd(unsigned long vaddr); pte_t *populate_extra_pte(unsigned long vaddr); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); /* @@ -730,14 +922,14 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) return pgd; return __pti_set_user_pgtbl(pgdp, pgd); } -#else /* CONFIG_PAGE_TABLE_ISOLATION */ +#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #ifdef CONFIG_X86_32 @@ -746,7 +938,7 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) # include <asm/pgtable_64.h> #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/log2.h> @@ -763,17 +955,18 @@ static inline int pte_same(pte_t a, pte_t b) return a.pte == b.pte; } -static inline int pte_present(pte_t a) +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); + if (__pte_needs_invert(pte_val(pte))) + return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } +#define pte_advance_pfn pte_advance_pfn -#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP -static inline int pte_devmap(pte_t a) +static inline int pte_present(pte_t a) { - return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP; + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } -#endif #define pte_accessible pte_accessible static inline bool pte_accessible(struct mm_struct *mm, pte_t a) @@ -782,7 +975,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) return true; if ((pte_flags(a) & _PAGE_PROTNONE) && - mm_tlb_flush_pending(mm)) + atomic_read(&mm->tlb_flush_pending)) return true; return false; @@ -836,18 +1029,10 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) */ #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - * - * (Currently stuck as a macro because of indirect forward reference - * to linux/mm.h:page_to_nid()) - */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - static inline int pmd_bad(pmd_t pmd) { - return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; + return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != + (_KERNPG_TABLE & ~_PAGE_ACCESSED); } static inline unsigned long pages_to_mb(unsigned long npg) @@ -866,9 +1051,9 @@ static inline int pud_present(pud_t pud) return pud_flags(pud) & _PAGE_PRESENT; } -static inline unsigned long pud_page_vaddr(pud_t pud) +static inline pmd_t *pud_pgtable(pud_t pud) { - return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud)); + return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud)); } /* @@ -877,23 +1062,16 @@ static inline unsigned long pud_page_vaddr(pud_t pud) */ #define pud_page(pud) pfn_to_page(pud_pfn(pud)) -#define pud_leaf pud_large -static inline int pud_large(pud_t pud) +#define pud_leaf pud_leaf +static inline bool pud_leaf(pud_t pud) { - return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == - (_PAGE_PSE | _PAGE_PRESENT); + return pud_val(pud) & _PAGE_PSE; } static inline int pud_bad(pud_t pud) { return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; } -#else -#define pud_leaf pud_large -static inline int pud_large(pud_t pud) -{ - return 0; -} #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -907,9 +1085,9 @@ static inline int p4d_present(p4d_t p4d) return p4d_flags(p4d) & _PAGE_PRESENT; } -static inline unsigned long p4d_page_vaddr(p4d_t p4d) +static inline pud_t *p4d_pgtable(p4d_t p4d) { - return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d)); + return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d)); } /* @@ -922,7 +1100,7 @@ static inline int p4d_bad(p4d_t p4d) { unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (p4d_flags(p4d) & ~ignore_flags) != 0; @@ -968,7 +1146,7 @@ static inline int pgd_bad(pgd_t pgd) if (!pgtable_l5_enabled()) return 0; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; @@ -988,26 +1166,22 @@ static inline int pgd_none(pgd_t pgd) } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern int direct_gbpages; void init_mem_mapping(void); void early_alloc_pgt_buf(void); -extern void memblock_find_dma_reserve(void); - - -#ifdef CONFIG_X86_64 -extern pgd_t trampoline_pgd_entry; - void __init poking_init(void); - unsigned long init_memory_mapping(unsigned long start, unsigned long end, pgprot_t prot); + +#ifdef CONFIG_X86_64 +extern pgd_t trampoline_pgd_entry; #endif /* local pte updates need not use xchg for locking */ @@ -1036,21 +1210,17 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) return res; } -static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep , pte_t pte) -{ - native_set_pte(ptep, pte); -} - static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(mm, pmdp, pmd); set_pmd(pmdp, pmd); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { + page_table_check_pud_set(mm, pudp, pud); native_set_pud(pudp, pud); } @@ -1081,6 +1251,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); + page_table_check_pte_clear(mm, pte); return pte; } @@ -1096,6 +1267,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); + page_table_check_pte_clear(mm, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } @@ -1106,12 +1278,20 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); -} + /* + * Avoid accidentally creating shadow stack PTEs + * (Write=0,Dirty=1). Use cmpxchg() to prevent races with + * the hardware setting Dirty=1. + */ + pte_t old_pte, new_pte; -#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) + old_pte = READ_ONCE(*ptep); + do { + new_pte = pte_wrprotect(old_pte); + } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte)); +} -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) +#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, @@ -1132,37 +1312,43 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define pmd_write pmd_write -static inline int pmd_write(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_RW; -} - #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - return native_pmdp_get_and_clear(pmdp); + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + + page_table_check_pmd_clear(mm, pmd); + + return pmd; } #define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - return native_pudp_get_and_clear(pudp); + pud_t pud = native_pudp_get_and_clear(pudp); + + page_table_check_pud_clear(mm, pud); + + return pud; } #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); -} + /* + * Avoid accidentally creating shadow stack PTEs + * (Write=0,Dirty=1). Use cmpxchg() to prevent races with + * the hardware setting Dirty=1. + */ + pmd_t old_pmd, new_pmd; -#define pud_write pud_write -static inline int pud_write(pud_t pud) -{ - return pud_flags(pud) & _PAGE_RW; + old_pmd = READ_ONCE(*pmdp); + do { + new_pmd = pmd_wrprotect(old_pmd); + } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd)); } #ifndef pmdp_establish @@ -1170,6 +1356,7 @@ static inline int pud_write(pud_t pud) static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { @@ -1179,6 +1366,29 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } } #endif + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline pud_t pudp_establish(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, pud_t pud) +{ + page_table_check_pud_set(vma->vm_mm, pudp, pud); + if (IS_ENABLED(CONFIG_SMP)) { + return xchg(pudp, pud); + } else { + pud_t old = *pudp; + WRITE_ONCE(*pudp, pud); + return old; + } +} +#endif + +#define __HAVE_ARCH_PMDP_INVALIDATE_AD +extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp); + /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. @@ -1193,12 +1403,9 @@ static inline bool pgdp_maps_userspace(void *__ptr) return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START); } -#define pgd_leaf pgd_large -static inline int pgd_large(pgd_t pgd) { return 0; } - -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and * the user one is in the last 4k. To switch between them, you * just need to flip the 12th bit in their addresses. @@ -1243,12 +1450,12 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) { return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * - * dst - pointer to pgd range anwhere on a pgd page + * dst - pointer to pgd range anywhere on a pgd page * src - "" * count - the number of pgds to copy. * @@ -1258,7 +1465,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!static_cpu_has(X86_FEATURE_PTI)) return; /* Clone the user space pgd as well */ @@ -1289,6 +1496,11 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ +} static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { @@ -1297,6 +1509,20 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE); +} + +static inline bool pte_swp_exclusive(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE); +} #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) @@ -1364,32 +1590,6 @@ static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd) } #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ -#define PKRU_AD_BIT 0x1 -#define PKRU_WD_BIT 0x2 -#define PKRU_BITS_PER_PKEY 2 - -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -extern u32 init_pkru_value; -#else -#define init_pkru_value 0 -#endif - -static inline bool __pkru_allows_read(u32 pkru, u16 pkey) -{ - int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; - return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits)); -} - -static inline bool __pkru_allows_write(u32 pkru, u16 pkey) -{ - int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; - /* - * Access-disable disables writes too so we need to check - * both bits here. - */ - return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits)); -} - static inline u16 pte_flags_pkey(unsigned long pte_flags) { #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -1421,6 +1621,11 @@ static inline bool __pte_access_permitted(unsigned long pteval, bool write) { unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; + /* + * Write=0,Dirty=1 PTEs are shadow stack, which the kernel + * shouldn't generally allow access to, but since they + * are already Write=0, the below logic covers both cases. + */ if (write) need_pte_bits |= _PAGE_RW; @@ -1456,12 +1661,85 @@ static inline bool arch_has_pfn_modify_check(void) return boot_cpu_has_bug(X86_BUG_L1TF); } -#define arch_faults_on_old_pte arch_faults_on_old_pte -static inline bool arch_faults_on_old_pte(void) +#define arch_check_zapped_pte arch_check_zapped_pte +void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); + +#define arch_check_zapped_pmd arch_check_zapped_pmd +void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd); + +#define arch_check_zapped_pud arch_check_zapped_pud +void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud); + +#ifdef CONFIG_XEN_PV +#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young +static inline bool arch_has_hw_nonleaf_pmd_young(void) +{ + return !cpu_feature_enabled(X86_FEATURE_XENPV); +} +#endif + +#ifdef CONFIG_PAGE_TABLE_CHECK +static inline bool pte_user_accessible_page(pte_t pte) { - return false; + return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); } -#endif /* __ASSEMBLY__ */ +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER); +} + +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER); +} +#endif + +#ifdef CONFIG_X86_SGX +int arch_memory_failure(unsigned long pfn, int flags); +#define arch_memory_failure arch_memory_failure + +bool arch_is_platform_page(u64 paddr); +#define arch_is_platform_page arch_is_platform_page +#endif + +/* + * Use set_p*_safe(), and elide TLB flushing, when confident that *no* + * TLB flush will be required as a result of the "set". For example, use + * in scenarios where it is known ahead of time that the routine is + * setting non-present entries, or re-setting an existing entry to the + * same value. Otherwise, use the typical "set" helpers and flush the + * TLB. + */ +#define set_pte_safe(ptep, pte) \ +({ \ + WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ + set_pte(ptep, pte); \ +}) + +#define set_pmd_safe(pmdp, pmd) \ +({ \ + WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ + set_pmd(pmdp, pmd); \ +}) + +#define set_pud_safe(pudp, pud) \ +({ \ + WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ + set_pud(pudp, pud); \ +}) + +#define set_p4d_safe(p4dp, p4d) \ +({ \ + WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ + set_p4d(p4dp, p4d); \ +}) + +#define set_pgd_safe(pgdp, pgd) \ +({ \ + WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ + set_pgd(pgdp, pgd); \ +}) +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_H */ |
