summaryrefslogtreecommitdiff
path: root/arch/x86/mm/init.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/init.c')
-rw-r--r--arch/x86/mm/init.c241
1 files changed, 160 insertions, 81 deletions
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 75ef19aa8903..8bf6ad4b9400 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -7,8 +7,10 @@
#include <linux/swapops.h>
#include <linux/kmemleak.h>
#include <linux/sched/task.h>
+#include <linux/execmem.h>
#include <asm/set_memory.h>
+#include <asm/cpu_device_id.h>
#include <asm/e820/api.h>
#include <asm/init.h>
#include <asm/page.h>
@@ -19,13 +21,14 @@
#include <asm/tlb.h>
#include <asm/proto.h>
#include <asm/dma.h> /* for MAX_DMA_PFN */
-#include <asm/microcode.h>
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
#include <asm/pti.h>
#include <asm/text-patching.h>
#include <asm/memtype.h>
+#include <asm/paravirt.h>
+#include <asm/mmu_context.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -78,10 +81,20 @@ static uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};
-/* Check that the write-protect PAT entry is set for write-protect */
+/*
+ * Check that the write-protect PAT entry is set for write-protect.
+ * To do this without making assumptions how PAT has been set up (Xen has
+ * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache
+ * mode via the __cachemode2pte_tbl[] into protection bits (those protection
+ * bits will select a cache mode of WP or better), and then translate the
+ * protection bits back into the cache mode using __pte2cm_idx() and the
+ * __pte2cachemode_tbl[] array. This will return the really used cache mode.
+ */
bool x86_has_pat_wp(void)
{
- return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP;
+ uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP];
+
+ return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP;
}
enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
@@ -127,14 +140,12 @@ __ref void *alloc_low_pages(unsigned int num)
unsigned long ret = 0;
if (min_pfn_mapped < max_pfn_mapped) {
- ret = memblock_find_in_range(
+ ret = memblock_phys_alloc_range(
+ PAGE_SIZE * num, PAGE_SIZE,
min_pfn_mapped << PAGE_SHIFT,
- max_pfn_mapped << PAGE_SHIFT,
- PAGE_SIZE * num , PAGE_SIZE);
+ max_pfn_mapped << PAGE_SHIFT);
}
- if (ret)
- memblock_reserve(ret, PAGE_SIZE * num);
- else if (can_use_brk_pgt)
+ if (!ret && can_use_brk_pgt)
ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
if (!ret)
@@ -164,11 +175,7 @@ __ref void *alloc_low_pages(unsigned int num)
* randomization is enabled.
*/
-#ifndef CONFIG_X86_5LEVEL
-#define INIT_PGD_PAGE_TABLES 3
-#else
#define INIT_PGD_PAGE_TABLES 4
-#endif
#ifndef CONFIG_RANDOMIZE_MEMORY
#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES)
@@ -253,14 +260,39 @@ static void __init probe_page_size_mask(void)
}
}
+/*
+ * INVLPG may not properly flush Global entries on
+ * these CPUs. New microcode fixes the issue.
+ */
+static const struct x86_cpu_id invlpg_miss_ids[] = {
+ X86_MATCH_VFM(INTEL_ALDERLAKE, 0x2e),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0x42c),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0x11),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, 0x118),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0x4117),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0x2e),
+ {}
+};
+
static void setup_pcid(void)
{
+ const struct x86_cpu_id *invlpg_miss_match;
+
if (!IS_ENABLED(CONFIG_X86_64))
return;
if (!boot_cpu_has(X86_FEATURE_PCID))
return;
+ invlpg_miss_match = x86_match_cpu(invlpg_miss_ids);
+
+ if (invlpg_miss_match &&
+ boot_cpu_data.microcode < invlpg_miss_match->driver_data) {
+ pr_info("Incomplete global flushes, disabling PCID");
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ return;
+ }
+
if (boot_cpu_has(X86_FEATURE_PGE)) {
/*
* This can't be cr4_set_bits_and_update_boot() -- the
@@ -274,15 +306,6 @@ static void setup_pcid(void)
* start_secondary().
*/
cr4_set_bits(X86_CR4_PCIDE);
-
- /*
- * INVPCID's single-context modes (2/3) only work if we set
- * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
- * on systems that have X86_CR4_PCIDE clear, or that have
- * no INVPCID support at all.
- */
- if (boot_cpu_has(X86_FEATURE_INVPCID))
- setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't work if
@@ -610,9 +633,23 @@ static void __init memory_map_top_down(unsigned long map_start,
unsigned long addr;
unsigned long mapped_ram_size = 0;
- /* xen has big range in reserved near end of ram, skip it at first.*/
- addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
- real_end = addr + PMD_SIZE;
+ /*
+ * Systems that have many reserved areas near top of the memory,
+ * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will
+ * require lots of 4K mappings which may exhaust pgt_buf.
+ * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure
+ * there is enough mapped memory that can be allocated from
+ * memblock.
+ */
+ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
+ map_end);
+ if (!addr) {
+ pr_warn("Failed to release memory for alloc_low_pages()");
+ real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE));
+ } else {
+ memblock_phys_free(addr, PMD_SIZE);
+ real_end = addr + PMD_SIZE;
+ }
/* step_size need to be small so pgt_buf from BRK could cover it */
step_size = PMD_SIZE;
@@ -707,6 +744,11 @@ static void __init memory_map_bottom_up(unsigned long map_start,
static void __init init_trampoline(void)
{
#ifdef CONFIG_X86_64
+ /*
+ * The code below will alias kernel page-tables in the user-range of the
+ * address space, including the Global bit. So global TLB entries will
+ * be created when using the trampoline page-table.
+ */
if (!kaslr_memory_enabled())
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
else
@@ -780,28 +822,33 @@ void __init poking_init(void)
spinlock_t *ptl;
pte_t *ptep;
- poking_mm = copy_init_mm();
- BUG_ON(!poking_mm);
+ text_poke_mm = mm_alloc();
+ BUG_ON(!text_poke_mm);
+
+ /* Xen PV guests need the PGD to be pinned. */
+ paravirt_enter_mmap(text_poke_mm);
+
+ set_notrack_mm(text_poke_mm);
/*
* Randomize the poking address, but make sure that the following page
* will be mapped at the same PMD. We need 2 pages, so find space for 3,
* and adjust the address if the PMD ends after the first one.
*/
- poking_addr = TASK_UNMAPPED_BASE;
+ text_poke_mm_addr = TASK_UNMAPPED_BASE;
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
- poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
+ text_poke_mm_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
(TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
- if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
- poking_addr += PAGE_SIZE;
+ if (((text_poke_mm_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
+ text_poke_mm_addr += PAGE_SIZE;
/*
* We need to trigger the allocation of the page-tables that will be
* needed for poking now. Later, poking may be performed in an atomic
* section, which might cause allocation to fail.
*/
- ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
+ ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
BUG_ON(!ptep);
pte_unmap_unlock(ptep, ptl);
}
@@ -835,7 +882,7 @@ int devmem_is_allowed(unsigned long pagenr)
/*
* This must follow RAM test, since System RAM is considered a
- * restricted resource under CONFIG_STRICT_IOMEM.
+ * restricted resource under CONFIG_STRICT_DEVMEM.
*/
if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
/* Low 1MB bypasses iomem restrictions. */
@@ -950,53 +997,6 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
}
#endif
-/*
- * Calculate the precise size of the DMA zone (first 16 MB of RAM),
- * and pass it to the MM layer - to help it set zone watermarks more
- * accurately.
- *
- * Done on 64-bit systems only for the time being, although 32-bit systems
- * might benefit from this as well.
- */
-void __init memblock_find_dma_reserve(void)
-{
-#ifdef CONFIG_X86_64
- u64 nr_pages = 0, nr_free_pages = 0;
- unsigned long start_pfn, end_pfn;
- phys_addr_t start_addr, end_addr;
- int i;
- u64 u;
-
- /*
- * Iterate over all memory ranges (free and reserved ones alike),
- * to calculate the total number of pages in the first 16 MB of RAM:
- */
- nr_pages = 0;
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
- start_pfn = min(start_pfn, MAX_DMA_PFN);
- end_pfn = min(end_pfn, MAX_DMA_PFN);
-
- nr_pages += end_pfn - start_pfn;
- }
-
- /*
- * Iterate over free memory ranges to calculate the number of free
- * pages in the DMA zone, while not counting potential partial
- * pages at the beginning or the end of the range:
- */
- nr_free_pages = 0;
- for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
- start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN);
- end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN);
-
- if (start_pfn < end_pfn)
- nr_free_pages += end_pfn - start_pfn;
- }
-
- set_dma_reserve(nr_pages - nr_free_pages);
-#endif
-}
-
void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -1023,6 +1023,11 @@ __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
+#ifdef CONFIG_ADDRESS_MASKING
+DEFINE_PER_CPU(u64, tlbstate_untag_mask);
+EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask);
+#endif
+
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
/* entry 0 MUST be WB (hardwired to speed up translations) */
@@ -1033,7 +1038,7 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
}
#ifdef CONFIG_SWAP
-unsigned long max_swapfile_size(void)
+unsigned long arch_max_swapfile_size(void)
{
unsigned long pages;
@@ -1054,3 +1059,77 @@ unsigned long max_swapfile_size(void)
return pages;
}
#endif
+
+#ifdef CONFIG_EXECMEM
+static struct execmem_info execmem_info __ro_after_init;
+
+#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
+void execmem_fill_trapping_insns(void *ptr, size_t size)
+{
+ memset(ptr, INT3_INSN_OPCODE, size);
+}
+#endif
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+ unsigned long start, offset = 0;
+ enum execmem_range_flags flags;
+ pgprot_t pgprot;
+
+ if (kaslr_enabled())
+ offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
+
+ start = MODULES_VADDR + offset;
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) &&
+ cpu_feature_enabled(X86_FEATURE_PSE)) {
+ pgprot = PAGE_KERNEL_ROX;
+ flags = EXECMEM_KASAN_SHADOW | EXECMEM_ROX_CACHE;
+ } else {
+ pgprot = PAGE_KERNEL;
+ flags = EXECMEM_KASAN_SHADOW;
+ }
+
+ execmem_info = (struct execmem_info){
+ .ranges = {
+ [EXECMEM_MODULE_TEXT] = {
+ .flags = flags,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = pgprot,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_KPROBES] = {
+ .flags = flags,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL_ROX,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_FTRACE] = {
+ .flags = flags,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = pgprot,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_BPF] = {
+ .flags = EXECMEM_KASAN_SHADOW,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_MODULE_DATA] = {
+ .flags = EXECMEM_KASAN_SHADOW,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = MODULE_ALIGN,
+ },
+ },
+ };
+
+ return &execmem_info;
+}
+#endif /* CONFIG_EXECMEM */