diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2020-09-22 06:43:17 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2020-09-22 06:43:17 -0400 |
commit | bf3c0e5e7102f5787d693379d9d384b813fe890b (patch) | |
tree | aa48fc57dee9ecdf99918affa38d1b06be41419a /arch/x86 | |
parent | 32251b07d532174d66941488c112ec046f646157 (diff) | |
parent | 976bc5e2aceedef13e0ba1f0e6e372a22164aa0c (diff) |
Merge branch 'x86-seves-for-paolo' of https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into HEAD
Diffstat (limited to 'arch/x86')
62 files changed, 623 insertions, 428 deletions
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c index 4ff01176c1cc..21d56ae83cdf 100644 --- a/arch/x86/boot/cmdline.c +++ b/arch/x86/boot/cmdline.c @@ -54,7 +54,7 @@ int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *b /* else */ state = st_wordcmp; opptr = option; - /* fall through */ + fallthrough; case st_wordcmp: if (c == '=' && !*opptr) { @@ -129,7 +129,7 @@ int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option) state = st_wordcmp; opptr = option; wstart = pos; - /* fall through */ + fallthrough; case st_wordcmp: if (!*opptr) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 0048269180d5..877970d76249 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -36,6 +36,10 @@ #define STATIC #include <linux/decompress/mm.h> +#define _SETUP +#include <asm/setup.h> /* For COMMAND_LINE_SIZE */ +#undef _SETUP + #ifdef CONFIG_X86_5LEVEL unsigned int __pgtable_l5_enabled; unsigned int pgdir_shift __ro_after_init = 39; @@ -87,8 +91,11 @@ static unsigned long get_boot_seed(void) static bool memmap_too_large; -/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ -static unsigned long long mem_limit = ULLONG_MAX; +/* + * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit. + * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options. + */ +static u64 mem_limit; /* Number of immovable memory regions */ static int num_immovable_mem; @@ -131,8 +138,7 @@ enum parse_mode { }; static int -parse_memmap(char *p, unsigned long long *start, unsigned long long *size, - enum parse_mode mode) +parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode) { char *oldp; @@ -162,7 +168,7 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size, */ *size = 0; } else { - unsigned long long flags; + u64 flags; /* * efi_fake_mem=nn@ss:attr the attr specifies @@ -178,7 +184,7 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size, } *size = 0; } - /* Fall through */ + fallthrough; default: /* * If w/o offset, only size specified, memmap=nn[KMG] has the @@ -201,7 +207,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str) while (str && (i < MAX_MEMMAP_REGIONS)) { int rc; - unsigned long long start, size; + u64 start, size; char *k = strchr(str, ','); if (k) @@ -214,7 +220,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str) if (start == 0) { /* Store the specified memory limit if size > 0 */ - if (size > 0) + if (size > 0 && size < mem_limit) mem_limit = size; continue; @@ -261,15 +267,15 @@ static void parse_gb_huge_pages(char *param, char *val) static void handle_mem_options(void) { char *args = (char *)get_cmd_line_ptr(); - size_t len = strlen((char *)args); + size_t len; char *tmp_cmdline; char *param, *val; u64 mem_size; - if (!strstr(args, "memmap=") && !strstr(args, "mem=") && - !strstr(args, "hugepages")) + if (!args) return; + len = strnlen(args, COMMAND_LINE_SIZE-1); tmp_cmdline = malloc(len + 1); if (!tmp_cmdline) error("Failed to allocate space for tmp_cmdline"); @@ -284,14 +290,12 @@ static void handle_mem_options(void) while (*args) { args = next_arg(args, ¶m, &val); /* Stop at -- */ - if (!val && strcmp(param, "--") == 0) { - warn("Only '--' specified in cmdline"); - goto out; - } + if (!val && strcmp(param, "--") == 0) + break; if (!strcmp(param, "memmap")) { mem_avoid_memmap(PARSE_MEMMAP, val); - } else if (strstr(param, "hugepages")) { + } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) { parse_gb_huge_pages(param, val); } else if (!strcmp(param, "mem")) { char *p = val; @@ -300,21 +304,23 @@ static void handle_mem_options(void) continue; mem_size = memparse(p, &p); if (mem_size == 0) - goto out; + break; - mem_limit = mem_size; + if (mem_size < mem_limit) + mem_limit = mem_size; } else if (!strcmp(param, "efi_fake_mem")) { mem_avoid_memmap(PARSE_EFI, val); } } -out: free(tmp_cmdline); return; } /* - * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). + * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM) + * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit. + * * The mem_avoid array is used to store the ranges that need to be avoided * when KASLR searches for an appropriate random address. We must avoid any * regions that are unsafe to overlap with during decompression, and other @@ -392,8 +398,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, { unsigned long init_size = boot_params->hdr.init_size; u64 initrd_start, initrd_size; - u64 cmd_line, cmd_line_size; - char *ptr; + unsigned long cmd_line, cmd_line_size; /* * Avoid the region that is unsafe to overlap during @@ -414,16 +419,15 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* No need to set mapping for initrd, it will be handled in VO. */ /* Avoid kernel command line. */ - cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; - cmd_line |= boot_params->hdr.cmd_line_ptr; + cmd_line = get_cmd_line_ptr(); /* Calculate size of cmd_line. */ - ptr = (char *)(unsigned long)cmd_line; - for (cmd_line_size = 0; ptr[cmd_line_size++];) - ; - mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; - mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; - add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, - mem_avoid[MEM_AVOID_CMDLINE].size); + if (cmd_line) { + cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1; + mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; + mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; + add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, + mem_avoid[MEM_AVOID_CMDLINE].size); + } /* Avoid boot parameters. */ mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; @@ -454,7 +458,7 @@ static bool mem_avoid_overlap(struct mem_vector *img, { int i; struct setup_data *ptr; - unsigned long earliest = img->start + img->size; + u64 earliest = img->start + img->size; bool is_overlapping = false; for (i = 0; i < MEM_AVOID_MAX; i++) { @@ -499,18 +503,16 @@ static bool mem_avoid_overlap(struct mem_vector *img, } struct slot_area { - unsigned long addr; - int num; + u64 addr; + unsigned long num; }; #define MAX_SLOT_AREA 100 static struct slot_area slot_areas[MAX_SLOT_AREA]; - +static unsigned int slot_area_index; static unsigned long slot_max; -static unsigned long slot_area_index; - static void store_slot_info(struct mem_vector *region, unsigned long image_size) { struct slot_area slot_area; @@ -519,13 +521,10 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) return; slot_area.addr = region->start; - slot_area.num = (region->size - image_size) / - CONFIG_PHYSICAL_ALIGN + 1; + slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN; - if (slot_area.num > 0) { - slot_areas[slot_area_index++] = slot_area; - slot_max += slot_area.num; - } + slot_areas[slot_area_index++] = slot_area; + slot_max += slot_area.num; } /* @@ -535,57 +534,53 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) static void process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) { - unsigned long addr, size = 0; + u64 pud_start, pud_end; + unsigned long gb_huge_pages; struct mem_vector tmp; - int i = 0; - if (!max_gb_huge_pages) { + if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) { store_slot_info(region, image_size); return; } - addr = ALIGN(region->start, PUD_SIZE); - /* Did we raise the address above the passed in memory entry? */ - if (addr < region->start + region->size) - size = region->size - (addr - region->start); - - /* Check how many 1GB huge pages can be filtered out: */ - while (size > PUD_SIZE && max_gb_huge_pages) { - size -= PUD_SIZE; - max_gb_huge_pages--; - i++; - } + /* Are there any 1GB pages in the region? */ + pud_start = ALIGN(region->start, PUD_SIZE); + pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE); /* No good 1GB huge pages found: */ - if (!i) { + if (pud_start >= pud_end) { store_slot_info(region, image_size); return; } - /* - * Skip those 'i'*1GB good huge pages, and continue checking and - * processing the remaining head or tail part of the passed region - * if available. - */ - - if (addr >= region->start + image_size) { + /* Check if the head part of the region is usable. */ + if (pud_start >= region->start + image_size) { tmp.start = region->start; - tmp.size = addr - region->start; + tmp.size = pud_start - region->start; store_slot_info(&tmp, image_size); } - size = region->size - (addr - region->start) - i * PUD_SIZE; - if (size >= image_size) { - tmp.start = addr + i * PUD_SIZE; - tmp.size = size; + /* Skip the good 1GB pages. */ + gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT; + if (gb_huge_pages > max_gb_huge_pages) { + pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT); + max_gb_huge_pages = 0; + } else { + max_gb_huge_pages -= gb_huge_pages; + } + + /* Check if the tail part of the region is usable. */ + if (region->start + region->size >= pud_end + image_size) { + tmp.start = pud_end; + tmp.size = region->start + region->size - pud_end; store_slot_info(&tmp, image_size); } } -static unsigned long slots_fetch_random(void) +static u64 slots_fetch_random(void) { unsigned long slot; - int i; + unsigned int i; /* Handle case of no slots stored. */ if (slot_max == 0) @@ -598,7 +593,7 @@ static unsigned long slots_fetch_random(void) slot -= slot_areas[i].num; continue; } - return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN; + return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN); } if (i == slot_area_index) @@ -611,49 +606,23 @@ static void __process_mem_region(struct mem_vector *entry, unsigned long image_size) { struct mem_vector region, overlap; - unsigned long start_orig, end; - struct mem_vector cur_entry; - - /* On 32-bit, ignore entries entirely above our maximum. */ - if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) - return; + u64 region_end; - /* Ignore entries entirely below our minimum. */ - if (entry->start + entry->size < minimum) - return; - - /* Ignore entries above memory limit */ - end = min(entry->size + entry->start, mem_limit); - if (entry->start >= end) - return; - cur_entry.start = entry->start; - cur_entry.size = end - entry->start; - - region.start = cur_entry.start; - region.size = cur_entry.size; + /* Enforce minimum and memory limit. */ + region.start = max_t(u64, entry->start, minimum); + region_end = min(entry->start + entry->size, mem_limit); /* Give up if slot area array is full. */ while (slot_area_index < MAX_SLOT_AREA) { - start_orig = region.start; - - /* Potentially raise address to minimum location. */ - if (region.start < minimum) - region.start = minimum; - /* Potentially raise address to meet alignment needs. */ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); /* Did we raise the address above the passed in memory entry? */ - if (region.start > cur_entry.start + cur_entry.size) + if (region.start > region_end) return; /* Reduce size by any delta from the original address. */ - region.size -= region.start - start_orig; - - /* On 32-bit, reduce region size to fit within max size. */ - if (IS_ENABLED(CONFIG_X86_32) && - region.start + region.size > KERNEL_IMAGE_SIZE) - region.size = KERNEL_IMAGE_SIZE - region.start; + region.size = region_end - region.start; /* Return if region can't contain decompressed kernel */ if (region.size < image_size) @@ -666,27 +635,19 @@ static void __process_mem_region(struct mem_vector *entry, } /* Store beginning of region if holds at least image_size. */ - if (overlap.start > region.start + image_size) { - struct mem_vector beginning; - - beginning.start = region.start; - beginning.size = overlap.start - region.start; - process_gb_huge_pages(&beginning, image_size); + if (overlap.start >= region.start + image_size) { + region.size = overlap.start - region.start; + process_gb_huge_pages(®ion, image_size); } - /* Return if overlap extends to or past end of region. */ - if (overlap.start + overlap.size >= region.start + region.size) - return; - /* Clip off the overlapping region and start over. */ - region.size -= overlap.start - region.start + overlap.size; region.start = overlap.start + overlap.size; } } static bool process_mem_region(struct mem_vector *region, - unsigned long long minimum, - unsigned long long image_size) + unsigned long minimum, + unsigned long image_size) { int i; /* @@ -709,7 +670,7 @@ static bool process_mem_region(struct mem_vector *region, * immovable memory and @region. */ for (i = 0; i < num_immovable_mem; i++) { - unsigned long long start, end, entry_end, region_end; + u64 start, end, entry_end, region_end; struct mem_vector entry; if (!mem_overlaps(region, &immovable_mem[i])) @@ -736,8 +697,8 @@ static bool process_mem_region(struct mem_vector *region, #ifdef CONFIG_EFI /* - * Returns true if mirror region found (and must have been processed - * for slots adding) + * Returns true if we processed the EFI memmap, which we prefer over the E820 + * table if it is available. */ static bool process_efi_entries(unsigned long minimum, unsigned long image_size) @@ -839,20 +800,30 @@ static void process_e820_entries(unsigned long minimum, static unsigned long find_random_phys_addr(unsigned long minimum, unsigned long image_size) { + u64 phys_addr; + + /* Bail out early if it's impossible to succeed. */ + if (minimum + image_size > mem_limit) + return 0; + /* Check if we had too many memmaps. */ if (memmap_too_large) { debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); return 0; } - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + if (!process_efi_entries(minimum, image_size)) + process_e820_entries(minimum, image_size); - if (process_efi_entries(minimum, image_size)) - return slots_fetch_random(); + phys_addr = slots_fetch_random(); - process_e820_entries(minimum, image_size); - return slots_fetch_random(); + /* Perform a final check to make sure the address is in range. */ + if (phys_addr < minimum || phys_addr + image_size > mem_limit) { + warn("Invalid physical address chosen!\n"); + return 0; + } + + return (unsigned long)phys_addr; } static unsigned long find_random_virt_addr(unsigned long minimum, @@ -860,18 +831,12 @@ static unsigned long find_random_virt_addr(unsigned long minimum, { unsigned long slots, random_addr; - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); - /* Align image_size for easy slot calculations. */ - image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN); - /* * There are how many CONFIG_PHYSICAL_ALIGN-sized slots * that can hold image_size within the range of minimum to * KERNEL_IMAGE_SIZE? */ - slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / - CONFIG_PHYSICAL_ALIGN + 1; + slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN; random_addr = kaslr_get_random_long("Virtual") % slots; @@ -908,6 +873,11 @@ void choose_random_location(unsigned long input, /* Prepare to add new identity pagetables on demand. */ initialize_identity_maps(); + if (IS_ENABLED(CONFIG_X86_32)) + mem_limit = KERNEL_IMAGE_SIZE; + else + mem_limit = MAXMEM; + /* Record the various known unsafe memory ranges. */ mem_avoid_init(input, input_size, *output); @@ -917,6 +887,8 @@ void choose_random_location(unsigned long input, * location: */ min_addr = min(*output, 512UL << 20); + /* Make sure minimum is aligned. */ + min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN); /* Walk available memory entries to find a random address. */ random_addr = find_random_phys_addr(min_addr, output_size); diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 39e592d0e0b4..e478e40fbe5a 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -30,12 +30,9 @@ #define STATIC static /* - * Use normal definitions of mem*() from string.c. There are already - * included header files which expect a definition of memset() and by - * the time we define memset macro, it is too late. + * Provide definitions of memzero and memmove as some of the decompressors will + * try to define their own functions if these are not defined as macros. */ -#undef memcpy -#undef memset #define memzero(s, n) memset((s), 0, (n)) #define memmove memmove diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 726e264410ff..3efce27ba35c 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -70,8 +70,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); struct mem_vector { - unsigned long long start; - unsigned long long size; + u64 start; + u64 size; }; #if CONFIG_RANDOMIZE_BASE diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 995f7b7ad512..a232da487cd2 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -11,10 +11,7 @@ void *memcpy(void *dst, const void *src, size_t len); void *memset(void *dst, int c, size_t len); int memcmp(const void *s1, const void *s2, size_t len); -/* - * Access builtin version by default. If one needs to use optimized version, - * do "undef memcpy" in .c file and link against right string.c - */ +/* Access builtin version by default. */ #define memcpy(d,s,l) __builtin_memcpy(d,s,l) #define memset(d,c,l) __builtin_memset(d,c,l) #define memcmp __builtin_memcmp diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 98e4d8886f11..ae9b0d4615b3 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -374,12 +374,14 @@ For 32-bit we have the following conventions - kernel is built with * Fetch the per-CPU GSBASE value for this processor and put it in @reg. * We normally use %gs for accessing per-CPU data, but we are setting up * %gs here and obviously can not use %gs itself to access per-CPU data. + * + * Do not use RDPID, because KVM loads guest's TSC_AUX on vm-entry and + * may not restore the host's value until the CPU returns to userspace. + * Thus the kernel would consume a guest's TSC_AUX if an NMI arrives + * while running KVM's run loop. */ .macro GET_PERCPU_BASE reg:req - ALTERNATIVE \ - "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \ - "RDPID \reg", \ - X86_FEATURE_RDPID + LOAD_CPU_AND_NODE_SEG_LIMIT \reg andq $VDSO_CPUNODE_MASK, \reg movq __per_cpu_offset(, \reg, 8), \reg .endm diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 48512c7944e7..2f84c7ca74ea 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -60,16 +60,10 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) { - unsigned int nr = (unsigned int)regs->orig_ax; - if (IS_ENABLED(CONFIG_IA32_EMULATION)) current_thread_info()->status |= TS_COMPAT; - /* - * Subtlety here: if ptrace pokes something larger than 2^32-1 into - * orig_ax, the unsigned int return value truncates it. This may - * or may not be necessary, but it matches the old asm behavior. - */ - return (unsigned int)syscall_enter_from_user_mode(regs, nr); + + return (unsigned int)regs->orig_ax; } /* @@ -91,15 +85,29 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { unsigned int nr = syscall_32_enter(regs); + /* + * Subtlety here: if ptrace pokes something larger than 2^32-1 into + * orig_ax, the unsigned int return value truncates it. This may + * or may not be necessary, but it matches the old asm behavior. + */ + nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); + do_syscall_32_irqs_on(regs, nr); syscall_exit_to_user_mode(regs); } static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { - unsigned int nr = syscall_32_enter(regs); + unsigned int nr = syscall_32_enter(regs); int res; + /* + * This cannot use syscall_enter_from_user_mode() as it has to + * fetch EBP before invoking any of the syscall entry work + * functions. + */ + syscall_enter_from_user_mode_prepare(regs); + instrumentation_begin(); /* Fetch EBP from where the vDSO stashed it. */ if (IS_ENABLED(CONFIG_X86_64)) { @@ -122,6 +130,9 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) return false; } + /* The case truncates any ptrace induced syscall nr > 2^32 -1 */ + nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr); + /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs, nr); syscall_exit_to_user_mode(regs); diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index 3a07ce3ec70b..f1f96d4d8cd6 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -29,11 +29,6 @@ SYM_CODE_START_NOALIGN(\name) SYM_CODE_END(\name) .endm -#ifdef CONFIG_TRACE_IRQFLAGS - THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 - THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 -#endif - #ifdef CONFIG_PREEMPTION THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 50963472ee85..31e6887d24f1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4682,7 +4682,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_CORE2_MEROM: x86_add_quirk(intel_clovertown_quirk); - /* fall through */ + fallthrough; case INTEL_FAM6_CORE2_MEROM_L: case INTEL_FAM6_CORE2_PENRYN: @@ -5062,7 +5062,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SKYLAKE_X: pmem = true; - /* fall through */ + fallthrough; case INTEL_FAM6_SKYLAKE_L: case INTEL_FAM6_SKYLAKE: case INTEL_FAM6_KABYLAKE_L: @@ -5114,7 +5114,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_ICELAKE_D: pmem = true; - /* fall through */ + fallthrough; case INTEL_FAM6_ICELAKE_L: case INTEL_FAM6_ICELAKE: case INTEL_FAM6_TIGERLAKE_L: diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 63f58bdf556c..8961653c5dd2 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1268,7 +1268,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) ret = X86_BR_ZERO_CALL; break; } - /* fall through */ + fallthrough; case 0x9a: /* call far absolute */ ret = X86_BR_CALL; break; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index cb94ba86efd2..6a4ca27b2c9e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -390,6 +390,18 @@ static struct uncore_event_desc snb_uncore_imc_events[] = { INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(gt_requests, "event=0x03"), + INTEL_UNCORE_EVENT_DESC(gt_requests.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(gt_requests.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(ia_requests, "event=0x04"), + INTEL_UNCORE_EVENT_DESC(ia_requests.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(ia_requests.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(io_requests, "event=0x05"), + INTEL_UNCORE_EVENT_DESC(io_requests.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(io_requests.unit, "MiB"), + { /* end: all zeroes */ }, }; @@ -405,13 +417,35 @@ static struct uncore_event_desc snb_uncore_imc_events[] = { #define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054 #define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE +/* BW break down- legacy counters */ +#define SNB_UNCORE_PCI_IMC_GT_REQUESTS 0x3 +#define SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE 0x5040 +#define SNB_UNCORE_PCI_IMC_IA_REQUESTS 0x4 +#define SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE 0x5044 +#define SNB_UNCORE_PCI_IMC_IO_REQUESTS 0x5 +#define SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE 0x5048 + enum perf_snb_uncore_imc_freerunning_types { - SNB_PCI_UNCORE_IMC_DATA = 0, + SNB_PCI_UNCORE_IMC_DATA_READS = 0, + SNB_PCI_UNCORE_IMC_DATA_WRITES, + SNB_PCI_UNCORE_IMC_GT_REQUESTS, + SNB_PCI_UNCORE_IMC_IA_REQUESTS, + SNB_PCI_UNCORE_IMC_IO_REQUESTS, + SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX, }; static struct freerunning_counters snb_uncore_imc_freerunning[] = { - [SNB_PCI_UNCORE_IMC_DATA] = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE, 0x4, 0x0, 2, 32 }, + [SNB_PCI_UNCORE_IMC_DATA_READS] = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE, + 0x0, 0x0, 1, 32 }, + [SNB_PCI_UNCORE_IMC_DATA_READS] = { SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE, + 0x0, 0x0, 1, 32 }, + [SNB_PCI_UNCORE_IMC_GT_REQUESTS] = { SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE, + 0x0, 0x0, 1, 32 }, + [SNB_PCI_UNCORE_IMC_IA_REQUESTS] = { SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE, + 0x0, 0x0, 1, 32 }, + [SNB_PCI_UNCORE_IMC_IO_REQUESTS] = { SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE, + 0x0, 0x0, 1, 32 }, }; static struct attribute *snb_uncore_imc_formats_attr[] = { @@ -525,6 +559,18 @@ static int snb_uncore_imc_event_init(struct perf_event *event) base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; idx = UNCORE_PMC_IDX_FREERUNNING; break; + case SNB_UNCORE_PCI_IMC_GT_REQUESTS: + base = SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE; + idx = UNCORE_PMC_IDX_FREERUNNING; + break; + case SNB_UNCORE_PCI_IMC_IA_REQUESTS: + base = SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE; + idx = UNCORE_PMC_IDX_FREERUNNING; + break; + case SNB_UNCORE_PCI_IMC_IO_REQUESTS: + base = SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE; + idx = UNCORE_PMC_IDX_FREERUNNING; + break; default: return -EINVAL; } @@ -598,7 +644,7 @@ static struct intel_uncore_ops snb_uncore_imc_ops = { static struct intel_uncore_type snb_uncore_imc = { .name = "imc", - .num_counters = 2, + .num_counters = 5, .num_boxes = 1, .num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX, .mmio_map_size = SNB_UNCORE_PCI_IMC_MAP_SIZE, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2901d5df4366..83fc9d38eb1f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -368,6 +368,7 @@ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ +#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index b9c2667ac46c..bc9758ef292e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -81,11 +81,8 @@ extern unsigned long efi_fw_vendor, efi_config_table; kernel_fpu_end(); \ }) - #define arch_efi_call_virt(p, f, args...) p->f(args) -#define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) - #else /* !CONFIG_X86_32 */ #define EFI_LOADER_SIGNATURE "EL64" @@ -125,9 +122,6 @@ struct efi_scratch { kernel_fpu_end(); \ }) -extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, - u32 type, u64 attribute); - #ifdef CONFIG_KASAN /* * CONFIG_KASAN may redefine memset to __memset. __memset function is present @@ -143,17 +137,13 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, #endif /* CONFIG_X86_32 */ extern struct efi_scratch efi_scratch; -extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); extern int __init efi_memblock_x86_reserve_range(void); extern void __init efi_print_memmap(void); -extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); extern int __init efi_alloc_page_tables(void); extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages); -extern void __init old_map_region(efi_memory_desc_t *md); -extern void __init runtime_code_page_mkexec(void); extern void __init efi_runtime_update_mappings(void); extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index a8f9315b9eae..6fe54b2813c1 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -18,8 +18,16 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) * state, not the interrupt state as imagined by Xen. */ unsigned long flags = native_save_fl(); - WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | - X86_EFLAGS_NT)); + unsigned long mask = X86_EFLAGS_DF | X86_EFLAGS_NT; + + /* + * For !SMAP hardware we patch out CLAC on entry. + */ + if (boot_cpu_has(X86_FEATURE_SMAP) || + (IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV))) + mask |= X86_EFLAGS_AC; + + WARN_ON_ONCE(flags & mask); /* We think we came from user mode. Make sure pt_regs agrees. */ WARN_ON_ONCE(!user_mode(regs)); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0a460f2a3f90..21a8b5259477 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -602,9 +602,7 @@ static inline u64 xgetbv(u32 index) { u32 eax, edx; - asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ - : "=a" (eax), "=d" (edx) - : "c" (index)); + asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index)); return eax + ((u64)edx << 32); } @@ -613,8 +611,7 @@ static inline void xsetbv(u32 index, u64 value) u32 eax = value; u32 edx = value >> 32; - asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ - : : "a" (eax), "d" (edx), "c" (index)); + asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); } #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 0a301ad0b02f..9257667d13c5 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -59,5 +59,6 @@ typedef struct { } void leave_mm(int cpu); +#define leave_mm leave_mm #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 40aa69d04862..d8324a236696 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -327,8 +327,8 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, static const unsigned int argument_offs[] = { #ifdef __i386__ offsetof(struct pt_regs, ax), - offsetof(struct pt_regs, cx), offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, cx), #define NR_REG_ARGUMENTS 3 #else offsetof(struct pt_regs, di), diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 59a3e13204c3..5999b0b3dd4a 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -234,6 +234,12 @@ static inline void clwb(volatile void *__p) #define nop() asm volatile ("nop") +static inline void serialize(void) +{ + /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */ + asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory"); +} + #endif /* __KERNEL__ */ #endif /* _ASM_X86_SPECIAL_INSNS_H */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 8a1f5382a4ea..cf13f9e78585 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -150,14 +150,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_NESTED_CTL_NP_ENABLE BIT(0) #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) -struct __attribute__ ((__packed__)) vmcb_seg { +struct vmcb_seg { u16 selector; u16 attrib; u32 limit; u64 base; -}; +} __packed; -struct __attribute__ ((__packed__)) vmcb_save_area { +struct vmcb_save_area { struct vmcb_seg es; struct vmcb_seg cs; struct vmcb_seg ss; @@ -200,20 +200,67 @@ struct __attribute__ ((__packed__)) vmcb_save_area { u64 br_to; u64 last_excp_from; u64 last_excp_to; -}; + /* + * The following part of the save area is valid only for + * SEV-ES guests when referenced through the GHCB. + */ + u8 reserved_7[104]; + u64 reserved_8; /* rax already available at 0x01f8 */ + u64 rcx; + u64 rdx; + u64 rbx; + u64 reserved_9; /* rsp already available at 0x01d8 */ + u64 rbp; + u64 rsi; + u64 rdi; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; + u8 reserved_10[16]; + u64 sw_exit_code; + u64 sw_exit_info_1; + u64 sw_exit_info_2; + u64 sw_scratch; + u8 reserved_11[56]; + u64 xcr0; + u8 valid_bitmap[16]; + u64 x87_state_gpa; +} __packed; + +struct ghcb { + struct vmcb_save_area save; + u8 reserved_save[2048 - sizeof(struct vmcb_save_area)]; + + u8 shared_buffer[2032]; + + u8 reserved_1[10]; + u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */ + u32 ghcb_usage; +} __packed; + + +#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032 +#define EXPECTED_VMCB_CONTROL_AREA_SIZE 256 +#define EXPECTED_GHCB_SIZE PAGE_SIZE static inline void __unused_size_checks(void) { - BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298); - BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256); + BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); } -struct __attribute__ ((__packed__)) vmcb { +struct vmcb { struct vmcb_control_area control; u8 reserved_control[1024 - sizeof(struct vmcb_control_area)]; struct vmcb_save_area save; -}; +} __packed; #define SVM_CPUID_FUNC 0x8000000a @@ -298,4 +345,47 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) +/* GHCB Accessor functions */ + +#define GHCB_BITMAP_IDX(field) \ + (offsetof(struct vmcb_save_area, field) / sizeof(u64)) + +#define DEFINE_GHCB_ACCESSORS(field) \ + static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ + { \ + return test_bit(GHCB_BITMAP_IDX(field), \ + (unsigned long *)&ghcb->save.valid_bitmap); \ + } \ + \ + static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \ + { \ + __set_bit(GHCB_BITMAP_IDX(field), \ + (unsigned long *)&ghcb->save.valid_bitmap); \ + ghcb->save.field = value; \ + } + +DEFINE_GHCB_ACCESSORS(cpl) +DEFINE_GHCB_ACCESSORS(rip) +DEFINE_GHCB_ACCESSORS(rsp) +DEFINE_GHCB_ACCESSORS(rax) +DEFINE_GHCB_ACCESSORS(rcx) +DEFINE_GHCB_ACCESSORS(rdx) +DEFINE_GHCB_ACCESSORS(rbx) +DEFINE_GHCB_ACCESSORS(rbp) +DEFINE_GHCB_ACCESSORS(rsi) +DEFINE_GHCB_ACCESSORS(rdi) +DEFINE_GHCB_ACCESSORS(r8) +DEFINE_GHCB_ACCESSORS(r9) +DEFINE_GHCB_ACCESSORS(r10) +DEFINE_GHCB_ACCESSORS(r11) +DEFINE_GHCB_ACCESSORS(r12) +DEFINE_GHCB_ACCESSORS(r13) +DEFINE_GHCB_ACCESSORS(r14) +DEFINE_GHCB_ACCESSORS(r15) +DEFINE_GHCB_ACCESSORS(sw_exit_code) +DEFINE_GHCB_ACCESSORS(sw_exit_info_1) +DEFINE_GHCB_ACCESSORS(sw_exit_info_2) +DEFINE_GHCB_ACCESSORS(sw_scratch) +DEFINE_GHCB_ACCESSORS(xcr0) + #endif diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h index fdb5b356e59b..0fd4a9dfb29c 100644 --- a/arch/x86/include/asm/sync_core.h +++ b/arch/x86/include/asm/sync_core.h @@ -5,6 +5,7 @@ #include <linux/preempt.h> #include <asm/processor.h> #include <asm/cpufeature.h> +#include <asm/special_insns.h> #ifdef CONFIG_X86_32 static inline void iret_to_self(void) @@ -46,22 +47,34 @@ static inline void iret_to_self(void) * * b) Text was modified on a different CPU, may subsequently be * executed on this CPU, and you want to make sure the new version - * gets executed. This generally means you're calling this in a IPI. + * gets executed. This generally means you're calling this in an IPI. * * If you're calling this for a different reason, you're probably doing * it wrong. + * + * Like all of Linux's memory ordering operations, this is a + * compiler barrier as well. */ static inline void sync_core(void) { /* - * There are quite a few ways to do this. IRET-to-self is nice - * because it works on every CPU, at any CPL (so it's compatible - * with paravirtualization), and it never exits to a hypervisor. - * The only down sides are that it's a bit slow (it seems to be - * a bit more than 2x slower than the fastest options) and that - * it unmasks NMIs. The "push %cs" is needed because, in - * paravirtual environments, __KERNEL_CS may not be a valid CS - * value when we do IRET directly. + * The SERIALIZE instruction is the most straightforward way to + * do this, but it is not universally available. + */ + if (static_cpu_has(X86_FEATURE_SERIALIZE)) { + serialize(); + return; + } + + /* + * For all other processors, there are quite a few ways to do this. + * IRET-to-self is nice because it works on every CPU, at any CPL + * (so it's compatible with paravirtualization), and it never exits + * to a hypervisor. The only downsides are that it's a bit slow + * (it seems to be a bit more than 2x slower than the fastest + * options) and that it unmasks NMIs. The "push %cs" is needed, + * because in paravirtual environments __KERNEL_CS may not be a + * valid CS value when we do IRET directly. * * In case NMI unmasking or performance ever becomes a problem, * the next best option appears to be MOV-to-CR2 and an @@ -71,9 +84,6 @@ static inline void sync_core(void) * CPUID is the conventional way, but it's nasty: it doesn't * exist on some 486-like CPUs, and it usually exits to a * hypervisor. - * - * Like all of Linux's memory ordering operations, this is a - * compiler barrier as well. */ iret_to_self(); } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c3daf0aaa0ee..cdaab30880b9 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -239,7 +239,7 @@ void __init arch_init_ideal_nops(void) return; } - /* fall through */ + fallthrough; default: #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 21325a4a78b9..779a89e31c4c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -800,7 +800,7 @@ static int irq_polarity(int idx) return IOAPIC_POL_HIGH; case MP_IRQPOL_RESERVED: pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); - /* fall through */ + fallthrough; case MP_IRQPOL_ACTIVE_LOW: default: /* Pointless default required due to do gcc stupidity */ return IOAPIC_POL_LOW; @@ -848,7 +848,7 @@ static int irq_trigger(int idx) return IOAPIC_EDGE; case MP_IRQTRIG_RESERVED: pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); - /* fall through */ + fallthrough; case MP_IRQTRIG_LEVEL: default: /* Pointless default required due to do gcc stupidity */ return IOAPIC_LEVEL; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 7bda71def557..99ee61c9ba54 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -149,7 +149,7 @@ void __init default_setup_apic_routing(void) break; } /* P4 and above */ - /* fall through */ + fallthrough; case X86_VENDOR_HYGON: case X86_VENDOR_AMD: def_to_bigsmp = 1; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index dae32d948bf2..f8a56b5dc29f 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -161,6 +161,7 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, apicd->move_in_progress = true; apicd->prev_vector = apicd->vector; apicd->prev_cpu = apicd->cpu; + WARN_ON_ONCE(apicd->cpu == newcpu); } else { irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, managed); @@ -910,7 +911,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) __send_cleanup_vector(apicd); } -static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) +void irq_complete_move(struct irq_cfg *cfg) { struct apic_chip_data *apicd; @@ -918,15 +919,16 @@ static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) if (likely(!apicd->move_in_progress)) return; - if (vector == apicd->vector && apicd->cpu == smp_processor_id()) + /* + * If the interrupt arrived on the new target CPU, cleanup the + * vector on the old target CPU. A vector check is not required + * because an interrupt can never move from one vector to another + * on the same CPU. + */ + if (apicd->cpu == smp_processor_id()) __send_cleanup_vector(apicd); } -void irq_complete_move(struct irq_cfg *cfg) -{ - __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); -} - /* * Called from fixup_irqs() with @desc->lock held and interrupts disabled. */ diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index c7503be92f35..57074cf3ad7c 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -248,7 +248,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, switch (leaf) { case 1: l1 = &l1i; - /* fall through */ + fallthrough; case 0: if (!l1->val) return; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 7843ab3fde09..3a44346f2276 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -199,7 +199,7 @@ static int raise_local(void) * calling irq_enter, but the necessary * machinery isn't exported currently. */ - /*FALL THROUGH*/ + fallthrough; case MCJ_CTX_PROCESS: raise_exception(m, NULL); break; diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index d8f9230d2034..abe9fe0fb851 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -193,7 +193,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval) if (!atomic_sub_return(1, &cmci_storm_on_cpus)) pr_notice("CMCI storm subsided: switching to interrupt mode\n"); - /* FALLTHROUGH */ + fallthrough; case CMCI_STORM_SUBSIDED: /* diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 72182809b333..ca670919b561 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -98,7 +98,7 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) case 7: if (size < 0x40) break; - /* Else, fall through */ + fallthrough; case 6: case 5: case 4: diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 8cdf29ffd95f..b98ff620ba77 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -349,7 +349,7 @@ static int arch_build_bp_info(struct perf_event *bp, hw->len = X86_BREAKPOINT_LEN_X; return 0; } - /* fall through */ + fallthrough; default: return -EINVAL; } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 68acd30c6b87..c2f02f308ecf 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -450,7 +450,7 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, ptr = &remcomInBuffer[1]; if (kgdb_hex2long(&ptr, &addr)) linux_regs->ip = addr; - /* fall through */ + fallthrough; case 'D': case 'k': /* clear the trace bit */ @@ -539,7 +539,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) * a system call which should be ignored */ return NOTIFY_DONE; - /* fall through */ + fallthrough; default: if (user_mode(regs)) return NOTIFY_DONE; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 411af4aa7b51..baa21090c9be 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -312,7 +312,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) case 2: if (i == 0 || i == 13) continue; /* IRQ0 & IRQ13 not connected */ - /* fall through */ + fallthrough; default: if (i == 2) continue; /* IRQ2 is never connected */ @@ -356,7 +356,7 @@ static void __init construct_ioapic_table(int mpc_default_type) default: pr_err("???\nUnknown standard configuration %d\n", mpc_default_type); - /* fall through */ + fallthrough; case 1: case 5: memcpy(bus.bustype, "ISA ", 6); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 994d8393f2f7..13ce616cc7af 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -684,9 +684,7 @@ void arch_cpu_idle(void) */ void __cpuidle default_idle(void) { - trace_cpu_idle_rcuidle(1, smp_processor_id()); safe_halt(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) EXPORT_SYMBOL(default_idle); @@ -792,7 +790,6 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { - trace_cpu_idle_rcuidle(1, smp_processor_id()); if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { mb(); /* quirk */ clflush((void *)¤t_thread_info()->flags); @@ -804,7 +801,6 @@ static __cpuidle void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } else { local_irq_enable(); } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 5679aa3fdcb8..e7537c5440bb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -204,7 +204,7 @@ static int set_segment_reg(struct task_struct *task, case offsetof(struct user_regs_struct, ss): if (unlikely(value == 0)) return -EIO; - /* Else, fall through */ + fallthrough; default: *pt_regs_access(task_pt_regs(task), offset) = value; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0ec7ced727fe..a515e2d230b7 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -654,7 +654,7 @@ static void native_machine_emergency_restart(void) case BOOT_CF9_FORCE: port_cf9_safe = true; - /* Fall through */ + fallthrough; case BOOT_CF9_SAFE: if (port_cf9_safe) { diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d5fa494c2304..be0d7d4152ec 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -726,7 +726,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) regs->ax = -EINTR; break; } - /* fallthrough */ + fallthrough; case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 27aa04a95702..f5ef689dd62a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1594,14 +1594,28 @@ int native_cpu_disable(void) if (ret) return ret; - /* - * Disable the local APIC. Otherwise IPI broadcasts will reach - * it. It still responds normally to INIT, NMI, SMI, and SIPI - * messages. - */ - apic_soft_disable(); cpu_disable_common(); + /* + * Disable the local APIC. Otherwise IPI broadcasts will reach + * it. It still responds normally to INIT, NMI, SMI, and SIPI + * messages. + * + * Disabling the APIC must happen after cpu_disable_common() + * which invokes fixup_irqs(). + * + * Disabling the APIC preserves already set bits in IRR, but + * an interrupt arriving after disabling the local APIC does not + * set the corresponding IRR bit. + * + * fixup_irqs() scans IRR for set bits so it can raise a not + * yet handled interrupt on the new destination CPU via an IPI + * but obviously it can't do so for IRR bits which are not set. + * IOW, interrupts arriving after disabling the local APIC will + * be lost. + */ + apic_soft_disable(); + return 0; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1f66d2d1e998..81a2fb711091 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -729,20 +729,9 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } -static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) +static __always_inline unsigned long debug_read_clear_dr6(void) { - /* - * Disable breakpoints during exception handling; recursive exceptions - * are exceedingly 'fun'. - * - * Since this function is NOKPROBE, and that also applies to - * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a - * HW_BREAKPOINT_W on our stack) - * - * Entry text is excluded for HW_BP_X and cpu_entry_area, which - * includes the entry stack is excluded for everything. - */ - *dr7 = local_db_save(); + unsigned long dr6; /* * The Intel SDM says: @@ -755,15 +744,12 @@ static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) * * Keep it simple: clear DR6 immediately. */ - get_debugreg(*dr6, 6); + get_debugreg(dr6, 6); set_debugreg(0, 6); /* Filter out all the reserved bits which are preset to 1 */ - *dr6 &= ~DR6_RESERVED; -} + dr6 &= ~DR6_RESERVED; -static __always_inline void debug_exit(unsigned long dr7) -{ - local_db_restore(dr7); + return dr6; } /* @@ -863,6 +849,18 @@ out: static __always_inline void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { + /* + * Disable breakpoints during exception handling; recursive exceptions + * are exceedingly 'fun'. + * + * Since this function is NOKPROBE, and that also applies to + * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a + * HW_BREAKPOINT_W on our stack) + * + * Entry text is excluded for HW_BP_X and cpu_entry_area, which + * includes the entry stack is excluded for everything. + */ + unsigned long dr7 = local_db_save(); bool irq_state = idtentry_enter_nmi(regs); instrumentation_begin(); @@ -883,6 +881,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, instrumentation_end(); idtentry_exit_nmi(regs, irq_state); + + local_db_restore(dr7); } static __always_inline void exc_debug_user(struct pt_regs *regs, @@ -894,6 +894,15 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, */ WARN_ON_ONCE(!user_mode(regs)); + /* + * NB: We can't easily clear DR7 here because + * idtentry_exit_to_usermode() can invoke ptrace, schedule, access + * user memory, etc. This means that a recursive #DB is possible. If + * this happens, that #DB will hit exc_debug_kernel() and clear DR7. + * Since we're not on the IST stack right now, everything will be + * fine. + */ + irqentry_enter_from_user_mode(regs); instrumentation_begin(); @@ -907,36 +916,24 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { - unsigned long dr6, dr7; - - debug_enter(&dr6, &dr7); - exc_debug_kernel(regs, dr6); - debug_exit(dr7); + exc_debug_kernel(regs, debug_read_clear_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { - unsigned long dr6, dr7; - - debug_enter(&dr6, &dr7); - exc_debug_user(regs, dr6); - debug_exit(dr7); + exc_debug_user(regs, debug_read_clear_dr6()); } #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { - unsigned long dr6, dr7; - - debug_enter(&dr6, &dr7); + unsigned long dr6 = debug_read_clear_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); - - debug_exit(dr7); } #endif diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 8d5cbe1bbb3b..2c304fd0bb1a 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -45,11 +45,12 @@ * value that, lies close to the top of the kernel memory. The limit for the GDT * and the IDT are set to zero. * - * Given that SLDT and STR are not commonly used in programs that run on WineHQ - * or DOSEMU2, they are not emulated. - * - * The instruction smsw is emulated to return the value that the register CR0 + * The instruction SMSW is emulated to return the value that the register CR0 * has at boot time as set in the head_32. + * SLDT and STR are emulated to return the values that the kernel programmatically + * assigns: + * - SLDT returns (GDT_ENTRY_LDT * 8) if an LDT has been set, 0 if not. + * - STR returns (GDT_ENTRY_TSS * 8). * * Emulation is provided for both 32-bit and 64-bit processes. * @@ -244,16 +245,34 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, *data_size += UMIP_GDT_IDT_LIMIT_SIZE; memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); - } else if (umip_inst == UMIP_INST_SMSW) { - unsigned long dummy_value = CR0_STATE; + } else if (umip_inst == UMIP_INST_SMSW || umip_inst == UMIP_INST_SLDT || + umip_inst == UMIP_INST_STR) { + unsigned long dummy_value; + + if (umip_inst == UMIP_INST_SMSW) { + dummy_value = CR0_STATE; + } else if (umip_inst == UMIP_INST_STR) { + dummy_value = GDT_ENTRY_TSS * 8; + } else if (umip_inst == UMIP_INST_SLDT) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + down_read(¤t->mm->context.ldt_usr_sem); + if (current->mm->context.ldt) + dummy_value = GDT_ENTRY_LDT * 8; + else + dummy_value = 0; + up_read(¤t->mm->context.ldt_usr_sem); +#else + dummy_value = 0; +#endif + } /* - * Even though the CR0 register has 4 bytes, the number + * For these 3 instructions, the number * of bytes to be copied in the result buffer is determined * by whether the operand is a register or a memory location. * If operand is a register, return as many bytes as the operand * size. If operand is memory, return only the two least - * siginificant bytes of CR0. + * siginificant bytes. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) *data_size = insn->opnd_bytes; @@ -261,7 +280,6 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, *data_size = 2; memcpy(data, &dummy_value, *data_size); - /* STR and SLDT are not emulated */ } else { return -EINVAL; } @@ -383,10 +401,6 @@ bool fixup_umip_exception(struct pt_regs *regs) umip_pr_warn(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); - /* Do not emulate (spoof) SLDT or STR. */ - if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT) - return false; - umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n"); if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 15e5aad8ac2c..3fdaa042823d 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -735,7 +735,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) * OPCODE1() of the "short" jmp which checks the same condition. */ opc1 = OPCODE2(insn) - 0x10; - /* fall through */ + fallthrough; default: if (!is_cond_jmp_opcode(opc1)) return -ENOSYS; @@ -892,7 +892,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, fix_ip_or_call = 0; break; } - /* fall through */ + fallthrough; default: riprel_analyze(auprobe, &insn); } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3fd6eec202d7..7456f9ad424b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -371,7 +371,7 @@ void kvm_set_cpu_caps(void) F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | - F(SERIALIZE) + F(SERIALIZE) | F(TSXLDTRK) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1d450d7710d6..2f6510de6b0c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3028,7 +3028,7 @@ static void string_registers_quirk(struct x86_emulate_ctxt *ctxt) case 0xa4: /* movsb */ case 0xa5: /* movsd/w */ *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1; - /* fall through */ + fallthrough; case 0xaa: /* stosb */ case 0xab: /* stosd/w */ *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 814d3aee5cef..1d330564eed8 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1779,7 +1779,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = kvm_hvcall_signal_event(vcpu, fast, ingpa); if (ret != HV_STATUS_INVALID_PORT_ID) break; - /* fall through - maybe userspace knows this conn_id. */ + fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) { diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index c47d2acec529..4aa1c2e00e2a 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -285,7 +285,7 @@ int kvm_set_routing_entry(struct kvm *kvm, switch (ue->u.irqchip.irqchip) { case KVM_IRQCHIP_PIC_SLAVE: e->irqchip.pin += PIC_NUM_PINS / 2; - /* fall through */ + fallthrough; case KVM_IRQCHIP_PIC_MASTER: if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) return -EINVAL; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5ccbee7165a2..35cca2e0c802 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1053,7 +1053,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, switch (delivery_mode) { case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; - /* fall through */ + fallthrough; case APIC_DM_FIXED: if (unlikely(trig_mode && !level)) break; @@ -1341,7 +1341,7 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) break; case APIC_TASKPRI: report_tpr_access(apic, false); - /* fall thru */ + fallthrough; default: val = kvm_lapic_get_reg(apic, offset); break; @@ -2027,7 +2027,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVT0: apic_manage_nmi_watchdog(apic, val); - /* fall through */ + fallthrough; case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 76c5826e29a2..71aa3da2a0b7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4422,7 +4422,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; - /* fall through */ + fallthrough; case PT64_ROOT_4LEVEL: rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7) | diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index e90bc436f584..598a769f1961 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1062,10 +1062,14 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, struct vmcb *hsave = svm->nested.hsave; struct vmcb __user *user_vmcb = (struct vmcb __user *) &user_kvm_nested_state->data.svm[0]; - struct vmcb_control_area ctl; - struct vmcb_save_area save; + struct vmcb_control_area *ctl; + struct vmcb_save_area *save; + int ret; u32 cr0; + BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) > + KVM_STATE_NESTED_SVM_VMCB_SIZE); + if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM) return -EINVAL; @@ -1097,13 +1101,22 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE) return -EINVAL; - if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl))) - return -EFAULT; - if (copy_from_user(&save, &user_vmcb->save, sizeof(save))) - return -EFAULT; - if (!nested_vmcb_check_controls(&ctl)) - return -EINVAL; + ret = -ENOMEM; + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + save = kzalloc(sizeof(*save), GFP_KERNEL); + if (!ctl || !save) + goto out_free; + + ret = -EFAULT; + if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl))) + goto out_free; + if (copy_from_user(save, &user_vmcb->save, sizeof(*save))) + goto out_free; + + ret = -EINVAL; + if (!nested_vmcb_check_controls(ctl)) + goto out_free; /* * Processor state contains L2 state. Check that it is @@ -1111,15 +1124,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ cr0 = kvm_read_cr0(vcpu); if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) - return -EINVAL; + goto out_free; /* * Validate host state saved from before VMRUN (see * nested_svm_check_permissions). * TODO: validate reserved bits for all saved state. */ - if (!(save.cr0 & X86_CR0_PG)) - return -EINVAL; + if (!(save->cr0 & X86_CR0_PG)) + goto out_free; /* * All checks done, we can enter guest mode. L1 control fields @@ -1128,10 +1141,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * contains saved L1 state. */ copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); - hsave->save = save; + hsave->save = *save; svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa; - load_nested_vmcb_control(svm, &ctl); + load_nested_vmcb_control(svm, ctl); nested_prepare_vmcb_control(svm); if (!nested_svm_vmrun_msrpm(svm)) @@ -1139,7 +1152,13 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, out_set_gif: svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); - return 0; + + ret = 0; +out_free: + kfree(save); + kfree(ctl); + + return ret; } struct kvm_x86_nested_ops svm_nested_ops = { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5764b87379cf..c91acabf18d0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2668,7 +2668,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_IA32_APICBASE: if (kvm_vcpu_apicv_active(vcpu)) avic_update_vapic_bar(to_svm(vcpu), data); - /* Fall through */ + fallthrough; default: return kvm_set_msr_common(vcpu, msr); } @@ -4170,6 +4170,8 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static int __init svm_init(void) { + __unused_size_checks(); + return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), THIS_MODULE); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 75cd720c9e8c..8646a797b7a8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4654,7 +4654,7 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return false; - /* fall through */ + fallthrough; case DB_VECTOR: return !(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); @@ -4827,7 +4827,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) } kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); - /* fall through */ + fallthrough; case BP_VECTOR: /* * Update instruction length as we may reinject #BP from @@ -5257,7 +5257,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE); } - /* fall through */ + fallthrough; case INTR_TYPE_SOFT_EXCEPTION: kvm_clear_exception_queue(vcpu); break; @@ -5610,7 +5610,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) * keeping track of global entries in shadow page tables. */ - /* fall-through */ + fallthrough; case INVPCID_TYPE_ALL_INCL_GLOBAL: kvm_mmu_unload(vcpu); return kvm_skip_emulated_instruction(vcpu); @@ -6579,7 +6579,7 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, break; case INTR_TYPE_SOFT_EXCEPTION: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); - /* fall through */ + fallthrough; case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { u32 err = vmcs_read32(error_code_field); @@ -6589,7 +6589,7 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, break; case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); - /* fall through */ + fallthrough; case INTR_TYPE_EXT_INTR: kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3de0fe5af37..1994602a0851 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1116,14 +1116,12 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) vcpu->arch.eff_db[dr] = val; break; case 4: - /* fall through */ case 6: if (!kvm_dr6_valid(val)) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); break; case 5: - /* fall through */ default: /* 7 */ if (!kvm_dr7_valid(val)) return -1; /* #GP */ @@ -1154,12 +1152,10 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) *val = vcpu->arch.db[array_index_nospec(dr, size)]; break; case 4: - /* fall through */ case 6: *val = vcpu->arch.dr6; break; case 5: - /* fall through */ default: /* 7 */ *val = vcpu->arch.dr7; break; @@ -3051,7 +3047,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - pr = true; /* fall through */ + pr = true; + fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) @@ -4362,7 +4359,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_HYPERV_SYNIC2: if (cap->args[0]) return -EINVAL; - /* fall through */ + fallthrough; case KVM_CAP_HYPERV_SYNIC: if (!irqchip_in_kernel(vcpu->kvm)) @@ -8675,7 +8672,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - /* fall through */ + fallthrough; case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; break; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index d46fff11f06f..aa067859a70b 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -24,7 +24,7 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_cmdline.o = -pg endif -CFLAGS_cmdline.o := -fno-stack-protector +CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables endif inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 4f1719e22d3c..b6da09339308 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -58,7 +58,7 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, state = st_wordcmp; opptr = option; wstart = pos; - /* fall through */ + fallthrough; case st_wordcmp: if (!*opptr) { @@ -89,7 +89,7 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, break; } state = st_wordskip; - /* fall through */ + fallthrough; case st_wordskip: if (!c) @@ -151,7 +151,7 @@ __cmdline_find_option(const char *cmdline, int max_cmdline_size, state = st_wordcmp; opptr = option; - /* fall through */ + fallthrough; case st_wordcmp: if ((c == '=') && !*opptr) { @@ -172,7 +172,7 @@ __cmdline_find_option(const char *cmdline, int max_cmdline_size, break; } state = st_wordskip; - /* fall through */ + fallthrough; case st_wordskip: if (myisspace(c)) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 31600d851fd8..5e69603ff63f 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -179,7 +179,7 @@ static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off) if (insn->addr_bytes == 2) return -EINVAL; - /* fall through */ + fallthrough; case -EDOM: case offsetof(struct pt_regs, bx): @@ -362,7 +362,6 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) case INAT_SEG_REG_GS: return vm86regs->gs; case INAT_SEG_REG_IGNORE: - /* fall through */ default: return -EINVAL; } @@ -386,7 +385,6 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) */ return get_user_gs(regs); case INAT_SEG_REG_IGNORE: - /* fall through */ default: return -EINVAL; } @@ -786,7 +784,7 @@ int insn_get_code_seg_params(struct pt_regs *regs) */ return INSN_CODE_SEG_PARAMS(4, 8); case 3: /* Invalid setting. CS.L=1, CS.D=1 */ - /* fall through */ + fallthrough; default: return -EINVAL; } diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index 73dc66d887f3..ec071cbb0804 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -186,7 +186,7 @@ void FPU_printall(void) case TAG_Special: /* Update tagi for the printk below */ tagi = FPU_Special(r); - /* fall through */ + fallthrough; case TAG_Valid: printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i, getsign(r) ? '-' : '+', diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c index 127ea54122d7..4a9887851ad8 100644 --- a/arch/x86/math-emu/fpu_trig.c +++ b/arch/x86/math-emu/fpu_trig.c @@ -1352,7 +1352,7 @@ static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag) case TW_Denormal: if (denormal_operand() < 0) return; - /* fall through */ + fallthrough; case TAG_Zero: case TAG_Valid: setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr)); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 35f1498e9832..6e3e8a124903 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -190,6 +190,53 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } +/* + * Handle a fault on the vmalloc or module mapping area + * + * This is needed because there is a race condition between the time + * when the vmalloc mapping code updates the PMD to the point in time + * where it synchronizes this update with the other page-tables in the + * system. + * + * In this race window another thread/CPU can map an area on the same + * PMD, finds it already present and does not synchronize it with the + * rest of the system yet. As a result v[mz]alloc might return areas + * which are not mapped in every page-table in the system, causing an + * unhandled page-fault when they are accessed. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3_pa(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + if (pmd_large(*pmd_k)) + return 0; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} +NOKPROBE_SYMBOL(vmalloc_fault); + void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; @@ -1110,6 +1157,37 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); +#ifdef CONFIG_X86_32 + /* + * We can fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * Before doing this on-demand faulting, ensure that the + * fault is not any of the following: + * 1. A fault on a PTE with a reserved bit set. + * 2. A fault caused by a user-mode access. (Do not demand- + * fault kernel memory due to user-mode accesses). + * 3. A fault caused by a page-level protection violation. + * (A demand fault would be on a non-present page which + * would have X86_PF_PROT==0). + * + * This is only needed to close a race condition on x86-32 in + * the vmalloc mapping/unmapping code. See the comment above + * vmalloc_fault() for details. On x86-64 the race does not + * exist as the vmalloc mappings don't need to be synchronized + * there. + */ + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + } +#endif + /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 84d85dbd1dad..9e5ccc56f8e0 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -574,7 +574,7 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr, /* For SEV, these areas are encrypted */ if (sev_active()) break; - /* Fallthrough */ + fallthrough; case E820_TYPE_PRAM: return true; diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index c5174b4e318b..683cd12f4793 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -321,7 +321,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, u64 addr, u64 max_addr, u64 size) { return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, - 0, NULL, NUMA_NO_NODE); + 0, NULL, 0); } static int __init setup_emu2phys_nid(int *dfl_phys_nid) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 1a3569b43aa5..0951b47e64c1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -555,21 +555,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); load_new_mm_cr3(next->pgd, new_asid, true); - /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ load_new_mm_cr3(next->pgd, new_asid, false); - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } /* Make sure we write CR3 before loaded_mm. */ diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 9f9aad42ccff..89395a5049bb 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -26,6 +26,7 @@ #include <asm/xen/pci.h> #include <asm/xen/cpuid.h> #include <asm/apic.h> +#include <asm/acpi.h> #include <asm/i8259.h> static int xen_pcifront_enable_irq(struct pci_dev *dev) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f6ea8f1a9d57..d37ebe6e70d7 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -49,7 +49,6 @@ #include <asm/efi.h> #include <asm/e820/api.h> #include <asm/time.h> -#include <asm/set_memory.h> #include <asm/tlbflush.h> #include <asm/x86_init.h> #include <asm/uv/uv.h> @@ -496,74 +495,6 @@ void __init efi_init(void) efi_print_memmap(); } -#if defined(CONFIG_X86_32) - -void __init efi_set_executable(efi_memory_desc_t *md, bool executable) -{ - u64 addr, npages; - - addr = md->virt_addr; - npages = md->num_pages; - - memrange_efi_to_native(&addr, &npages); - - if (executable) - set_memory_x(addr, npages); - else - set_memory_nx(addr, npages); -} - -void __init runtime_code_page_mkexec(void) -{ - efi_memory_desc_t *md; - - /* Make EFI runtime service code area executable */ - for_each_efi_memory_desc(md) { - if (md->type != EFI_RUNTIME_SERVICES_CODE) - continue; - - efi_set_executable(md, true); - } -} - -void __init efi_memory_uc(u64 addr, unsigned long size) -{ - unsigned long page_shift = 1UL << EFI_PAGE_SHIFT; - u64 npages; - - npages = round_up(size, page_shift) / page_shift; - memrange_efi_to_native(&addr, &npages); - set_memory_uc(addr, npages); -} - -void __init old_map_region(efi_memory_desc_t *md) -{ - u64 start_pfn, end_pfn, end; - unsigned long size; - void *va; - - start_pfn = PFN_DOWN(md->phys_addr); - size = md->num_pages << PAGE_SHIFT; - end = md->phys_addr + size; - end_pfn = PFN_UP(end); - - if (pfn_range_is_mapped(start_pfn, end_pfn)) { - va = __va(md->phys_addr); - - if (!(md->attribute & EFI_MEMORY_WB)) - efi_memory_uc((u64)(unsigned long)va, size); - } else - va = efi_ioremap(md->phys_addr, size, - md->type, md->attribute); - - md->virt_addr = (u64) (unsigned long) va; - if (!va) - pr_err("ioremap of 0x%llX failed!\n", - (unsigned long long)md->phys_addr); -} - -#endif - /* Merge contiguous regions of the same type and attribute */ static void __init efi_merge_regions(void) { diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 826ead67753d..e06a199423c0 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -29,9 +29,35 @@ #include <asm/io.h> #include <asm/desc.h> #include <asm/page.h> +#include <asm/set_memory.h> #include <asm/tlbflush.h> #include <asm/efi.h> +void __init efi_map_region(efi_memory_desc_t *md) +{ + u64 start_pfn, end_pfn, end; + unsigned long size; + void *va; + + start_pfn = PFN_DOWN(md->phys_addr); + size = md->num_pages << PAGE_SHIFT; + end = md->phys_addr + size; + end_pfn = PFN_UP(end); + + if (pfn_range_is_mapped(start_pfn, end_pfn)) { + va = __va(md->phys_addr); + + if (!(md->attribute & EFI_MEMORY_WB)) + set_memory_uc((unsigned long)va, md->num_pages); + } else { + va = ioremap_cache(md->phys_addr, size); + } + + md->virt_addr = (unsigned long)va; + if (!va) + pr_err("ioremap of 0x%llX failed!\n", md->phys_addr); +} + /* * To make EFI call EFI runtime service in physical addressing mode we need * prolog/epilog before/after the invocation to claim the EFI runtime service @@ -58,11 +84,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 0; } -void __init efi_map_region(efi_memory_desc_t *md) -{ - old_map_region(md); -} - void __init efi_map_region_fixed(efi_memory_desc_t *md) {} void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} @@ -107,6 +128,15 @@ efi_status_t __init efi_set_virtual_address_map(unsigned long memory_map_size, void __init efi_runtime_update_mappings(void) { - if (__supported_pte_mask & _PAGE_NX) - runtime_code_page_mkexec(); + if (__supported_pte_mask & _PAGE_NX) { + efi_memory_desc_t *md; + + /* Make EFI runtime service code area executable */ + for_each_efi_memory_desc(md) { + if (md->type != EFI_RUNTIME_SERVICES_CODE) + continue; + + set_memory_x(md->virt_addr, md->num_pages); + } + } } diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 413583f904a6..6af4da1149ba 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -259,6 +259,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) npages = (__end_rodata - __start_rodata) >> PAGE_SHIFT; rodata = __pa(__start_rodata); pfn = rodata >> PAGE_SHIFT; + + pf = _PAGE_NX | _PAGE_ENC; if (kernel_map_pages_in_pgd(pgd, pfn, rodata, npages, pf)) { pr_err("Failed to map kernel rodata 1:1\n"); return 1; |