diff options
Diffstat (limited to 'arch/x86')
291 files changed, 11824 insertions, 2699 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ace79d2da2c3..2fa55851d2a9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,6 +22,7 @@ config X86 select ANON_INODES select ARCH_CLOCKSOURCE_DATA select ARCH_DISCARD_MEMBLOCK + select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_DEVMEM_IS_ALLOWED @@ -49,7 +50,6 @@ config X86 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_IPC_PARSE_VERSION if X86_32 - select ARCH_WANT_OPTIONAL_GPIOLIB select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 select CLKSRC_I8253 if X86_32 @@ -105,6 +105,7 @@ config X86 select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_EXIT_THREAD select HAVE_FENTRY if X86_64 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_FP_TEST @@ -130,6 +131,7 @@ config X86 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM @@ -292,11 +294,6 @@ config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR -config ARCH_HWEIGHT_CFLAGS - string - default "-fcall-saved-ecx -fcall-saved-edx" if X86_32 - default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64 - config ARCH_SUPPORTS_UPROBES def_bool y @@ -641,7 +638,7 @@ config STA2X11 select X86_DMA_REMAP select SWIOTLB select MFD_STA2X11 - select ARCH_REQUIRE_GPIOLIB + select GPIOLIB default n ---help--- This adds support for boards based on the STA2X11 IO-Hub, @@ -1932,21 +1929,26 @@ config RANDOMIZE_BASE attempts relying on knowledge of the location of kernel code internals. - The kernel physical and virtual address can be randomized - from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that - using RANDOMIZE_BASE reduces the memory space available to - kernel modules from 1.5GB to 1GB.) + On 64-bit, the kernel physical and virtual addresses are + randomized separately. The physical address will be anywhere + between 16MB and the top of physical memory (up to 64TB). The + virtual address will be randomized from 16MB up to 1GB (9 bits + of entropy). Note that this also reduces the memory space + available to kernel modules from 1.5GB to 1GB. + + On 32-bit, the kernel physical and virtual addresses are + randomized together. They will be randomized from 16MB up to + 512MB (8 bits of entropy). Entropy is generated using the RDRAND instruction if it is supported. If RDTSC is supported, its value is mixed into the entropy pool as well. If neither RDRAND nor RDTSC are - supported, then entropy is read from the i8254 timer. - - Since the kernel is built using 2GB addressing, and - PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of - entropy is theoretically possible. Currently, with the - default value for PHYSICAL_ALIGN and due to page table - layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits. + supported, then entropy is read from the i8254 timer. The + usable entropy is limited by the kernel being built using + 2GB addressing, and that PHYSICAL_ALIGN must be at a + minimum of 2MB. As a result, only 10 bits of entropy are + theoretically possible, but the implementations are further + limited due to memory layouts. If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot time. To enable it, boot with "kaslr" on the kernel command @@ -1986,6 +1988,38 @@ config PHYSICAL_ALIGN Don't change this unless you know what you are doing. +config RANDOMIZE_MEMORY + bool "Randomize the kernel memory sections" + depends on X86_64 + depends on RANDOMIZE_BASE + default RANDOMIZE_BASE + ---help--- + Randomizes the base virtual address of kernel memory sections + (physical memory mapping, vmalloc & vmemmap). This security feature + makes exploits relying on predictable memory locations less reliable. + + The order of allocations remains unchanged. Entropy is generated in + the same way as RANDOMIZE_BASE. Current implementation in the optimal + configuration have in average 30,000 different possible virtual + addresses for each memory section. + + If unsure, say N. + +config RANDOMIZE_MEMORY_PHYSICAL_PADDING + hex "Physical memory mapping padding" if EXPERT + depends on RANDOMIZE_MEMORY + default "0xa" if MEMORY_HOTPLUG + default "0x0" + range 0x1 0x40 if MEMORY_HOTPLUG + range 0x0 0x40 + ---help--- + Define the padding in terabytes added to the existing physical + memory size during kernel memory randomization. It is useful + for memory hotplug support but reduces the entropy available for + address randomization. + + If unsure, leave at the default value. + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP @@ -2437,6 +2471,15 @@ config PCI_CNB20LE_QUIRK source "drivers/pci/Kconfig" +config ISA_BUS + bool "ISA-style bus support on modern systems" if EXPERT + select ISA_BUS_API + help + Enables ISA-style drivers on modern systems. This is necessary to + support PC/104 devices on X86_64 platforms. + + If unsure, say N. + # x86_64 have no ISA slots, but can have ISA-style DMA. config ISA_DMA_API bool "ISA-style DMA support" if (X86_64 && EXPERT) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 6fce7f096b88..830ed391e7ef 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -126,14 +126,6 @@ else KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args) endif -# Make sure compiler does not have buggy stack-protector support. -ifdef CONFIG_CC_STACKPROTECTOR - cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh - ifneq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) - $(warning stack-protector enabled but compiler support broken) - endif -endif - ifdef CONFIG_X86_X32 x32_ld_ok := $(call try-run,\ /bin/echo -e '1: .quad 1b' | \ diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 700a9c6e6159..be8e688fa0d4 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -162,6 +162,9 @@ isoimage: $(obj)/bzImage for i in lib lib64 share end ; do \ if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \ cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \ + if [ -f /usr/$$i/syslinux/ldlinux.c32 ]; then \ + cp /usr/$$i/syslinux/ldlinux.c32 $(obj)/isoimage ; \ + fi ; \ break ; \ fi ; \ if [ $$i = end ] ; then exit 1 ; fi ; \ diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h index 878e4b9940d9..0d41d68131cc 100644 --- a/arch/x86/boot/bitops.h +++ b/arch/x86/boot/bitops.h @@ -16,14 +16,16 @@ #define BOOT_BITOPS_H #define _LINUX_BITOPS_H /* Inhibit inclusion of <linux/bitops.h> */ -static inline int constant_test_bit(int nr, const void *addr) +#include <linux/types.h> + +static inline bool constant_test_bit(int nr, const void *addr) { const u32 *p = (const u32 *)addr; return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0; } -static inline int variable_test_bit(int nr, const void *addr) +static inline bool variable_test_bit(int nr, const void *addr) { - u8 v; + bool v; const u32 *p = (const u32 *)addr; asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr)); diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 9011a88353de..e5612f3e3b57 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -24,6 +24,7 @@ #include <linux/types.h> #include <linux/edd.h> #include <asm/setup.h> +#include <asm/asm.h> #include "bitops.h" #include "ctype.h" #include "cpuflags.h" @@ -176,18 +177,18 @@ static inline void wrgs32(u32 v, addr_t addr) } /* Note: these only return true/false, not a signed return value! */ -static inline int memcmp_fs(const void *s1, addr_t s2, size_t len) +static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len) { - u8 diff; - asm volatile("fs; repe; cmpsb; setnz %0" - : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + bool diff; + asm volatile("fs; repe; cmpsb" CC_SET(nz) + : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } -static inline int memcmp_gs(const void *s1, addr_t s2, size_t len) +static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len) { - u8 diff; - asm volatile("gs; repe; cmpsb; setnz %0" - : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + bool diff; + asm volatile("gs; repe; cmpsb" CC_SET(nz) + : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } @@ -294,6 +295,7 @@ static inline int cmdline_find_option_bool(const char *option) /* cpu.c, cpucheck.c */ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); +int check_knl_erratum(void); int validate_cpu(void); /* early_serial_console.c */ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index cfdd8c3f8af2..536ccfcc01c6 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -85,9 +85,26 @@ vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \ $(objtree)/drivers/firmware/efi/libstub/lib.a vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o +# The compressed kernel is built with -fPIC/-fPIE so that a boot loader +# can place it anywhere in memory and it will still run. However, since +# it is executed as-is without any ELF relocation processing performed +# (and has already had all relocation sections stripped from the binary), +# none of the code can use data relocations (e.g. static assignments of +# pointer values), since they will be meaningless at runtime. This check +# will refuse to link the vmlinux if any of these relocations are found. +quiet_cmd_check_data_rel = DATAREL $@ +define cmd_check_data_rel + for obj in $(filter %.o,$^); do \ + readelf -S $$obj | grep -qF .rel.local && { \ + echo "error: $$obj has data relocations!" >&2; \ + exit 1; \ + } || true; \ + done +endef + $(obj)/vmlinux: $(vmlinux-objs-y) FORCE + $(call if_changed,check_data_rel) $(call if_changed,ld) - @: OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 52fef606bc54..ff574dad95cc 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -757,7 +757,6 @@ struct boot_params *make_boot_params(struct efi_config *c) struct boot_params *boot_params; struct apm_bios_info *bi; struct setup_header *hdr; - struct efi_info *efi; efi_loaded_image_t *image; void *options, *handle; efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; @@ -800,7 +799,6 @@ struct boot_params *make_boot_params(struct efi_config *c) memset(boot_params, 0x0, 0x4000); hdr = &boot_params->hdr; - efi = &boot_params->efi_info; bi = &boot_params->apm_bios_info; /* Copy the second sector to boot_params */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index cfeb0259ed81..a66854d99ee1 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -12,10 +12,6 @@ #include "misc.h" #include "error.h" -#include <asm/msr.h> -#include <asm/archrandom.h> -#include <asm/e820.h> - #include <generated/compile.h> #include <linux/module.h> #include <linux/uts.h> @@ -26,26 +22,6 @@ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; -#define I8254_PORT_CONTROL 0x43 -#define I8254_PORT_COUNTER0 0x40 -#define I8254_CMD_READBACK 0xC0 -#define I8254_SELECT_COUNTER0 0x02 -#define I8254_STATUS_NOTREADY 0x40 -static inline u16 i8254(void) -{ - u16 status, timer; - - do { - outb(I8254_PORT_CONTROL, - I8254_CMD_READBACK | I8254_SELECT_COUNTER0); - status = inb(I8254_PORT_COUNTER0); - timer = inb(I8254_PORT_COUNTER0); - timer |= inb(I8254_PORT_COUNTER0) << 8; - } while (status & I8254_STATUS_NOTREADY); - - return timer; -} - static unsigned long rotate_xor(unsigned long hash, const void *area, size_t size) { @@ -62,7 +38,7 @@ static unsigned long rotate_xor(unsigned long hash, const void *area, } /* Attempt to create a simple but unpredictable starting entropy. */ -static unsigned long get_random_boot(void) +static unsigned long get_boot_seed(void) { unsigned long hash = 0; @@ -72,50 +48,8 @@ static unsigned long get_random_boot(void) return hash; } -static unsigned long get_random_long(const char *purpose) -{ -#ifdef CONFIG_X86_64 - const unsigned long mix_const = 0x5d6008cbf3848dd3UL; -#else - const unsigned long mix_const = 0x3f39e593UL; -#endif - unsigned long raw, random = get_random_boot(); - bool use_i8254 = true; - - debug_putstr(purpose); - debug_putstr(" KASLR using"); - - if (has_cpuflag(X86_FEATURE_RDRAND)) { - debug_putstr(" RDRAND"); - if (rdrand_long(&raw)) { - random ^= raw; - use_i8254 = false; - } - } - - if (has_cpuflag(X86_FEATURE_TSC)) { - debug_putstr(" RDTSC"); - raw = rdtsc(); - - random ^= raw; - use_i8254 = false; - } - - if (use_i8254) { - debug_putstr(" i8254"); - random ^= i8254(); - } - - /* Circular multiply for better bit diffusion */ - asm("mul %3" - : "=a" (random), "=d" (raw) - : "a" (random), "rm" (mix_const)); - random += raw; - - debug_putstr("...\n"); - - return random; -} +#define KASLR_COMPRESSED_BOOT +#include "../../lib/kaslr.c" struct mem_vector { unsigned long start; @@ -132,17 +66,6 @@ enum mem_avoid_index { static struct mem_vector mem_avoid[MEM_AVOID_MAX]; -static bool mem_contains(struct mem_vector *region, struct mem_vector *item) -{ - /* Item at least partially before region. */ - if (item->start < region->start) - return false; - /* Item at least partially after region. */ - if (item->start + item->size > region->start + region->size) - return false; - return true; -} - static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) { /* Item one is entirely before item two. */ @@ -296,6 +219,7 @@ static bool mem_avoid_overlap(struct mem_vector *img, if (mem_overlaps(img, &mem_avoid[i]) && mem_avoid[i].start < earliest) { *overlap = mem_avoid[i]; + earliest = overlap->start; is_overlapping = true; } } @@ -310,6 +234,7 @@ static bool mem_avoid_overlap(struct mem_vector *img, if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { *overlap = avoid; + earliest = overlap->start; is_overlapping = true; } @@ -319,8 +244,6 @@ static bool mem_avoid_overlap(struct mem_vector *img, return is_overlapping; } -static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN]; - struct slot_area { unsigned long addr; int num; @@ -351,36 +274,44 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) } } -static void slots_append(unsigned long addr) -{ - /* Overflowing the slots list should be impossible. */ - if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN) - return; - - slots[slot_max++] = addr; -} - static unsigned long slots_fetch_random(void) { + unsigned long slot; + int i; + /* Handle case of no slots stored. */ if (slot_max == 0) return 0; - return slots[get_random_long("Physical") % slot_max]; + slot = kaslr_get_random_long("Physical") % slot_max; + + for (i = 0; i < slot_area_index; i++) { + if (slot >= slot_areas[i].num) { + slot -= slot_areas[i].num; + continue; + } + return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN; + } + + if (i == slot_area_index) + debug_putstr("slots_fetch_random() failed!?\n"); + return 0; } static void process_e820_entry(struct e820entry *entry, unsigned long minimum, unsigned long image_size) { - struct mem_vector region, img, overlap; + struct mem_vector region, overlap; + struct slot_area slot_area; + unsigned long start_orig; /* Skip non-RAM entries. */ if (entry->type != E820_RAM) return; - /* Ignore entries entirely above our maximum. */ - if (entry->addr >= KERNEL_IMAGE_SIZE) + /* On 32-bit, ignore entries entirely above our maximum. */ + if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE) return; /* Ignore entries entirely below our minimum. */ @@ -390,31 +321,55 @@ static void process_e820_entry(struct e820entry *entry, region.start = entry->addr; region.size = entry->size; - /* Potentially raise address to minimum location. */ - if (region.start < minimum) - region.start = minimum; + /* Give up if slot area array is full. */ + while (slot_area_index < MAX_SLOT_AREA) { + start_orig = region.start; - /* Potentially raise address to meet alignment requirements. */ - region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); + /* Potentially raise address to minimum location. */ + if (region.start < minimum) + region.start = minimum; - /* Did we raise the address above the bounds of this e820 region? */ - if (region.start > entry->addr + entry->size) - return; + /* Potentially raise address to meet alignment needs. */ + region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - /* Reduce size by any delta from the original address. */ - region.size -= region.start - entry->addr; + /* Did we raise the address above this e820 region? */ + if (region.start > entry->addr + entry->size) + return; - /* Reduce maximum size to fit end of image within maximum limit. */ - if (region.start + region.size > KERNEL_IMAGE_SIZE) - region.size = KERNEL_IMAGE_SIZE - region.start; + /* Reduce size by any delta from the original address. */ + region.size -= region.start - start_orig; - /* Walk each aligned slot and check for avoided areas. */ - for (img.start = region.start, img.size = image_size ; - mem_contains(®ion, &img) ; - img.start += CONFIG_PHYSICAL_ALIGN) { - if (mem_avoid_overlap(&img, &overlap)) - continue; - slots_append(img.start); + /* On 32-bit, reduce region size to fit within max size. */ + if (IS_ENABLED(CONFIG_X86_32) && + region.start + region.size > KERNEL_IMAGE_SIZE) + region.size = KERNEL_IMAGE_SIZE - region.start; + + /* Return if region can't contain decompressed kernel */ + if (region.size < image_size) + return; + + /* If nothing overlaps, store the region and return. */ + if (!mem_avoid_overlap(®ion, &overlap)) { + store_slot_info(®ion, image_size); + return; + } + + /* Store beginning of region if holds at least image_size. */ + if (overlap.start > region.start + image_size) { + struct mem_vector beginning; + + beginning.start = region.start; + beginning.size = overlap.start - region.start; + store_slot_info(&beginning, image_size); + } + + /* Return if overlap extends to or past end of region. */ + if (overlap.start + overlap.size >= region.start + region.size) + return; + + /* Clip off the overlapping region and start over. */ + region.size -= overlap.start - region.start + overlap.size; + region.start = overlap.start + overlap.size; } } @@ -431,6 +386,10 @@ static unsigned long find_random_phys_addr(unsigned long minimum, for (i = 0; i < boot_params->e820_entries; i++) { process_e820_entry(&boot_params->e820_map[i], minimum, image_size); + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted e820 scan (slot_areas full)!\n"); + break; + } } return slots_fetch_random(); @@ -454,7 +413,7 @@ static unsigned long find_random_virt_addr(unsigned long minimum, slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN + 1; - random_addr = get_random_long("Virtual") % slots; + random_addr = kaslr_get_random_long("Virtual") % slots; return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; } @@ -463,48 +422,54 @@ static unsigned long find_random_virt_addr(unsigned long minimum, * Since this function examines addresses much more numerically, * it takes the input and output pointers as 'unsigned long'. */ -unsigned char *choose_random_location(unsigned long input, - unsigned long input_size, - unsigned long output, - unsigned long output_size) +void choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long *output, + unsigned long output_size, + unsigned long *virt_addr) { - unsigned long choice = output; - unsigned long random_addr; + unsigned long random_addr, min_addr; + + /* By default, keep output position unchanged. */ + *virt_addr = *output; -#ifdef CONFIG_HIBERNATION - if (!cmdline_find_option_bool("kaslr")) { - warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected)."); - goto out; - } -#else if (cmdline_find_option_bool("nokaslr")) { warn("KASLR disabled: 'nokaslr' on cmdline."); - goto out; + return; } -#endif boot_params->hdr.loadflags |= KASLR_FLAG; + /* Prepare to add new identity pagetables on demand. */ + initialize_identity_maps(); + /* Record the various known unsafe memory ranges. */ - mem_avoid_init(input, input_size, output); + mem_avoid_init(input, input_size, *output); + + /* + * Low end of the randomization range should be the + * smaller of 512M or the initial kernel image + * location: + */ + min_addr = min(*output, 512UL << 20); /* Walk e820 and find a random address. */ - random_addr = find_random_phys_addr(output, output_size); + random_addr = find_random_phys_addr(min_addr, output_size); if (!random_addr) { warn("KASLR disabled: could not find suitable E820 region!"); - goto out; + } else { + /* Update the new physical address location. */ + if (*output != random_addr) { + add_identity_map(random_addr, output_size); + *output = random_addr; + } } - /* Always enforce the minimum. */ - if (random_addr < choice) - goto out; - - choice = random_addr; - - add_identity_map(choice, output_size); - /* This actually loads the identity pagetable on x86_64. */ finalize_identity_maps(); -out: - return (unsigned char *)choice; + + /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ + if (IS_ENABLED(CONFIG_X86_64)) + random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size); + *virt_addr = random_addr; } diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index f14db4e21654..b3c5a5f030ce 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -170,7 +170,8 @@ void __puthex(unsigned long value) } #if CONFIG_X86_NEED_RELOCS -static void handle_relocations(void *output, unsigned long output_len) +static void handle_relocations(void *output, unsigned long output_len, + unsigned long virt_addr) { int *reloc; unsigned long delta, map, ptr; @@ -182,11 +183,6 @@ static void handle_relocations(void *output, unsigned long output_len) * and where it was actually loaded. */ delta = min_addr - LOAD_PHYSICAL_ADDR; - if (!delta) { - debug_putstr("No relocation needed... "); - return; - } - debug_putstr("Performing relocations... "); /* * The kernel contains a table of relocation addresses. Those @@ -198,6 +194,20 @@ static void handle_relocations(void *output, unsigned long output_len) map = delta - __START_KERNEL_map; /* + * 32-bit always performs relocations. 64-bit relocations are only + * needed if KASLR has chosen a different starting address offset + * from __START_KERNEL_map. + */ + if (IS_ENABLED(CONFIG_X86_64)) + delta = virt_addr - LOAD_PHYSICAL_ADDR; + + if (!delta) { + debug_putstr("No relocation needed... "); + return; + } + debug_putstr("Performing relocations... "); + + /* * Process relocations: 32 bit relocations first then 64 bit after. * Three sets of binary relocations are added to the end of the kernel * before compression. Each relocation table entry is the kernel @@ -250,7 +260,8 @@ static void handle_relocations(void *output, unsigned long output_len) #endif } #else -static inline void handle_relocations(void *output, unsigned long output_len) +static inline void handle_relocations(void *output, unsigned long output_len, + unsigned long virt_addr) { } #endif @@ -327,7 +338,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, unsigned long output_len) { const unsigned long kernel_total_size = VO__end - VO__text; - unsigned char *output_orig = output; + unsigned long virt_addr = (unsigned long)output; /* Retain x86 boot parameters pointer passed from startup_32/64. */ boot_params = rmode; @@ -366,13 +377,16 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_random_location((unsigned long)input_data, input_len, - (unsigned long)output, - max(output_len, kernel_total_size)); + choose_random_location((unsigned long)input_data, input_len, + (unsigned long *)&output, + max(output_len, kernel_total_size), + &virt_addr); /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) - error("Destination address inappropriately aligned"); + error("Destination physical address inappropriately aligned"); + if (virt_addr & (MIN_KERNEL_ALIGN - 1)) + error("Destination virtual address inappropriately aligned"); #ifdef CONFIG_X86_64 if (heap > 0x3fffffffffffUL) error("Destination address too large"); @@ -382,19 +396,16 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, #endif #ifndef CONFIG_RELOCATABLE if ((unsigned long)output != LOAD_PHYSICAL_ADDR) - error("Wrong destination address"); + error("Destination address does not match LOAD_PHYSICAL_ADDR"); + if ((unsigned long)output != virt_addr) + error("Destination virtual address changed when not relocatable"); #endif debug_putstr("\nDecompressing Linux... "); __decompress(input_data, input_len, NULL, NULL, output, output_len, NULL, error); parse_elf(output); - /* - * 32-bit always performs relocations. 64-bit relocations are only - * needed if kASLR has chosen a different load address. - */ - if (!IS_ENABLED(CONFIG_X86_64) || output != output_orig) - handle_relocations(output, output_len); + handle_relocations(output, output_len, virt_addr); debug_putstr("done.\nBooting the kernel.\n"); return output; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index b6fec1ff10e4..1c8355eadbd1 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -67,28 +67,33 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* kaslr.c */ -unsigned char *choose_random_location(unsigned long input_ptr, - unsigned long input_size, - unsigned long output_ptr, - unsigned long output_size); +void choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long *output, + unsigned long output_size, + unsigned long *virt_addr); /* cpuflags.c */ bool has_cpuflag(int flag); #else -static inline -unsigned char *choose_random_location(unsigned long input_ptr, - unsigned long input_size, - unsigned long output_ptr, - unsigned long output_size) +static inline void choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long *output, + unsigned long output_size, + unsigned long *virt_addr) { - return (unsigned char *)output_ptr; + /* No change from existing output location. */ + *virt_addr = *output; } #endif #ifdef CONFIG_X86_64 +void initialize_identity_maps(void); void add_identity_map(unsigned long start, unsigned long size); void finalize_identity_maps(void); extern unsigned char _pgtable[]; #else +static inline void initialize_identity_maps(void) +{ } static inline void add_identity_map(unsigned long start, unsigned long size) { } static inline void finalize_identity_maps(void) diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 34b95df14e69..56589d0a804b 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -2,6 +2,9 @@ * This code is used on x86_64 to create page table identity mappings on * demand by building up a new set of page tables (or appending to the * existing ones), and then switching over to them when ready. + * + * Copyright (C) 2015-2016 Yinghai Lu + * Copyright (C) 2016 Kees Cook */ /* @@ -17,6 +20,9 @@ /* These actually do the work of building the kernel identity maps. */ #include <asm/init.h> #include <asm/pgtable.h> +/* Use the static base for this part of the boot process */ +#undef __PAGE_OFFSET +#define __PAGE_OFFSET __PAGE_OFFSET_BASE #include "../../mm/ident_map.c" /* Used by pgtable.h asm code to force instruction serialization. */ @@ -59,9 +65,21 @@ static struct alloc_pgt_data pgt_data; /* The top level page table entry pointer. */ static unsigned long level4p; +/* + * Mapping information structure passed to kernel_ident_mapping_init(). + * Due to relocation, pointers must be assigned at run time not build time. + */ +static struct x86_mapping_info mapping_info = { + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, +}; + /* Locates and clears a region for a new top level page table. */ -static void prepare_level4(void) +void initialize_identity_maps(void) { + /* Init mapping_info with run-time function/buffer pointers. */ + mapping_info.alloc_pgt_page = alloc_pgt_page; + mapping_info.context = &pgt_data; + /* * It should be impossible for this not to already be true, * but since calling this a second time would rewind the other @@ -96,17 +114,8 @@ static void prepare_level4(void) */ void add_identity_map(unsigned long start, unsigned long size) { - struct x86_mapping_info mapping_info = { - .alloc_pgt_page = alloc_pgt_page, - .context = &pgt_data, - .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, - }; unsigned long end = start + size; - /* Make sure we have a top level page table ready to use. */ - if (!level4p) - prepare_level4(); - /* Align boundary to 2M. */ start = round_down(start, PMD_SIZE); end = round_up(end, PMD_SIZE); diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 29207f69ae8c..26240dde081e 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -93,6 +93,8 @@ int validate_cpu(void) show_cap_strs(err_flags); putchar('\n'); return -1; + } else if (check_knl_erratum()) { + return -1; } else { return 0; } diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 1fd7d575092e..4ad7d70e8739 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -24,6 +24,7 @@ # include "boot.h" #endif #include <linux/types.h> +#include <asm/intel-family.h> #include <asm/processor-flags.h> #include <asm/required-features.h> #include <asm/msr-index.h> @@ -175,6 +176,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) puts("WARNING: PAE disabled. Use parameter 'forcepae' to enable at your own risk!\n"); } } + if (!err) + err = check_knl_erratum(); if (err_flags_ptr) *err_flags_ptr = err ? err_flags : NULL; @@ -185,3 +188,33 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) return (cpu.level < req_level || err) ? -1 : 0; } + +int check_knl_erratum(void) +{ + /* + * First check for the affected model/family: + */ + if (!is_intel() || + cpu.family != 6 || + cpu.model != INTEL_FAM6_XEON_PHI_KNL) + return 0; + + /* + * This erratum affects the Accessed/Dirty bits, and can + * cause stray bits to be set in !Present PTEs. We have + * enough bits in our 64-bit PTEs (which we have on real + * 64-bit mode or PAE) to avoid using these troublesome + * bits. But, we do not have enough space in our 32-bit + * PTEs. So, refuse to run on 32-bit non-PAE kernels. + */ + if (IS_ENABLED(CONFIG_X86_64) || IS_ENABLED(CONFIG_X86_PAE)) + return 0; + + puts("This 32-bit kernel can not run on this Xeon Phi x200\n" + "processor due to a processor erratum. Use a 64-bit\n" + "kernel, or enable PAE in this 32-bit kernel.\n\n"); + + return -1; +} + + diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index 431fa5f84537..6687ab953257 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -102,6 +102,7 @@ void get_cpuflags(void) cpuid(0x1, &tfms, &ignored, &cpu.flags[4], &cpu.flags[0]); cpu.level = (tfms >> 8) & 15; + cpu.family = cpu.level; cpu.model = (tfms >> 4) & 15; if (cpu.level >= 6) cpu.model += ((tfms >> 16) & 0xf) << 4; diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index 4cb404fd45ce..15ad56a3f905 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -6,6 +6,7 @@ struct cpu_features { int level; /* Family, or 64 for x86-64 */ + int family; /* Family, always */ int model; u32 flags[NCAPINTS]; }; diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 318b8465d302..cc3bd583dce1 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -17,7 +17,7 @@ int memcmp(const void *s1, const void *s2, size_t len) { - u8 diff; + bool diff; asm("repe; cmpsb; setnz %0" : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 265901a84f3f..5fa6ee2c2dde 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -17,7 +17,6 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y -CONFIG_RESOURCE_COUNTERS=y CONFIG_CGROUP_SCHED=y CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 0c8d7963483c..d28bdabcc87e 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -16,7 +16,6 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y -CONFIG_RESOURCE_COUNTERS=y CONFIG_CGROUP_SCHED=y CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index b9b912a44d61..34b3fa2889d1 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -49,7 +49,9 @@ endif ifeq ($(avx2_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o - obj-$(CONFIG_CRYPTO_SHA1_MB) += sha-mb/ + obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/ + obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/ + obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/ endif aes-i586-y := aes-i586-asm_32.o aes_glue.o diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 5b7fa1471007..0ab5ee1c26af 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -59,17 +59,6 @@ struct aesni_rfc4106_gcm_ctx { u8 nonce[4]; }; -struct aesni_gcm_set_hash_subkey_result { - int err; - struct completion completion; -}; - -struct aesni_hash_subkey_req_data { - u8 iv[16]; - struct aesni_gcm_set_hash_subkey_result result; - struct scatterlist sg; -}; - struct aesni_lrw_ctx { struct lrw_table_ctx lrw_table; u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; @@ -809,71 +798,28 @@ static void rfc4106_exit(struct crypto_aead *aead) cryptd_free_aead(*ctx); } -static void -rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err) -{ - struct aesni_gcm_set_hash_subkey_result *result = req->data; - - if (err == -EINPROGRESS) - return; - result->err = err; - complete(&result->completion); -} - static int rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) { - struct crypto_ablkcipher *ctr_tfm; - struct ablkcipher_request *req; - int ret = -EINVAL; - struct aesni_hash_subkey_req_data *req_data; + struct crypto_cipher *tfm; + int ret; - ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0); - if (IS_ERR(ctr_tfm)) - return PTR_ERR(ctr_tfm); + tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); - ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); + ret = crypto_cipher_setkey(tfm, key, key_len); if (ret) - goto out_free_ablkcipher; - - ret = -ENOMEM; - req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); - if (!req) - goto out_free_ablkcipher; - - req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); - if (!req_data) - goto out_free_request; - - memset(req_data->iv, 0, sizeof(req_data->iv)); + goto out_free_cipher; /* Clear the data in the hash sub key container to zero.*/ /* We want to cipher all zeros to create the hash sub key. */ memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE); - init_completion(&req_data->result.completion); - sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE); - ablkcipher_request_set_tfm(req, ctr_tfm); - ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - rfc4106_set_hash_subkey_done, - &req_data->result); - - ablkcipher_request_set_crypt(req, &req_data->sg, - &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv); - - ret = crypto_ablkcipher_encrypt(req); - if (ret == -EINPROGRESS || ret == -EBUSY) { - ret = wait_for_completion_interruptible - (&req_data->result.completion); - if (!ret) - ret = req_data->result.err; - } - kfree(req_data); -out_free_request: - ablkcipher_request_free(req); -out_free_ablkcipher: - crypto_free_ablkcipher(ctr_tfm); + crypto_cipher_encrypt_one(tfm, hash_subkey, hash_subkey); + +out_free_cipher: + crypto_free_cipher(tfm); return ret; } @@ -1098,9 +1044,12 @@ static int rfc4106_encrypt(struct aead_request *req) struct cryptd_aead **ctx = crypto_aead_ctx(tfm); struct cryptd_aead *cryptd_tfm = *ctx; - aead_request_set_tfm(req, irq_fpu_usable() ? - cryptd_aead_child(cryptd_tfm) : - &cryptd_tfm->base); + tfm = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + tfm = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, tfm); return crypto_aead_encrypt(req); } @@ -1111,9 +1060,12 @@ static int rfc4106_decrypt(struct aead_request *req) struct cryptd_aead **ctx = crypto_aead_ctx(tfm); struct cryptd_aead *cryptd_tfm = *ctx; - aead_request_set_tfm(req, irq_fpu_usable() ? - cryptd_aead_child(cryptd_tfm) : - &cryptd_tfm->base); + tfm = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + tfm = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, tfm); return crypto_aead_decrypt(req); } diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 2d5c2e0bd939..f910d1d449f0 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -70,7 +70,7 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, struct blkcipher_walk walk; int err; - if (!may_use_simd()) + if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd()) return crypto_chacha20_crypt(desc, dst, src, nbytes); state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN); diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index a69321a77783..0420bab19efb 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -168,30 +168,23 @@ static int ghash_async_init(struct ahash_request *req) struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct ahash_request *cryptd_req = ahash_request_ctx(req); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - if (!irq_fpu_usable()) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); - return crypto_ahash_init(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - - desc->tfm = child; - desc->flags = req->base.flags; - return crypto_shash_init(desc); - } + desc->tfm = child; + desc->flags = req->base.flags; + return crypto_shash_init(desc); } static int ghash_async_update(struct ahash_request *req) { struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable()) { - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - + if (!irq_fpu_usable() || + (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); return crypto_ahash_update(cryptd_req); @@ -204,12 +197,12 @@ static int ghash_async_update(struct ahash_request *req) static int ghash_async_final(struct ahash_request *req) { struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable()) { - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - + if (!irq_fpu_usable() || + (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); return crypto_ahash_final(cryptd_req); @@ -249,7 +242,8 @@ static int ghash_async_digest(struct ahash_request *req) struct ahash_request *cryptd_req = ahash_request_ctx(req); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable()) { + if (!irq_fpu_usable() || + (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); return crypto_ahash_digest(cryptd_req); diff --git a/arch/x86/crypto/sha-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile index 2f8756375df5..2f8756375df5 100644 --- a/arch/x86/crypto/sha-mb/Makefile +++ b/arch/x86/crypto/sha1-mb/Makefile diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c index 9c5af331a956..9e5b67127a09 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha1-mb/sha1_mb.c @@ -67,7 +67,7 @@ #include <asm/byteorder.h> #include <linux/hardirq.h> #include <asm/fpu/api.h> -#include "sha_mb_ctx.h" +#include "sha1_mb_ctx.h" #define FLUSH_INTERVAL 1000 /* in usec */ @@ -77,30 +77,34 @@ struct sha1_mb_ctx { struct mcryptd_ahash *mcryptd_tfm; }; -static inline struct mcryptd_hash_request_ctx *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx) +static inline struct mcryptd_hash_request_ctx + *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx) { - struct shash_desc *desc; + struct ahash_request *areq; - desc = container_of((void *) hash_ctx, struct shash_desc, __ctx); - return container_of(desc, struct mcryptd_hash_request_ctx, desc); + areq = container_of((void *) hash_ctx, struct ahash_request, __ctx); + return container_of(areq, struct mcryptd_hash_request_ctx, areq); } -static inline struct ahash_request *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx) +static inline struct ahash_request + *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx) { return container_of((void *) ctx, struct ahash_request, __ctx); } static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx, - struct shash_desc *desc) + struct ahash_request *areq) { rctx->flag = HASH_UPDATE; } static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state); -static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)(struct sha1_mb_mgr *state, - struct job_sha1 *job); -static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)(struct sha1_mb_mgr *state); -static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)(struct sha1_mb_mgr *state); +static asmlinkage struct job_sha1* (*sha1_job_mgr_submit) + (struct sha1_mb_mgr *state, struct job_sha1 *job); +static asmlinkage struct job_sha1* (*sha1_job_mgr_flush) + (struct sha1_mb_mgr *state); +static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job) + (struct sha1_mb_mgr *state); static inline void sha1_init_digest(uint32_t *digest) { @@ -131,7 +135,8 @@ static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], return i >> SHA1_LOG2_BLOCK_SIZE; } -static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, struct sha1_hash_ctx *ctx) +static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, + struct sha1_hash_ctx *ctx) { while (ctx) { if (ctx->status & HASH_CTX_STS_COMPLETE) { @@ -177,8 +182,8 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str ctx->job.buffer = (uint8_t *) buffer; ctx->job.len = len; - ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, - &ctx->job); + ctx = (struct sha1_hash_ctx *)sha1_job_mgr_submit(&mgr->mgr, + &ctx->job); continue; } } @@ -191,13 +196,15 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str if (ctx->status & HASH_CTX_STS_LAST) { uint8_t *buf = ctx->partial_block_buffer; - uint32_t n_extra_blocks = sha1_pad(buf, ctx->total_length); + uint32_t n_extra_blocks = + sha1_pad(buf, ctx->total_length); ctx->status = (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); ctx->job.buffer = buf; ctx->job.len = (uint32_t) n_extra_blocks; - ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job); + ctx = (struct sha1_hash_ctx *) + sha1_job_mgr_submit(&mgr->mgr, &ctx->job); continue; } @@ -208,14 +215,17 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str return NULL; } -static struct sha1_hash_ctx *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr) +static struct sha1_hash_ctx + *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr) { /* * If get_comp_job returns NULL, there are no jobs complete. - * If get_comp_job returns a job, verify that it is safe to return to the user. + * If get_comp_job returns a job, verify that it is safe to return to + * the user. * If it is not ready, resubmit the job to finish processing. * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. - * Otherwise, all jobs currently being managed by the hash_ctx_mgr still need processing. + * Otherwise, all jobs currently being managed by the hash_ctx_mgr + * still need processing. */ struct sha1_hash_ctx *ctx; @@ -235,7 +245,10 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, int flags) { if (flags & (~HASH_ENTIRE)) { - /* User should not pass anything other than FIRST, UPDATE, or LAST */ + /* + * User should not pass anything other than FIRST, UPDATE, or + * LAST + */ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; return ctx; } @@ -264,14 +277,20 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, ctx->partial_block_buffer_length = 0; } - /* If we made it here, there were no errors during this call to submit */ + /* + * If we made it here, there were no errors during this call to + * submit + */ ctx->error = HASH_CTX_ERROR_NONE; /* Store buffer ptr info from user */ ctx->incoming_buffer = buffer; ctx->incoming_buffer_length = len; - /* Store the user's request flags and mark this ctx as currently being processed. */ + /* + * Store the user's request flags and mark this ctx as currently + * being processed. + */ ctx->status = (flags & HASH_LAST) ? (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : HASH_CTX_STS_PROCESSING; @@ -285,9 +304,13 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, * Or if the user's buffer contains less than a whole block, * append as much as possible to the extra block. */ - if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { - /* Compute how many bytes to copy from user buffer into extra block */ - uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (ctx->partial_block_buffer_length || len < SHA1_BLOCK_SIZE) { + /* + * Compute how many bytes to copy from user buffer into + * extra block + */ + uint32_t copy_len = SHA1_BLOCK_SIZE - + ctx->partial_block_buffer_length; if (len < copy_len) copy_len = len; @@ -297,20 +320,28 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, buffer, copy_len); ctx->partial_block_buffer_length += copy_len; - ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer = (const void *) + ((const char *)buffer + copy_len); ctx->incoming_buffer_length = len - copy_len; } - /* The extra block should never contain more than 1 block here */ + /* + * The extra block should never contain more than 1 block + * here + */ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); - /* If the extra block buffer contains exactly 1 block, it can be hashed. */ + /* + * If the extra block buffer contains exactly 1 block, it can + * be hashed. + */ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { ctx->partial_block_buffer_length = 0; ctx->job.buffer = ctx->partial_block_buffer; ctx->job.len = 1; - ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job); + ctx = (struct sha1_hash_ctx *) + sha1_job_mgr_submit(&mgr->mgr, &ctx->job); } } @@ -329,23 +360,24 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr) return NULL; /* - * If flush returned a job, resubmit the job to finish processing. + * If flush returned a job, resubmit the job to finish + * processing. */ ctx = sha1_ctx_mgr_resubmit(mgr, ctx); /* - * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. - * Otherwise, all jobs currently being managed by the sha1_ctx_mgr - * still need processing. Loop. + * If sha1_ctx_mgr_resubmit returned a job, it is ready to be + * returned. Otherwise, all jobs currently being managed by the + * sha1_ctx_mgr still need processing. Loop. */ if (ctx) return ctx; } } -static int sha1_mb_init(struct shash_desc *desc) +static int sha1_mb_init(struct ahash_request *areq) { - struct sha1_hash_ctx *sctx = shash_desc_ctx(desc); + struct sha1_hash_ctx *sctx = ahash_request_ctx(areq); hash_ctx_init(sctx); sctx->job.result_digest[0] = SHA1_H0; @@ -363,7 +395,7 @@ static int sha1_mb_init(struct shash_desc *desc) static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx) { int i; - struct sha1_hash_ctx *sctx = shash_desc_ctx(&rctx->desc); + struct sha1_hash_ctx *sctx = ahash_request_ctx(&rctx->areq); __be32 *dst = (__be32 *) rctx->out; for (i = 0; i < 5; ++i) @@ -394,9 +426,11 @@ static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx, flag |= HASH_LAST; } - sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(&rctx->desc); + sha_ctx = (struct sha1_hash_ctx *) + ahash_request_ctx(&rctx->areq); kernel_fpu_begin(); - sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, + rctx->walk.data, nbytes, flag); if (!sha_ctx) { if (flush) sha_ctx = sha1_ctx_mgr_flush(cstate->mgr); @@ -485,11 +519,10 @@ static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx, mcryptd_arm_flusher(cstate, delay); } -static int sha1_mb_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha1_mb_update(struct ahash_request *areq) { struct mcryptd_hash_request_ctx *rctx = - container_of(desc, struct mcryptd_hash_request_ctx, desc); + container_of(areq, struct mcryptd_hash_request_ctx, areq); struct mcryptd_alg_cstate *cstate = this_cpu_ptr(sha1_mb_alg_state.alg_cstate); @@ -505,7 +538,7 @@ static int sha1_mb_update(struct shash_desc *desc, const u8 *data, } /* need to init context */ - req_ctx_init(rctx, desc); + req_ctx_init(rctx, areq); nbytes = crypto_ahash_walk_first(req, &rctx->walk); @@ -518,10 +551,11 @@ static int sha1_mb_update(struct shash_desc *desc, const u8 *data, rctx->flag |= HASH_DONE; /* submit */ - sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc); + sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq); sha1_mb_add_list(rctx, cstate); kernel_fpu_begin(); - sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, HASH_UPDATE); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, HASH_UPDATE); kernel_fpu_end(); /* check if anything is returned */ @@ -544,11 +578,10 @@ done: return ret; } -static int sha1_mb_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha1_mb_finup(struct ahash_request *areq) { struct mcryptd_hash_request_ctx *rctx = - container_of(desc, struct mcryptd_hash_request_ctx, desc); + container_of(areq, struct mcryptd_hash_request_ctx, areq); struct mcryptd_alg_cstate *cstate = this_cpu_ptr(sha1_mb_alg_state.alg_cstate); @@ -563,7 +596,7 @@ static int sha1_mb_finup(struct shash_desc *desc, const u8 *data, } /* need to init context */ - req_ctx_init(rctx, desc); + req_ctx_init(rctx, areq); nbytes = crypto_ahash_walk_first(req, &rctx->walk); @@ -576,15 +609,15 @@ static int sha1_mb_finup(struct shash_desc *desc, const u8 *data, rctx->flag |= HASH_DONE; flag = HASH_LAST; } - rctx->out = out; /* submit */ rctx->flag |= HASH_FINAL; - sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc); + sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq); sha1_mb_add_list(rctx, cstate); kernel_fpu_begin(); - sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, flag); kernel_fpu_end(); /* check if anything is returned */ @@ -605,10 +638,10 @@ done: return ret; } -static int sha1_mb_final(struct shash_desc *desc, u8 *out) +static int sha1_mb_final(struct ahash_request *areq) { struct mcryptd_hash_request_ctx *rctx = - container_of(desc, struct mcryptd_hash_request_ctx, desc); + container_of(areq, struct mcryptd_hash_request_ctx, areq); struct mcryptd_alg_cstate *cstate = this_cpu_ptr(sha1_mb_alg_state.alg_cstate); @@ -623,16 +656,16 @@ static int sha1_mb_final(struct shash_desc *desc, u8 *out) } /* need to init context */ - req_ctx_init(rctx, desc); + req_ctx_init(rctx, areq); - rctx->out = out; rctx->flag |= HASH_DONE | HASH_FINAL; - sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc); + sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq); /* flag HASH_FINAL and 0 data size */ sha1_mb_add_list(rctx, cstate); kernel_fpu_begin(); - sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, HASH_LAST); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, + HASH_LAST); kernel_fpu_end(); /* check if anything is returned */ @@ -654,48 +687,98 @@ done: return ret; } -static int sha1_mb_export(struct shash_desc *desc, void *out) +static int sha1_mb_export(struct ahash_request *areq, void *out) { - struct sha1_hash_ctx *sctx = shash_desc_ctx(desc); + struct sha1_hash_ctx *sctx = ahash_request_ctx(areq); memcpy(out, sctx, sizeof(*sctx)); return 0; } -static int sha1_mb_import(struct shash_desc *desc, const void *in) +static int sha1_mb_import(struct ahash_request *areq, const void *in) { - struct sha1_hash_ctx *sctx = shash_desc_ctx(desc); + struct sha1_hash_ctx *sctx = ahash_request_ctx(areq); memcpy(sctx, in, sizeof(*sctx)); return 0; } +static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm) +{ + struct mcryptd_ahash *mcryptd_tfm; + struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); + struct mcryptd_hash_ctx *mctx; -static struct shash_alg sha1_mb_shash_alg = { - .digestsize = SHA1_DIGEST_SIZE, + mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(mcryptd_tfm)) + return PTR_ERR(mcryptd_tfm); + mctx = crypto_ahash_ctx(&mcryptd_tfm->base); + mctx->alg_state = &sha1_mb_alg_state; + ctx->mcryptd_tfm = mcryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&mcryptd_tfm->base)); + + return 0; +} + +static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static int sha1_mb_areq_init_tfm(struct crypto_tfm *tfm) +{ + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + sizeof(struct sha1_hash_ctx)); + + return 0; +} + +static void sha1_mb_areq_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static struct ahash_alg sha1_mb_areq_alg = { .init = sha1_mb_init, .update = sha1_mb_update, .final = sha1_mb_final, .finup = sha1_mb_finup, .export = sha1_mb_export, .import = sha1_mb_import, - .descsize = sizeof(struct sha1_hash_ctx), - .statesize = sizeof(struct sha1_hash_ctx), - .base = { - .cra_name = "__sha1-mb", - .cra_driver_name = "__intel_sha1-mb", - .cra_priority = 100, - /* - * use ASYNC flag as some buffers in multi-buffer - * algo may not have completed before hashing thread sleep - */ - .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list), + .halg = { + .digestsize = SHA1_DIGEST_SIZE, + .statesize = sizeof(struct sha1_hash_ctx), + .base = { + .cra_name = "__sha1-mb", + .cra_driver_name = "__intel_sha1-mb", + .cra_priority = 100, + /* + * use ASYNC flag as some buffers in multi-buffer + * algo may not have completed before hashing thread + * sleep + */ + .cra_flags = CRYPTO_ALG_TYPE_AHASH | + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT + (sha1_mb_areq_alg.halg.base.cra_list), + .cra_init = sha1_mb_areq_init_tfm, + .cra_exit = sha1_mb_areq_exit_tfm, + .cra_ctxsize = sizeof(struct sha1_hash_ctx), + } } }; @@ -780,46 +863,20 @@ static int sha1_mb_async_import(struct ahash_request *req, const void *in) struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; - struct crypto_shash *child = mcryptd_ahash_child(mcryptd_tfm); + struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm); struct mcryptd_hash_request_ctx *rctx; - struct shash_desc *desc; + struct ahash_request *areq; memcpy(mcryptd_req, req, sizeof(*req)); ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); rctx = ahash_request_ctx(mcryptd_req); - desc = &rctx->desc; - desc->tfm = child; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - - return crypto_ahash_import(mcryptd_req, in); -} - -static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm) -{ - struct mcryptd_ahash *mcryptd_tfm; - struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); - struct mcryptd_hash_ctx *mctx; + areq = &rctx->areq; - mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", - CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(mcryptd_tfm)) - return PTR_ERR(mcryptd_tfm); - mctx = crypto_ahash_ctx(&mcryptd_tfm->base); - mctx->alg_state = &sha1_mb_alg_state; - ctx->mcryptd_tfm = mcryptd_tfm; - crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), - sizeof(struct ahash_request) + - crypto_ahash_reqsize(&mcryptd_tfm->base)); + ahash_request_set_tfm(areq, child); + ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP, + rctx->complete, req); - return 0; -} - -static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm) -{ - struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); - - mcryptd_free_ahash(ctx->mcryptd_tfm); + return crypto_ahash_import(mcryptd_req, in); } static struct ahash_alg sha1_mb_async_alg = { @@ -866,7 +923,8 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate) if (time_before(cur_time, rctx->tag.expire)) break; kernel_fpu_begin(); - sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr); + sha_ctx = (struct sha1_hash_ctx *) + sha1_ctx_mgr_flush(cstate->mgr); kernel_fpu_end(); if (!sha_ctx) { pr_err("sha1_mb error: nothing got flushed for non-empty list\n"); @@ -927,7 +985,7 @@ static int __init sha1_mb_mod_init(void) } sha1_mb_alg_state.flusher = &sha1_mb_flusher; - err = crypto_register_shash(&sha1_mb_shash_alg); + err = crypto_register_ahash(&sha1_mb_areq_alg); if (err) goto err2; err = crypto_register_ahash(&sha1_mb_async_alg); @@ -937,7 +995,7 @@ static int __init sha1_mb_mod_init(void) return 0; err1: - crypto_unregister_shash(&sha1_mb_shash_alg); + crypto_unregister_ahash(&sha1_mb_areq_alg); err2: for_each_possible_cpu(cpu) { cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu); @@ -953,7 +1011,7 @@ static void __exit sha1_mb_mod_fini(void) struct mcryptd_alg_cstate *cpu_state; crypto_unregister_ahash(&sha1_mb_async_alg); - crypto_unregister_shash(&sha1_mb_shash_alg); + crypto_unregister_ahash(&sha1_mb_areq_alg); for_each_possible_cpu(cpu) { cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu); kfree(cpu_state->mgr); diff --git a/arch/x86/crypto/sha-mb/sha_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h index e36069d0c1bd..98a35bcc6f4a 100644 --- a/arch/x86/crypto/sha-mb/sha_mb_ctx.h +++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h @@ -54,7 +54,7 @@ #ifndef _SHA_MB_CTX_INTERNAL_H #define _SHA_MB_CTX_INTERNAL_H -#include "sha_mb_mgr.h" +#include "sha1_mb_mgr.h" #define HASH_UPDATE 0x00 #define HASH_FIRST 0x01 diff --git a/arch/x86/crypto/sha-mb/sha_mb_mgr.h b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h index 08ad1a9acfd7..08ad1a9acfd7 100644 --- a/arch/x86/crypto/sha-mb/sha_mb_mgr.h +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S index 86688c6e7a25..86688c6e7a25 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S index 96df6a39d7e2..96df6a39d7e2 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c index 822acb5b464c..d2add0d35f43 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c @@ -51,7 +51,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "sha_mb_mgr.h" +#include "sha1_mb_mgr.h" void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state) { diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S index 63a0d9c8e31f..63a0d9c8e31f 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S diff --git a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S index 8e1b47792b31..c9dae1cd2919 100644 --- a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S +++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S @@ -296,7 +296,11 @@ W14 = TMP_ # ENTRY(sha1_x8_avx2) - push RSP_SAVE + # save callee-saved clobbered registers to comply with C function ABI + push %r12 + push %r13 + push %r14 + push %r15 #save rsp mov %rsp, RSP_SAVE @@ -446,7 +450,12 @@ lloop: ## Postamble mov RSP_SAVE, %rsp - pop RSP_SAVE + + # restore callee-saved clobbered registers + pop %r15 + pop %r14 + pop %r13 + pop %r12 ret ENDPROC(sha1_x8_avx2) diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 1024e378a358..fc61739150e7 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -374,3 +374,9 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS_CRYPTO("sha1"); +MODULE_ALIAS_CRYPTO("sha1-ssse3"); +MODULE_ALIAS_CRYPTO("sha1-avx"); +MODULE_ALIAS_CRYPTO("sha1-avx2"); +#ifdef CONFIG_AS_SHA1_NI +MODULE_ALIAS_CRYPTO("sha1-ni"); +#endif diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile new file mode 100644 index 000000000000..41089e7c400c --- /dev/null +++ b/arch/x86/crypto/sha256-mb/Makefile @@ -0,0 +1,11 @@ +# +# Arch-specific CryptoAPI modules. +# + +avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ + $(comma)4)$(comma)%ymm2,yes,no) +ifeq ($(avx2_supported),yes) + obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb.o + sha256-mb-y := sha256_mb.o sha256_mb_mgr_flush_avx2.o \ + sha256_mb_mgr_init_avx2.o sha256_mb_mgr_submit_avx2.o sha256_x8_avx2.o +endif diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c new file mode 100644 index 000000000000..89fa85e8b10c --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb.c @@ -0,0 +1,1030 @@ +/* + * Multi buffer SHA256 algorithm Glue Code + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <linux/list.h> +#include <crypto/scatterwalk.h> +#include <crypto/sha.h> +#include <crypto/mcryptd.h> +#include <crypto/crypto_wq.h> +#include <asm/byteorder.h> +#include <linux/hardirq.h> +#include <asm/fpu/api.h> +#include "sha256_mb_ctx.h" + +#define FLUSH_INTERVAL 1000 /* in usec */ + +static struct mcryptd_alg_state sha256_mb_alg_state; + +struct sha256_mb_ctx { + struct mcryptd_ahash *mcryptd_tfm; +}; + +static inline struct mcryptd_hash_request_ctx + *cast_hash_to_mcryptd_ctx(struct sha256_hash_ctx *hash_ctx) +{ + struct ahash_request *areq; + + areq = container_of((void *) hash_ctx, struct ahash_request, __ctx); + return container_of(areq, struct mcryptd_hash_request_ctx, areq); +} + +static inline struct ahash_request + *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx) +{ + return container_of((void *) ctx, struct ahash_request, __ctx); +} + +static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx, + struct ahash_request *areq) +{ + rctx->flag = HASH_UPDATE; +} + +static asmlinkage void (*sha256_job_mgr_init)(struct sha256_mb_mgr *state); +static asmlinkage struct job_sha256* (*sha256_job_mgr_submit) + (struct sha256_mb_mgr *state, struct job_sha256 *job); +static asmlinkage struct job_sha256* (*sha256_job_mgr_flush) + (struct sha256_mb_mgr *state); +static asmlinkage struct job_sha256* (*sha256_job_mgr_get_comp_job) + (struct sha256_mb_mgr *state); + +inline void sha256_init_digest(uint32_t *digest) +{ + static const uint32_t initial_digest[SHA256_DIGEST_LENGTH] = { + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7}; + memcpy(digest, initial_digest, sizeof(initial_digest)); +} + +inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], + uint32_t total_len) +{ + uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1); + + memset(&padblock[i], 0, SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((SHA256_BLOCK_SIZE - 1) & + (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) &padblock[i - 16]) = 0; +#endif + + *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3); + + /* Number of extra blocks to hash */ + return i >> SHA256_LOG2_BLOCK_SIZE; +} + +static struct sha256_hash_ctx + *sha256_ctx_mgr_resubmit(struct sha256_ctx_mgr *mgr, + struct sha256_hash_ctx *ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + /* Clear PROCESSING bit */ + ctx->status = HASH_CTX_STS_COMPLETE; + return ctx; + } + + /* + * If the extra blocks are empty, begin hashing what remains + * in the user's buffer. + */ + if (ctx->partial_block_buffer_length == 0 && + ctx->incoming_buffer_length) { + + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + uint32_t copy_len; + + /* + * Only entire blocks can be hashed. + * Copy remainder to extra blocks buffer. + */ + copy_len = len & (SHA256_BLOCK_SIZE-1); + + if (copy_len) { + len -= copy_len; + memcpy(ctx->partial_block_buffer, + ((const char *) buffer + len), + copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + /* len should be a multiple of the block size now */ + assert((len % SHA256_BLOCK_SIZE) == 0); + + /* Set len to the number of blocks to be hashed */ + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (struct sha256_hash_ctx *) + sha256_job_mgr_submit(&mgr->mgr, &ctx->job); + continue; + } + } + + /* + * If the extra blocks are not empty, then we are + * either on the last block(s) or we need more + * user input before continuing. + */ + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = + sha256_pad(buf, ctx->total_length); + + ctx->status = (HASH_CTX_STS_PROCESSING | + HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (struct sha256_hash_ctx *) + sha256_job_mgr_submit(&mgr->mgr, &ctx->job); + continue; + } + + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static struct sha256_hash_ctx + *sha256_ctx_mgr_get_comp_ctx(struct sha256_ctx_mgr *mgr) +{ + /* + * If get_comp_job returns NULL, there are no jobs complete. + * If get_comp_job returns a job, verify that it is safe to return to + * the user. If it is not ready, resubmit the job to finish processing. + * If sha256_ctx_mgr_resubmit returned a job, it is ready to be + * returned. Otherwise, all jobs currently being managed by the + * hash_ctx_mgr still need processing. + */ + struct sha256_hash_ctx *ctx; + + ctx = (struct sha256_hash_ctx *) sha256_job_mgr_get_comp_job(&mgr->mgr); + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +static void sha256_ctx_mgr_init(struct sha256_ctx_mgr *mgr) +{ + sha256_job_mgr_init(&mgr->mgr); +} + +static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr, + struct sha256_hash_ctx *ctx, + const void *buffer, + uint32_t len, + int flags) +{ + if (flags & (~HASH_ENTIRE)) { + /* User should not pass anything other than FIRST, UPDATE + * or LAST + */ + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + /* Cannot submit to a currently processing job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + /* Cannot update a finished job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + /* Init digest */ + sha256_init_digest(ctx->job.result_digest); + + /* Reset byte counter */ + ctx->total_length = 0; + + /* Clear extra blocks */ + ctx->partial_block_buffer_length = 0; + } + + /* If we made it here, there was no error during this call to submit */ + ctx->error = HASH_CTX_ERROR_NONE; + + /* Store buffer ptr info from user */ + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + /* + * Store the user's request flags and mark this ctx as currently + * being processed. + */ + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + /* Advance byte counter */ + ctx->total_length += len; + + /* + * If there is anything currently buffered in the extra blocks, + * append to it until it contains a whole block. + * Or if the user's buffer contains less than a whole block, + * append as much as possible to the extra block. + */ + if (ctx->partial_block_buffer_length || len < SHA256_BLOCK_SIZE) { + /* + * Compute how many bytes to copy from user buffer into + * extra block + */ + uint32_t copy_len = SHA256_BLOCK_SIZE - + ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + /* Copy and update relevant pointers and counters */ + memcpy( + &ctx->partial_block_buffer[ctx->partial_block_buffer_length], + buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *) + ((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + + /* The extra block should never contain more than 1 block */ + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + /* + * If the extra block buffer contains exactly 1 block, + * it can be hashed. + */ + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (struct sha256_hash_ctx *) + sha256_job_mgr_submit(&mgr->mgr, &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +static struct sha256_hash_ctx *sha256_ctx_mgr_flush(struct sha256_ctx_mgr *mgr) +{ + struct sha256_hash_ctx *ctx; + + while (1) { + ctx = (struct sha256_hash_ctx *) + sha256_job_mgr_flush(&mgr->mgr); + + /* If flush returned 0, there are no more jobs in flight. */ + if (!ctx) + return NULL; + + /* + * If flush returned a job, resubmit the job to finish + * processing. + */ + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + /* + * If sha256_ctx_mgr_resubmit returned a job, it is ready to + * be returned. Otherwise, all jobs currently being managed by + * the sha256_ctx_mgr still need processing. Loop. + */ + if (ctx) + return ctx; + } +} + +static int sha256_mb_init(struct ahash_request *areq) +{ + struct sha256_hash_ctx *sctx = ahash_request_ctx(areq); + + hash_ctx_init(sctx); + sctx->job.result_digest[0] = SHA256_H0; + sctx->job.result_digest[1] = SHA256_H1; + sctx->job.result_digest[2] = SHA256_H2; + sctx->job.result_digest[3] = SHA256_H3; + sctx->job.result_digest[4] = SHA256_H4; + sctx->job.result_digest[5] = SHA256_H5; + sctx->job.result_digest[6] = SHA256_H6; + sctx->job.result_digest[7] = SHA256_H7; + sctx->total_length = 0; + sctx->partial_block_buffer_length = 0; + sctx->status = HASH_CTX_STS_IDLE; + + return 0; +} + +static int sha256_mb_set_results(struct mcryptd_hash_request_ctx *rctx) +{ + int i; + struct sha256_hash_ctx *sctx = ahash_request_ctx(&rctx->areq); + __be32 *dst = (__be32 *) rctx->out; + + for (i = 0; i < 8; ++i) + dst[i] = cpu_to_be32(sctx->job.result_digest[i]); + + return 0; +} + +static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx, + struct mcryptd_alg_cstate *cstate, bool flush) +{ + int flag = HASH_UPDATE; + int nbytes, err = 0; + struct mcryptd_hash_request_ctx *rctx = *ret_rctx; + struct sha256_hash_ctx *sha_ctx; + + /* more work ? */ + while (!(rctx->flag & HASH_DONE)) { + nbytes = crypto_ahash_walk_done(&rctx->walk, 0); + if (nbytes < 0) { + err = nbytes; + goto out; + } + /* check if the walk is done */ + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + if (rctx->flag & HASH_FINAL) + flag |= HASH_LAST; + + } + sha_ctx = (struct sha256_hash_ctx *) + ahash_request_ctx(&rctx->areq); + kernel_fpu_begin(); + sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, + rctx->walk.data, nbytes, flag); + if (!sha_ctx) { + if (flush) + sha_ctx = sha256_ctx_mgr_flush(cstate->mgr); + } + kernel_fpu_end(); + if (sha_ctx) + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + else { + rctx = NULL; + goto out; + } + } + + /* copy the results */ + if (rctx->flag & HASH_FINAL) + sha256_mb_set_results(rctx); + +out: + *ret_rctx = rctx; + return err; +} + +static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate, + int err) +{ + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha256_hash_ctx *sha_ctx; + struct mcryptd_hash_request_ctx *req_ctx; + int ret; + + /* remove from work list */ + spin_lock(&cstate->work_lock); + list_del(&rctx->waiter); + spin_unlock(&cstate->work_lock); + + if (irqs_disabled()) + rctx->complete(&req->base, err); + else { + local_bh_disable(); + rctx->complete(&req->base, err); + local_bh_enable(); + } + + /* check to see if there are other jobs that are done */ + sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr); + while (sha_ctx) { + req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&req_ctx, cstate, false); + if (req_ctx) { + spin_lock(&cstate->work_lock); + list_del(&req_ctx->waiter); + spin_unlock(&cstate->work_lock); + + req = cast_mcryptd_ctx_to_req(req_ctx); + if (irqs_disabled()) + rctx->complete(&req->base, ret); + else { + local_bh_disable(); + rctx->complete(&req->base, ret); + local_bh_enable(); + } + } + sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr); + } + + return 0; +} + +static void sha256_mb_add_list(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate) +{ + unsigned long next_flush; + unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL); + + /* initialize tag */ + rctx->tag.arrival = jiffies; /* tag the arrival time */ + rctx->tag.seq_num = cstate->next_seq_num++; + next_flush = rctx->tag.arrival + delay; + rctx->tag.expire = next_flush; + + spin_lock(&cstate->work_lock); + list_add_tail(&rctx->waiter, &cstate->work_list); + spin_unlock(&cstate->work_lock); + + mcryptd_arm_flusher(cstate, delay); +} + +static int sha256_mb_update(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha256_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha256_hash_ctx *sha_ctx; + int ret = 0, nbytes; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) + rctx->flag |= HASH_DONE; + + /* submit */ + sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq); + sha256_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, HASH_UPDATE); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha256_mb_finup(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha256_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha256_hash_ctx *sha_ctx; + int ret = 0, flag = HASH_UPDATE, nbytes; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + flag = HASH_LAST; + } + + /* submit */ + rctx->flag |= HASH_FINAL; + sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq); + sha256_mb_add_list(rctx, cstate); + + kernel_fpu_begin(); + sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, flag); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha256_mb_final(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, + areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha256_mb_alg_state.alg_cstate); + + struct sha256_hash_ctx *sha_ctx; + int ret = 0; + u8 data; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + rctx->flag |= HASH_DONE | HASH_FINAL; + + sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq); + /* flag HASH_FINAL and 0 data size */ + sha256_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, + HASH_LAST); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha256_mb_export(struct ahash_request *areq, void *out) +{ + struct sha256_hash_ctx *sctx = ahash_request_ctx(areq); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha256_mb_import(struct ahash_request *areq, const void *in) +{ + struct sha256_hash_ctx *sctx = ahash_request_ctx(areq); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static int sha256_mb_async_init_tfm(struct crypto_tfm *tfm) +{ + struct mcryptd_ahash *mcryptd_tfm; + struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm); + struct mcryptd_hash_ctx *mctx; + + mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha256-mb", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(mcryptd_tfm)) + return PTR_ERR(mcryptd_tfm); + mctx = crypto_ahash_ctx(&mcryptd_tfm->base); + mctx->alg_state = &sha256_mb_alg_state; + ctx->mcryptd_tfm = mcryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&mcryptd_tfm->base)); + + return 0; +} + +static void sha256_mb_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static int sha256_mb_areq_init_tfm(struct crypto_tfm *tfm) +{ + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + sizeof(struct sha256_hash_ctx)); + + return 0; +} + +static void sha256_mb_areq_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static struct ahash_alg sha256_mb_areq_alg = { + .init = sha256_mb_init, + .update = sha256_mb_update, + .final = sha256_mb_final, + .finup = sha256_mb_finup, + .export = sha256_mb_export, + .import = sha256_mb_import, + .halg = { + .digestsize = SHA256_DIGEST_SIZE, + .statesize = sizeof(struct sha256_hash_ctx), + .base = { + .cra_name = "__sha256-mb", + .cra_driver_name = "__intel_sha256-mb", + .cra_priority = 100, + /* + * use ASYNC flag as some buffers in multi-buffer + * algo may not have completed before hashing thread + * sleep + */ + .cra_flags = CRYPTO_ALG_TYPE_AHASH | + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT + (sha256_mb_areq_alg.halg.base.cra_list), + .cra_init = sha256_mb_areq_init_tfm, + .cra_exit = sha256_mb_areq_exit_tfm, + .cra_ctxsize = sizeof(struct sha256_hash_ctx), + } + } +}; + +static int sha256_mb_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_init(mcryptd_req); +} + +static int sha256_mb_async_update(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_update(mcryptd_req); +} + +static int sha256_mb_async_finup(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_finup(mcryptd_req); +} + +static int sha256_mb_async_final(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_final(mcryptd_req); +} + +static int sha256_mb_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_digest(mcryptd_req); +} + +static int sha256_mb_async_export(struct ahash_request *req, void *out) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_export(mcryptd_req, out); +} + +static int sha256_mb_async_import(struct ahash_request *req, const void *in) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm); + struct mcryptd_hash_request_ctx *rctx; + struct ahash_request *areq; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + rctx = ahash_request_ctx(mcryptd_req); + areq = &rctx->areq; + + ahash_request_set_tfm(areq, child); + ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP, + rctx->complete, req); + + return crypto_ahash_import(mcryptd_req, in); +} + +static struct ahash_alg sha256_mb_async_alg = { + .init = sha256_mb_async_init, + .update = sha256_mb_async_update, + .final = sha256_mb_async_final, + .finup = sha256_mb_async_finup, + .export = sha256_mb_async_export, + .import = sha256_mb_async_import, + .digest = sha256_mb_async_digest, + .halg = { + .digestsize = SHA256_DIGEST_SIZE, + .statesize = sizeof(struct sha256_hash_ctx), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256_mb", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_AHASH | + CRYPTO_ALG_ASYNC, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_type = &crypto_ahash_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT + (sha256_mb_async_alg.halg.base.cra_list), + .cra_init = sha256_mb_async_init_tfm, + .cra_exit = sha256_mb_async_exit_tfm, + .cra_ctxsize = sizeof(struct sha256_mb_ctx), + .cra_alignmask = 0, + }, + }, +}; + +static unsigned long sha256_mb_flusher(struct mcryptd_alg_cstate *cstate) +{ + struct mcryptd_hash_request_ctx *rctx; + unsigned long cur_time; + unsigned long next_flush = 0; + struct sha256_hash_ctx *sha_ctx; + + + cur_time = jiffies; + + while (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + if (time_before(cur_time, rctx->tag.expire)) + break; + kernel_fpu_begin(); + sha_ctx = (struct sha256_hash_ctx *) + sha256_ctx_mgr_flush(cstate->mgr); + kernel_fpu_end(); + if (!sha_ctx) { + pr_err("sha256_mb error: nothing got" + " flushed for non-empty list\n"); + break; + } + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + sha_finish_walk(&rctx, cstate, true); + sha_complete_job(rctx, cstate, 0); + } + + if (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + /* get the hash context and then flush time */ + next_flush = rctx->tag.expire; + mcryptd_arm_flusher(cstate, get_delay(next_flush)); + } + return next_flush; +} + +static int __init sha256_mb_mod_init(void) +{ + + int cpu; + int err; + struct mcryptd_alg_cstate *cpu_state; + + /* check for dependent cpu features */ + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_BMI2)) + return -ENODEV; + + /* initialize multibuffer structures */ + sha256_mb_alg_state.alg_cstate = alloc_percpu + (struct mcryptd_alg_cstate); + + sha256_job_mgr_init = sha256_mb_mgr_init_avx2; + sha256_job_mgr_submit = sha256_mb_mgr_submit_avx2; + sha256_job_mgr_flush = sha256_mb_mgr_flush_avx2; + sha256_job_mgr_get_comp_job = sha256_mb_mgr_get_comp_job_avx2; + + if (!sha256_mb_alg_state.alg_cstate) + return -ENOMEM; + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu); + cpu_state->next_flush = 0; + cpu_state->next_seq_num = 0; + cpu_state->flusher_engaged = false; + INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher); + cpu_state->cpu = cpu; + cpu_state->alg_state = &sha256_mb_alg_state; + cpu_state->mgr = kzalloc(sizeof(struct sha256_ctx_mgr), + GFP_KERNEL); + if (!cpu_state->mgr) + goto err2; + sha256_ctx_mgr_init(cpu_state->mgr); + INIT_LIST_HEAD(&cpu_state->work_list); + spin_lock_init(&cpu_state->work_lock); + } + sha256_mb_alg_state.flusher = &sha256_mb_flusher; + + err = crypto_register_ahash(&sha256_mb_areq_alg); + if (err) + goto err2; + err = crypto_register_ahash(&sha256_mb_async_alg); + if (err) + goto err1; + + + return 0; +err1: + crypto_unregister_ahash(&sha256_mb_areq_alg); +err2: + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha256_mb_alg_state.alg_cstate); + return -ENODEV; +} + +static void __exit sha256_mb_mod_fini(void) +{ + int cpu; + struct mcryptd_alg_cstate *cpu_state; + + crypto_unregister_ahash(&sha256_mb_async_alg); + crypto_unregister_ahash(&sha256_mb_areq_alg); + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha256_mb_alg_state.alg_cstate); +} + +module_init(sha256_mb_mod_init); +module_exit(sha256_mb_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, multi buffer accelerated"); + +MODULE_ALIAS_CRYPTO("sha256"); diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h new file mode 100644 index 000000000000..edd252b73206 --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h @@ -0,0 +1,136 @@ +/* + * Header file for multi buffer SHA256 context + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SHA_MB_CTX_INTERNAL_H +#define _SHA_MB_CTX_INTERNAL_H + +#include "sha256_mb_mgr.h" + +#define HASH_UPDATE 0x00 +#define HASH_FIRST 0x01 +#define HASH_LAST 0x02 +#define HASH_ENTIRE 0x03 +#define HASH_DONE 0x04 +#define HASH_FINAL 0x08 + +#define HASH_CTX_STS_IDLE 0x00 +#define HASH_CTX_STS_PROCESSING 0x01 +#define HASH_CTX_STS_LAST 0x02 +#define HASH_CTX_STS_COMPLETE 0x04 + +enum hash_ctx_error { + HASH_CTX_ERROR_NONE = 0, + HASH_CTX_ERROR_INVALID_FLAGS = -1, + HASH_CTX_ERROR_ALREADY_PROCESSING = -2, + HASH_CTX_ERROR_ALREADY_COMPLETED = -3, + +#ifdef HASH_CTX_DEBUG + HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4, +#endif +}; + + +#define hash_ctx_user_data(ctx) ((ctx)->user_data) +#define hash_ctx_digest(ctx) ((ctx)->job.result_digest) +#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING) +#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE) +#define hash_ctx_status(ctx) ((ctx)->status) +#define hash_ctx_error(ctx) ((ctx)->error) +#define hash_ctx_init(ctx) \ + do { \ + (ctx)->error = HASH_CTX_ERROR_NONE; \ + (ctx)->status = HASH_CTX_STS_COMPLETE; \ + } while (0) + + +/* Hash Constants and Typedefs */ +#define SHA256_DIGEST_LENGTH 8 +#define SHA256_LOG2_BLOCK_SIZE 6 + +#define SHA256_PADLENGTHFIELD_SIZE 8 + +#ifdef SHA_MB_DEBUG +#define assert(expr) \ +do { \ + if (unlikely(!(expr))) { \ + printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ + #expr, __FILE__, __func__, __LINE__); \ + } \ +} while (0) +#else +#define assert(expr) do {} while (0) +#endif + +struct sha256_ctx_mgr { + struct sha256_mb_mgr mgr; +}; + +/* typedef struct sha256_ctx_mgr sha256_ctx_mgr; */ + +struct sha256_hash_ctx { + /* Must be at struct offset 0 */ + struct job_sha256 job; + /* status flag */ + int status; + /* error flag */ + int error; + + uint32_t total_length; + const void *incoming_buffer; + uint32_t incoming_buffer_length; + uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2]; + uint32_t partial_block_buffer_length; + void *user_data; +}; + +#endif diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h new file mode 100644 index 000000000000..b01ae408c56d --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h @@ -0,0 +1,108 @@ +/* + * Header file for multi buffer SHA256 algorithm manager + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef __SHA_MB_MGR_H +#define __SHA_MB_MGR_H + +#include <linux/types.h> + +#define NUM_SHA256_DIGEST_WORDS 8 + +enum job_sts { STS_UNKNOWN = 0, + STS_BEING_PROCESSED = 1, + STS_COMPLETED = 2, + STS_INTERNAL_ERROR = 3, + STS_ERROR = 4 +}; + +struct job_sha256 { + u8 *buffer; + u32 len; + u32 result_digest[NUM_SHA256_DIGEST_WORDS] __aligned(32); + enum job_sts status; + void *user_data; +}; + +/* SHA256 out-of-order scheduler */ + +/* typedef uint32_t sha8_digest_array[8][8]; */ + +struct sha256_args_x8 { + uint32_t digest[8][8]; + uint8_t *data_ptr[8]; +}; + +struct sha256_lane_data { + struct job_sha256 *job_in_lane; +}; + +struct sha256_mb_mgr { + struct sha256_args_x8 args; + + uint32_t lens[8]; + + /* each byte is index (0...7) of unused lanes */ + uint64_t unused_lanes; + /* byte 4 is set to FF as a flag */ + struct sha256_lane_data ldata[8]; +}; + + +#define SHA256_MB_MGR_NUM_LANES_AVX2 8 + +void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state); +struct job_sha256 *sha256_mb_mgr_submit_avx2(struct sha256_mb_mgr *state, + struct job_sha256 *job); +struct job_sha256 *sha256_mb_mgr_flush_avx2(struct sha256_mb_mgr *state); +struct job_sha256 *sha256_mb_mgr_get_comp_job_avx2(struct sha256_mb_mgr *state); + +#endif diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S new file mode 100644 index 000000000000..5c377bac21d0 --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S @@ -0,0 +1,304 @@ +/* + * Header file for multi buffer SHA256 algorithm data structure + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +# Macros for defining data structures + +# Usage example + +#START_FIELDS # JOB_AES +### name size align +#FIELD _plaintext, 8, 8 # pointer to plaintext +#FIELD _ciphertext, 8, 8 # pointer to ciphertext +#FIELD _IV, 16, 8 # IV +#FIELD _keys, 8, 8 # pointer to keys +#FIELD _len, 4, 4 # length in bytes +#FIELD _status, 4, 4 # status enumeration +#FIELD _user_data, 8, 8 # pointer to user data +#UNION _union, size1, align1, \ +# size2, align2, \ +# size3, align3, \ +# ... +#END_FIELDS +#%assign _JOB_AES_size _FIELD_OFFSET +#%assign _JOB_AES_align _STRUCT_ALIGN + +######################################################################### + +# Alternate "struc-like" syntax: +# STRUCT job_aes2 +# RES_Q .plaintext, 1 +# RES_Q .ciphertext, 1 +# RES_DQ .IV, 1 +# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN +# RES_U .union, size1, align1, \ +# size2, align2, \ +# ... +# ENDSTRUCT +# # Following only needed if nesting +# %assign job_aes2_size _FIELD_OFFSET +# %assign job_aes2_align _STRUCT_ALIGN +# +# RES_* macros take a name, a count and an optional alignment. +# The count in in terms of the base size of the macro, and the +# default alignment is the base size. +# The macros are: +# Macro Base size +# RES_B 1 +# RES_W 2 +# RES_D 4 +# RES_Q 8 +# RES_DQ 16 +# RES_Y 32 +# RES_Z 64 +# +# RES_U defines a union. It's arguments are a name and two or more +# pairs of "size, alignment" +# +# The two assigns are only needed if this structure is being nested +# within another. Even if the assigns are not done, one can still use +# STRUCT_NAME_size as the size of the structure. +# +# Note that for nesting, you still need to assign to STRUCT_NAME_size. +# +# The differences between this and using "struc" directly are that each +# type is implicitly aligned to its natural length (although this can be +# over-ridden with an explicit third parameter), and that the structure +# is padded at the end to its overall alignment. +# + +######################################################################### + +#ifndef _DATASTRUCT_ASM_ +#define _DATASTRUCT_ASM_ + +#define SZ8 8*SHA256_DIGEST_WORD_SIZE +#define ROUNDS 64*SZ8 +#define PTR_SZ 8 +#define SHA256_DIGEST_WORD_SIZE 4 +#define MAX_SHA256_LANES 8 +#define SHA256_DIGEST_WORDS 8 +#define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE) +#define SHA256_DIGEST_SIZE (SHA256_DIGEST_ROW_SIZE * SHA256_DIGEST_WORDS) +#define SHA256_BLK_SZ 64 + +# START_FIELDS +.macro START_FIELDS + _FIELD_OFFSET = 0 + _STRUCT_ALIGN = 0 +.endm + +# FIELD name size align +.macro FIELD name size align + _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1)) + \name = _FIELD_OFFSET + _FIELD_OFFSET = _FIELD_OFFSET + (\size) +.if (\align > _STRUCT_ALIGN) + _STRUCT_ALIGN = \align +.endif +.endm + +# END_FIELDS +.macro END_FIELDS + _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1)) +.endm + +######################################################################## + +.macro STRUCT p1 +START_FIELDS +.struc \p1 +.endm + +.macro ENDSTRUCT + tmp = _FIELD_OFFSET + END_FIELDS + tmp = (_FIELD_OFFSET - %%tmp) +.if (tmp > 0) + .lcomm tmp +.endif +.endstruc +.endm + +## RES_int name size align +.macro RES_int p1 p2 p3 + name = \p1 + size = \p2 + align = .\p3 + + _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1)) +.align align +.lcomm name size + _FIELD_OFFSET = _FIELD_OFFSET + (size) +.if (align > _STRUCT_ALIGN) + _STRUCT_ALIGN = align +.endif +.endm + +# macro RES_B name, size [, align] +.macro RES_B _name, _size, _align=1 +RES_int _name _size _align +.endm + +# macro RES_W name, size [, align] +.macro RES_W _name, _size, _align=2 +RES_int _name 2*(_size) _align +.endm + +# macro RES_D name, size [, align] +.macro RES_D _name, _size, _align=4 +RES_int _name 4*(_size) _align +.endm + +# macro RES_Q name, size [, align] +.macro RES_Q _name, _size, _align=8 +RES_int _name 8*(_size) _align +.endm + +# macro RES_DQ name, size [, align] +.macro RES_DQ _name, _size, _align=16 +RES_int _name 16*(_size) _align +.endm + +# macro RES_Y name, size [, align] +.macro RES_Y _name, _size, _align=32 +RES_int _name 32*(_size) _align +.endm + +# macro RES_Z name, size [, align] +.macro RES_Z _name, _size, _align=64 +RES_int _name 64*(_size) _align +.endm + +#endif + + +######################################################################## +#### Define SHA256 Out Of Order Data Structures +######################################################################## + +START_FIELDS # LANE_DATA +### name size align +FIELD _job_in_lane, 8, 8 # pointer to job object +END_FIELDS + + _LANE_DATA_size = _FIELD_OFFSET + _LANE_DATA_align = _STRUCT_ALIGN + +######################################################################## + +START_FIELDS # SHA256_ARGS_X4 +### name size align +FIELD _digest, 4*8*8, 4 # transposed digest +FIELD _data_ptr, 8*8, 8 # array of pointers to data +END_FIELDS + + _SHA256_ARGS_X4_size = _FIELD_OFFSET + _SHA256_ARGS_X4_align = _STRUCT_ALIGN + _SHA256_ARGS_X8_size = _FIELD_OFFSET + _SHA256_ARGS_X8_align = _STRUCT_ALIGN + +####################################################################### + +START_FIELDS # MB_MGR +### name size align +FIELD _args, _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align +FIELD _lens, 4*8, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align +END_FIELDS + + _MB_MGR_size = _FIELD_OFFSET + _MB_MGR_align = _STRUCT_ALIGN + +_args_digest = _args + _digest +_args_data_ptr = _args + _data_ptr + +####################################################################### + +START_FIELDS #STACK_FRAME +### name size align +FIELD _data, 16*SZ8, 1 # transposed digest +FIELD _digest, 8*SZ8, 1 # array of pointers to data +FIELD _ytmp, 4*SZ8, 1 +FIELD _rsp, 8, 1 +END_FIELDS + + _STACK_FRAME_size = _FIELD_OFFSET + _STACK_FRAME_align = _STRUCT_ALIGN + +####################################################################### + +######################################################################## +#### Define constants +######################################################################## + +#define STS_UNKNOWN 0 +#define STS_BEING_PROCESSED 1 +#define STS_COMPLETED 2 + +######################################################################## +#### Define JOB_SHA256 structure +######################################################################## + +START_FIELDS # JOB_SHA256 + +### name size align +FIELD _buffer, 8, 8 # pointer to buffer +FIELD _len, 8, 8 # length in bytes +FIELD _result_digest, 8*4, 32 # Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 +END_FIELDS + + _JOB_SHA256_size = _FIELD_OFFSET + _JOB_SHA256_align = _STRUCT_ALIGN diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S new file mode 100644 index 000000000000..b691da981cd9 --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S @@ -0,0 +1,304 @@ +/* + * Flush routine for SHA256 multibuffer + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <linux/linkage.h> +#include <asm/frame.h> +#include "sha256_mb_mgr_datastruct.S" + +.extern sha256_x8_avx2 + +#LINUX register definitions +#define arg1 %rdi +#define arg2 %rsi + +# Common register definitions +#define state arg1 +#define job arg2 +#define len2 arg2 + +# idx must be a register not clobberred by sha1_mult +#define idx %r8 +#define DWORD_idx %r8d + +#define unused_lanes %rbx +#define lane_data %rbx +#define tmp2 %rbx +#define tmp2_w %ebx + +#define job_rax %rax +#define tmp1 %rax +#define size_offset %rax +#define tmp %rax +#define start_offset %rax + +#define tmp3 %arg1 + +#define extra_blocks %arg2 +#define p %arg2 + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JNE_SKIP i +jne skip_\i +.endm + +.altmacro +.macro SET_OFFSET _offset +offset = \_offset +.endm +.noaltmacro + +# JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state) +# arg 1 : rcx : state +ENTRY(sha256_mb_mgr_flush_avx2) + FRAME_BEGIN + push %rbx + + # If bit (32+3) is set, then all lanes are empty + mov _unused_lanes(state), unused_lanes + bt $32+3, unused_lanes + jc return_null + + # find a lane with a non-null job + xor idx, idx + offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne one(%rip), idx + offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne two(%rip), idx + offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne three(%rip), idx + offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne four(%rip), idx + offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne five(%rip), idx + offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne six(%rip), idx + offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne seven(%rip), idx + + # copy idx to empty lanes +copy_lane_data: + offset = (_args + _data_ptr) + mov offset(state,idx,8), tmp + + I = 0 +.rep 8 + offset = (_ldata + I * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) +.altmacro + JNE_SKIP %I + offset = (_args + _data_ptr + 8*I) + mov tmp, offset(state) + offset = (_lens + 4*I) + movl $0xFFFFFFFF, offset(state) +LABEL skip_ %I + I = (I+1) +.noaltmacro +.endr + + # Find min length + vmovdqa _lens+0*16(state), %xmm0 + vmovdqa _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword + + vmovd %xmm2, DWORD_idx + mov idx, len2 + and $0xF, idx + shr $4, len2 + jz len_is_0 + + vpand clear_low_nibble(%rip), %xmm2, %xmm2 + vpshufd $0, %xmm2, %xmm2 + + vpsubd %xmm2, %xmm0, %xmm0 + vpsubd %xmm2, %xmm1, %xmm1 + + vmovdqa %xmm0, _lens+0*16(state) + vmovdqa %xmm1, _lens+1*16(state) + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha256_x8_avx2 + # state and idx are intact + +len_is_0: + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $4, unused_lanes + or idx, unused_lanes + + mov unused_lanes, _unused_lanes(state) + movl $0xFFFFFFFF, _lens(state,idx,4) + + vmovd _args_digest(state , idx, 4) , %xmm0 + vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 + vmovd _args_digest+4*32(state, idx, 4), %xmm1 + vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1 + vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1 + vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1 + + vmovdqu %xmm0, _result_digest(job_rax) + offset = (_result_digest + 1*16) + vmovdqu %xmm1, offset(job_rax) + +return: + pop %rbx + FRAME_END + ret + +return_null: + xor job_rax, job_rax + jmp return +ENDPROC(sha256_mb_mgr_flush_avx2) + +############################################################################## + +.align 16 +ENTRY(sha256_mb_mgr_get_comp_job_avx2) + push %rbx + + ## if bit 32+3 is set, then all lanes are empty + mov _unused_lanes(state), unused_lanes + bt $(32+3), unused_lanes + jc .return_null + + # Find min length + vmovdqa _lens(state), %xmm0 + vmovdqa _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword + + vmovd %xmm2, DWORD_idx + test $~0xF, idx + jnz .return_null + + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $4, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens(state, idx, 4) + + vmovd _args_digest(state, idx, 4), %xmm0 + vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 + movl _args_digest+4*32(state, idx, 4), tmp2_w + vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1 + vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1 + vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1 + + vmovdqu %xmm0, _result_digest(job_rax) + movl tmp2_w, _result_digest+1*16(job_rax) + + pop %rbx + + ret + +.return_null: + xor job_rax, job_rax + pop %rbx + ret +ENDPROC(sha256_mb_mgr_get_comp_job_avx2) + +.data + +.align 16 +clear_low_nibble: +.octa 0x000000000000000000000000FFFFFFF0 +one: +.quad 1 +two: +.quad 2 +three: +.quad 3 +four: +.quad 4 +five: +.quad 5 +six: +.quad 6 +seven: +.quad 7 diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c new file mode 100644 index 000000000000..b0c498371e67 --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c @@ -0,0 +1,65 @@ +/* + * Initialization code for multi buffer SHA256 algorithm for AVX2 + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "sha256_mb_mgr.h" + +void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state) +{ + unsigned int j; + + state->unused_lanes = 0xF76543210ULL; + for (j = 0; j < 8; j++) { + state->lens[j] = 0xFFFFFFFF; + state->ldata[j].job_in_lane = NULL; + } +} diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S new file mode 100644 index 000000000000..7ea670e25acc --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S @@ -0,0 +1,215 @@ +/* + * Buffer submit code for multi buffer SHA256 algorithm + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> +#include "sha256_mb_mgr_datastruct.S" + +.extern sha256_x8_avx2 + +# LINUX register definitions +arg1 = %rdi +arg2 = %rsi +size_offset = %rcx +tmp2 = %rcx +extra_blocks = %rdx + +# Common definitions +#define state arg1 +#define job %rsi +#define len2 arg2 +#define p2 arg2 + +# idx must be a register not clobberred by sha1_x8_avx2 +idx = %r8 +DWORD_idx = %r8d +last_len = %r8 + +p = %r11 +start_offset = %r11 + +unused_lanes = %rbx +BYTE_unused_lanes = %bl + +job_rax = %rax +len = %rax +DWORD_len = %eax + +lane = %r12 +tmp3 = %r12 + +tmp = %r9 +DWORD_tmp = %r9d + +lane_data = %r10 + +# JOB* sha256_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA256 *job) +# arg 1 : rcx : state +# arg 2 : rdx : job +ENTRY(sha256_mb_mgr_submit_avx2) + FRAME_BEGIN + push %rbx + push %r12 + + mov _unused_lanes(state), unused_lanes + mov unused_lanes, lane + and $0xF, lane + shr $4, unused_lanes + imul $_LANE_DATA_size, lane, lane_data + movl $STS_BEING_PROCESSED, _status(job) + lea _ldata(state, lane_data), lane_data + mov unused_lanes, _unused_lanes(state) + movl _len(job), DWORD_len + + mov job, _job_in_lane(lane_data) + shl $4, len + or lane, len + + movl DWORD_len, _lens(state , lane, 4) + + # Load digest words from result_digest + vmovdqu _result_digest(job), %xmm0 + vmovdqu _result_digest+1*16(job), %xmm1 + vmovd %xmm0, _args_digest(state, lane, 4) + vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4) + vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4) + vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4) + vmovd %xmm1, _args_digest+4*32(state , lane, 4) + + vpextrd $1, %xmm1, _args_digest+5*32(state , lane, 4) + vpextrd $2, %xmm1, _args_digest+6*32(state , lane, 4) + vpextrd $3, %xmm1, _args_digest+7*32(state , lane, 4) + + mov _buffer(job), p + mov p, _args_data_ptr(state, lane, 8) + + cmp $0xF, unused_lanes + jne return_null + +start_loop: + # Find min length + vmovdqa _lens(state), %xmm0 + vmovdqa _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword + + vmovd %xmm2, DWORD_idx + mov idx, len2 + and $0xF, idx + shr $4, len2 + jz len_is_0 + + vpand clear_low_nibble(%rip), %xmm2, %xmm2 + vpshufd $0, %xmm2, %xmm2 + + vpsubd %xmm2, %xmm0, %xmm0 + vpsubd %xmm2, %xmm1, %xmm1 + + vmovdqa %xmm0, _lens + 0*16(state) + vmovdqa %xmm1, _lens + 1*16(state) + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha256_x8_avx2 + + # state and idx are intact + +len_is_0: + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + mov _unused_lanes(state), unused_lanes + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + shl $4, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens(state,idx,4) + + vmovd _args_digest(state, idx, 4), %xmm0 + vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0 + vmovd _args_digest+4*32(state, idx, 4), %xmm1 + + vpinsrd $1, _args_digest+5*32(state , idx, 4), %xmm1, %xmm1 + vpinsrd $2, _args_digest+6*32(state , idx, 4), %xmm1, %xmm1 + vpinsrd $3, _args_digest+7*32(state , idx, 4), %xmm1, %xmm1 + + vmovdqu %xmm0, _result_digest(job_rax) + vmovdqu %xmm1, _result_digest+1*16(job_rax) + +return: + pop %r12 + pop %rbx + FRAME_END + ret + +return_null: + xor job_rax, job_rax + jmp return + +ENDPROC(sha256_mb_mgr_submit_avx2) + +.data + +.align 16 +clear_low_nibble: + .octa 0x000000000000000000000000FFFFFFF0 diff --git a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S new file mode 100644 index 000000000000..aa21aea4c722 --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S @@ -0,0 +1,593 @@ +/* + * Multi-buffer SHA256 algorithm hash compute routine + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include "sha256_mb_mgr_datastruct.S" + +## code to compute oct SHA256 using SSE-256 +## outer calling routine takes care of save and restore of XMM registers +## Logic designed/laid out by JDG + +## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; %ymm0-15 +## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +## Linux preserves: rdi rbp r8 +## +## clobbers %ymm0-15 + +arg1 = %rdi +arg2 = %rsi +reg3 = %rcx +reg4 = %rdx + +# Common definitions +STATE = arg1 +INP_SIZE = arg2 + +IDX = %rax +ROUND = %rbx +TBL = reg3 + +inp0 = %r9 +inp1 = %r10 +inp2 = %r11 +inp3 = %r12 +inp4 = %r13 +inp5 = %r14 +inp6 = %r15 +inp7 = reg4 + +a = %ymm0 +b = %ymm1 +c = %ymm2 +d = %ymm3 +e = %ymm4 +f = %ymm5 +g = %ymm6 +h = %ymm7 + +T1 = %ymm8 + +a0 = %ymm12 +a1 = %ymm13 +a2 = %ymm14 +TMP = %ymm15 +TMP0 = %ymm6 +TMP1 = %ymm7 + +TT0 = %ymm8 +TT1 = %ymm9 +TT2 = %ymm10 +TT3 = %ymm11 +TT4 = %ymm12 +TT5 = %ymm13 +TT6 = %ymm14 +TT7 = %ymm15 + +# Define stack usage + +# Assume stack aligned to 32 bytes before call +# Therefore FRAMESZ mod 32 must be 32-8 = 24 + +#define FRAMESZ 0x388 + +#define VMOVPS vmovups + +# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 +# "transpose" data in {r0...r7} using temps {t0...t1} +# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +# r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +# r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +# r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +# r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +# r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +# r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +# r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +# r7 = {h7 h6 h5 h4 h3 h2 h1 h0} +# +# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +# r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +# r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +# r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +# r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +# r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +# r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +# r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +# r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +# + +.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1 + # process top half (r0..r3) {a...d} + vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + # use r2 in place of t0 + # process bottom half (r4..r7) {e...h} + vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6 + vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2 + vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5 + vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1 + vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7 + vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3 + vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4 + vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0 + +.endm + +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro _PRORD reg imm tmp + vpslld $(32-\imm),\reg,\tmp + vpsrld $\imm,\reg, \reg + vpor \tmp,\reg, \reg +.endm + +# PRORD_nd reg, imm, tmp, src +.macro _PRORD_nd reg imm tmp src + vpslld $(32-\imm), \src, \tmp + vpsrld $\imm, \src, \reg + vpor \tmp, \reg, \reg +.endm + +# PRORD dst/src, amt +.macro PRORD reg imm + _PRORD \reg,\imm,TMP +.endm + +# PRORD_nd dst, src, amt +.macro PRORD_nd reg tmp imm + _PRORD_nd \reg, \imm, TMP, \tmp +.endm + +# arguments passed implicitly in preprocessor symbols i, a...h +.macro ROUND_00_15 _T1 i + PRORD_nd a0,e,5 # sig1: a0 = (e >> 5) + + vpxor g, f, a2 # ch: a2 = f^g + vpand e,a2, a2 # ch: a2 = (f^g)&e + vpxor g, a2, a2 # a2 = ch + + PRORD_nd a1,e,25 # sig1: a1 = (e >> 25) + + vmovdqu \_T1,(SZ8*(\i & 0xf))(%rsp) + vpaddd (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K + vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 # sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd a2, h, h # h = h + ch + PRORD_nd a2,a,11 # sig0: a2 = (a >> 11) + vpaddd \_T1,h, h # h = h + ch + W + K + vpxor a1, a0, a0 # a0 = sigma1 + PRORD_nd a1,a,22 # sig0: a1 = (a >> 22) + vpxor c, a, \_T1 # maj: T1 = a^c + add $SZ8, ROUND # ROUND++ + vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b + vpaddd a0, h, h + vpaddd h, d, d + vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11) + PRORD a2,2 # sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a1, a2, a2 # a2 = sig0 + vpand c, a, a1 # maj: a1 = a&c + vpor \_T1, a1, a1 # a1 = maj + vpaddd a1, h, h # h = h + ch + W + K + maj + vpaddd a2, h, h # h = h + ch + W + K + maj + sigma0 + ROTATE_ARGS +.endm + +# arguments passed implicitly in preprocessor symbols i, a...h +.macro ROUND_16_XX _T1 i + vmovdqu (SZ8*((\i-15)&0xf))(%rsp), \_T1 + vmovdqu (SZ8*((\i-2)&0xf))(%rsp), a1 + vmovdqu \_T1, a0 + PRORD \_T1,11 + vmovdqu a1, a2 + PRORD a1,2 + vpxor a0, \_T1, \_T1 + PRORD \_T1, 7 + vpxor a2, a1, a1 + PRORD a1, 17 + vpsrld $3, a0, a0 + vpxor a0, \_T1, \_T1 + vpsrld $10, a2, a2 + vpxor a2, a1, a1 + vpaddd (SZ8*((\i-16)&0xf))(%rsp), \_T1, \_T1 + vpaddd (SZ8*((\i-7)&0xf))(%rsp), a1, a1 + vpaddd a1, \_T1, \_T1 + + ROUND_00_15 \_T1,\i +.endm + +# SHA256_ARGS: +# UINT128 digest[8]; // transposed digests +# UINT8 *data_ptr[4]; + +# void sha256_x8_avx2(SHA256_ARGS *args, UINT64 bytes); +# arg 1 : STATE : pointer to array of pointers to input data +# arg 2 : INP_SIZE : size of input in blocks + # general registers preserved in outer calling routine + # outer calling routine saves all the XMM registers + # save rsp, allocate 32-byte aligned for local variables +ENTRY(sha256_x8_avx2) + + # save callee-saved clobbered registers to comply with C function ABI + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, IDX + sub $FRAMESZ, %rsp + and $~0x1F, %rsp + mov IDX, _rsp(%rsp) + + # Load the pre-transposed incoming digest. + vmovdqu 0*SHA256_DIGEST_ROW_SIZE(STATE),a + vmovdqu 1*SHA256_DIGEST_ROW_SIZE(STATE),b + vmovdqu 2*SHA256_DIGEST_ROW_SIZE(STATE),c + vmovdqu 3*SHA256_DIGEST_ROW_SIZE(STATE),d + vmovdqu 4*SHA256_DIGEST_ROW_SIZE(STATE),e + vmovdqu 5*SHA256_DIGEST_ROW_SIZE(STATE),f + vmovdqu 6*SHA256_DIGEST_ROW_SIZE(STATE),g + vmovdqu 7*SHA256_DIGEST_ROW_SIZE(STATE),h + + lea K256_8(%rip),TBL + + # load the address of each of the 4 message lanes + # getting ready to transpose input onto stack + mov _args_data_ptr+0*PTR_SZ(STATE),inp0 + mov _args_data_ptr+1*PTR_SZ(STATE),inp1 + mov _args_data_ptr+2*PTR_SZ(STATE),inp2 + mov _args_data_ptr+3*PTR_SZ(STATE),inp3 + mov _args_data_ptr+4*PTR_SZ(STATE),inp4 + mov _args_data_ptr+5*PTR_SZ(STATE),inp5 + mov _args_data_ptr+6*PTR_SZ(STATE),inp6 + mov _args_data_ptr+7*PTR_SZ(STATE),inp7 + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + # save old digest + vmovdqu a, _digest(%rsp) + vmovdqu b, _digest+1*SZ8(%rsp) + vmovdqu c, _digest+2*SZ8(%rsp) + vmovdqu d, _digest+3*SZ8(%rsp) + vmovdqu e, _digest+4*SZ8(%rsp) + vmovdqu f, _digest+5*SZ8(%rsp) + vmovdqu g, _digest+6*SZ8(%rsp) + vmovdqu h, _digest+7*SZ8(%rsp) + i = 0 +.rep 2 + VMOVPS i*32(inp0, IDX), TT0 + VMOVPS i*32(inp1, IDX), TT1 + VMOVPS i*32(inp2, IDX), TT2 + VMOVPS i*32(inp3, IDX), TT3 + VMOVPS i*32(inp4, IDX), TT4 + VMOVPS i*32(inp5, IDX), TT5 + VMOVPS i*32(inp6, IDX), TT6 + VMOVPS i*32(inp7, IDX), TT7 + vmovdqu g, _ytmp(%rsp) + vmovdqu h, _ytmp+1*SZ8(%rsp) + TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1 + vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP1 + vmovdqu _ytmp(%rsp), g + vpshufb TMP1, TT0, TT0 + vpshufb TMP1, TT1, TT1 + vpshufb TMP1, TT2, TT2 + vpshufb TMP1, TT3, TT3 + vpshufb TMP1, TT4, TT4 + vpshufb TMP1, TT5, TT5 + vpshufb TMP1, TT6, TT6 + vpshufb TMP1, TT7, TT7 + vmovdqu _ytmp+1*SZ8(%rsp), h + vmovdqu TT4, _ytmp(%rsp) + vmovdqu TT5, _ytmp+1*SZ8(%rsp) + vmovdqu TT6, _ytmp+2*SZ8(%rsp) + vmovdqu TT7, _ytmp+3*SZ8(%rsp) + ROUND_00_15 TT0,(i*8+0) + vmovdqu _ytmp(%rsp), TT0 + ROUND_00_15 TT1,(i*8+1) + vmovdqu _ytmp+1*SZ8(%rsp), TT1 + ROUND_00_15 TT2,(i*8+2) + vmovdqu _ytmp+2*SZ8(%rsp), TT2 + ROUND_00_15 TT3,(i*8+3) + vmovdqu _ytmp+3*SZ8(%rsp), TT3 + ROUND_00_15 TT0,(i*8+4) + ROUND_00_15 TT1,(i*8+5) + ROUND_00_15 TT2,(i*8+6) + ROUND_00_15 TT3,(i*8+7) + i = (i+1) +.endr + add $64, IDX + i = (i*8) + + jmp Lrounds_16_xx +.align 16 +Lrounds_16_xx: +.rep 16 + ROUND_16_XX T1, i + i = (i+1) +.endr + + cmp $ROUNDS,ROUND + jb Lrounds_16_xx + + # add old digest + vpaddd _digest+0*SZ8(%rsp), a, a + vpaddd _digest+1*SZ8(%rsp), b, b + vpaddd _digest+2*SZ8(%rsp), c, c + vpaddd _digest+3*SZ8(%rsp), d, d + vpaddd _digest+4*SZ8(%rsp), e, e + vpaddd _digest+5*SZ8(%rsp), f, f + vpaddd _digest+6*SZ8(%rsp), g, g + vpaddd _digest+7*SZ8(%rsp), h, h + + sub $1, INP_SIZE # unit is blocks + jne lloop + + # write back to memory (state object) the transposed digest + vmovdqu a, 0*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu b, 1*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu c, 2*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu d, 3*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu e, 4*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu f, 5*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu g, 6*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu h, 7*SHA256_DIGEST_ROW_SIZE(STATE) + + # update input pointers + add IDX, inp0 + mov inp0, _args_data_ptr+0*8(STATE) + add IDX, inp1 + mov inp1, _args_data_ptr+1*8(STATE) + add IDX, inp2 + mov inp2, _args_data_ptr+2*8(STATE) + add IDX, inp3 + mov inp3, _args_data_ptr+3*8(STATE) + add IDX, inp4 + mov inp4, _args_data_ptr+4*8(STATE) + add IDX, inp5 + mov inp5, _args_data_ptr+5*8(STATE) + add IDX, inp6 + mov inp6, _args_data_ptr+6*8(STATE) + add IDX, inp7 + mov inp7, _args_data_ptr+7*8(STATE) + + # Postamble + mov _rsp(%rsp), %rsp + + # restore callee-saved clobbered registers + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + ret +ENDPROC(sha256_x8_avx2) +.data +.align 64 +K256_8: + .octa 0x428a2f98428a2f98428a2f98428a2f98 + .octa 0x428a2f98428a2f98428a2f98428a2f98 + .octa 0x71374491713744917137449171374491 + .octa 0x71374491713744917137449171374491 + .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf + .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf + .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5 + .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5 + .octa 0x3956c25b3956c25b3956c25b3956c25b + .octa 0x3956c25b3956c25b3956c25b3956c25b + .octa 0x59f111f159f111f159f111f159f111f1 + .octa 0x59f111f159f111f159f111f159f111f1 + .octa 0x923f82a4923f82a4923f82a4923f82a4 + .octa 0x923f82a4923f82a4923f82a4923f82a4 + .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5 + .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5 + .octa 0xd807aa98d807aa98d807aa98d807aa98 + .octa 0xd807aa98d807aa98d807aa98d807aa98 + .octa 0x12835b0112835b0112835b0112835b01 + .octa 0x12835b0112835b0112835b0112835b01 + .octa 0x243185be243185be243185be243185be + .octa 0x243185be243185be243185be243185be + .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3 + .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3 + .octa 0x72be5d7472be5d7472be5d7472be5d74 + .octa 0x72be5d7472be5d7472be5d7472be5d74 + .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe + .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe + .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7 + .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7 + .octa 0xc19bf174c19bf174c19bf174c19bf174 + .octa 0xc19bf174c19bf174c19bf174c19bf174 + .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1 + .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1 + .octa 0xefbe4786efbe4786efbe4786efbe4786 + .octa 0xefbe4786efbe4786efbe4786efbe4786 + .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6 + .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6 + .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc + .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc + .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f + .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f + .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa + .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa + .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc + .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc + .octa 0x76f988da76f988da76f988da76f988da + .octa 0x76f988da76f988da76f988da76f988da + .octa 0x983e5152983e5152983e5152983e5152 + .octa 0x983e5152983e5152983e5152983e5152 + .octa 0xa831c66da831c66da831c66da831c66d + .octa 0xa831c66da831c66da831c66da831c66d + .octa 0xb00327c8b00327c8b00327c8b00327c8 + .octa 0xb00327c8b00327c8b00327c8b00327c8 + .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7 + .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7 + .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3 + .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3 + .octa 0xd5a79147d5a79147d5a79147d5a79147 + .octa 0xd5a79147d5a79147d5a79147d5a79147 + .octa 0x06ca635106ca635106ca635106ca6351 + .octa 0x06ca635106ca635106ca635106ca6351 + .octa 0x14292967142929671429296714292967 + .octa 0x14292967142929671429296714292967 + .octa 0x27b70a8527b70a8527b70a8527b70a85 + .octa 0x27b70a8527b70a8527b70a8527b70a85 + .octa 0x2e1b21382e1b21382e1b21382e1b2138 + .octa 0x2e1b21382e1b21382e1b21382e1b2138 + .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc + .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc + .octa 0x53380d1353380d1353380d1353380d13 + .octa 0x53380d1353380d1353380d1353380d13 + .octa 0x650a7354650a7354650a7354650a7354 + .octa 0x650a7354650a7354650a7354650a7354 + .octa 0x766a0abb766a0abb766a0abb766a0abb + .octa 0x766a0abb766a0abb766a0abb766a0abb + .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e + .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e + .octa 0x92722c8592722c8592722c8592722c85 + .octa 0x92722c8592722c8592722c8592722c85 + .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1 + .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1 + .octa 0xa81a664ba81a664ba81a664ba81a664b + .octa 0xa81a664ba81a664ba81a664ba81a664b + .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70 + .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70 + .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3 + .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3 + .octa 0xd192e819d192e819d192e819d192e819 + .octa 0xd192e819d192e819d192e819d192e819 + .octa 0xd6990624d6990624d6990624d6990624 + .octa 0xd6990624d6990624d6990624d6990624 + .octa 0xf40e3585f40e3585f40e3585f40e3585 + .octa 0xf40e3585f40e3585f40e3585f40e3585 + .octa 0x106aa070106aa070106aa070106aa070 + .octa 0x106aa070106aa070106aa070106aa070 + .octa 0x19a4c11619a4c11619a4c11619a4c116 + .octa 0x19a4c11619a4c11619a4c11619a4c116 + .octa 0x1e376c081e376c081e376c081e376c08 + .octa 0x1e376c081e376c081e376c081e376c08 + .octa 0x2748774c2748774c2748774c2748774c + .octa 0x2748774c2748774c2748774c2748774c + .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5 + .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5 + .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3 + .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3 + .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a + .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a + .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f + .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f + .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3 + .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3 + .octa 0x748f82ee748f82ee748f82ee748f82ee + .octa 0x748f82ee748f82ee748f82ee748f82ee + .octa 0x78a5636f78a5636f78a5636f78a5636f + .octa 0x78a5636f78a5636f78a5636f78a5636f + .octa 0x84c8781484c8781484c8781484c87814 + .octa 0x84c8781484c8781484c8781484c87814 + .octa 0x8cc702088cc702088cc702088cc70208 + .octa 0x8cc702088cc702088cc702088cc70208 + .octa 0x90befffa90befffa90befffa90befffa + .octa 0x90befffa90befffa90befffa90befffa + .octa 0xa4506ceba4506ceba4506ceba4506ceb + .octa 0xa4506ceba4506ceba4506ceba4506ceb + .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7 + .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7 + .octa 0xc67178f2c67178f2c67178f2c67178f2 + .octa 0xc67178f2c67178f2c67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: +.octa 0x0c0d0e0f08090a0b0405060700010203 +.octa 0x0c0d0e0f08090a0b0405060700010203 + +.align 64 +.global K256 +K256: + .int 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .int 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .int 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .int 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .int 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .int 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .int 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .int 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .int 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .int 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .int 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .int 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .int 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .int 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .int 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .int 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 3ae0f43ebd37..9e79baf03a4b 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -427,4 +427,14 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS_CRYPTO("sha256"); +MODULE_ALIAS_CRYPTO("sha256-ssse3"); +MODULE_ALIAS_CRYPTO("sha256-avx"); +MODULE_ALIAS_CRYPTO("sha256-avx2"); MODULE_ALIAS_CRYPTO("sha224"); +MODULE_ALIAS_CRYPTO("sha224-ssse3"); +MODULE_ALIAS_CRYPTO("sha224-avx"); +MODULE_ALIAS_CRYPTO("sha224-avx2"); +#ifdef CONFIG_AS_SHA256_NI +MODULE_ALIAS_CRYPTO("sha256-ni"); +MODULE_ALIAS_CRYPTO("sha224-ni"); +#endif diff --git a/arch/x86/crypto/sha512-mb/Makefile b/arch/x86/crypto/sha512-mb/Makefile new file mode 100644 index 000000000000..0a57e2103980 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/Makefile @@ -0,0 +1,11 @@ +# +# Arch-specific CryptoAPI modules. +# + +avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ + $(comma)4)$(comma)%ymm2,yes,no) +ifeq ($(avx2_supported),yes) + obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb.o + sha512-mb-y := sha512_mb.o sha512_mb_mgr_flush_avx2.o \ + sha512_mb_mgr_init_avx2.o sha512_mb_mgr_submit_avx2.o sha512_x4_avx2.o +endif diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c new file mode 100644 index 000000000000..f4cf5b78fd36 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -0,0 +1,1046 @@ +/* + * Multi buffer SHA512 algorithm Glue Code + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <linux/list.h> +#include <crypto/scatterwalk.h> +#include <crypto/sha.h> +#include <crypto/mcryptd.h> +#include <crypto/crypto_wq.h> +#include <asm/byteorder.h> +#include <linux/hardirq.h> +#include <asm/fpu/api.h> +#include "sha512_mb_ctx.h" + +#define FLUSH_INTERVAL 1000 /* in usec */ + +static struct mcryptd_alg_state sha512_mb_alg_state; + +struct sha512_mb_ctx { + struct mcryptd_ahash *mcryptd_tfm; +}; + +static inline struct mcryptd_hash_request_ctx + *cast_hash_to_mcryptd_ctx(struct sha512_hash_ctx *hash_ctx) +{ + struct ahash_request *areq; + + areq = container_of((void *) hash_ctx, struct ahash_request, __ctx); + return container_of(areq, struct mcryptd_hash_request_ctx, areq); +} + +static inline struct ahash_request + *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx) +{ + return container_of((void *) ctx, struct ahash_request, __ctx); +} + +static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx, + struct ahash_request *areq) +{ + rctx->flag = HASH_UPDATE; +} + +static asmlinkage void (*sha512_job_mgr_init)(struct sha512_mb_mgr *state); +static asmlinkage struct job_sha512* (*sha512_job_mgr_submit) + (struct sha512_mb_mgr *state, + struct job_sha512 *job); +static asmlinkage struct job_sha512* (*sha512_job_mgr_flush) + (struct sha512_mb_mgr *state); +static asmlinkage struct job_sha512* (*sha512_job_mgr_get_comp_job) + (struct sha512_mb_mgr *state); + +inline void sha512_init_digest(uint64_t *digest) +{ + static const uint64_t initial_digest[SHA512_DIGEST_LENGTH] = { + SHA512_H0, SHA512_H1, SHA512_H2, + SHA512_H3, SHA512_H4, SHA512_H5, + SHA512_H6, SHA512_H7 }; + memcpy(digest, initial_digest, sizeof(initial_digest)); +} + +inline uint32_t sha512_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], + uint32_t total_len) +{ + uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1); + + memset(&padblock[i], 0, SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((SHA512_BLOCK_SIZE - 1) & + (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) &padblock[i - 16]) = 0; +#endif + + *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3); + + /* Number of extra blocks to hash */ + return i >> SHA512_LOG2_BLOCK_SIZE; +} + +static struct sha512_hash_ctx *sha512_ctx_mgr_resubmit + (struct sha512_ctx_mgr *mgr, struct sha512_hash_ctx *ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + /* Clear PROCESSING bit */ + ctx->status = HASH_CTX_STS_COMPLETE; + return ctx; + } + + /* + * If the extra blocks are empty, begin hashing what remains + * in the user's buffer. + */ + if (ctx->partial_block_buffer_length == 0 && + ctx->incoming_buffer_length) { + + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + uint32_t copy_len; + + /* + * Only entire blocks can be hashed. + * Copy remainder to extra blocks buffer. + */ + copy_len = len & (SHA512_BLOCK_SIZE-1); + + if (copy_len) { + len -= copy_len; + memcpy(ctx->partial_block_buffer, + ((const char *) buffer + len), + copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + /* len should be a multiple of the block size now */ + assert((len % SHA512_BLOCK_SIZE) == 0); + + /* Set len to the number of blocks to be hashed */ + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (struct sha512_hash_ctx *) + sha512_job_mgr_submit(&mgr->mgr, + &ctx->job); + continue; + } + } + + /* + * If the extra blocks are not empty, then we are + * either on the last block(s) or we need more + * user input before continuing. + */ + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = + sha512_pad(buf, ctx->total_length); + + ctx->status = (HASH_CTX_STS_PROCESSING | + HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (struct sha512_hash_ctx *) + sha512_job_mgr_submit(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static struct sha512_hash_ctx + *sha512_ctx_mgr_get_comp_ctx(struct sha512_ctx_mgr *mgr) +{ + /* + * If get_comp_job returns NULL, there are no jobs complete. + * If get_comp_job returns a job, verify that it is safe to return to + * the user. + * If it is not ready, resubmit the job to finish processing. + * If sha512_ctx_mgr_resubmit returned a job, it is ready to be + * returned. + * Otherwise, all jobs currently being managed by the hash_ctx_mgr + * still need processing. + */ + struct sha512_hash_ctx *ctx; + + ctx = (struct sha512_hash_ctx *) + sha512_job_mgr_get_comp_job(&mgr->mgr); + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr) +{ + sha512_job_mgr_init(&mgr->mgr); +} + +static struct sha512_hash_ctx + *sha512_ctx_mgr_submit(struct sha512_ctx_mgr *mgr, + struct sha512_hash_ctx *ctx, + const void *buffer, + uint32_t len, + int flags) +{ + if (flags & (~HASH_ENTIRE)) { + /* + * User should not pass anything other than FIRST, UPDATE, or + * LAST + */ + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + /* Cannot submit to a currently processing job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + /* Cannot update a finished job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + + if (flags & HASH_FIRST) { + /* Init digest */ + sha512_init_digest(ctx->job.result_digest); + + /* Reset byte counter */ + ctx->total_length = 0; + + /* Clear extra blocks */ + ctx->partial_block_buffer_length = 0; + } + + /* + * If we made it here, there were no errors during this call to + * submit + */ + ctx->error = HASH_CTX_ERROR_NONE; + + /* Store buffer ptr info from user */ + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + /* + * Store the user's request flags and mark this ctx as currently being + * processed. + */ + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + /* Advance byte counter */ + ctx->total_length += len; + + /* + * If there is anything currently buffered in the extra blocks, + * append to it until it contains a whole block. + * Or if the user's buffer contains less than a whole block, + * append as much as possible to the extra block. + */ + if (ctx->partial_block_buffer_length || len < SHA512_BLOCK_SIZE) { + /* Compute how many bytes to copy from user buffer into extra + * block + */ + uint32_t copy_len = SHA512_BLOCK_SIZE - + ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + /* Copy and update relevant pointers and counters */ + memcpy + (&ctx->partial_block_buffer[ctx->partial_block_buffer_length], + buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *) + ((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + + /* The extra block should never contain more than 1 block + * here + */ + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + /* If the extra block buffer contains exactly 1 block, it can + * be hashed. + */ + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (struct sha512_hash_ctx *) + sha512_job_mgr_submit(&mgr->mgr, &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr) +{ + struct sha512_hash_ctx *ctx; + + while (1) { + ctx = (struct sha512_hash_ctx *) + sha512_job_mgr_flush(&mgr->mgr); + + /* If flush returned 0, there are no more jobs in flight. */ + if (!ctx) + return NULL; + + /* + * If flush returned a job, resubmit the job to finish + * processing. + */ + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + /* + * If sha512_ctx_mgr_resubmit returned a job, it is ready to + * be returned. Otherwise, all jobs currently being managed by + * the sha512_ctx_mgr still need processing. Loop. + */ + if (ctx) + return ctx; + } +} + +static int sha512_mb_init(struct ahash_request *areq) +{ + struct sha512_hash_ctx *sctx = ahash_request_ctx(areq); + + hash_ctx_init(sctx); + sctx->job.result_digest[0] = SHA512_H0; + sctx->job.result_digest[1] = SHA512_H1; + sctx->job.result_digest[2] = SHA512_H2; + sctx->job.result_digest[3] = SHA512_H3; + sctx->job.result_digest[4] = SHA512_H4; + sctx->job.result_digest[5] = SHA512_H5; + sctx->job.result_digest[6] = SHA512_H6; + sctx->job.result_digest[7] = SHA512_H7; + sctx->total_length = 0; + sctx->partial_block_buffer_length = 0; + sctx->status = HASH_CTX_STS_IDLE; + + return 0; +} + +static int sha512_mb_set_results(struct mcryptd_hash_request_ctx *rctx) +{ + int i; + struct sha512_hash_ctx *sctx = ahash_request_ctx(&rctx->areq); + __be64 *dst = (__be64 *) rctx->out; + + for (i = 0; i < 8; ++i) + dst[i] = cpu_to_be64(sctx->job.result_digest[i]); + + return 0; +} + +static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx, + struct mcryptd_alg_cstate *cstate, bool flush) +{ + int flag = HASH_UPDATE; + int nbytes, err = 0; + struct mcryptd_hash_request_ctx *rctx = *ret_rctx; + struct sha512_hash_ctx *sha_ctx; + + /* more work ? */ + while (!(rctx->flag & HASH_DONE)) { + nbytes = crypto_ahash_walk_done(&rctx->walk, 0); + if (nbytes < 0) { + err = nbytes; + goto out; + } + /* check if the walk is done */ + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + if (rctx->flag & HASH_FINAL) + flag |= HASH_LAST; + + } + sha_ctx = (struct sha512_hash_ctx *) + ahash_request_ctx(&rctx->areq); + kernel_fpu_begin(); + sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, + rctx->walk.data, nbytes, flag); + if (!sha_ctx) { + if (flush) + sha_ctx = sha512_ctx_mgr_flush(cstate->mgr); + } + kernel_fpu_end(); + if (sha_ctx) + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + else { + rctx = NULL; + goto out; + } + } + + /* copy the results */ + if (rctx->flag & HASH_FINAL) + sha512_mb_set_results(rctx); + +out: + *ret_rctx = rctx; + return err; +} + +static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate, + int err) +{ + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha512_hash_ctx *sha_ctx; + struct mcryptd_hash_request_ctx *req_ctx; + int ret; + + /* remove from work list */ + spin_lock(&cstate->work_lock); + list_del(&rctx->waiter); + spin_unlock(&cstate->work_lock); + + if (irqs_disabled()) + rctx->complete(&req->base, err); + else { + local_bh_disable(); + rctx->complete(&req->base, err); + local_bh_enable(); + } + + /* check to see if there are other jobs that are done */ + sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr); + while (sha_ctx) { + req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&req_ctx, cstate, false); + if (req_ctx) { + spin_lock(&cstate->work_lock); + list_del(&req_ctx->waiter); + spin_unlock(&cstate->work_lock); + + req = cast_mcryptd_ctx_to_req(req_ctx); + if (irqs_disabled()) + rctx->complete(&req->base, ret); + else { + local_bh_disable(); + rctx->complete(&req->base, ret); + local_bh_enable(); + } + } + sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr); + } + + return 0; +} + +static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate) +{ + unsigned long next_flush; + unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL); + + /* initialize tag */ + rctx->tag.arrival = jiffies; /* tag the arrival time */ + rctx->tag.seq_num = cstate->next_seq_num++; + next_flush = rctx->tag.arrival + delay; + rctx->tag.expire = next_flush; + + spin_lock(&cstate->work_lock); + list_add_tail(&rctx->waiter, &cstate->work_list); + spin_unlock(&cstate->work_lock); + + mcryptd_arm_flusher(cstate, delay); +} + +static int sha512_mb_update(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, + areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha512_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha512_hash_ctx *sha_ctx; + int ret = 0, nbytes; + + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) + rctx->flag |= HASH_DONE; + + /* submit */ + sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq); + sha512_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, HASH_UPDATE); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha512_mb_finup(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, + areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha512_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha512_hash_ctx *sha_ctx; + int ret = 0, flag = HASH_UPDATE, nbytes; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + flag = HASH_LAST; + } + + /* submit */ + rctx->flag |= HASH_FINAL; + sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq); + sha512_mb_add_list(rctx, cstate); + + kernel_fpu_begin(); + sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, flag); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha512_mb_final(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, + areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha512_mb_alg_state.alg_cstate); + + struct sha512_hash_ctx *sha_ctx; + int ret = 0; + u8 data; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + rctx->flag |= HASH_DONE | HASH_FINAL; + + sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq); + /* flag HASH_FINAL and 0 data size */ + sha512_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, + HASH_LAST); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha512_mb_export(struct ahash_request *areq, void *out) +{ + struct sha512_hash_ctx *sctx = ahash_request_ctx(areq); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha512_mb_import(struct ahash_request *areq, const void *in) +{ + struct sha512_hash_ctx *sctx = ahash_request_ctx(areq); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static int sha512_mb_async_init_tfm(struct crypto_tfm *tfm) +{ + struct mcryptd_ahash *mcryptd_tfm; + struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm); + struct mcryptd_hash_ctx *mctx; + + mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha512-mb", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(mcryptd_tfm)) + return PTR_ERR(mcryptd_tfm); + mctx = crypto_ahash_ctx(&mcryptd_tfm->base); + mctx->alg_state = &sha512_mb_alg_state; + ctx->mcryptd_tfm = mcryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&mcryptd_tfm->base)); + + return 0; +} + +static void sha512_mb_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static int sha512_mb_areq_init_tfm(struct crypto_tfm *tfm) +{ + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + sizeof(struct sha512_hash_ctx)); + + return 0; +} + +static void sha512_mb_areq_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static struct ahash_alg sha512_mb_areq_alg = { + .init = sha512_mb_init, + .update = sha512_mb_update, + .final = sha512_mb_final, + .finup = sha512_mb_finup, + .export = sha512_mb_export, + .import = sha512_mb_import, + .halg = { + .digestsize = SHA512_DIGEST_SIZE, + .statesize = sizeof(struct sha512_hash_ctx), + .base = { + .cra_name = "__sha512-mb", + .cra_driver_name = "__intel_sha512-mb", + .cra_priority = 100, + /* + * use ASYNC flag as some buffers in multi-buffer + * algo may not have completed before hashing thread + * sleep + */ + .cra_flags = CRYPTO_ALG_TYPE_AHASH | + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT + (sha512_mb_areq_alg.halg.base.cra_list), + .cra_init = sha512_mb_areq_init_tfm, + .cra_exit = sha512_mb_areq_exit_tfm, + .cra_ctxsize = sizeof(struct sha512_hash_ctx), + } + } +}; + +static int sha512_mb_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_init(mcryptd_req); +} + +static int sha512_mb_async_update(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_update(mcryptd_req); +} + +static int sha512_mb_async_finup(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_finup(mcryptd_req); +} + +static int sha512_mb_async_final(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_final(mcryptd_req); +} + +static int sha512_mb_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_digest(mcryptd_req); +} + +static int sha512_mb_async_export(struct ahash_request *req, void *out) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_export(mcryptd_req, out); +} + +static int sha512_mb_async_import(struct ahash_request *req, const void *in) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm); + struct mcryptd_hash_request_ctx *rctx; + struct ahash_request *areq; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + rctx = ahash_request_ctx(mcryptd_req); + + areq = &rctx->areq; + + ahash_request_set_tfm(areq, child); + ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP, + rctx->complete, req); + + return crypto_ahash_import(mcryptd_req, in); +} + +static struct ahash_alg sha512_mb_async_alg = { + .init = sha512_mb_async_init, + .update = sha512_mb_async_update, + .final = sha512_mb_async_final, + .finup = sha512_mb_async_finup, + .digest = sha512_mb_async_digest, + .export = sha512_mb_async_export, + .import = sha512_mb_async_import, + .halg = { + .digestsize = SHA512_DIGEST_SIZE, + .statesize = sizeof(struct sha512_hash_ctx), + .base = { + .cra_name = "sha512", + .cra_driver_name = "sha512_mb", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_AHASH | + CRYPTO_ALG_ASYNC, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_type = &crypto_ahash_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT + (sha512_mb_async_alg.halg.base.cra_list), + .cra_init = sha512_mb_async_init_tfm, + .cra_exit = sha512_mb_async_exit_tfm, + .cra_ctxsize = sizeof(struct sha512_mb_ctx), + .cra_alignmask = 0, + }, + }, +}; + +static unsigned long sha512_mb_flusher(struct mcryptd_alg_cstate *cstate) +{ + struct mcryptd_hash_request_ctx *rctx; + unsigned long cur_time; + unsigned long next_flush = 0; + struct sha512_hash_ctx *sha_ctx; + + + cur_time = jiffies; + + while (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + if time_before(cur_time, rctx->tag.expire) + break; + kernel_fpu_begin(); + sha_ctx = (struct sha512_hash_ctx *) + sha512_ctx_mgr_flush(cstate->mgr); + kernel_fpu_end(); + if (!sha_ctx) { + pr_err("sha512_mb error: nothing got flushed for" + " non-empty list\n"); + break; + } + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + sha_finish_walk(&rctx, cstate, true); + sha_complete_job(rctx, cstate, 0); + } + + if (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + /* get the hash context and then flush time */ + next_flush = rctx->tag.expire; + mcryptd_arm_flusher(cstate, get_delay(next_flush)); + } + return next_flush; +} + +static int __init sha512_mb_mod_init(void) +{ + + int cpu; + int err; + struct mcryptd_alg_cstate *cpu_state; + + /* check for dependent cpu features */ + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_BMI2)) + return -ENODEV; + + /* initialize multibuffer structures */ + sha512_mb_alg_state.alg_cstate = + alloc_percpu(struct mcryptd_alg_cstate); + + sha512_job_mgr_init = sha512_mb_mgr_init_avx2; + sha512_job_mgr_submit = sha512_mb_mgr_submit_avx2; + sha512_job_mgr_flush = sha512_mb_mgr_flush_avx2; + sha512_job_mgr_get_comp_job = sha512_mb_mgr_get_comp_job_avx2; + + if (!sha512_mb_alg_state.alg_cstate) + return -ENOMEM; + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu); + cpu_state->next_flush = 0; + cpu_state->next_seq_num = 0; + cpu_state->flusher_engaged = false; + INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher); + cpu_state->cpu = cpu; + cpu_state->alg_state = &sha512_mb_alg_state; + cpu_state->mgr = kzalloc(sizeof(struct sha512_ctx_mgr), + GFP_KERNEL); + if (!cpu_state->mgr) + goto err2; + sha512_ctx_mgr_init(cpu_state->mgr); + INIT_LIST_HEAD(&cpu_state->work_list); + spin_lock_init(&cpu_state->work_lock); + } + sha512_mb_alg_state.flusher = &sha512_mb_flusher; + + err = crypto_register_ahash(&sha512_mb_areq_alg); + if (err) + goto err2; + err = crypto_register_ahash(&sha512_mb_async_alg); + if (err) + goto err1; + + + return 0; +err1: + crypto_unregister_ahash(&sha512_mb_areq_alg); +err2: + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha512_mb_alg_state.alg_cstate); + return -ENODEV; +} + +static void __exit sha512_mb_mod_fini(void) +{ + int cpu; + struct mcryptd_alg_cstate *cpu_state; + + crypto_unregister_ahash(&sha512_mb_async_alg); + crypto_unregister_ahash(&sha512_mb_areq_alg); + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha512_mb_alg_state.alg_cstate); +} + +module_init(sha512_mb_mod_init); +module_exit(sha512_mb_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, multi buffer accelerated"); + +MODULE_ALIAS("sha512"); diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h new file mode 100644 index 000000000000..9d4b2c8208d5 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h @@ -0,0 +1,130 @@ +/* + * Header file for multi buffer SHA512 context + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SHA_MB_CTX_INTERNAL_H +#define _SHA_MB_CTX_INTERNAL_H + +#include "sha512_mb_mgr.h" + +#define HASH_UPDATE 0x00 +#define HASH_FIRST 0x01 +#define HASH_LAST 0x02 +#define HASH_ENTIRE 0x03 +#define HASH_DONE 0x04 +#define HASH_FINAL 0x08 + +#define HASH_CTX_STS_IDLE 0x00 +#define HASH_CTX_STS_PROCESSING 0x01 +#define HASH_CTX_STS_LAST 0x02 +#define HASH_CTX_STS_COMPLETE 0x04 + +enum hash_ctx_error { + HASH_CTX_ERROR_NONE = 0, + HASH_CTX_ERROR_INVALID_FLAGS = -1, + HASH_CTX_ERROR_ALREADY_PROCESSING = -2, + HASH_CTX_ERROR_ALREADY_COMPLETED = -3, +}; + +#define hash_ctx_user_data(ctx) ((ctx)->user_data) +#define hash_ctx_digest(ctx) ((ctx)->job.result_digest) +#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING) +#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE) +#define hash_ctx_status(ctx) ((ctx)->status) +#define hash_ctx_error(ctx) ((ctx)->error) +#define hash_ctx_init(ctx) \ + do { \ + (ctx)->error = HASH_CTX_ERROR_NONE; \ + (ctx)->status = HASH_CTX_STS_COMPLETE; \ + } while (0) + +/* Hash Constants and Typedefs */ +#define SHA512_DIGEST_LENGTH 8 +#define SHA512_LOG2_BLOCK_SIZE 7 + +#define SHA512_PADLENGTHFIELD_SIZE 16 + +#ifdef SHA_MB_DEBUG +#define assert(expr) \ +do { \ + if (unlikely(!(expr))) { \ + printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ + #expr, __FILE__, __func__, __LINE__); \ + } \ +} while (0) +#else +#define assert(expr) do {} while (0) +#endif + +struct sha512_ctx_mgr { + struct sha512_mb_mgr mgr; +}; + +/* typedef struct sha512_ctx_mgr sha512_ctx_mgr; */ + +struct sha512_hash_ctx { + /* Must be at struct offset 0 */ + struct job_sha512 job; + /* status flag */ + int status; + /* error flag */ + int error; + + uint32_t total_length; + const void *incoming_buffer; + uint32_t incoming_buffer_length; + uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2]; + uint32_t partial_block_buffer_length; + void *user_data; +}; + +#endif diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h b/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h new file mode 100644 index 000000000000..178f17eef382 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h @@ -0,0 +1,104 @@ +/* + * Header file for multi buffer SHA512 algorithm manager + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __SHA_MB_MGR_H +#define __SHA_MB_MGR_H + +#include <linux/types.h> + +#define NUM_SHA512_DIGEST_WORDS 8 + +enum job_sts {STS_UNKNOWN = 0, + STS_BEING_PROCESSED = 1, + STS_COMPLETED = 2, + STS_INTERNAL_ERROR = 3, + STS_ERROR = 4 +}; + +struct job_sha512 { + u8 *buffer; + u64 len; + u64 result_digest[NUM_SHA512_DIGEST_WORDS] __aligned(32); + enum job_sts status; + void *user_data; +}; + +struct sha512_args_x4 { + uint64_t digest[8][4]; + uint8_t *data_ptr[4]; +}; + +struct sha512_lane_data { + struct job_sha512 *job_in_lane; +}; + +struct sha512_mb_mgr { + struct sha512_args_x4 args; + + uint64_t lens[4]; + + /* each byte is index (0...7) of unused lanes */ + uint64_t unused_lanes; + /* byte 4 is set to FF as a flag */ + struct sha512_lane_data ldata[4]; +}; + +#define SHA512_MB_MGR_NUM_LANES_AVX2 4 + +void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state); +struct job_sha512 *sha512_mb_mgr_submit_avx2(struct sha512_mb_mgr *state, + struct job_sha512 *job); +struct job_sha512 *sha512_mb_mgr_flush_avx2(struct sha512_mb_mgr *state); +struct job_sha512 *sha512_mb_mgr_get_comp_job_avx2(struct sha512_mb_mgr *state); + +#endif diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S new file mode 100644 index 000000000000..cf2636d4c9ba --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S @@ -0,0 +1,281 @@ +/* + * Header file for multi buffer SHA256 algorithm data structure + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +# Macros for defining data structures + +# Usage example + +#START_FIELDS # JOB_AES +### name size align +#FIELD _plaintext, 8, 8 # pointer to plaintext +#FIELD _ciphertext, 8, 8 # pointer to ciphertext +#FIELD _IV, 16, 8 # IV +#FIELD _keys, 8, 8 # pointer to keys +#FIELD _len, 4, 4 # length in bytes +#FIELD _status, 4, 4 # status enumeration +#FIELD _user_data, 8, 8 # pointer to user data +#UNION _union, size1, align1, \ +# size2, align2, \ +# size3, align3, \ +# ... +#END_FIELDS +#%assign _JOB_AES_size _FIELD_OFFSET +#%assign _JOB_AES_align _STRUCT_ALIGN + +######################################################################### + +# Alternate "struc-like" syntax: +# STRUCT job_aes2 +# RES_Q .plaintext, 1 +# RES_Q .ciphertext, 1 +# RES_DQ .IV, 1 +# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN +# RES_U .union, size1, align1, \ +# size2, align2, \ +# ... +# ENDSTRUCT +# # Following only needed if nesting +# %assign job_aes2_size _FIELD_OFFSET +# %assign job_aes2_align _STRUCT_ALIGN +# +# RES_* macros take a name, a count and an optional alignment. +# The count in in terms of the base size of the macro, and the +# default alignment is the base size. +# The macros are: +# Macro Base size +# RES_B 1 +# RES_W 2 +# RES_D 4 +# RES_Q 8 +# RES_DQ 16 +# RES_Y 32 +# RES_Z 64 +# +# RES_U defines a union. It's arguments are a name and two or more +# pairs of "size, alignment" +# +# The two assigns are only needed if this structure is being nested +# within another. Even if the assigns are not done, one can still use +# STRUCT_NAME_size as the size of the structure. +# +# Note that for nesting, you still need to assign to STRUCT_NAME_size. +# +# The differences between this and using "struc" directly are that each +# type is implicitly aligned to its natural length (although this can be +# over-ridden with an explicit third parameter), and that the structure +# is padded at the end to its overall alignment. +# + +######################################################################### + +#ifndef _DATASTRUCT_ASM_ +#define _DATASTRUCT_ASM_ + +#define PTR_SZ 8 +#define SHA512_DIGEST_WORD_SIZE 8 +#define SHA512_MB_MGR_NUM_LANES_AVX2 4 +#define NUM_SHA512_DIGEST_WORDS 8 +#define SZ4 4*SHA512_DIGEST_WORD_SIZE +#define ROUNDS 80*SZ4 +#define SHA512_DIGEST_ROW_SIZE (SHA512_MB_MGR_NUM_LANES_AVX2 * 8) + +# START_FIELDS +.macro START_FIELDS + _FIELD_OFFSET = 0 + _STRUCT_ALIGN = 0 +.endm + +# FIELD name size align +.macro FIELD name size align + _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1)) + \name = _FIELD_OFFSET + _FIELD_OFFSET = _FIELD_OFFSET + (\size) +.if (\align > _STRUCT_ALIGN) + _STRUCT_ALIGN = \align +.endif +.endm + +# END_FIELDS +.macro END_FIELDS + _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1)) +.endm + +.macro STRUCT p1 +START_FIELDS +.struc \p1 +.endm + +.macro ENDSTRUCT + tmp = _FIELD_OFFSET + END_FIELDS + tmp = (_FIELD_OFFSET - ##tmp) +.if (tmp > 0) + .lcomm tmp +.endm + +## RES_int name size align +.macro RES_int p1 p2 p3 + name = \p1 + size = \p2 + align = .\p3 + + _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1)) +.align align +.lcomm name size + _FIELD_OFFSET = _FIELD_OFFSET + (size) +.if (align > _STRUCT_ALIGN) + _STRUCT_ALIGN = align +.endif +.endm + +# macro RES_B name, size [, align] +.macro RES_B _name, _size, _align=1 +RES_int _name _size _align +.endm + +# macro RES_W name, size [, align] +.macro RES_W _name, _size, _align=2 +RES_int _name 2*(_size) _align +.endm + +# macro RES_D name, size [, align] +.macro RES_D _name, _size, _align=4 +RES_int _name 4*(_size) _align +.endm + +# macro RES_Q name, size [, align] +.macro RES_Q _name, _size, _align=8 +RES_int _name 8*(_size) _align +.endm + +# macro RES_DQ name, size [, align] +.macro RES_DQ _name, _size, _align=16 +RES_int _name 16*(_size) _align +.endm + +# macro RES_Y name, size [, align] +.macro RES_Y _name, _size, _align=32 +RES_int _name 32*(_size) _align +.endm + +# macro RES_Z name, size [, align] +.macro RES_Z _name, _size, _align=64 +RES_int _name 64*(_size) _align +.endm + +#endif + +################################################################### +### Define SHA512 Out Of Order Data Structures +################################################################### + +START_FIELDS # LANE_DATA +### name size align +FIELD _job_in_lane, 8, 8 # pointer to job object +END_FIELDS + + _LANE_DATA_size = _FIELD_OFFSET + _LANE_DATA_align = _STRUCT_ALIGN + +#################################################################### + +START_FIELDS # SHA512_ARGS_X4 +### name size align +FIELD _digest, 8*8*4, 4 # transposed digest +FIELD _data_ptr, 8*4, 8 # array of pointers to data +END_FIELDS + + _SHA512_ARGS_X4_size = _FIELD_OFFSET + _SHA512_ARGS_X4_align = _STRUCT_ALIGN + +##################################################################### + +START_FIELDS # MB_MGR +### name size align +FIELD _args, _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align +FIELD _lens, 8*4, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*4, _LANE_DATA_align +END_FIELDS + + _MB_MGR_size = _FIELD_OFFSET + _MB_MGR_align = _STRUCT_ALIGN + +_args_digest = _args + _digest +_args_data_ptr = _args + _data_ptr + +####################################################################### + +####################################################################### +#### Define constants +####################################################################### + +#define STS_UNKNOWN 0 +#define STS_BEING_PROCESSED 1 +#define STS_COMPLETED 2 + +####################################################################### +#### Define JOB_SHA512 structure +####################################################################### + +START_FIELDS # JOB_SHA512 +### name size align +FIELD _buffer, 8, 8 # pointer to buffer +FIELD _len, 8, 8 # length in bytes +FIELD _result_digest, 8*8, 32 # Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 +END_FIELDS + + _JOB_SHA512_size = _FIELD_OFFSET + _JOB_SHA512_align = _STRUCT_ALIGN diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S new file mode 100644 index 000000000000..3ddba19a0db6 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S @@ -0,0 +1,291 @@ +/* + * Flush routine for SHA512 multibuffer + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> +#include "sha512_mb_mgr_datastruct.S" + +.extern sha512_x4_avx2 + +# LINUX register definitions +#define arg1 %rdi +#define arg2 %rsi + +# idx needs to be other than arg1, arg2, rbx, r12 +#define idx %rdx + +# Common definitions +#define state arg1 +#define job arg2 +#define len2 arg2 + +#define unused_lanes %rbx +#define lane_data %rbx +#define tmp2 %rbx + +#define job_rax %rax +#define tmp1 %rax +#define size_offset %rax +#define tmp %rax +#define start_offset %rax + +#define tmp3 arg1 + +#define extra_blocks arg2 +#define p arg2 + +#define tmp4 %r8 +#define lens0 %r8 + +#define lens1 %r9 +#define lens2 %r10 +#define lens3 %r11 + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JNE_SKIP i +jne skip_\i +.endm + +.altmacro +.macro SET_OFFSET _offset +offset = \_offset +.endm +.noaltmacro + +# JOB* sha512_mb_mgr_flush_avx2(MB_MGR *state) +# arg 1 : rcx : state +ENTRY(sha512_mb_mgr_flush_avx2) + FRAME_BEGIN + push %rbx + + # If bit (32+3) is set, then all lanes are empty + mov _unused_lanes(state), unused_lanes + bt $32+7, unused_lanes + jc return_null + + # find a lane with a non-null job + xor idx, idx + offset = (_ldata + 1*_LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne one(%rip), idx + offset = (_ldata + 2*_LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne two(%rip), idx + offset = (_ldata + 3*_LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne three(%rip), idx + + # copy idx to empty lanes +copy_lane_data: + offset = (_args + _data_ptr) + mov offset(state,idx,8), tmp + + I = 0 +.rep 4 + offset = (_ldata + I * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) +.altmacro + JNE_SKIP %I + offset = (_args + _data_ptr + 8*I) + mov tmp, offset(state) + offset = (_lens + 8*I +4) + movl $0xFFFFFFFF, offset(state) +LABEL skip_ %I + I = (I+1) +.noaltmacro +.endr + + # Find min length + mov _lens + 0*8(state),lens0 + mov lens0,idx + mov _lens + 1*8(state),lens1 + cmp idx,lens1 + cmovb lens1,idx + mov _lens + 2*8(state),lens2 + cmp idx,lens2 + cmovb lens2,idx + mov _lens + 3*8(state),lens3 + cmp idx,lens3 + cmovb lens3,idx + mov idx,len2 + and $0xF,idx + and $~0xFF,len2 + jz len_is_0 + + sub len2, lens0 + sub len2, lens1 + sub len2, lens2 + sub len2, lens3 + shr $32,len2 + mov lens0, _lens + 0*8(state) + mov lens1, _lens + 1*8(state) + mov lens2, _lens + 2*8(state) + mov lens3, _lens + 3*8(state) + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha512_x4_avx2 + # state and idx are intact + +len_is_0: + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $8, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens+4(state, idx, 8) + + vmovq _args_digest+0*32(state, idx, 8), %xmm0 + vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0 + vmovq _args_digest+2*32(state, idx, 8), %xmm1 + vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1 + vmovq _args_digest+4*32(state, idx, 8), %xmm2 + vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2 + vmovq _args_digest+6*32(state, idx, 8), %xmm3 + vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3 + + vmovdqu %xmm0, _result_digest(job_rax) + vmovdqu %xmm1, _result_digest+1*16(job_rax) + vmovdqu %xmm2, _result_digest+2*16(job_rax) + vmovdqu %xmm3, _result_digest+3*16(job_rax) + +return: + pop %rbx + FRAME_END + ret + +return_null: + xor job_rax, job_rax + jmp return +ENDPROC(sha512_mb_mgr_flush_avx2) +.align 16 + +ENTRY(sha512_mb_mgr_get_comp_job_avx2) + push %rbx + + mov _unused_lanes(state), unused_lanes + bt $(32+7), unused_lanes + jc .return_null + + # Find min length + mov _lens(state),lens0 + mov lens0,idx + mov _lens+1*8(state),lens1 + cmp idx,lens1 + cmovb lens1,idx + mov _lens+2*8(state),lens2 + cmp idx,lens2 + cmovb lens2,idx + mov _lens+3*8(state),lens3 + cmp idx,lens3 + cmovb lens3,idx + test $~0xF,idx + jnz .return_null + and $0xF,idx + + #process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $8, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens+4(state, idx, 8) + + vmovq _args_digest(state, idx, 8), %xmm0 + vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0 + vmovq _args_digest+2*32(state, idx, 8), %xmm1 + vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1 + vmovq _args_digest+4*32(state, idx, 8), %xmm2 + vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2 + vmovq _args_digest+6*32(state, idx, 8), %xmm3 + vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3 + + vmovdqu %xmm0, _result_digest+0*16(job_rax) + vmovdqu %xmm1, _result_digest+1*16(job_rax) + vmovdqu %xmm2, _result_digest+2*16(job_rax) + vmovdqu %xmm3, _result_digest+3*16(job_rax) + + pop %rbx + + ret + +.return_null: + xor job_rax, job_rax + pop %rbx + ret +ENDPROC(sha512_mb_mgr_get_comp_job_avx2) +.data + +.align 16 +one: +.quad 1 +two: +.quad 2 +three: +.quad 3 diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c new file mode 100644 index 000000000000..36870b26067a --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c @@ -0,0 +1,67 @@ +/* + * Initialization code for multi buffer SHA256 algorithm for AVX2 + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "sha512_mb_mgr.h" + +void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state) +{ + unsigned int j; + + state->lens[0] = 0; + state->lens[1] = 1; + state->lens[2] = 2; + state->lens[3] = 3; + state->unused_lanes = 0xFF03020100; + for (j = 0; j < 4; j++) + state->ldata[j].job_in_lane = NULL; +} diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S new file mode 100644 index 000000000000..815f07bdd1f8 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S @@ -0,0 +1,222 @@ +/* + * Buffer submit code for multi buffer SHA512 algorithm + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> +#include "sha512_mb_mgr_datastruct.S" + +.extern sha512_x4_avx2 + +#define arg1 %rdi +#define arg2 %rsi + +#define idx %rdx +#define last_len %rdx + +#define size_offset %rcx +#define tmp2 %rcx + +# Common definitions +#define state arg1 +#define job arg2 +#define len2 arg2 +#define p2 arg2 + +#define p %r11 +#define start_offset %r11 + +#define unused_lanes %rbx + +#define job_rax %rax +#define len %rax + +#define lane %r12 +#define tmp3 %r12 +#define lens3 %r12 + +#define extra_blocks %r8 +#define lens0 %r8 + +#define tmp %r9 +#define lens1 %r9 + +#define lane_data %r10 +#define lens2 %r10 + +#define DWORD_len %eax + +# JOB* sha512_mb_mgr_submit_avx2(MB_MGR *state, JOB *job) +# arg 1 : rcx : state +# arg 2 : rdx : job +ENTRY(sha512_mb_mgr_submit_avx2) + FRAME_BEGIN + push %rbx + push %r12 + + mov _unused_lanes(state), unused_lanes + movzb %bl,lane + shr $8, unused_lanes + imul $_LANE_DATA_size, lane,lane_data + movl $STS_BEING_PROCESSED, _status(job) + lea _ldata(state, lane_data), lane_data + mov unused_lanes, _unused_lanes(state) + movl _len(job), DWORD_len + + mov job, _job_in_lane(lane_data) + movl DWORD_len,_lens+4(state , lane, 8) + + # Load digest words from result_digest + vmovdqu _result_digest+0*16(job), %xmm0 + vmovdqu _result_digest+1*16(job), %xmm1 + vmovdqu _result_digest+2*16(job), %xmm2 + vmovdqu _result_digest+3*16(job), %xmm3 + + vmovq %xmm0, _args_digest(state, lane, 8) + vpextrq $1, %xmm0, _args_digest+1*32(state , lane, 8) + vmovq %xmm1, _args_digest+2*32(state , lane, 8) + vpextrq $1, %xmm1, _args_digest+3*32(state , lane, 8) + vmovq %xmm2, _args_digest+4*32(state , lane, 8) + vpextrq $1, %xmm2, _args_digest+5*32(state , lane, 8) + vmovq %xmm3, _args_digest+6*32(state , lane, 8) + vpextrq $1, %xmm3, _args_digest+7*32(state , lane, 8) + + mov _buffer(job), p + mov p, _args_data_ptr(state, lane, 8) + + cmp $0xFF, unused_lanes + jne return_null + +start_loop: + + # Find min length + mov _lens+0*8(state),lens0 + mov lens0,idx + mov _lens+1*8(state),lens1 + cmp idx,lens1 + cmovb lens1, idx + mov _lens+2*8(state),lens2 + cmp idx,lens2 + cmovb lens2,idx + mov _lens+3*8(state),lens3 + cmp idx,lens3 + cmovb lens3,idx + mov idx,len2 + and $0xF,idx + and $~0xFF,len2 + jz len_is_0 + + sub len2,lens0 + sub len2,lens1 + sub len2,lens2 + sub len2,lens3 + shr $32,len2 + mov lens0, _lens + 0*8(state) + mov lens1, _lens + 1*8(state) + mov lens2, _lens + 2*8(state) + mov lens3, _lens + 3*8(state) + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha512_x4_avx2 + # state and idx are intact + +len_is_0: + + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + mov _unused_lanes(state), unused_lanes + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + shl $8, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF,_lens+4(state,idx,8) + vmovq _args_digest+0*32(state , idx, 8), %xmm0 + vpinsrq $1, _args_digest+1*32(state , idx, 8), %xmm0, %xmm0 + vmovq _args_digest+2*32(state , idx, 8), %xmm1 + vpinsrq $1, _args_digest+3*32(state , idx, 8), %xmm1, %xmm1 + vmovq _args_digest+4*32(state , idx, 8), %xmm2 + vpinsrq $1, _args_digest+5*32(state , idx, 8), %xmm2, %xmm2 + vmovq _args_digest+6*32(state , idx, 8), %xmm3 + vpinsrq $1, _args_digest+7*32(state , idx, 8), %xmm3, %xmm3 + + vmovdqu %xmm0, _result_digest + 0*16(job_rax) + vmovdqu %xmm1, _result_digest + 1*16(job_rax) + vmovdqu %xmm2, _result_digest + 2*16(job_rax) + vmovdqu %xmm3, _result_digest + 3*16(job_rax) + +return: + pop %r12 + pop %rbx + FRAME_END + ret + +return_null: + xor job_rax, job_rax + jmp return +ENDPROC(sha512_mb_mgr_submit_avx2) +.data + +.align 16 +H0: .int 0x6a09e667 +H1: .int 0xbb67ae85 +H2: .int 0x3c6ef372 +H3: .int 0xa54ff53a +H4: .int 0x510e527f +H5: .int 0x9b05688c +H6: .int 0x1f83d9ab +H7: .int 0x5be0cd19 diff --git a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S new file mode 100644 index 000000000000..31ab1eff6413 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S @@ -0,0 +1,529 @@ +/* + * Multi-buffer SHA512 algorithm hash compute routine + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +# code to compute quad SHA512 using AVX2 +# use YMMs to tackle the larger digest size +# outer calling routine takes care of save and restore of XMM registers +# Logic designed/laid out by JDG + +# Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +# Stack must be aligned to 32 bytes before call +# Linux clobbers: rax rbx rcx rsi r8 r9 r10 r11 r12 +# Linux preserves: rcx rdx rdi rbp r13 r14 r15 +# clobbers ymm0-15 + +#include <linux/linkage.h> +#include "sha512_mb_mgr_datastruct.S" + +arg1 = %rdi +arg2 = %rsi + +# Common definitions +STATE = arg1 +INP_SIZE = arg2 + +IDX = %rax +ROUND = %rbx +TBL = %r8 + +inp0 = %r9 +inp1 = %r10 +inp2 = %r11 +inp3 = %r12 + +a = %ymm0 +b = %ymm1 +c = %ymm2 +d = %ymm3 +e = %ymm4 +f = %ymm5 +g = %ymm6 +h = %ymm7 + +a0 = %ymm8 +a1 = %ymm9 +a2 = %ymm10 + +TT0 = %ymm14 +TT1 = %ymm13 +TT2 = %ymm12 +TT3 = %ymm11 +TT4 = %ymm10 +TT5 = %ymm9 + +T1 = %ymm14 +TMP = %ymm15 + +# Define stack usage +STACK_SPACE1 = SZ4*16 + NUM_SHA512_DIGEST_WORDS*SZ4 + 24 + +#define VMOVPD vmovupd +_digest = SZ4*16 + +# transpose r0, r1, r2, r3, t0, t1 +# "transpose" data in {r0..r3} using temps {t0..t3} +# Input looks like: {r0 r1 r2 r3} +# r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +# r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +# r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +# r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +# +# output looks like: {t0 r1 r0 r3} +# t0 = {d1 d0 c1 c0 b1 b0 a1 a0} +# r1 = {d3 d2 c3 c2 b3 b2 a3 a2} +# r0 = {d5 d4 c5 c4 b5 b4 a5 a4} +# r3 = {d7 d6 c7 c6 b7 b6 a7 a6} + +.macro TRANSPOSE r0 r1 r2 r3 t0 t1 + vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + + vperm2f128 $0x20, \r2, \r0, \r1 # h6...a6 + vperm2f128 $0x31, \r2, \r0, \r3 # h2...a2 + vperm2f128 $0x31, \t1, \t0, \r0 # h5...a5 + vperm2f128 $0x20, \t1, \t0, \t0 # h1...a1 +.endm + +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +# PRORQ reg, imm, tmp +# packed-rotate-right-double +# does a rotate by doing two shifts and an or +.macro _PRORQ reg imm tmp + vpsllq $(64-\imm),\reg,\tmp + vpsrlq $\imm,\reg, \reg + vpor \tmp,\reg, \reg +.endm + +# non-destructive +# PRORQ_nd reg, imm, tmp, src +.macro _PRORQ_nd reg imm tmp src + vpsllq $(64-\imm), \src, \tmp + vpsrlq $\imm, \src, \reg + vpor \tmp, \reg, \reg +.endm + +# PRORQ dst/src, amt +.macro PRORQ reg imm + _PRORQ \reg, \imm, TMP +.endm + +# PRORQ_nd dst, src, amt +.macro PRORQ_nd reg tmp imm + _PRORQ_nd \reg, \imm, TMP, \tmp +.endm + +#; arguments passed implicitly in preprocessor symbols i, a...h +.macro ROUND_00_15 _T1 i + PRORQ_nd a0, e, (18-14) # sig1: a0 = (e >> 4) + + vpxor g, f, a2 # ch: a2 = f^g + vpand e,a2, a2 # ch: a2 = (f^g)&e + vpxor g, a2, a2 # a2 = ch + + PRORQ_nd a1,e,41 # sig1: a1 = (e >> 25) + + offset = SZ4*(\i & 0xf) + vmovdqu \_T1,offset(%rsp) + vpaddq (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K + vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 # sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddq a2, h, h # h = h + ch + PRORQ_nd a2,a,6 # sig0: a2 = (a >> 11) + vpaddq \_T1,h, h # h = h + ch + W + K + vpxor a1, a0, a0 # a0 = sigma1 + vmovdqu a,\_T1 + PRORQ_nd a1,a,39 # sig0: a1 = (a >> 22) + vpxor c, \_T1, \_T1 # maj: T1 = a^c + add $SZ4, ROUND # ROUND++ + vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b + vpaddq a0, h, h + vpaddq h, d, d + vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11) + PRORQ a2,28 # sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a1, a2, a2 # a2 = sig0 + vpand c, a, a1 # maj: a1 = a&c + vpor \_T1, a1, a1 # a1 = maj + vpaddq a1, h, h # h = h + ch + W + K + maj + vpaddq a2, h, h # h = h + ch + W + K + maj + sigma0 + ROTATE_ARGS +.endm + + +#; arguments passed implicitly in preprocessor symbols i, a...h +.macro ROUND_16_XX _T1 i + vmovdqu SZ4*((\i-15)&0xf)(%rsp), \_T1 + vmovdqu SZ4*((\i-2)&0xf)(%rsp), a1 + vmovdqu \_T1, a0 + PRORQ \_T1,7 + vmovdqu a1, a2 + PRORQ a1,42 + vpxor a0, \_T1, \_T1 + PRORQ \_T1, 1 + vpxor a2, a1, a1 + PRORQ a1, 19 + vpsrlq $7, a0, a0 + vpxor a0, \_T1, \_T1 + vpsrlq $6, a2, a2 + vpxor a2, a1, a1 + vpaddq SZ4*((\i-16)&0xf)(%rsp), \_T1, \_T1 + vpaddq SZ4*((\i-7)&0xf)(%rsp), a1, a1 + vpaddq a1, \_T1, \_T1 + + ROUND_00_15 \_T1,\i +.endm + + +# void sha512_x4_avx2(void *STATE, const int INP_SIZE) +# arg 1 : STATE : pointer to input data +# arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +ENTRY(sha512_x4_avx2) + # general registers preserved in outer calling routine + # outer calling routine saves all the XMM registers + # save callee-saved clobbered registers to comply with C function ABI + push %r12 + push %r13 + push %r14 + push %r15 + + sub $STACK_SPACE1, %rsp + + # Load the pre-transposed incoming digest. + vmovdqu 0*SHA512_DIGEST_ROW_SIZE(STATE),a + vmovdqu 1*SHA512_DIGEST_ROW_SIZE(STATE),b + vmovdqu 2*SHA512_DIGEST_ROW_SIZE(STATE),c + vmovdqu 3*SHA512_DIGEST_ROW_SIZE(STATE),d + vmovdqu 4*SHA512_DIGEST_ROW_SIZE(STATE),e + vmovdqu 5*SHA512_DIGEST_ROW_SIZE(STATE),f + vmovdqu 6*SHA512_DIGEST_ROW_SIZE(STATE),g + vmovdqu 7*SHA512_DIGEST_ROW_SIZE(STATE),h + + lea K512_4(%rip),TBL + + # load the address of each of the 4 message lanes + # getting ready to transpose input onto stack + mov _data_ptr+0*PTR_SZ(STATE),inp0 + mov _data_ptr+1*PTR_SZ(STATE),inp1 + mov _data_ptr+2*PTR_SZ(STATE),inp2 + mov _data_ptr+3*PTR_SZ(STATE),inp3 + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + # save old digest + vmovdqu a, _digest(%rsp) + vmovdqu b, _digest+1*SZ4(%rsp) + vmovdqu c, _digest+2*SZ4(%rsp) + vmovdqu d, _digest+3*SZ4(%rsp) + vmovdqu e, _digest+4*SZ4(%rsp) + vmovdqu f, _digest+5*SZ4(%rsp) + vmovdqu g, _digest+6*SZ4(%rsp) + vmovdqu h, _digest+7*SZ4(%rsp) + i = 0 +.rep 4 + vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP + VMOVPD i*32(inp0, IDX), TT2 + VMOVPD i*32(inp1, IDX), TT1 + VMOVPD i*32(inp2, IDX), TT4 + VMOVPD i*32(inp3, IDX), TT3 + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + vpshufb TMP, TT0, TT0 + vpshufb TMP, TT1, TT1 + vpshufb TMP, TT2, TT2 + vpshufb TMP, TT3, TT3 + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) + i = (i+1) +.endr + add $128, IDX + + i = (i*4) + + jmp Lrounds_16_xx +.align 16 +Lrounds_16_xx: +.rep 16 + ROUND_16_XX T1, i + i = (i+1) +.endr + cmp $0xa00,ROUND + jb Lrounds_16_xx + + # add old digest + vpaddq _digest(%rsp), a, a + vpaddq _digest+1*SZ4(%rsp), b, b + vpaddq _digest+2*SZ4(%rsp), c, c + vpaddq _digest+3*SZ4(%rsp), d, d + vpaddq _digest+4*SZ4(%rsp), e, e + vpaddq _digest+5*SZ4(%rsp), f, f + vpaddq _digest+6*SZ4(%rsp), g, g + vpaddq _digest+7*SZ4(%rsp), h, h + + sub $1, INP_SIZE # unit is blocks + jne lloop + + # write back to memory (state object) the transposed digest + vmovdqu a, 0*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu b, 1*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu c, 2*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu d, 3*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu e, 4*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu f, 5*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu g, 6*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu h, 7*SHA512_DIGEST_ROW_SIZE(STATE) + + # update input data pointers + add IDX, inp0 + mov inp0, _data_ptr+0*PTR_SZ(STATE) + add IDX, inp1 + mov inp1, _data_ptr+1*PTR_SZ(STATE) + add IDX, inp2 + mov inp2, _data_ptr+2*PTR_SZ(STATE) + add IDX, inp3 + mov inp3, _data_ptr+3*PTR_SZ(STATE) + + #;;;;;;;;;;;;;;; + #; Postamble + add $STACK_SPACE1, %rsp + # restore callee-saved clobbered registers + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + # outer calling routine restores XMM and other GP registers + ret +ENDPROC(sha512_x4_avx2) + +.data +.align 64 +K512_4: + .octa 0x428a2f98d728ae22428a2f98d728ae22,\ + 0x428a2f98d728ae22428a2f98d728ae22 + .octa 0x7137449123ef65cd7137449123ef65cd,\ + 0x7137449123ef65cd7137449123ef65cd + .octa 0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f,\ + 0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f + .octa 0xe9b5dba58189dbbce9b5dba58189dbbc,\ + 0xe9b5dba58189dbbce9b5dba58189dbbc + .octa 0x3956c25bf348b5383956c25bf348b538,\ + 0x3956c25bf348b5383956c25bf348b538 + .octa 0x59f111f1b605d01959f111f1b605d019,\ + 0x59f111f1b605d01959f111f1b605d019 + .octa 0x923f82a4af194f9b923f82a4af194f9b,\ + 0x923f82a4af194f9b923f82a4af194f9b + .octa 0xab1c5ed5da6d8118ab1c5ed5da6d8118,\ + 0xab1c5ed5da6d8118ab1c5ed5da6d8118 + .octa 0xd807aa98a3030242d807aa98a3030242,\ + 0xd807aa98a3030242d807aa98a3030242 + .octa 0x12835b0145706fbe12835b0145706fbe,\ + 0x12835b0145706fbe12835b0145706fbe + .octa 0x243185be4ee4b28c243185be4ee4b28c,\ + 0x243185be4ee4b28c243185be4ee4b28c + .octa 0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2,\ + 0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2 + .octa 0x72be5d74f27b896f72be5d74f27b896f,\ + 0x72be5d74f27b896f72be5d74f27b896f + .octa 0x80deb1fe3b1696b180deb1fe3b1696b1,\ + 0x80deb1fe3b1696b180deb1fe3b1696b1 + .octa 0x9bdc06a725c712359bdc06a725c71235,\ + 0x9bdc06a725c712359bdc06a725c71235 + .octa 0xc19bf174cf692694c19bf174cf692694,\ + 0xc19bf174cf692694c19bf174cf692694 + .octa 0xe49b69c19ef14ad2e49b69c19ef14ad2,\ + 0xe49b69c19ef14ad2e49b69c19ef14ad2 + .octa 0xefbe4786384f25e3efbe4786384f25e3,\ + 0xefbe4786384f25e3efbe4786384f25e3 + .octa 0x0fc19dc68b8cd5b50fc19dc68b8cd5b5,\ + 0x0fc19dc68b8cd5b50fc19dc68b8cd5b5 + .octa 0x240ca1cc77ac9c65240ca1cc77ac9c65,\ + 0x240ca1cc77ac9c65240ca1cc77ac9c65 + .octa 0x2de92c6f592b02752de92c6f592b0275,\ + 0x2de92c6f592b02752de92c6f592b0275 + .octa 0x4a7484aa6ea6e4834a7484aa6ea6e483,\ + 0x4a7484aa6ea6e4834a7484aa6ea6e483 + .octa 0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4,\ + 0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4 + .octa 0x76f988da831153b576f988da831153b5,\ + 0x76f988da831153b576f988da831153b5 + .octa 0x983e5152ee66dfab983e5152ee66dfab,\ + 0x983e5152ee66dfab983e5152ee66dfab + .octa 0xa831c66d2db43210a831c66d2db43210,\ + 0xa831c66d2db43210a831c66d2db43210 + .octa 0xb00327c898fb213fb00327c898fb213f,\ + 0xb00327c898fb213fb00327c898fb213f + .octa 0xbf597fc7beef0ee4bf597fc7beef0ee4,\ + 0xbf597fc7beef0ee4bf597fc7beef0ee4 + .octa 0xc6e00bf33da88fc2c6e00bf33da88fc2,\ + 0xc6e00bf33da88fc2c6e00bf33da88fc2 + .octa 0xd5a79147930aa725d5a79147930aa725,\ + 0xd5a79147930aa725d5a79147930aa725 + .octa 0x06ca6351e003826f06ca6351e003826f,\ + 0x06ca6351e003826f06ca6351e003826f + .octa 0x142929670a0e6e70142929670a0e6e70,\ + 0x142929670a0e6e70142929670a0e6e70 + .octa 0x27b70a8546d22ffc27b70a8546d22ffc,\ + 0x27b70a8546d22ffc27b70a8546d22ffc + .octa 0x2e1b21385c26c9262e1b21385c26c926,\ + 0x2e1b21385c26c9262e1b21385c26c926 + .octa 0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed,\ + 0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed + .octa 0x53380d139d95b3df53380d139d95b3df,\ + 0x53380d139d95b3df53380d139d95b3df + .octa 0x650a73548baf63de650a73548baf63de,\ + 0x650a73548baf63de650a73548baf63de + .octa 0x766a0abb3c77b2a8766a0abb3c77b2a8,\ + 0x766a0abb3c77b2a8766a0abb3c77b2a8 + .octa 0x81c2c92e47edaee681c2c92e47edaee6,\ + 0x81c2c92e47edaee681c2c92e47edaee6 + .octa 0x92722c851482353b92722c851482353b,\ + 0x92722c851482353b92722c851482353b + .octa 0xa2bfe8a14cf10364a2bfe8a14cf10364,\ + 0xa2bfe8a14cf10364a2bfe8a14cf10364 + .octa 0xa81a664bbc423001a81a664bbc423001,\ + 0xa81a664bbc423001a81a664bbc423001 + .octa 0xc24b8b70d0f89791c24b8b70d0f89791,\ + 0xc24b8b70d0f89791c24b8b70d0f89791 + .octa 0xc76c51a30654be30c76c51a30654be30,\ + 0xc76c51a30654be30c76c51a30654be30 + .octa 0xd192e819d6ef5218d192e819d6ef5218,\ + 0xd192e819d6ef5218d192e819d6ef5218 + .octa 0xd69906245565a910d69906245565a910,\ + 0xd69906245565a910d69906245565a910 + .octa 0xf40e35855771202af40e35855771202a,\ + 0xf40e35855771202af40e35855771202a + .octa 0x106aa07032bbd1b8106aa07032bbd1b8,\ + 0x106aa07032bbd1b8106aa07032bbd1b8 + .octa 0x19a4c116b8d2d0c819a4c116b8d2d0c8,\ + 0x19a4c116b8d2d0c819a4c116b8d2d0c8 + .octa 0x1e376c085141ab531e376c085141ab53,\ + 0x1e376c085141ab531e376c085141ab53 + .octa 0x2748774cdf8eeb992748774cdf8eeb99,\ + 0x2748774cdf8eeb992748774cdf8eeb99 + .octa 0x34b0bcb5e19b48a834b0bcb5e19b48a8,\ + 0x34b0bcb5e19b48a834b0bcb5e19b48a8 + .octa 0x391c0cb3c5c95a63391c0cb3c5c95a63,\ + 0x391c0cb3c5c95a63391c0cb3c5c95a63 + .octa 0x4ed8aa4ae3418acb4ed8aa4ae3418acb,\ + 0x4ed8aa4ae3418acb4ed8aa4ae3418acb + .octa 0x5b9cca4f7763e3735b9cca4f7763e373,\ + 0x5b9cca4f7763e3735b9cca4f7763e373 + .octa 0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3,\ + 0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3 + .octa 0x748f82ee5defb2fc748f82ee5defb2fc,\ + 0x748f82ee5defb2fc748f82ee5defb2fc + .octa 0x78a5636f43172f6078a5636f43172f60,\ + 0x78a5636f43172f6078a5636f43172f60 + .octa 0x84c87814a1f0ab7284c87814a1f0ab72,\ + 0x84c87814a1f0ab7284c87814a1f0ab72 + .octa 0x8cc702081a6439ec8cc702081a6439ec,\ + 0x8cc702081a6439ec8cc702081a6439ec + .octa 0x90befffa23631e2890befffa23631e28,\ + 0x90befffa23631e2890befffa23631e28 + .octa 0xa4506cebde82bde9a4506cebde82bde9,\ + 0xa4506cebde82bde9a4506cebde82bde9 + .octa 0xbef9a3f7b2c67915bef9a3f7b2c67915,\ + 0xbef9a3f7b2c67915bef9a3f7b2c67915 + .octa 0xc67178f2e372532bc67178f2e372532b,\ + 0xc67178f2e372532bc67178f2e372532b + .octa 0xca273eceea26619cca273eceea26619c,\ + 0xca273eceea26619cca273eceea26619c + .octa 0xd186b8c721c0c207d186b8c721c0c207,\ + 0xd186b8c721c0c207d186b8c721c0c207 + .octa 0xeada7dd6cde0eb1eeada7dd6cde0eb1e,\ + 0xeada7dd6cde0eb1eeada7dd6cde0eb1e + .octa 0xf57d4f7fee6ed178f57d4f7fee6ed178,\ + 0xf57d4f7fee6ed178f57d4f7fee6ed178 + .octa 0x06f067aa72176fba06f067aa72176fba,\ + 0x06f067aa72176fba06f067aa72176fba + .octa 0x0a637dc5a2c898a60a637dc5a2c898a6,\ + 0x0a637dc5a2c898a60a637dc5a2c898a6 + .octa 0x113f9804bef90dae113f9804bef90dae,\ + 0x113f9804bef90dae113f9804bef90dae + .octa 0x1b710b35131c471b1b710b35131c471b,\ + 0x1b710b35131c471b1b710b35131c471b + .octa 0x28db77f523047d8428db77f523047d84,\ + 0x28db77f523047d8428db77f523047d84 + .octa 0x32caab7b40c7249332caab7b40c72493,\ + 0x32caab7b40c7249332caab7b40c72493 + .octa 0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc,\ + 0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc + .octa 0x431d67c49c100d4c431d67c49c100d4c,\ + 0x431d67c49c100d4c431d67c49c100d4c + .octa 0x4cc5d4becb3e42b64cc5d4becb3e42b6,\ + 0x4cc5d4becb3e42b64cc5d4becb3e42b6 + .octa 0x597f299cfc657e2a597f299cfc657e2a,\ + 0x597f299cfc657e2a597f299cfc657e2a + .octa 0x5fcb6fab3ad6faec5fcb6fab3ad6faec,\ + 0x5fcb6fab3ad6faec5fcb6fab3ad6faec + .octa 0x6c44198c4a4758176c44198c4a475817,\ + 0x6c44198c4a4758176c44198c4a475817 + +PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607 + .octa 0x18191a1b1c1d1e1f1011121314151617 diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 0b17c83d027d..2b0e2a6825f3 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -346,4 +346,10 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS_CRYPTO("sha512"); +MODULE_ALIAS_CRYPTO("sha512-ssse3"); +MODULE_ALIAS_CRYPTO("sha512-avx"); +MODULE_ALIAS_CRYPTO("sha512-avx2"); MODULE_ALIAS_CRYPTO("sha384"); +MODULE_ALIAS_CRYPTO("sha384-ssse3"); +MODULE_ALIAS_CRYPTO("sha384-avx"); +MODULE_ALIAS_CRYPTO("sha384-avx2"); diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index ec138e538c44..9e1e27d31c6d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -40,10 +40,10 @@ static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) #ifdef CONFIG_CONTEXT_TRACKING /* Called on entry from user mode with IRQs off. */ -__visible void enter_from_user_mode(void) +__visible inline void enter_from_user_mode(void) { CT_WARN_ON(ct_state() != CONTEXT_USER); - user_exit(); + user_exit_irqoff(); } #else static inline void enter_from_user_mode(void) {} @@ -274,7 +274,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) ti->status &= ~TS_COMPAT; #endif - user_enter(); + user_enter_irqoff(); } #define SYSCALL_EXIT_WORK_FLAGS \ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 983e5d3a0d27..0b56666e6039 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1153,3 +1153,14 @@ ENTRY(async_page_fault) jmp error_code END(async_page_fault) #endif + +ENTRY(rewind_stack_do_exit) + /* Prevent any naive code from trying to unwind to our caller. */ + xorl %ebp, %ebp + + movl PER_CPU_VAR(cpu_current_top_of_stack), %esi + leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp + + call do_exit +1: jmp 1b +END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9ee0da1807ed..b846875aeea6 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1423,3 +1423,14 @@ ENTRY(ignore_sysret) mov $-ENOSYS, %eax sysret END(ignore_sysret) + +ENTRY(rewind_stack_do_exit) + /* Prevent any naive code from trying to unwind to our caller. */ + xorl %ebp, %ebp + + movq PER_CPU_VAR(cpu_current_top_of_stack), %rax + leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp + + call do_exit +1: jmp 1b +END(rewind_stack_do_exit) diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 555263e385c9..e9ce9c7c39b4 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -374,5 +374,5 @@ 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat/ptregs -534 x32 preadv2 compat_sys_preadv2 -535 x32 pwritev2 compat_sys_pwritev2 +546 x32 preadv2 compat_sys_preadv64v2 +547 x32 pwritev2 compat_sys_pwritev64v2 diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index 98df1fa8825c..627ecbcb2e62 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -8,16 +8,15 @@ #include <linux/linkage.h> #include "calling.h" #include <asm/asm.h> -#include <asm/frame.h> /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func, put_ret_addr_in_rdi=0 .globl \name .type \name, @function \name: - FRAME_BEGIN + pushq %rbp + movq %rsp, %rbp - /* this one pushes 9 elems, the next one would be %rIP */ pushq %rdi pushq %rsi pushq %rdx @@ -29,12 +28,12 @@ pushq %r11 .if \put_ret_addr_in_rdi - /* 9*8(%rsp) is return addr on stack */ - movq 9*8(%rsp), %rdi + /* 8(%rbp) is return addr on stack */ + movq 8(%rbp), %rdi .endif call \func - jmp restore + jmp .L_restore _ASM_NOKPROBE(\name) .endm @@ -55,7 +54,7 @@ #if defined(CONFIG_TRACE_IRQFLAGS) \ || defined(CONFIG_DEBUG_LOCK_ALLOC) \ || defined(CONFIG_PREEMPT) -restore: +.L_restore: popq %r11 popq %r10 popq %r9 @@ -65,7 +64,7 @@ restore: popq %rdx popq %rsi popq %rdi - FRAME_END + popq %rbp ret - _ASM_NOKPROBE(restore) + _ASM_NOKPROBE(.L_restore) #endif diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 6874da5f67fc..6ba89a1ab0e5 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -55,7 +55,7 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ $(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) -HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/x86/include/uapi +HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi hostprogs-y += vdso2c quiet_cmd_vdso2c = VDSO2C $@ @@ -134,7 +134,7 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds -targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o +targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o targets += vdso32/vclock_gettime.o KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO @@ -156,7 +156,8 @@ $(obj)/vdso32.so.dbg: FORCE \ $(obj)/vdso32/vdso32.lds \ $(obj)/vdso32/vclock_gettime.o \ $(obj)/vdso32/note.o \ - $(obj)/vdso32/system_call.o + $(obj)/vdso32/system_call.o \ + $(obj)/vdso32/sigreturn.o $(call if_changed,vdso) # @@ -193,10 +194,10 @@ vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) $(MODLIB)/vdso: FORCE @mkdir -p $(MODLIB)/vdso -$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE +$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso $(call cmd,vdso_install) PHONY += vdso_install $(vdso_img_insttargets) -vdso_install: $(vdso_img_insttargets) FORCE +vdso_install: $(vdso_img_insttargets) clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S index d7ec4e251c0a..20633e026e82 100644 --- a/arch/x86/entry/vdso/vdso32/sigreturn.S +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -1,11 +1,3 @@ -/* - * Common code for the sigreturn entry points in vDSO images. - * So far this code is the same for both int80 and sysenter versions. - * This file is #include'd by int80.S et al to define them first thing. - * The kernel assumes that the addresses of these routines are constant - * for all vDSO implementations. - */ - #include <linux/linkage.h> #include <asm/unistd_32.h> #include <asm/asm-offsets.h> diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 0109ac6cb79c..ed4bc9731cbb 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -2,16 +2,11 @@ * AT_SYSINFO entry point */ +#include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/cpufeatures.h> #include <asm/alternative-asm.h> -/* - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - .text .globl __kernel_vsyscall .type __kernel_vsyscall,@function diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index b3cf81333a54..3329844e3c43 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -12,6 +12,7 @@ #include <linux/random.h> #include <linux/elf.h> #include <linux/cpu.h> +#include <linux/ptrace.h> #include <asm/pvclock.h> #include <asm/vgtod.h> #include <asm/proto.h> @@ -97,10 +98,40 @@ static int vdso_fault(const struct vm_special_mapping *sm, return 0; } -static const struct vm_special_mapping text_mapping = { - .name = "[vdso]", - .fault = vdso_fault, -}; +static void vdso_fix_landing(const struct vdso_image *image, + struct vm_area_struct *new_vma) +{ +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + if (in_ia32_syscall() && image == &vdso_image_32) { + struct pt_regs *regs = current_pt_regs(); + unsigned long vdso_land = image->sym_int80_landing_pad; + unsigned long old_land_addr = vdso_land + + (unsigned long)current->mm->context.vdso; + + /* Fixing userspace landing - look at do_fast_syscall_32 */ + if (regs->ip == old_land_addr) + regs->ip = new_vma->vm_start + vdso_land; + } +#endif +} + +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + const struct vdso_image *image = current->mm->context.vdso_image; + + if (image->size != new_size) + return -EINVAL; + + if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) + return -EFAULT; + + vdso_fix_landing(image, new_vma); + current->mm->context.vdso = (void __user *)new_vma->vm_start; + + return 0; +} static int vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) @@ -151,6 +182,12 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) struct vm_area_struct *vma; unsigned long addr, text_start; int ret = 0; + + static const struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + .fault = vdso_fault, + .mremap = vdso_mremap, + }; static const struct vm_special_mapping vvar_mapping = { .name = "[vvar]", .fault = vvar_fault, @@ -163,7 +200,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) addr = 0; } - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; addr = get_unmapped_area(NULL, addr, image->size - image->sym_vvar_start, 0, 0); @@ -184,7 +222,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) image->size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &text_mapping); + &vdso_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 174c2549939d..75fc719b7f31 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -96,7 +96,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_error, this could go away. + * sig_on_uaccess_err, this could go away. */ if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { @@ -125,7 +125,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) struct task_struct *tsk; unsigned long caller; int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_error; + int prev_sig_on_uaccess_err; long ret; /* @@ -221,8 +221,8 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) * With a real vsyscall, page faults cause SIGSEGV. We want to * preserve that behavior to make writing exploits harder. */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; + prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err; + current->thread.sig_on_uaccess_err = 1; ret = -EFAULT; switch (vsyscall_nr) { @@ -243,7 +243,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) break; } - current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err; check_fault: if (ret == -EFAULT) { diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index feb90f6730e8..72dea2f40fc4 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -655,8 +655,12 @@ fail: } if (event->attr.sample_type & PERF_SAMPLE_RAW) { - raw.size = sizeof(u32) + ibs_data.size; - raw.data = ibs_data.data; + raw = (struct perf_raw_record){ + .frag = { + .size = sizeof(u32) + ibs_data.size, + .data = ibs_data.data, + }, + }; data.raw = &raw; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 73a75aa5a66d..dfebbde2a4cc 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -263,7 +263,7 @@ static bool check_hw_exists(void) msr_fail: pr_cont("Broken PMU hardware detected, using software events only.\n"); - pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n", + printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n", boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR, reg, val_new); @@ -1622,6 +1622,29 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, cha } EXPORT_SYMBOL_GPL(events_sysfs_show); +ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_ht_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_ht_attr, attr); + + /* + * Report conditional events depending on Hyper-Threading. + * + * This is overly conservative as usually the HT special + * handling is not needed if the other CPU thread is idle. + * + * Note this does not (and cannot) handle the case when thread + * siblings are invisible, for example with virtualization + * if they are owned by some other guest. The user tool + * has to re-read when a thread sibling gets onlined later. + */ + return sprintf(page, "%s", + topology_max_smt_threads() > 1 ? + pmu_attr->event_str_ht : + pmu_attr->event_str_noht); +} + EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); EVENT_ATTR(cache-references, CACHE_REFERENCES ); @@ -2202,7 +2225,7 @@ static int backtrace_stack(void *data, char *name) static int backtrace_address(void *data, unsigned long addr, int reliable) { - struct perf_callchain_entry *entry = data; + struct perf_callchain_entry_ctx *entry = data; return perf_callchain_store(entry, addr); } @@ -2214,7 +2237,7 @@ static const struct stacktrace_ops backtrace_ops = { }; void -perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ @@ -2268,7 +2291,7 @@ static unsigned long get_segment_base(unsigned int segment) #include <asm/compat.h> static inline int -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; @@ -2283,7 +2306,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); - while (entry->nr < sysctl_perf_event_max_stack) { + while (entry->nr < entry->max_stack) { unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; @@ -2309,17 +2332,17 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) } #else static inline int -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { return 0; } #endif void -perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; - const void __user *fp; + const unsigned long __user *fp; if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ @@ -2332,7 +2355,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) return; - fp = (void __user *)regs->bp; + fp = (unsigned long __user *)regs->bp; perf_callchain_store(entry, regs->ip); @@ -2343,18 +2366,19 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; pagefault_disable(); - while (entry->nr < sysctl_perf_event_max_stack) { + while (entry->nr < entry->max_stack) { unsigned long bytes; + frame.next_frame = NULL; frame.return_address = 0; - if (!access_ok(VERIFY_READ, fp, 16)) + if (!access_ok(VERIFY_READ, fp, sizeof(*fp) * 2)) break; - bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8); + bytes = __copy_from_user_nmi(&frame.next_frame, fp, sizeof(*fp)); if (bytes != 0) break; - bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8); + bytes = __copy_from_user_nmi(&frame.return_address, fp + 1, sizeof(*fp)); if (bytes != 0) break; diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 3660b2cf245a..06c2baa51814 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -1,8 +1,8 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o -obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl.o -intel-rapl-objs := rapl.o +obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o +intel-rapl-perf-objs := rapl.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7c666958a625..0974ba11e954 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -16,6 +16,7 @@ #include <asm/cpufeature.h> #include <asm/hardirq.h> +#include <asm/intel-family.h> #include <asm/apic.h> #include "../perf_event.h" @@ -115,6 +116,10 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + /* + * When HT is off these events can only run on the bottom 4 counters + * When HT is on, they are impacted by the HT bug and require EXCL access + */ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -139,6 +144,10 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + /* + * When HT is off these events can only run on the bottom 4 counters + * When HT is on, they are impacted by the HT bug and require EXCL access + */ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -177,19 +186,27 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; -struct event_constraint intel_skl_event_constraints[] = { +static struct event_constraint intel_skl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ + + /* + * when HT is off, these can only run on the bottom 4 counters + */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc6, 0xf), /* FRONTEND_RETIRED.* */ + EVENT_CONSTRAINT_END }; static struct extra_reg intel_knl_extra_regs[] __read_mostly = { - INTEL_UEVENT_EXTRA_REG(0x01b7, - MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, - MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1), + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x799ffbb6e7ull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x399ffbffe7ull, RSP_1), EVENT_EXTRA_END }; @@ -225,14 +242,51 @@ EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); -struct attribute *nhm_events_attrs[] = { +static struct attribute *nhm_events_attrs[] = { EVENT_PTR(mem_ld_nhm), NULL, }; -struct attribute *snb_events_attrs[] = { +/* + * topdown events for Intel Core CPUs. + * + * The events are all in slots, which is a free slot in a 4 wide + * pipeline. Some events are already reported in slots, for cycle + * events we multiply by the pipeline width (4). + * + * With Hyper Threading on, topdown metrics are either summed or averaged + * between the threads of a core: (count_t0 + count_t1). + * + * For the average case the metric is always scaled to pipeline width, + * so we use factor 2 ((count_t0 + count_t1) / 2 * 4) + */ + +EVENT_ATTR_STR_HT(topdown-total-slots, td_total_slots, + "event=0x3c,umask=0x0", /* cpu_clk_unhalted.thread */ + "event=0x3c,umask=0x0,any=1"); /* cpu_clk_unhalted.thread_any */ +EVENT_ATTR_STR_HT(topdown-total-slots.scale, td_total_slots_scale, "4", "2"); +EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued, + "event=0xe,umask=0x1"); /* uops_issued.any */ +EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired, + "event=0xc2,umask=0x2"); /* uops_retired.retire_slots */ +EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles, + "event=0x9c,umask=0x1"); /* idq_uops_not_delivered_core */ +EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles, + "event=0xd,umask=0x3,cmask=1", /* int_misc.recovery_cycles */ + "event=0xd,umask=0x3,cmask=1,any=1"); /* int_misc.recovery_cycles_any */ +EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale, + "4", "2"); + +static struct attribute *snb_events_attrs[] = { EVENT_PTR(mem_ld_snb), EVENT_PTR(mem_st_snb), + EVENT_PTR(td_slots_issued), + EVENT_PTR(td_slots_retired), + EVENT_PTR(td_fetch_bubbles), + EVENT_PTR(td_total_slots), + EVENT_PTR(td_total_slots_scale), + EVENT_PTR(td_recovery_bubbles), + EVENT_PTR(td_recovery_bubbles_scale), NULL, }; @@ -250,6 +304,10 @@ static struct event_constraint intel_hsw_event_constraints[] = { /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), + /* + * When HT is off these events can only run on the bottom 4 counters + * When HT is on, they are impacted by the HT bug and require EXCL access + */ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -258,12 +316,19 @@ static struct event_constraint intel_hsw_event_constraints[] = { EVENT_CONSTRAINT_END }; -struct event_constraint intel_bdw_event_constraints[] = { +static struct event_constraint intel_bdw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */ + /* + * when HT is off, these can only run on the bottom 4 counters + */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */ EVENT_CONSTRAINT_END }; @@ -1332,6 +1397,29 @@ static __initconst const u64 atom_hw_cache_event_ids }, }; +EVENT_ATTR_STR(topdown-total-slots, td_total_slots_slm, "event=0x3c"); +EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_slm, "2"); +/* no_alloc_cycles.not_delivered */ +EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_slm, + "event=0xca,umask=0x50"); +EVENT_ATTR_STR(topdown-fetch-bubbles.scale, td_fetch_bubbles_scale_slm, "2"); +/* uops_retired.all */ +EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_slm, + "event=0xc2,umask=0x10"); +/* uops_retired.all */ +EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_slm, + "event=0xc2,umask=0x10"); + +static struct attribute *slm_events_attrs[] = { + EVENT_PTR(td_total_slots_slm), + EVENT_PTR(td_total_slots_scale_slm), + EVENT_PTR(td_fetch_bubbles_slm), + EVENT_PTR(td_fetch_bubbles_scale_slm), + EVENT_PTR(td_slots_issued_slm), + EVENT_PTR(td_slots_retired_slm), + NULL +}; + static struct extra_reg intel_slm_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ @@ -3261,11 +3349,11 @@ static int intel_snb_pebs_broken(int cpu) u32 rev = UINT_MAX; /* default to broken for unknown models */ switch (cpu_data(cpu).x86_model) { - case 42: /* SNB */ + case INTEL_FAM6_SANDYBRIDGE: rev = 0x28; break; - case 45: /* SNB-EP */ + case INTEL_FAM6_SANDYBRIDGE_X: switch (cpu_data(cpu).x86_mask) { case 6: rev = 0x618; break; case 7: rev = 0x70c; break; @@ -3302,6 +3390,13 @@ static void intel_snb_check_microcode(void) } } +static bool is_lbr_from(unsigned long msr) +{ + unsigned long lbr_from_nr = x86_pmu.lbr_from + x86_pmu.lbr_nr; + + return x86_pmu.lbr_from <= msr && msr < lbr_from_nr; +} + /* * Under certain circumstances, access certain MSR may cause #GP. * The function tests if the input MSR can be safely accessed. @@ -3322,13 +3417,24 @@ static bool check_msr(unsigned long msr, u64 mask) * Only change the bits which can be updated by wrmsrl. */ val_tmp = val_old ^ mask; + + if (is_lbr_from(msr)) + val_tmp = lbr_from_signext_quirk_wr(val_tmp); + if (wrmsrl_safe(msr, val_tmp) || rdmsrl_safe(msr, &val_new)) return false; + /* + * Quirk only affects validation in wrmsr(), so wrmsrl()'s value + * should equal rdmsrl()'s even with the quirk. + */ if (val_new != val_tmp) return false; + if (is_lbr_from(msr)) + val_old = lbr_from_signext_quirk_wr(val_old); + /* Here it's sure that the MSR can be safely accessed. * Restore the old value and return. */ @@ -3437,6 +3543,13 @@ static struct attribute *hsw_events_attrs[] = { EVENT_PTR(cycles_ct), EVENT_PTR(mem_ld_hsw), EVENT_PTR(mem_st_hsw), + EVENT_PTR(td_slots_issued), + EVENT_PTR(td_slots_retired), + EVENT_PTR(td_fetch_bubbles), + EVENT_PTR(td_total_slots), + EVENT_PTR(td_total_slots_scale), + EVENT_PTR(td_recovery_bubbles), + EVENT_PTR(td_recovery_bubbles_scale), NULL }; @@ -3508,15 +3621,15 @@ __init int intel_pmu_init(void) * Install the hw-cache-events table: */ switch (boot_cpu_data.x86_model) { - case 14: /* 65nm Core "Yonah" */ + case INTEL_FAM6_CORE_YONAH: pr_cont("Core events, "); break; - case 15: /* 65nm Core2 "Merom" */ + case INTEL_FAM6_CORE2_MEROM: x86_add_quirk(intel_clovertown_quirk); - case 22: /* 65nm Core2 "Merom-L" */ - case 23: /* 45nm Core2 "Penryn" */ - case 29: /* 45nm Core2 "Dunnington (MP) */ + case INTEL_FAM6_CORE2_MEROM_L: + case INTEL_FAM6_CORE2_PENRYN: + case INTEL_FAM6_CORE2_DUNNINGTON: memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -3527,9 +3640,9 @@ __init int intel_pmu_init(void) pr_cont("Core2 events, "); break; - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_NEHALEM_EP: + case INTEL_FAM6_NEHALEM_EX: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, @@ -3557,11 +3670,11 @@ __init int intel_pmu_init(void) pr_cont("Nehalem events, "); break; - case 28: /* 45nm Atom "Pineview" */ - case 38: /* 45nm Atom "Lincroft" */ - case 39: /* 32nm Atom "Penwell" */ - case 53: /* 32nm Atom "Cloverview" */ - case 54: /* 32nm Atom "Cedarview" */ + case INTEL_FAM6_ATOM_PINEVIEW: + case INTEL_FAM6_ATOM_LINCROFT: + case INTEL_FAM6_ATOM_PENWELL: + case INTEL_FAM6_ATOM_CLOVERVIEW: + case INTEL_FAM6_ATOM_CEDARVIEW: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -3573,9 +3686,9 @@ __init int intel_pmu_init(void) pr_cont("Atom events, "); break; - case 55: /* 22nm Atom "Silvermont" */ - case 76: /* 14nm Atom "Airmont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ + case INTEL_FAM6_ATOM_SILVERMONT1: + case INTEL_FAM6_ATOM_SILVERMONT2: + case INTEL_FAM6_ATOM_AIRMONT: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, @@ -3587,11 +3700,12 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; x86_pmu.extra_regs = intel_slm_extra_regs; x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.cpu_events = slm_events_attrs; pr_cont("Silvermont events, "); break; - case 92: /* 14nm Atom "Goldmont" */ - case 95: /* 14nm Atom "Goldmont Denverton" */ + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_DENVERTON: memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, @@ -3614,9 +3728,9 @@ __init int intel_pmu_init(void) pr_cont("Goldmont events, "); break; - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_WESTMERE_EP: + case INTEL_FAM6_WESTMERE_EX: memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, @@ -3643,8 +3757,8 @@ __init int intel_pmu_init(void) pr_cont("Westmere events, "); break; - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ + case INTEL_FAM6_SANDYBRIDGE: + case INTEL_FAM6_SANDYBRIDGE_X: x86_add_quirk(intel_sandybridge_quirk); x86_add_quirk(intel_ht_bug); memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, @@ -3657,7 +3771,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; x86_pmu.pebs_aliases = intel_pebs_aliases_snb; - if (boot_cpu_data.x86_model == 45) + if (boot_cpu_data.x86_model == INTEL_FAM6_SANDYBRIDGE_X) x86_pmu.extra_regs = intel_snbep_extra_regs; else x86_pmu.extra_regs = intel_snb_extra_regs; @@ -3679,8 +3793,8 @@ __init int intel_pmu_init(void) pr_cont("SandyBridge events, "); break; - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ + case INTEL_FAM6_IVYBRIDGE: + case INTEL_FAM6_IVYBRIDGE_X: x86_add_quirk(intel_ht_bug); memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -3696,7 +3810,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; x86_pmu.pebs_aliases = intel_pebs_aliases_ivb; x86_pmu.pebs_prec_dist = true; - if (boot_cpu_data.x86_model == 62) + if (boot_cpu_data.x86_model == INTEL_FAM6_IVYBRIDGE_X) x86_pmu.extra_regs = intel_snbep_extra_regs; else x86_pmu.extra_regs = intel_snb_extra_regs; @@ -3714,10 +3828,10 @@ __init int intel_pmu_init(void) break; - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 69: /* 22nm Haswell ULT */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + case INTEL_FAM6_HASWELL_CORE: + case INTEL_FAM6_HASWELL_X: + case INTEL_FAM6_HASWELL_ULT: + case INTEL_FAM6_HASWELL_GT3E: x86_add_quirk(intel_ht_bug); x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -3741,10 +3855,10 @@ __init int intel_pmu_init(void) pr_cont("Haswell events, "); break; - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ + case INTEL_FAM6_BROADWELL_CORE: + case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_BROADWELL_X: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); @@ -3777,7 +3891,7 @@ __init int intel_pmu_init(void) pr_cont("Broadwell events, "); break; - case 87: /* Knights Landing Xeon Phi */ + case INTEL_FAM6_XEON_PHI_KNL: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, @@ -3795,16 +3909,22 @@ __init int intel_pmu_init(void) pr_cont("Knights Landing events, "); break; - case 142: /* 14nm Kabylake Mobile */ - case 158: /* 14nm Kabylake Desktop */ - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - case 85: /* 14nm Skylake Server */ + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_skl(); + /* INT_MISC.RECOVERY_CYCLES has umask 1 in Skylake */ + event_attr_td_recovery_bubbles.event_str_noht = + "event=0xd,umask=0x1,cmask=1"; + event_attr_td_recovery_bubbles.event_str_ht = + "event=0xd,umask=0x1,cmask=1,any=1"; + x86_pmu.event_constraints = intel_skl_event_constraints; x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints; x86_pmu.extra_regs = intel_skl_extra_regs; @@ -3885,6 +4005,8 @@ __init int intel_pmu_init(void) x86_pmu.lbr_nr = 0; } + if (x86_pmu.lbr_nr) + pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); /* * Access extra MSR may cause #GP under certain circumstances. * E.g. KVM doesn't support offcore event @@ -3917,16 +4039,14 @@ __init int intel_pmu_init(void) */ static __init int fixup_ht_bug(void) { - int cpu = smp_processor_id(); - int w, c; + int c; /* * problem not present on this CPU model, nothing to do */ if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED)) return 0; - w = cpumask_weight(topology_sibling_cpumask(cpu)); - if (w > 1) { + if (topology_max_smt_threads() > 1) { pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n"); return 0; } diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 9ba4e4136a15..4c7638b91fa5 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -89,6 +89,7 @@ #include <linux/slab.h> #include <linux/perf_event.h> #include <asm/cpu_device_id.h> +#include <asm/intel-family.h> #include "../perf_event.h" MODULE_LICENSE("GPL"); @@ -511,37 +512,37 @@ static const struct cstate_model slm_cstates __initconst = { { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) } static const struct x86_cpu_id intel_cstates_match[] __initconst = { - X86_CSTATES_MODEL(30, nhm_cstates), /* 45nm Nehalem */ - X86_CSTATES_MODEL(26, nhm_cstates), /* 45nm Nehalem-EP */ - X86_CSTATES_MODEL(46, nhm_cstates), /* 45nm Nehalem-EX */ + X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM, nhm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EP, nhm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EX, nhm_cstates), - X86_CSTATES_MODEL(37, nhm_cstates), /* 32nm Westmere */ - X86_CSTATES_MODEL(44, nhm_cstates), /* 32nm Westmere-EP */ - X86_CSTATES_MODEL(47, nhm_cstates), /* 32nm Westmere-EX */ + X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE, nhm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EP, nhm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EX, nhm_cstates), - X86_CSTATES_MODEL(42, snb_cstates), /* 32nm SandyBridge */ - X86_CSTATES_MODEL(45, snb_cstates), /* 32nm SandyBridge-E/EN/EP */ + X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE_X, snb_cstates), - X86_CSTATES_MODEL(58, snb_cstates), /* 22nm IvyBridge */ - X86_CSTATES_MODEL(62, snb_cstates), /* 22nm IvyBridge-EP/EX */ + X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE_X, snb_cstates), - X86_CSTATES_MODEL(60, snb_cstates), /* 22nm Haswell Core */ - X86_CSTATES_MODEL(63, snb_cstates), /* 22nm Haswell Server */ - X86_CSTATES_MODEL(70, snb_cstates), /* 22nm Haswell + GT3e */ + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_CORE, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_X, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_GT3E, snb_cstates), - X86_CSTATES_MODEL(69, hswult_cstates), /* 22nm Haswell ULT */ + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates), - X86_CSTATES_MODEL(55, slm_cstates), /* 22nm Atom Silvermont */ - X86_CSTATES_MODEL(77, slm_cstates), /* 22nm Atom Avoton/Rangely */ - X86_CSTATES_MODEL(76, slm_cstates), /* 22nm Atom Airmont */ + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), - X86_CSTATES_MODEL(61, snb_cstates), /* 14nm Broadwell Core-M */ - X86_CSTATES_MODEL(86, snb_cstates), /* 14nm Broadwell Xeon D */ - X86_CSTATES_MODEL(71, snb_cstates), /* 14nm Broadwell + GT3e */ - X86_CSTATES_MODEL(79, snb_cstates), /* 14nm Broadwell Server */ + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_XEON_D, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_GT3E, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_X, snb_cstates), - X86_CSTATES_MODEL(78, snb_cstates), /* 14nm Skylake Mobile */ - X86_CSTATES_MODEL(94, snb_cstates), /* 14nm Skylake Desktop */ + X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 9e2b40cdb05f..707d358e0dff 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -77,9 +77,11 @@ static enum { LBR_IND_JMP |\ LBR_FAR) -#define LBR_FROM_FLAG_MISPRED (1ULL << 63) -#define LBR_FROM_FLAG_IN_TX (1ULL << 62) -#define LBR_FROM_FLAG_ABORT (1ULL << 61) +#define LBR_FROM_FLAG_MISPRED BIT_ULL(63) +#define LBR_FROM_FLAG_IN_TX BIT_ULL(62) +#define LBR_FROM_FLAG_ABORT BIT_ULL(61) + +#define LBR_FROM_SIGNEXT_2MSB (BIT_ULL(60) | BIT_ULL(59)) /* * x86control flow change classification @@ -235,6 +237,97 @@ enum { LBR_VALID, }; +/* + * For formats with LBR_TSX flags (e.g. LBR_FORMAT_EIP_FLAGS2), bits 61:62 in + * MSR_LAST_BRANCH_FROM_x are the TSX flags when TSX is supported, but when + * TSX is not supported they have no consistent behavior: + * + * - For wrmsr(), bits 61:62 are considered part of the sign extension. + * - For HW updates (branch captures) bits 61:62 are always OFF and are not + * part of the sign extension. + * + * Therefore, if: + * + * 1) LBR has TSX format + * 2) CPU has no TSX support enabled + * + * ... then any value passed to wrmsr() must be sign extended to 63 bits and any + * value from rdmsr() must be converted to have a 61 bits sign extension, + * ignoring the TSX flags. + */ +static inline bool lbr_from_signext_quirk_needed(void) +{ + int lbr_format = x86_pmu.intel_cap.lbr_format; + bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) || + boot_cpu_has(X86_FEATURE_RTM); + + return !tsx_support && (lbr_desc[lbr_format] & LBR_TSX); +} + +DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key); + +/* If quirk is enabled, ensure sign extension is 63 bits: */ +inline u64 lbr_from_signext_quirk_wr(u64 val) +{ + if (static_branch_unlikely(&lbr_from_quirk_key)) { + /* + * Sign extend into bits 61:62 while preserving bit 63. + * + * Quirk is enabled when TSX is disabled. Therefore TSX bits + * in val are always OFF and must be changed to be sign + * extension bits. Since bits 59:60 are guaranteed to be + * part of the sign extension bits, we can just copy them + * to 61:62. + */ + val |= (LBR_FROM_SIGNEXT_2MSB & val) << 2; + } + return val; +} + +/* + * If quirk is needed, ensure sign extension is 61 bits: + */ +u64 lbr_from_signext_quirk_rd(u64 val) +{ + if (static_branch_unlikely(&lbr_from_quirk_key)) { + /* + * Quirk is on when TSX is not enabled. Therefore TSX + * flags must be read as OFF. + */ + val &= ~(LBR_FROM_FLAG_IN_TX | LBR_FROM_FLAG_ABORT); + } + return val; +} + +static inline void wrlbr_from(unsigned int idx, u64 val) +{ + val = lbr_from_signext_quirk_wr(val); + wrmsrl(x86_pmu.lbr_from + idx, val); +} + +static inline void wrlbr_to(unsigned int idx, u64 val) +{ + wrmsrl(x86_pmu.lbr_to + idx, val); +} + +static inline u64 rdlbr_from(unsigned int idx) +{ + u64 val; + + rdmsrl(x86_pmu.lbr_from + idx, val); + + return lbr_from_signext_quirk_rd(val); +} + +static inline u64 rdlbr_to(unsigned int idx) +{ + u64 val; + + rdmsrl(x86_pmu.lbr_to + idx, val); + + return val; +} + static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) { int i; @@ -251,8 +344,9 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) tos = task_ctx->tos; for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; - wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); - wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); + wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } @@ -262,9 +356,9 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) { - int i; unsigned lbr_idx, mask; u64 tos; + int i; if (task_ctx->lbr_callstack_users == 0) { task_ctx->lbr_stack_state = LBR_NONE; @@ -275,8 +369,8 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) tos = intel_pmu_lbr_tos(); for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; - rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); - rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + task_ctx->lbr_from[i] = rdlbr_from(lbr_idx); + task_ctx->lbr_to[i] = rdlbr_to(lbr_idx); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } @@ -452,8 +546,8 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) u16 cycles = 0; int lbr_flags = lbr_desc[lbr_format]; - rdmsrl(x86_pmu.lbr_from + lbr_idx, from); - rdmsrl(x86_pmu.lbr_to + lbr_idx, to); + from = rdlbr_from(lbr_idx); + to = rdlbr_to(lbr_idx); if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; @@ -956,7 +1050,6 @@ void __init intel_pmu_lbr_init_core(void) * SW branch filter usage: * - compensate for lack of HW filter */ - pr_cont("4-deep LBR, "); } /* nehalem/westmere */ @@ -977,7 +1070,6 @@ void __init intel_pmu_lbr_init_nhm(void) * That requires LBR_FAR but that means far * jmp need to be filtered out */ - pr_cont("16-deep LBR, "); } /* sandy bridge */ @@ -997,7 +1089,6 @@ void __init intel_pmu_lbr_init_snb(void) * That requires LBR_FAR but that means far * jmp need to be filtered out */ - pr_cont("16-deep LBR, "); } /* haswell */ @@ -1011,7 +1102,8 @@ void intel_pmu_lbr_init_hsw(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; - pr_cont("16-deep LBR, "); + if (lbr_from_signext_quirk_needed()) + static_branch_enable(&lbr_from_quirk_key); } /* skylake */ @@ -1031,7 +1123,6 @@ __init void intel_pmu_lbr_init_skl(void) * That requires LBR_FAR but that means far * jmp need to be filtered out */ - pr_cont("32-deep LBR, "); } /* atom */ @@ -1057,7 +1148,6 @@ void __init intel_pmu_lbr_init_atom(void) * SW branch filter usage: * - compensate for lack of HW filter */ - pr_cont("8-deep LBR, "); } /* slm */ @@ -1088,6 +1178,4 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = snb_lbr_sel_map; - - pr_cont("8-deep LBR, "); } diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index 0a5ede187d9c..eb0533558c2b 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -826,7 +826,7 @@ static int p4_hw_config(struct perf_event *event) * Clear bits we reserve to be managed by kernel itself * and never allowed from a user space */ - event->attr.config &= P4_CONFIG_MASK; + event->attr.config &= P4_CONFIG_MASK; rc = p4_validate_raw_event(event); if (rc) diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 99c4bab123cd..d0c58b35155f 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -55,6 +55,7 @@ #include <linux/slab.h> #include <linux/perf_event.h> #include <asm/cpu_device_id.h> +#include <asm/intel-family.h> #include "../perf_event.h" MODULE_LICENSE("GPL"); @@ -714,7 +715,7 @@ static void cleanup_rapl_pmus(void) int i; for (i = 0; i < rapl_pmus->maxpkg; i++) - kfree(rapl_pmus->pmus + i); + kfree(rapl_pmus->pmus[i]); kfree(rapl_pmus); } @@ -786,26 +787,27 @@ static const struct intel_rapl_init_fun skl_rapl_init __initconst = { }; static const struct x86_cpu_id rapl_cpu_match[] __initconst = { - X86_RAPL_MODEL_MATCH(42, snb_rapl_init), /* Sandy Bridge */ - X86_RAPL_MODEL_MATCH(45, snbep_rapl_init), /* Sandy Bridge-EP */ + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_rapl_init), - X86_RAPL_MODEL_MATCH(58, snb_rapl_init), /* Ivy Bridge */ - X86_RAPL_MODEL_MATCH(62, snbep_rapl_init), /* IvyTown */ + X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, snb_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init), - X86_RAPL_MODEL_MATCH(60, hsw_rapl_init), /* Haswell */ - X86_RAPL_MODEL_MATCH(63, hsx_rapl_init), /* Haswell-Server */ - X86_RAPL_MODEL_MATCH(69, hsw_rapl_init), /* Haswell-Celeron */ - X86_RAPL_MODEL_MATCH(70, hsw_rapl_init), /* Haswell GT3e */ + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(61, hsw_rapl_init), /* Broadwell */ - X86_RAPL_MODEL_MATCH(71, hsw_rapl_init), /* Broadwell-H */ - X86_RAPL_MODEL_MATCH(79, hsx_rapl_init), /* Broadwell-Server */ - X86_RAPL_MODEL_MATCH(86, hsx_rapl_init), /* Broadwell Xeon D */ + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(87, knl_rapl_init), /* Knights Landing */ + X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init), - X86_RAPL_MODEL_MATCH(78, skl_rapl_init), /* Skylake */ - X86_RAPL_MODEL_MATCH(94, skl_rapl_init), /* Skylake H/S */ + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init), {}, }; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 16c178916412..59b4974c697f 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1,4 +1,5 @@ #include <asm/cpu_device_id.h> +#include <asm/intel-family.h> #include "uncore.h" static struct intel_uncore_type *empty_uncore[] = { NULL, }; @@ -882,7 +883,7 @@ uncore_types_init(struct intel_uncore_type **types, bool setid) static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct intel_uncore_type *type; - struct intel_uncore_pmu *pmu; + struct intel_uncore_pmu *pmu = NULL; struct intel_uncore_box *box; int phys_id, pkg, ret; @@ -891,7 +892,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id return -ENODEV; pkg = topology_phys_to_logical_pkg(phys_id); - if (WARN_ON_ONCE(pkg < 0)) + if (pkg < 0) return -EINVAL; if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { @@ -903,20 +904,37 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id } type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + /* - * for performance monitoring unit with multiple boxes, - * each box has a different function id. - */ - pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; - /* Knights Landing uses a common PCI device ID for multiple instances of - * an uncore PMU device type. There is only one entry per device type in - * the knl_uncore_pci_ids table inspite of multiple devices present for - * some device types. Hence PCI device idx would be 0 for all devices. - * So increment pmu pointer to point to an unused array element. + * Some platforms, e.g. Knights Landing, use a common PCI device ID + * for multiple instances of an uncore PMU device type. We should check + * PCI slot and func to indicate the uncore box. */ - if (boot_cpu_data.x86_model == 87) { - while (pmu->func_id >= 0) - pmu++; + if (id->driver_data & ~0xffff) { + struct pci_driver *pci_drv = pdev->driver; + const struct pci_device_id *ids = pci_drv->id_table; + unsigned int devfn; + + while (ids && ids->vendor) { + if ((ids->vendor == pdev->vendor) && + (ids->device == pdev->device)) { + devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data), + UNCORE_PCI_DEV_FUNC(ids->driver_data)); + if (devfn == pdev->devfn) { + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; + break; + } + } + ids++; + } + if (pmu == NULL) + return -ENODEV; + } else { + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; } if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL)) @@ -956,7 +974,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id static void uncore_pci_remove(struct pci_dev *pdev) { - struct intel_uncore_box *box = pci_get_drvdata(pdev); + struct intel_uncore_box *box; struct intel_uncore_pmu *pmu; int i, phys_id, pkg; @@ -1361,30 +1379,32 @@ static const struct intel_uncore_init_fun knl_uncore_init __initconst = { }; static const struct intel_uncore_init_fun skl_uncore_init __initconst = { + .cpu_init = skl_uncore_cpu_init, .pci_init = skl_uncore_pci_init, }; static const struct x86_cpu_id intel_uncore_match[] __initconst = { - X86_UNCORE_MODEL_MATCH(26, nhm_uncore_init), /* Nehalem */ - X86_UNCORE_MODEL_MATCH(30, nhm_uncore_init), - X86_UNCORE_MODEL_MATCH(37, nhm_uncore_init), /* Westmere */ - X86_UNCORE_MODEL_MATCH(44, nhm_uncore_init), - X86_UNCORE_MODEL_MATCH(42, snb_uncore_init), /* Sandy Bridge */ - X86_UNCORE_MODEL_MATCH(58, ivb_uncore_init), /* Ivy Bridge */ - X86_UNCORE_MODEL_MATCH(60, hsw_uncore_init), /* Haswell */ - X86_UNCORE_MODEL_MATCH(69, hsw_uncore_init), /* Haswell Celeron */ - X86_UNCORE_MODEL_MATCH(70, hsw_uncore_init), /* Haswell */ - X86_UNCORE_MODEL_MATCH(61, bdw_uncore_init), /* Broadwell */ - X86_UNCORE_MODEL_MATCH(71, bdw_uncore_init), /* Broadwell */ - X86_UNCORE_MODEL_MATCH(45, snbep_uncore_init), /* Sandy Bridge-EP */ - X86_UNCORE_MODEL_MATCH(46, nhmex_uncore_init), /* Nehalem-EX */ - X86_UNCORE_MODEL_MATCH(47, nhmex_uncore_init), /* Westmere-EX aka. Xeon E7 */ - X86_UNCORE_MODEL_MATCH(62, ivbep_uncore_init), /* Ivy Bridge-EP */ - X86_UNCORE_MODEL_MATCH(63, hswep_uncore_init), /* Haswell-EP */ - X86_UNCORE_MODEL_MATCH(79, bdx_uncore_init), /* BDX-EP */ - X86_UNCORE_MODEL_MATCH(86, bdx_uncore_init), /* BDX-DE */ - X86_UNCORE_MODEL_MATCH(87, knl_uncore_init), /* Knights Landing */ - X86_UNCORE_MODEL_MATCH(94, skl_uncore_init), /* SkyLake */ + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE, nhm_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EP, nhm_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, ivb_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, bdw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, bdw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EX, nhmex_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EX, nhmex_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, ivbep_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hswep_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, bdx_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, bdx_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 79766b9a3580..d6063e438158 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -15,7 +15,11 @@ #define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC #define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) +#define UNCORE_PCI_DEV_FULL_DATA(dev, func, type, idx) \ + ((dev << 24) | (func << 16) | (type << 8) | idx) #define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx) +#define UNCORE_PCI_DEV_DEV(data) ((data >> 24) & 0xff) +#define UNCORE_PCI_DEV_FUNC(data) ((data >> 16) & 0xff) #define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) #define UNCORE_PCI_DEV_IDX(data) (data & 0xff) #define UNCORE_EXTRA_PCI_DEV 0xff @@ -360,6 +364,7 @@ int bdw_uncore_pci_init(void); int skl_uncore_pci_init(void); void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); +void skl_uncore_cpu_init(void); int snb_pci2phy_map_init(int devid); /* perf_event_intel_uncore_snbep.c */ diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 96531d2b843f..97a69dbba649 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -1,4 +1,4 @@ -/* Nehalem/SandBridge/Haswell uncore support */ +/* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */ #include "uncore.h" /* Uncore IMC PCI IDs */ @@ -9,6 +9,7 @@ #define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 #define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 #define PCI_DEVICE_ID_INTEL_SKL_IMC 0x191f +#define PCI_DEVICE_ID_INTEL_SKL_U_IMC 0x190c /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -64,6 +65,10 @@ #define NHM_UNC_PERFEVTSEL0 0x3c0 #define NHM_UNC_UNCORE_PMC0 0x3b0 +/* SKL uncore global control */ +#define SKL_UNC_PERF_GLOBAL_CTL 0xe01 +#define SKL_UNC_GLOBAL_CTL_CORE_ALL ((1 << 5) - 1) + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); @@ -179,6 +184,60 @@ void snb_uncore_cpu_init(void) snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; } +static void skl_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) { + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL); + } +} + +static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, 0); +} + +static struct intel_uncore_ops skl_uncore_msr_ops = { + .init_box = skl_uncore_msr_init_box, + .exit_box = skl_uncore_msr_exit_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct intel_uncore_type skl_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 5, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .ops = &skl_uncore_msr_ops, + .format_group = &snb_uncore_format_group, + .event_descs = snb_uncore_events, +}; + +static struct intel_uncore_type *skl_msr_uncores[] = { + &skl_uncore_cbox, + &snb_uncore_arb, + NULL, +}; + +void skl_uncore_cpu_init(void) +{ + uncore_msr_uncores = skl_msr_uncores; + if (skl_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) + skl_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + snb_uncore_arb.ops = &skl_uncore_msr_ops; +} + enum { SNB_PCI_UNCORE_IMC, }; @@ -544,6 +603,11 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, }; @@ -587,6 +651,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */ IMC_DEV(SKL_IMC, &skl_uncore_pci_driver), /* 6th Gen Core */ + IMC_DEV(SKL_U_IMC, &skl_uncore_pci_driver), /* 6th Gen Core U */ { /* end marker */ } }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index b2625867ebd1..824e54086e07 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2164,21 +2164,101 @@ static struct intel_uncore_type *knl_pci_uncores[] = { */ static const struct pci_device_id knl_uncore_pci_ids[] = { - { /* MC UClk */ + { /* MC0 UClk */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841), - .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 0, KNL_PCI_UNCORE_MC_UCLK, 0), }, - { /* MC DClk Channel */ + { /* MC1 UClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 0, KNL_PCI_UNCORE_MC_UCLK, 1), + }, + { /* MC0 DClk CH 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 2, KNL_PCI_UNCORE_MC_DCLK, 0), + }, + { /* MC0 DClk CH 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 3, KNL_PCI_UNCORE_MC_DCLK, 1), + }, + { /* MC0 DClk CH 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 4, KNL_PCI_UNCORE_MC_DCLK, 2), + }, + { /* MC1 DClk CH 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 2, KNL_PCI_UNCORE_MC_DCLK, 3), + }, + { /* MC1 DClk CH 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843), - .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 3, KNL_PCI_UNCORE_MC_DCLK, 4), + }, + { /* MC1 DClk CH 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 4, KNL_PCI_UNCORE_MC_DCLK, 5), + }, + { /* EDC0 UClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, KNL_PCI_UNCORE_EDC_UCLK, 0), }, - { /* EDC UClk */ + { /* EDC1 UClk */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833), - .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, KNL_PCI_UNCORE_EDC_UCLK, 1), + }, + { /* EDC2 UClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(17, 0, KNL_PCI_UNCORE_EDC_UCLK, 2), + }, + { /* EDC3 UClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, KNL_PCI_UNCORE_EDC_UCLK, 3), + }, + { /* EDC4 UClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(19, 0, KNL_PCI_UNCORE_EDC_UCLK, 4), + }, + { /* EDC5 UClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(20, 0, KNL_PCI_UNCORE_EDC_UCLK, 5), + }, + { /* EDC6 UClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 0, KNL_PCI_UNCORE_EDC_UCLK, 6), + }, + { /* EDC7 UClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 0, KNL_PCI_UNCORE_EDC_UCLK, 7), + }, + { /* EDC0 EClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(24, 2, KNL_PCI_UNCORE_EDC_ECLK, 0), }, - { /* EDC EClk */ + { /* EDC1 EClk */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835), - .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(25, 2, KNL_PCI_UNCORE_EDC_ECLK, 1), + }, + { /* EDC2 EClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(26, 2, KNL_PCI_UNCORE_EDC_ECLK, 2), + }, + { /* EDC3 EClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(27, 2, KNL_PCI_UNCORE_EDC_ECLK, 3), + }, + { /* EDC4 EClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(28, 2, KNL_PCI_UNCORE_EDC_ECLK, 4), + }, + { /* EDC5 EClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(29, 2, KNL_PCI_UNCORE_EDC_ECLK, 5), + }, + { /* EDC6 EClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(30, 2, KNL_PCI_UNCORE_EDC_ECLK, 6), + }, + { /* EDC7 EClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(31, 2, KNL_PCI_UNCORE_EDC_ECLK, 7), }, { /* M2PCIe */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817), @@ -2868,27 +2948,10 @@ static struct intel_uncore_type bdx_uncore_cbox = { .format_group = &hswep_uncore_cbox_format_group, }; -static struct intel_uncore_type bdx_uncore_sbox = { - .name = "sbox", - .num_counters = 4, - .num_boxes = 4, - .perf_ctr_bits = 48, - .event_ctl = HSWEP_S0_MSR_PMON_CTL0, - .perf_ctr = HSWEP_S0_MSR_PMON_CTR0, - .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL, - .msr_offset = HSWEP_SBOX_MSR_OFFSET, - .ops = &hswep_uncore_sbox_msr_ops, - .format_group = &hswep_uncore_sbox_format_group, -}; - -#define BDX_MSR_UNCORE_SBOX 3 - static struct intel_uncore_type *bdx_msr_uncores[] = { &bdx_uncore_ubox, &bdx_uncore_cbox, &hswep_uncore_pcu, - &bdx_uncore_sbox, NULL, }; @@ -2897,10 +2960,6 @@ void bdx_uncore_cpu_init(void) if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; uncore_msr_uncores = bdx_msr_uncores; - - /* BDX-DE doesn't have SBOX */ - if (boot_cpu_data.x86_model == 86) - uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; } static struct intel_uncore_type bdx_uncore_ha = { diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 85ef3c2e80e0..50b3a056f96b 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -1,4 +1,5 @@ #include <linux/perf_event.h> +#include <asm/intel-family.h> enum perf_msr_id { PERF_MSR_TSC = 0, @@ -34,39 +35,43 @@ static bool test_intel(int idx) return false; switch (boot_cpu_data.x86_model) { - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 69: /* 22nm Haswell ULT */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - - case 55: /* 22nm Atom "Silvermont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - case 76: /* 14nm Atom "Airmont" */ + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_NEHALEM_EP: + case INTEL_FAM6_NEHALEM_EX: + + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_WESTMERE2: + case INTEL_FAM6_WESTMERE_EP: + case INTEL_FAM6_WESTMERE_EX: + + case INTEL_FAM6_SANDYBRIDGE: + case INTEL_FAM6_SANDYBRIDGE_X: + + case INTEL_FAM6_IVYBRIDGE: + case INTEL_FAM6_IVYBRIDGE_X: + + case INTEL_FAM6_HASWELL_CORE: + case INTEL_FAM6_HASWELL_X: + case INTEL_FAM6_HASWELL_ULT: + case INTEL_FAM6_HASWELL_GT3E: + + case INTEL_FAM6_BROADWELL_CORE: + case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_BROADWELL_X: + + case INTEL_FAM6_ATOM_SILVERMONT1: + case INTEL_FAM6_ATOM_SILVERMONT2: + case INTEL_FAM6_ATOM_AIRMONT: if (idx == PERF_MSR_SMI) return true; break; - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 8bd764df815d..8c4a47706296 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -668,6 +668,14 @@ static struct perf_pmu_events_attr event_attr_##v = { \ .event_str = str, \ }; +#define EVENT_ATTR_STR_HT(_name, v, noht, ht) \ +static struct perf_pmu_events_ht_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, events_ht_sysfs_show, NULL),\ + .id = 0, \ + .event_str_noht = noht, \ + .event_str_ht = ht, \ +} + extern struct x86_pmu x86_pmu __read_mostly; static inline bool x86_pmu_has_lbr_callstack(void) @@ -803,6 +811,8 @@ struct attribute **merge_attr(struct attribute **a, struct attribute **b); ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); +ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page); #ifdef CONFIG_CPU_SUP_AMD @@ -892,6 +902,8 @@ void intel_ds_init(void); void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +u64 lbr_from_signext_quirk_wr(u64 val); + void intel_pmu_lbr_reset(void); void intel_pmu_lbr_enable(struct perf_event *event); diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index ae6aad1d24f7..cb26f18d43af 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -116,13 +116,13 @@ static struct linux_binfmt aout_format = { .min_coredump = PAGE_SIZE }; -static void set_brk(unsigned long start, unsigned long end) +static int set_brk(unsigned long start, unsigned long end) { start = PAGE_ALIGN(start); end = PAGE_ALIGN(end); if (end <= start) - return; - vm_brk(start, end - start); + return 0; + return vm_brk(start, end - start); } #ifdef CONFIG_COREDUMP @@ -321,7 +321,7 @@ static int load_aout_binary(struct linux_binprm *bprm) error = vm_brk(text_addr & PAGE_MASK, map_size); - if (error != (text_addr & PAGE_MASK)) + if (error) return error; error = read_code(bprm->file, text_addr, 32, @@ -349,7 +349,10 @@ static int load_aout_binary(struct linux_binprm *bprm) #endif if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) { - vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + if (error) + return error; + read_code(bprm->file, N_TXTADDR(ex), fd_offset, ex.a_text+ex.a_data); goto beyond_if; @@ -372,10 +375,13 @@ static int load_aout_binary(struct linux_binprm *bprm) if (error != N_DATADDR(ex)) return error; } + beyond_if: - set_binfmt(&aout_format); + error = set_brk(current->mm->start_brk, current->mm->brk); + if (error) + return error; - set_brk(current->mm->start_brk, current->mm->brk); + set_binfmt(&aout_format); current->mm->start_stack = (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); @@ -434,7 +440,9 @@ static int load_aout_library(struct file *file) error_time = jiffies; } #endif - vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); + retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); + if (retval) + goto out; read_code(file, start_addr, N_TXTOFF(ex), ex.a_text + ex.a_data); @@ -453,9 +461,8 @@ static int load_aout_library(struct file *file) len = PAGE_ALIGN(ex.a_text + ex.a_data); bss = ex.a_text + ex.a_data + ex.a_bss; if (bss > len) { - error = vm_brk(start_addr + len, bss - len); - retval = error; - if (error != start_addr + len) + retval = vm_brk(start_addr + len, bss - len); + if (retval) goto out; } retval = 0; diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index aeac434c9feb..2cfed174e3c9 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -1,5 +1,11 @@ +generated-y += syscalls_32.h +generated-y += syscalls_64.h +generated-y += unistd_32_ia32.h +generated-y += unistd_64_x32.h +generated-y += xen-hypercalls.h + genhdr-y += unistd_32.h genhdr-y += unistd_64.h genhdr-y += unistd_x32.h diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 94c18ebfd68c..5391b0ae7cc3 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -145,7 +145,6 @@ static inline void disable_acpi(void) { } #define ARCH_HAS_POWER_INIT 1 #ifdef CONFIG_ACPI_NUMA -extern int acpi_numa; extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ @@ -170,4 +169,6 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) } #endif +#define ACPI_TABLE_UPGRADE_MAX_PHYS (max_low_pfn_mapped << PAGE_SHIFT) + #endif /* _ASM_X86_ACPI_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bc27611fa58f..f5befd4945f2 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -300,7 +300,6 @@ struct apic { unsigned int (*get_apic_id)(unsigned long x); unsigned long (*set_apic_id)(unsigned int id); - unsigned long apic_id_mask; int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, const struct cpumask *andmask, diff --git a/arch/x86/include/asm/apm.h b/arch/x86/include/asm/apm.h index 20370c6db74b..93eebc636c76 100644 --- a/arch/x86/include/asm/apm.h +++ b/arch/x86/include/asm/apm.h @@ -45,11 +45,11 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, : "memory", "cc"); } -static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in, - u32 ecx_in, u32 *eax) +static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in, + u32 ecx_in, u32 *eax) { int cx, dx, si; - u8 error; + bool error; /* * N.B. We do NOT need a cld after the BIOS call diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 02e799fa43d1..e7cd63175de4 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -4,8 +4,8 @@ #include <asm/cpufeatures.h> #ifdef CONFIG_64BIT -/* popcnt %edi, %eax -- redundant REX prefix for alignment */ -#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7" +/* popcnt %edi, %eax */ +#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7" /* popcnt %rdi, %rax */ #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7" #define REG_IN "D" @@ -17,19 +17,15 @@ #define REG_OUT "a" #endif -/* - * __sw_hweightXX are called from within the alternatives below - * and callee-clobbered registers need to be taken care of. See - * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective - * compiler switches. - */ +#define __HAVE_ARCH_SW_HWEIGHT + static __always_inline unsigned int __arch_hweight32(unsigned int w) { - unsigned int res = 0; + unsigned int res; asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + : "="REG_OUT (res) + : REG_IN (w)); return res; } @@ -53,11 +49,11 @@ static inline unsigned long __arch_hweight64(__u64 w) #else static __always_inline unsigned long __arch_hweight64(__u64 w) { - unsigned long res = 0; + unsigned long res; asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + : "="REG_OUT (res) + : REG_IN (w)); return res; } diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 69f1366f1aa3..5b0579abb398 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -25,8 +25,6 @@ #include <asm/processor.h> #include <asm/cpufeature.h> -#include <asm/alternative.h> -#include <asm/nops.h> #define RDRAND_RETRY_LOOPS 10 @@ -40,97 +38,91 @@ # define RDSEED_LONG RDSEED_INT #endif -#ifdef CONFIG_ARCH_RANDOM +/* Unconditional execution of RDRAND and RDSEED */ -/* Instead of arch_get_random_long() when alternatives haven't run. */ -static inline int rdrand_long(unsigned long *v) +static inline bool rdrand_long(unsigned long *v) { - int ok; - asm volatile("1: " RDRAND_LONG "\n\t" - "jc 2f\n\t" - "decl %0\n\t" - "jnz 1b\n\t" - "2:" - : "=r" (ok), "=a" (*v) - : "0" (RDRAND_RETRY_LOOPS)); - return ok; + bool ok; + unsigned int retry = RDRAND_RETRY_LOOPS; + do { + asm volatile(RDRAND_LONG "\n\t" + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + if (ok) + return true; + } while (--retry); + return false; +} + +static inline bool rdrand_int(unsigned int *v) +{ + bool ok; + unsigned int retry = RDRAND_RETRY_LOOPS; + do { + asm volatile(RDRAND_INT "\n\t" + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + if (ok) + return true; + } while (--retry); + return false; } -/* A single attempt at RDSEED */ static inline bool rdseed_long(unsigned long *v) { - unsigned char ok; + bool ok; asm volatile(RDSEED_LONG "\n\t" - "setc %0" - : "=qm" (ok), "=a" (*v)); + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); return ok; } -#define GET_RANDOM(name, type, rdrand, nop) \ -static inline int name(type *v) \ -{ \ - int ok; \ - alternative_io("movl $0, %0\n\t" \ - nop, \ - "\n1: " rdrand "\n\t" \ - "jc 2f\n\t" \ - "decl %0\n\t" \ - "jnz 1b\n\t" \ - "2:", \ - X86_FEATURE_RDRAND, \ - ASM_OUTPUT2("=r" (ok), "=a" (*v)), \ - "0" (RDRAND_RETRY_LOOPS)); \ - return ok; \ -} - -#define GET_SEED(name, type, rdseed, nop) \ -static inline int name(type *v) \ -{ \ - unsigned char ok; \ - alternative_io("movb $0, %0\n\t" \ - nop, \ - rdseed "\n\t" \ - "setc %0", \ - X86_FEATURE_RDSEED, \ - ASM_OUTPUT2("=q" (ok), "=a" (*v))); \ - return ok; \ +static inline bool rdseed_int(unsigned int *v) +{ + bool ok; + asm volatile(RDSEED_INT "\n\t" + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + return ok; } -#ifdef CONFIG_X86_64 - -GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5); -GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4); - -GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP5); -GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); - -#else - -GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3); -GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); - -GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP4); -GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); - -#endif /* CONFIG_X86_64 */ - +/* Conditional execution based on CPU type */ #define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND) #define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED) -#else +/* + * These are the generic interfaces; they must not be declared if the + * stubs in <linux/random.h> are to be invoked, + * i.e. CONFIG_ARCH_RANDOM is not defined. + */ +#ifdef CONFIG_ARCH_RANDOM -static inline int rdrand_long(unsigned long *v) +static inline bool arch_get_random_long(unsigned long *v) { - return 0; + return arch_has_random() ? rdrand_long(v) : false; } -static inline bool rdseed_long(unsigned long *v) +static inline bool arch_get_random_int(unsigned int *v) { - return 0; + return arch_has_random() ? rdrand_int(v) : false; } -#endif /* CONFIG_ARCH_RANDOM */ +static inline bool arch_get_random_seed_long(unsigned long *v) +{ + return arch_has_random_seed() ? rdseed_long(v) : false; +} + +static inline bool arch_get_random_seed_int(unsigned int *v) +{ + return arch_has_random_seed() ? rdseed_int(v) : false; +} extern void x86_init_rdrand(struct cpuinfo_x86 *c); +#else /* !CONFIG_ARCH_RANDOM */ + +static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { } + +#endif /* !CONFIG_ARCH_RANDOM */ + #endif /* ASM_X86_ARCHRANDOM_H */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index f5063b6659eb..7acb51c49fec 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -42,6 +42,18 @@ #define _ASM_SI __ASM_REG(si) #define _ASM_DI __ASM_REG(di) +/* + * Macros to generate condition code outputs from inline assembly, + * The output operand must be type "bool". + */ +#ifdef __GCC_ASM_FLAG_OUTPUTS__ +# define CC_SET(c) "\n\t/* output condition code " #c "*/\n" +# define CC_OUT(c) "=@cc" #c +#else +# define CC_SET(c) "\n\tset" #c " %[_cc_" #c "]\n" +# define CC_OUT(c) [_cc_ ## c] "=qm" +#endif + /* Exception table entry */ #ifdef __ASSEMBLY__ # define _ASM_EXTABLE_HANDLE(from, to, handler) \ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 3e8674288198..14635c5ea025 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -75,9 +75,9 @@ static __always_inline void atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ -static __always_inline int atomic_sub_and_test(int i, atomic_t *v) +static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e"); + GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); } /** @@ -112,9 +112,9 @@ static __always_inline void atomic_dec(atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ -static __always_inline int atomic_dec_and_test(atomic_t *v) +static __always_inline bool atomic_dec_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); } /** @@ -125,9 +125,9 @@ static __always_inline int atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static __always_inline int atomic_inc_and_test(atomic_t *v) +static __always_inline bool atomic_inc_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); } /** @@ -139,9 +139,9 @@ static __always_inline int atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static __always_inline int atomic_add_negative(int i, atomic_t *v) +static __always_inline bool atomic_add_negative(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s"); + GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); } /** @@ -171,6 +171,16 @@ static __always_inline int atomic_sub_return(int i, atomic_t *v) #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) +static __always_inline int atomic_fetch_add(int i, atomic_t *v) +{ + return xadd(&v->counter, i); +} + +static __always_inline int atomic_fetch_sub(int i, atomic_t *v) +{ + return xadd(&v->counter, -i); +} + static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { return cmpxchg(&v->counter, old, new); @@ -190,10 +200,29 @@ static inline void atomic_##op(int i, atomic_t *v) \ : "memory"); \ } -ATOMIC_OP(and) -ATOMIC_OP(or) -ATOMIC_OP(xor) +#define ATOMIC_FETCH_OP(op, c_op) \ +static inline int atomic_fetch_##op(int i, atomic_t *v) \ +{ \ + int old, val = atomic_read(v); \ + for (;;) { \ + old = atomic_cmpxchg(v, val, val c_op i); \ + if (old == val) \ + break; \ + val = old; \ + } \ + return old; \ +} + +#define ATOMIC_OPS(op, c_op) \ + ATOMIC_OP(op) \ + ATOMIC_FETCH_OP(op, c_op) + +ATOMIC_OPS(and, &) +ATOMIC_OPS(or , |) +ATOMIC_OPS(xor, ^) +#undef ATOMIC_OPS +#undef ATOMIC_FETCH_OP #undef ATOMIC_OP /** diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index a984111135b1..71d7705fb303 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -320,10 +320,29 @@ static inline void atomic64_##op(long long i, atomic64_t *v) \ c = old; \ } -ATOMIC64_OP(and, &) -ATOMIC64_OP(or, |) -ATOMIC64_OP(xor, ^) +#define ATOMIC64_FETCH_OP(op, c_op) \ +static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ +{ \ + long long old, c = 0; \ + while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ + c = old; \ + return old; \ +} + +ATOMIC64_FETCH_OP(add, +) + +#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) + +#define ATOMIC64_OPS(op, c_op) \ + ATOMIC64_OP(op, c_op) \ + ATOMIC64_FETCH_OP(op, c_op) + +ATOMIC64_OPS(and, &) +ATOMIC64_OPS(or, |) +ATOMIC64_OPS(xor, ^) +#undef ATOMIC64_OPS +#undef ATOMIC64_FETCH_OP #undef ATOMIC64_OP #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 037351022f54..89ed2f6ae2f7 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -70,9 +70,9 @@ static inline void atomic64_sub(long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ -static inline int atomic64_sub_and_test(long i, atomic64_t *v) +static inline bool atomic64_sub_and_test(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", "e"); + GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); } /** @@ -109,9 +109,9 @@ static __always_inline void atomic64_dec(atomic64_t *v) * returns true if the result is 0, or false for all other * cases. */ -static inline int atomic64_dec_and_test(atomic64_t *v) +static inline bool atomic64_dec_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); } /** @@ -122,9 +122,9 @@ static inline int atomic64_dec_and_test(atomic64_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline int atomic64_inc_and_test(atomic64_t *v) +static inline bool atomic64_inc_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); } /** @@ -136,9 +136,9 @@ static inline int atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int atomic64_add_negative(long i, atomic64_t *v) +static inline bool atomic64_add_negative(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", "s"); + GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s); } /** @@ -158,6 +158,16 @@ static inline long atomic64_sub_return(long i, atomic64_t *v) return atomic64_add_return(-i, v); } +static inline long atomic64_fetch_add(long i, atomic64_t *v) +{ + return xadd(&v->counter, i); +} + +static inline long atomic64_fetch_sub(long i, atomic64_t *v) +{ + return xadd(&v->counter, -i); +} + #define atomic64_inc_return(v) (atomic64_add_return(1, (v))) #define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) @@ -180,7 +190,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static inline int atomic64_add_unless(atomic64_t *v, long a, long u) +static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { long c, old; c = atomic64_read(v); @@ -229,10 +239,29 @@ static inline void atomic64_##op(long i, atomic64_t *v) \ : "memory"); \ } -ATOMIC64_OP(and) -ATOMIC64_OP(or) -ATOMIC64_OP(xor) +#define ATOMIC64_FETCH_OP(op, c_op) \ +static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ +{ \ + long old, val = atomic64_read(v); \ + for (;;) { \ + old = atomic64_cmpxchg(v, val, val c_op i); \ + if (old == val) \ + break; \ + val = old; \ + } \ + return old; \ +} + +#define ATOMIC64_OPS(op, c_op) \ + ATOMIC64_OP(op) \ + ATOMIC64_FETCH_OP(op, c_op) + +ATOMIC64_OPS(and, &) +ATOMIC64_OPS(or, |) +ATOMIC64_OPS(xor, ^) +#undef ATOMIC64_OPS +#undef ATOMIC64_FETCH_OP #undef ATOMIC64_OP #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h index 2b00c776f223..4b7b8e71607e 100644 --- a/arch/x86/include/asm/bios_ebda.h +++ b/arch/x86/include/asm/bios_ebda.h @@ -17,7 +17,7 @@ static inline unsigned int get_bios_ebda(void) return address; /* 0 means none */ } -void reserve_ebda_region(void); +void reserve_bios_regions(void); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION /* diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 7766d1cf096e..68557f52b961 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -201,9 +201,9 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c"); + GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", c); } /** @@ -213,7 +213,7 @@ static __always_inline int test_and_set_bit(long nr, volatile unsigned long *add * * This is the same as test_and_set_bit on x86. */ -static __always_inline int +static __always_inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr) { return test_and_set_bit(nr, addr); @@ -228,13 +228,13 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *addr) { - int oldbit; + bool oldbit; asm("bts %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); return oldbit; } @@ -247,9 +247,9 @@ static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *a * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c"); + GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", c); } /** @@ -268,25 +268,25 @@ static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *a * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ -static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr) { - int oldbit; + bool oldbit; asm volatile("btr %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); return oldbit; } /* WARNING: non atomic and it can be reordered! */ -static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline bool __test_and_change_bit(long nr, volatile unsigned long *addr) { - int oldbit; + bool oldbit; asm volatile("btc %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR : "Ir" (nr) : "memory"); return oldbit; @@ -300,24 +300,24 @@ static __always_inline int __test_and_change_bit(long nr, volatile unsigned long * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c"); + GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", c); } -static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) +static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) { return ((1UL << (nr & (BITS_PER_LONG-1))) & (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } -static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr) +static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr) { - int oldbit; + bool oldbit; asm volatile("bt %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit) + CC_SET(c) + : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); return oldbit; @@ -329,7 +329,7 @@ static __always_inline int variable_test_bit(long nr, volatile const unsigned lo * @nr: bit number to test * @addr: Address to start counting from */ -static int test_bit(int nr, const volatile unsigned long *addr); +static bool test_bit(int nr, const volatile unsigned long *addr); #endif #define test_bit(nr, addr) \ diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h index 08abf639075f..5490bbaf71d5 100644 --- a/arch/x86/include/asm/bugs.h +++ b/arch/x86/include/asm/bugs.h @@ -1,8 +1,16 @@ #ifndef _ASM_X86_BUGS_H #define _ASM_X86_BUGS_H +#include <asm/processor.h> + extern void check_bugs(void); +#if defined(CONFIG_CPU_SUP_INTEL) +void check_mpx_erratum(struct cpuinfo_x86 *c); +#else +static inline void check_mpx_erratum(struct cpuinfo_x86 *c) {} +#endif + #if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32) int ppro_with_ram_bug(void); #else diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 532f85e6651f..7b53743ed267 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -2,8 +2,7 @@ #define _ASM_X86_CHECKSUM_32_H #include <linux/in6.h> - -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* * computes the checksum of a memory block at buff, length len, diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 5a3b2c119ed0..a18806165fe4 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -40,6 +40,7 @@ typedef s32 compat_long_t; typedef s64 __attribute__((aligned(4))) compat_s64; typedef u32 compat_uint_t; typedef u32 compat_ulong_t; +typedef u32 compat_u32; typedef u64 __attribute__((aligned(4))) compat_u64; typedef u32 compat_uptr_t; @@ -181,6 +182,16 @@ typedef struct compat_siginfo { /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ struct { unsigned int _addr; /* faulting insn/memory ref. */ + short int _addr_lsb; /* Valid LSB of the reported address. */ + union { + /* used when si_code=SEGV_BNDERR */ + struct { + compat_uptr_t _lower; + compat_uptr_t _upper; + } _addr_bnd; + /* used when si_code=SEGV_PKUERR */ + compat_u32 _pkey; + }; } _sigfault; /* SIGPOLL */ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 678637ad7476..9b7fa6313f1a 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -16,8 +16,8 @@ extern void prefill_possible_map(void); static inline void prefill_possible_map(void) {} #define cpu_physical_id(cpu) boot_cpu_physical_apicid +#define cpu_acpi_id(cpu) 0 #define safe_smp_processor_id() 0 -#define stack_smp_processor_id() 0 #endif /* CONFIG_SMP */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 25ebb54905e0..483fb547e3c0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -64,9 +64,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) || \ (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) || \ (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) || \ - (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \ - (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \ - (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) ) + (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \ + (((bit)>>5)==15 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \ + (((bit)>>5)==16 && (1UL<<((bit)&31) & REQUIRED_MASK16)) ) #define DISABLED_MASK_BIT_SET(bit) \ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0 )) || \ @@ -83,9 +83,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) || \ (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) || \ (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) || \ - (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \ - (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \ - (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) ) + (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \ + (((bit)>>5)==15 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \ + (((bit)>>5)==16 && (1UL<<((bit)&31) & DISABLED_MASK16)) ) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4a413485f9eb..c64b1e9c5d1a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -301,10 +301,6 @@ #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ -#define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */ -#define X86_BUG_SWAPGS_FENCE X86_BUG(10) /* SWAPGS without input dep on GS */ - - #ifdef CONFIG_X86_32 /* * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional @@ -312,5 +308,7 @@ */ #define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif +#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 39343be7d4f4..911e9358ceb1 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -29,11 +29,11 @@ #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -# define DISABLE_PKU (1<<(X86_FEATURE_PKU)) -# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE)) -#else # define DISABLE_PKU 0 # define DISABLE_OSPKE 0 +#else +# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31)) +# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ /* diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 78d1e7467eae..d0bb76d81402 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -41,10 +41,9 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); /* * Wrap all the virtual calls in a way that forces the parameters on the stack. */ -#define arch_efi_call_virt(f, args...) \ +#define arch_efi_call_virt(p, f, args...) \ ({ \ - ((efi_##f##_t __attribute__((regparm(0)))*) \ - efi.systab->runtime->f)(args); \ + ((efi_##f##_t __attribute__((regparm(0)))*) p->f)(args); \ }) #define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) @@ -81,8 +80,8 @@ struct efi_scratch { } \ }) -#define arch_efi_call_virt(f, args...) \ - efi_call((void *)efi.systab->runtime->f, args) \ +#define arch_efi_call_virt(p, f, args...) \ + efi_call((void *)p->f, args) \ #define arch_efi_call_virt_teardown() \ ({ \ @@ -125,7 +124,6 @@ extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); extern int __init efi_alloc_page_tables(void); extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages); -extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages); extern void __init old_map_region(efi_memory_desc_t *md); extern void __init runtime_code_page_mkexec(void); extern void __init efi_runtime_update_mappings(void); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 31ac8e6d9f36..116b58347501 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -18,6 +18,7 @@ #include <asm/fpu/api.h> #include <asm/fpu/xstate.h> #include <asm/cpufeature.h> +#include <asm/trace/fpu.h> /* * High level FPU state handling functions: @@ -524,6 +525,7 @@ static inline void __fpregs_deactivate(struct fpu *fpu) fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); + trace_x86_fpu_regs_deactivated(fpu); } /* Must be paired with a 'clts' (fpregs_activate_hw()) before! */ @@ -533,6 +535,7 @@ static inline void __fpregs_activate(struct fpu *fpu) fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); + trace_x86_fpu_regs_activated(fpu); } /* @@ -604,11 +607,13 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* But leave fpu_fpregs_owner_ctx! */ old_fpu->fpregs_active = 0; + trace_x86_fpu_regs_deactivated(old_fpu); /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { new_fpu->counter++; __fpregs_activate(new_fpu); + trace_x86_fpu_regs_activated(new_fpu); prefetch(&new_fpu->state); } else { __fpregs_deactivate_hw(); diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 36b90bbfc69f..48df486b02f9 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -122,6 +122,7 @@ enum xfeature { #define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) #define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) #define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) +#define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) @@ -231,6 +232,12 @@ struct xstate_header { } __attribute__((packed)); /* + * xstate_header.xcomp_bv[63] indicates that the extended_state_area + * is in compacted format. + */ +#define XCOMP_BV_COMPACTED_FORMAT ((u64)1 << 63) + +/* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. * diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 38951b0fcc5a..ae55a43e09c0 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -18,6 +18,9 @@ #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) +/* Supervisor features */ +#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT) + /* Supported features which support lazy state saving */ #define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ @@ -39,7 +42,6 @@ #define REX_PREFIX #endif -extern unsigned int xstate_size; extern u64 xfeatures_mask; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; @@ -48,5 +50,9 @@ extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); - +int using_compacted_format(void); +int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, + void __user *ubuf, struct xregs_state *xsave); +int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, + struct xregs_state *xsave); #endif diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 74a2e312e8a2..02aff0867211 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -48,6 +48,7 @@ /* AVX VEX prefixes */ #define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */ #define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */ +#define INAT_PFX_EVEX 15 /* EVEX prefix */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -89,6 +90,7 @@ #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) +#define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) /* Attribute making macros for attribute tables */ #define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) #define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) @@ -141,7 +143,13 @@ static inline int inat_last_prefix_id(insn_attr_t attr) static inline int inat_is_vex_prefix(insn_attr_t attr) { attr &= INAT_PFX_MASK; - return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3; + return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3 || + attr == INAT_PFX_EVEX; +} + +static inline int inat_is_evex_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_EVEX; } static inline int inat_is_vex3_prefix(insn_attr_t attr) @@ -216,6 +224,11 @@ static inline int inat_accept_vex(insn_attr_t attr) static inline int inat_must_vex(insn_attr_t attr) { - return attr & INAT_VEXONLY; + return attr & (INAT_VEXONLY | INAT_EVEXONLY); +} + +static inline int inat_must_evex(insn_attr_t attr) +{ + return attr & INAT_EVEXONLY; } #endif diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index e7814b74caf8..b3e32b010ab1 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -91,6 +91,7 @@ struct insn { #define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */ #define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */ /* VEX bit fields */ +#define X86_EVEX_M(vex) ((vex) & 0x03) /* EVEX Byte1 */ #define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */ #define X86_VEX2_M 1 /* VEX2.M always 1 */ #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ @@ -133,6 +134,13 @@ static inline int insn_is_avx(struct insn *insn) return (insn->vex_prefix.value != 0); } +static inline int insn_is_evex(struct insn *insn) +{ + if (!insn->prefixes.got) + insn_get_prefixes(insn); + return (insn->vex_prefix.nbytes == 4); +} + /* Ensure this instruction is decoded completely */ static inline int insn_complete(struct insn *insn) { @@ -144,8 +152,10 @@ static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX2_M; - else + else if (insn->vex_prefix.nbytes == 3) /* 3 bytes VEX */ return X86_VEX3_M(insn->vex_prefix.bytes[1]); + else /* EVEX */ + return X86_EVEX_M(insn->vex_prefix.bytes[1]); } static inline insn_byte_t insn_vex_p_bits(struct insn *insn) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h new file mode 100644 index 000000000000..6999f7d01a0d --- /dev/null +++ b/arch/x86/include/asm/intel-family.h @@ -0,0 +1,68 @@ +#ifndef _ASM_X86_INTEL_FAMILY_H +#define _ASM_X86_INTEL_FAMILY_H + +/* + * "Big Core" Processors (Branded as Core, Xeon, etc...) + * + * The "_X" parts are generally the EP and EX Xeons, or the + * "Extreme" ones, like Broadwell-E. + * + * Things ending in "2" are usually because we have no better + * name for them. There's no processor called "WESTMERE2". + */ + +#define INTEL_FAM6_CORE_YONAH 0x0E +#define INTEL_FAM6_CORE2_MEROM 0x0F +#define INTEL_FAM6_CORE2_MEROM_L 0x16 +#define INTEL_FAM6_CORE2_PENRYN 0x17 +#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D + +#define INTEL_FAM6_NEHALEM 0x1E +#define INTEL_FAM6_NEHALEM_EP 0x1A +#define INTEL_FAM6_NEHALEM_EX 0x2E +#define INTEL_FAM6_WESTMERE 0x25 +#define INTEL_FAM6_WESTMERE2 0x1F +#define INTEL_FAM6_WESTMERE_EP 0x2C +#define INTEL_FAM6_WESTMERE_EX 0x2F + +#define INTEL_FAM6_SANDYBRIDGE 0x2A +#define INTEL_FAM6_SANDYBRIDGE_X 0x2D +#define INTEL_FAM6_IVYBRIDGE 0x3A +#define INTEL_FAM6_IVYBRIDGE_X 0x3E + +#define INTEL_FAM6_HASWELL_CORE 0x3C +#define INTEL_FAM6_HASWELL_X 0x3F +#define INTEL_FAM6_HASWELL_ULT 0x45 +#define INTEL_FAM6_HASWELL_GT3E 0x46 + +#define INTEL_FAM6_BROADWELL_CORE 0x3D +#define INTEL_FAM6_BROADWELL_XEON_D 0x56 +#define INTEL_FAM6_BROADWELL_GT3E 0x47 +#define INTEL_FAM6_BROADWELL_X 0x4F + +#define INTEL_FAM6_SKYLAKE_MOBILE 0x4E +#define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E +#define INTEL_FAM6_SKYLAKE_X 0x55 +#define INTEL_FAM6_KABYLAKE_MOBILE 0x8E +#define INTEL_FAM6_KABYLAKE_DESKTOP 0x9E + +/* "Small Core" Processors (Atom) */ + +#define INTEL_FAM6_ATOM_PINEVIEW 0x1C +#define INTEL_FAM6_ATOM_LINCROFT 0x26 +#define INTEL_FAM6_ATOM_PENWELL 0x27 +#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35 +#define INTEL_FAM6_ATOM_CEDARVIEW 0x36 +#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ +#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ +#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ +#define INTEL_FAM6_ATOM_MERRIFIELD1 0x4A /* Tangier */ +#define INTEL_FAM6_ATOM_MERRIFIELD2 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_GOLDMONT 0x5C +#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ + +/* Xeon Phi */ + +#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ + +#endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index 7c5af123bdbd..9d6b097aa73d 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -12,9 +12,17 @@ #define _ASM_X86_INTEL_MID_H #include <linux/sfi.h> +#include <linux/pci.h> #include <linux/platform_device.h> extern int intel_mid_pci_init(void); +extern int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state); + +#define INTEL_MID_PWR_LSS_OFFSET 4 +#define INTEL_MID_PWR_LSS_TYPE (1 << 7) + +extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev); + extern int get_gpio_by_name(const char *name); extern void intel_scu_device_register(struct platform_device *pdev); extern int __init sfi_parse_mrtc(struct sfi_table_header *table); @@ -34,13 +42,28 @@ struct devs_id { void *(*get_platform_data)(void *info); /* Custom handler for devices */ void (*device_handler)(struct sfi_device_table_entry *pentry, - struct devs_id *dev); + struct devs_id *dev); }; -#define sfi_device(i) \ - static const struct devs_id *const __intel_mid_sfi_##i##_dev __used \ +#define sfi_device(i) \ + static const struct devs_id *const __intel_mid_sfi_##i##_dev __used \ __attribute__((__section__(".x86_intel_mid_dev.init"))) = &i +/** +* struct mid_sd_board_info - template for SD device creation +* @name: identifies the driver +* @bus_num: board-specific identifier for a given SD controller +* @max_clk: the maximum frequency device supports +* @platform_data: the particular data stored there is driver-specific +*/ +struct mid_sd_board_info { + char name[SFI_NAME_LEN]; + int bus_num; + unsigned short addr; + u32 max_clk; + void *platform_data; +}; + /* * Medfield is the follow-up of Moorestown, it combines two chip solution into * one. Other than that it also added always-on and constant tsc and lapic @@ -60,7 +83,7 @@ extern enum intel_mid_cpu_type __intel_mid_cpu_chip; /** * struct intel_mid_ops - Interface between intel-mid & sub archs * @arch_setup: arch_setup function to re-initialize platform - * structures (x86_init, x86_platform_init) + * structures (x86_init, x86_platform_init) * * This structure can be extended if any new interface is required * between intel-mid & its sub arch files. @@ -70,20 +93,20 @@ struct intel_mid_ops { }; /* Helper API's for INTEL_MID_OPS_INIT */ -#define DECLARE_INTEL_MID_OPS_INIT(cpuname, cpuid) \ - [cpuid] = get_##cpuname##_ops +#define DECLARE_INTEL_MID_OPS_INIT(cpuname, cpuid) \ + [cpuid] = get_##cpuname##_ops /* Maximum number of CPU ops */ -#define MAX_CPU_OPS(a) (sizeof(a)/sizeof(void *)) +#define MAX_CPU_OPS(a) (sizeof(a)/sizeof(void *)) /* * For every new cpu addition, a weak get_<cpuname>_ops() function needs be * declared in arch/x86/platform/intel_mid/intel_mid_weak_decls.h. */ -#define INTEL_MID_OPS_INIT {\ - DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \ - DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \ - DECLARE_INTEL_MID_OPS_INIT(tangier, INTEL_MID_CPU_CHIP_TANGIER) \ +#define INTEL_MID_OPS_INIT { \ + DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \ + DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \ + DECLARE_INTEL_MID_OPS_INIT(tangier, INTEL_MID_CPU_CHIP_TANGIER) \ }; #ifdef CONFIG_X86_INTEL_MID @@ -100,8 +123,8 @@ static inline bool intel_mid_has_msic(void) #else /* !CONFIG_X86_INTEL_MID */ -#define intel_mid_identify_cpu() (0) -#define intel_mid_has_msic() (0) +#define intel_mid_identify_cpu() 0 +#define intel_mid_has_msic() 0 #endif /* !CONFIG_X86_INTEL_MID */ @@ -117,35 +140,38 @@ extern enum intel_mid_timer_options intel_mid_timer_options; * Penwell uses spread spectrum clock, so the freq number is not exactly * the same as reported by MSR based on SDM. */ -#define FSB_FREQ_83SKU 83200 -#define FSB_FREQ_100SKU 99840 -#define FSB_FREQ_133SKU 133000 +#define FSB_FREQ_83SKU 83200 +#define FSB_FREQ_100SKU 99840 +#define FSB_FREQ_133SKU 133000 -#define FSB_FREQ_167SKU 167000 -#define FSB_FREQ_200SKU 200000 -#define FSB_FREQ_267SKU 267000 -#define FSB_FREQ_333SKU 333000 -#define FSB_FREQ_400SKU 400000 +#define FSB_FREQ_167SKU 167000 +#define FSB_FREQ_200SKU 200000 +#define FSB_FREQ_267SKU 267000 +#define FSB_FREQ_333SKU 333000 +#define FSB_FREQ_400SKU 400000 /* Bus Select SoC Fuse value */ -#define BSEL_SOC_FUSE_MASK 0x7 -#define BSEL_SOC_FUSE_001 0x1 /* FSB 133MHz */ -#define BSEL_SOC_FUSE_101 0x5 /* FSB 100MHz */ -#define BSEL_SOC_FUSE_111 0x7 /* FSB 83MHz */ +#define BSEL_SOC_FUSE_MASK 0x7 +/* FSB 133MHz */ +#define BSEL_SOC_FUSE_001 0x1 +/* FSB 100MHz */ +#define BSEL_SOC_FUSE_101 0x5 +/* FSB 83MHz */ +#define BSEL_SOC_FUSE_111 0x7 -#define SFI_MTMR_MAX_NUM 8 -#define SFI_MRTC_MAX 8 +#define SFI_MTMR_MAX_NUM 8 +#define SFI_MRTC_MAX 8 extern void intel_scu_devices_create(void); extern void intel_scu_devices_destroy(void); /* VRTC timer */ -#define MRST_VRTC_MAP_SZ (1024) -/*#define MRST_VRTC_PGOFFSET (0xc00) */ +#define MRST_VRTC_MAP_SZ 1024 +/* #define MRST_VRTC_PGOFFSET 0xc00 */ extern void intel_mid_rtc_init(void); -/* the offset for the mapping of global gpio pin to irq */ -#define INTEL_MID_IRQ_OFFSET 0x100 +/* The offset for the mapping of global gpio pin to irq */ +#define INTEL_MID_IRQ_OFFSET 0x100 #endif /* _ASM_X86_INTEL_MID_H */ diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h index ed65fe701de5..85029b58d0cd 100644 --- a/arch/x86/include/asm/intel_telemetry.h +++ b/arch/x86/include/asm/intel_telemetry.h @@ -99,7 +99,7 @@ struct telemetry_core_ops { int (*reset_events)(void); }; -int telemetry_set_pltdata(struct telemetry_core_ops *ops, +int telemetry_set_pltdata(const struct telemetry_core_ops *ops, struct telemetry_plt_config *pltconfig); int telemetry_clear_pltdata(void); diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h new file mode 100644 index 000000000000..2674ee3de748 --- /dev/null +++ b/arch/x86/include/asm/kaslr.h @@ -0,0 +1,15 @@ +#ifndef _ASM_KASLR_H_ +#define _ASM_KASLR_H_ + +unsigned long kaslr_get_random_long(const char *purpose); + +#ifdef CONFIG_RANDOMIZE_MEMORY +extern unsigned long page_offset_base; +extern unsigned long vmalloc_base; + +void kernel_randomize_memory(void); +#else +static inline void kernel_randomize_memory(void) { } +#endif /* CONFIG_RANDOMIZE_MEMORY */ + +#endif diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index e5f5dc9787d5..1ef9d581b5d9 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_trace(struct task_struct *t, struct pt_regs *regs, unsigned long *sp, unsigned long bp); +extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, int all); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 4421b5da409d..d1d1e5094c28 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -38,12 +38,11 @@ typedef u8 kprobe_opcode_t; #define RELATIVECALL_OPCODE 0xe8 #define RELATIVE_ADDR_SIZE 4 #define MAX_STACK_SIZE 64 -#define MIN_STACK_SIZE(ADDR) \ - (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \ - THREAD_SIZE - (unsigned long)(ADDR))) \ - ? (MAX_STACK_SIZE) \ - : (((unsigned long)current_thread_info()) + \ - THREAD_SIZE - (unsigned long)(ADDR))) +#define CUR_STACK_SIZE(ADDR) \ + (current_top_of_stack() - (unsigned long)(ADDR)) +#define MIN_STACK_SIZE(ADDR) \ + (MAX_STACK_SIZE < CUR_STACK_SIZE(ADDR) ? \ + MAX_STACK_SIZE : CUR_STACK_SIZE(ADDR)) #define flush_insn_slot(p) do { } while (0) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b7e394485a5f..69e62862b622 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include <linux/irqbypass.h> #include <linux/hyperv.h> +#include <asm/apic.h> #include <asm/pvclock-abi.h> #include <asm/desc.h> #include <asm/mtrr.h> @@ -562,7 +563,6 @@ struct kvm_vcpu_arch { struct { u64 msr_val; u64 last_steal; - u64 accum_steal; struct gfn_to_hva_cache stime; struct kvm_steal_time steal; } st; @@ -774,6 +774,11 @@ struct kvm_arch { u8 nr_reserved_ioapic_pins; bool disabled_lapic_found; + + /* Struct members for AVIC */ + u32 ldr_mode; + struct page *avic_logical_id_table_page; + struct page *avic_physical_id_table_page; }; struct kvm_vm_stat { @@ -804,6 +809,7 @@ struct kvm_vcpu_stat { u32 halt_exits; u32 halt_successful_poll; u32 halt_attempted_poll; + u32 halt_poll_invalid; u32 halt_wakeup; u32 request_irq_exits; u32 irq_exits; @@ -848,6 +854,9 @@ struct kvm_x86_ops { bool (*cpu_has_high_real_mode_segbase)(void); void (*cpuid_update)(struct kvm_vcpu *vcpu); + int (*vm_init)(struct kvm *kvm); + void (*vm_destroy)(struct kvm *kvm); + /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); @@ -914,7 +923,7 @@ struct kvm_x86_ops { bool (*get_enable_apicv)(void); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); - void (*hwapic_isr_update)(struct kvm *kvm, int isr); + void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); @@ -990,8 +999,13 @@ struct kvm_x86_ops { */ int (*pre_block)(struct kvm_vcpu *vcpu); void (*post_block)(struct kvm_vcpu *vcpu); + + void (*vcpu_blocking)(struct kvm_vcpu *vcpu); + void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); + int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); + void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1341,7 +1355,28 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); -static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} -static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->vcpu_blocking) + kvm_x86_ops->vcpu_blocking(vcpu); +} + +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->vcpu_unblocking) + kvm_x86_ops->vcpu_unblocking(vcpu); +} + +static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} + +static inline int kvm_cpu_get_apicid(int mps_cpu) +{ +#ifdef CONFIG_X86_LOCAL_APIC + return __default_cpu_present_to_apicid(mps_cpu); +#else + WARN_ON_ONCE(1); + return BAD_APICID; +#endif +} #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index 7e68f9558552..a7f9181f63f3 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -32,8 +32,6 @@ static inline int klp_check_compiler_support(void) #endif return 0; } -int klp_write_module_reloc(struct module *mod, unsigned long type, - unsigned long loc, unsigned long value); static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) { diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 4ad6560847b1..7511978093eb 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -50,9 +50,9 @@ static inline void local_sub(long i, local_t *l) * true if the result is zero, or false for all * other cases. */ -static inline int local_sub_and_test(long i, local_t *l) +static inline bool local_sub_and_test(long i, local_t *l) { - GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", "e"); + GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", e); } /** @@ -63,9 +63,9 @@ static inline int local_sub_and_test(long i, local_t *l) * returns true if the result is 0, or false for all other * cases. */ -static inline int local_dec_and_test(local_t *l) +static inline bool local_dec_and_test(local_t *l) { - GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e"); + GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", e); } /** @@ -76,9 +76,9 @@ static inline int local_dec_and_test(local_t *l) * and returns true if the result is zero, or false for all * other cases. */ -static inline int local_inc_and_test(local_t *l) +static inline bool local_inc_and_test(local_t *l) { - GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e"); + GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", e); } /** @@ -90,9 +90,9 @@ static inline int local_inc_and_test(local_t *l) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int local_add_negative(long i, local_t *l) +static inline bool local_add_negative(long i, local_t *l) { - GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", "s"); + GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", s); } /** diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5a73a9c62c39..56f4c6676b29 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -64,8 +64,6 @@ #define MSR_OFFCORE_RSP_0 0x000001a6 #define MSR_OFFCORE_RSP_1 0x000001a7 -#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad -#define MSR_IVT_TURBO_RATIO_LIMIT 0x000001ae #define MSR_TURBO_RATIO_LIMIT 0x000001ad #define MSR_TURBO_RATIO_LIMIT1 0x000001ae #define MSR_TURBO_RATIO_LIMIT2 0x000001af diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 7dc1d8fef7fd..b5fee97813cd 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -122,7 +122,7 @@ notrace static inline void native_write_msr(unsigned int msr, "2:\n" _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) : : "c" (msr), "a"(low), "d" (high) : "memory"); - if (msr_tracepoint_active(__tracepoint_read_msr)) + if (msr_tracepoint_active(__tracepoint_write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } @@ -141,7 +141,7 @@ notrace static inline int native_write_msr_safe(unsigned int msr, : "c" (msr), "0" (low), "d" (high), [fault] "i" (-EIO) : "memory"); - if (msr_tracepoint_active(__tracepoint_read_msr)) + if (msr_tracepoint_active(__tracepoint_write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), err); return err; } diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h index 85e6cda45a02..e9355a84fc67 100644 --- a/arch/x86/include/asm/mutex_32.h +++ b/arch/x86/include/asm/mutex_32.h @@ -101,7 +101,7 @@ static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { /* cmpxchg because it never induces a false contention state. */ - if (likely(atomic_cmpxchg(count, 1, 0) == 1)) + if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) return 1; return 0; diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h index 07537a44216e..d9850758464e 100644 --- a/arch/x86/include/asm/mutex_64.h +++ b/arch/x86/include/asm/mutex_64.h @@ -118,10 +118,10 @@ do { \ static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - if (likely(atomic_cmpxchg(count, 1, 0) == 1)) + if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) return 1; - else - return 0; + + return 0; } #endif /* _ASM_X86_MUTEX_64_H */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index d5c2f8b40faa..9215e0527647 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,6 +1,10 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H +#ifndef __ASSEMBLY__ +#include <asm/kaslr.h> +#endif + #ifdef CONFIG_KASAN #define KASAN_STACK_ORDER 1 #else @@ -32,7 +36,12 @@ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's * what Xen requires. */ -#define __PAGE_OFFSET _AC(0xffff880000000000, UL) +#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) +#ifdef CONFIG_RANDOMIZE_MEMORY +#define __PAGE_OFFSET page_offset_base +#else +#define __PAGE_OFFSET __PAGE_OFFSET_BASE +#endif /* CONFIG_RANDOMIZE_MEMORY */ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e0ba66ca68c6..e02e3f80d363 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -510,14 +510,15 @@ do { \ /* This is not atomic against other CPUs -- CPU preemption needs to be off */ #define x86_test_and_clear_bit_percpu(bit, var) \ ({ \ - int old__; \ - asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \ - : "=r" (old__), "+m" (var) \ + bool old__; \ + asm volatile("btr %2,"__percpu_arg(1)"\n\t" \ + CC_SET(c) \ + : CC_OUT(c) (old__), "+m" (var) \ : "dIr" (bit)); \ old__; \ }) -static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, +static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, const unsigned long __percpu *addr) { unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; @@ -529,14 +530,14 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, #endif } -static inline int x86_this_cpu_variable_test_bit(int nr, +static inline bool x86_this_cpu_variable_test_bit(int nr, const unsigned long __percpu *addr) { - int oldbit; + bool oldbit; asm volatile("bt "__percpu_arg(2)",%1\n\t" - "sbb %0,%0" - : "=r" (oldbit) + CC_SET(c) + : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); return oldbit; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index bf7f8b55b0f9..b6d425999f99 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -81,7 +81,11 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { struct page *page; - page = alloc_pages(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO, 0); + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + page = alloc_pages(gfp, 0); if (!page) return NULL; if (!pgtable_pmd_page_ctor(page)) { @@ -125,7 +129,11 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + gfp_t gfp = GFP_KERNEL_ACCOUNT; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + return (pud_t *)get_zeroed_page(gfp); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f86491a7bc9d..437feb436efa 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -181,6 +181,7 @@ static inline int pmd_trans_huge(pmd_t pmd) return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; } +#define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { return boot_cpu_has(X86_FEATURE_PSE); @@ -479,7 +480,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); static inline int pte_none(pte_t pte) { - return !pte.pte; + return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK)); } #define __HAVE_ARCH_PTE_SAME @@ -551,7 +552,8 @@ static inline int pmd_none(pmd_t pmd) { /* Only check low word on 32-bit platforms, since it might be out of sync with upper half. */ - return (unsigned long)native_pmd_val(pmd) == 0; + unsigned long val = native_pmd_val(pmd); + return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0; } static inline unsigned long pmd_page_vaddr(pmd_t pmd) @@ -615,7 +617,7 @@ static inline unsigned long pages_to_mb(unsigned long npg) #if CONFIG_PGTABLE_LEVELS > 2 static inline int pud_none(pud_t pud) { - return native_pud_val(pud) == 0; + return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0; } static inline int pud_present(pud_t pud) @@ -693,6 +695,12 @@ static inline int pgd_bad(pgd_t pgd) static inline int pgd_none(pgd_t pgd) { + /* + * There is no need to do a workaround for the KNL stray + * A/D bit erratum here. PGDs only point to page tables + * except on 32-bit non-PAE which is not supported on + * KNL. + */ return !native_pgd_val(pgd); } #endif /* CONFIG_PGTABLE_LEVELS > 3 */ @@ -728,6 +736,23 @@ extern int direct_gbpages; void init_mem_mapping(void); void early_alloc_pgt_buf(void); +#ifdef CONFIG_X86_64 +/* Realmode trampoline initialization. */ +extern pgd_t trampoline_pgd_entry; +static inline void __meminit init_trampoline_default(void) +{ + /* Default trampoline pgd value */ + trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)]; +} +# ifdef CONFIG_RANDOMIZE_MEMORY +void __meminit init_trampoline(void); +# else +# define init_trampoline init_trampoline_default +# endif +#else +static inline void init_trampoline(void) { } +#endif + /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) { diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 2ee781114d34..7e8ec7ae10fa 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -140,18 +140,32 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) ((void)(pte))/* NOP */ -/* Encode and de-code a swap entry */ +/* + * Encode and de-code a swap entry + * + * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number + * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names + * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry + * + * G (8) is aliased and used as a PROT_NONE indicator for + * !present ptes. We need to start storing swap entries above + * there. We also need to avoid using A and D because of an + * erratum where they can be incorrectly set by hardware on + * non-present PTEs. + */ +#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) #define SWP_TYPE_BITS 5 -#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +/* Place the offset above the type: */ +#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ +#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \ & ((1U << SWP_TYPE_BITS) - 1)) -#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << (_PAGE_BIT_PRESENT + 1)) \ - | ((offset) << SWP_OFFSET_SHIFT) }) + ((type) << (SWP_TYPE_FIRST_BIT)) \ + | ((offset) << SWP_OFFSET_FIRST_BIT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index e6844dfb4471..6fdef9eef2d5 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -5,6 +5,7 @@ #ifndef __ASSEMBLY__ #include <linux/types.h> +#include <asm/kaslr.h> /* * These are used to make use of C type-checking.. @@ -53,10 +54,16 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_MASK (~(PGDIR_SIZE - 1)) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) -#define VMALLOC_START _AC(0xffffc90000000000, UL) -#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) -#define VMEMMAP_START _AC(0xffffea0000000000, UL) +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define VMALLOC_SIZE_TB _AC(32, UL) +#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) +#define VMEMMAP_START _AC(0xffffea0000000000, UL) +#ifdef CONFIG_RANDOMIZE_MEMORY +#define VMALLOC_START vmalloc_base +#else +#define VMALLOC_START __VMALLOC_BASE +#endif /* CONFIG_RANDOMIZE_MEMORY */ +#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7b5efe264eff..f1218f512f62 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -70,6 +70,12 @@ _PAGE_PKEY_BIT2 | \ _PAGE_PKEY_BIT3) +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED) +#else +#define _PAGE_KNL_ERRATUM_MASK 0 +#endif + #ifdef CONFIG_KMEMCHECK #define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) #else @@ -475,8 +481,6 @@ extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags); -void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, - unsigned numpages); #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */ diff --git a/arch/x86/include/asm/pmc_core.h b/arch/x86/include/asm/pmc_core.h new file mode 100644 index 000000000000..d4855f11136d --- /dev/null +++ b/arch/x86/include/asm/pmc_core.h @@ -0,0 +1,27 @@ +/* + * Intel Core SoC Power Management Controller Header File + * + * Copyright (c) 2016, Intel Corporation. + * All Rights Reserved. + * + * Authors: Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com> + * Vishwanath Somayaji <vishwanath.somayaji@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef _ASM_PMC_CORE_H +#define _ASM_PMC_CORE_H + +/* API to read SLP_S0_RESIDENCY counter */ +int intel_pmc_slp_s0_counter_read(u32 *data); + +#endif /* _ASM_PMC_CORE_H */ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index d397deb58146..17f218645701 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -81,7 +81,7 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e"); + GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e); } /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 62c6cc3cc5d3..63def9537a2d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -367,10 +367,15 @@ DECLARE_PER_CPU(struct irq_stack *, hardirq_stack); DECLARE_PER_CPU(struct irq_stack *, softirq_stack); #endif /* X86_64 */ -extern unsigned int xstate_size; +extern unsigned int fpu_kernel_xstate_size; +extern unsigned int fpu_user_xstate_size; struct perf_event; +typedef struct { + unsigned long seg; +} mm_segment_t; + struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; @@ -419,6 +424,11 @@ struct thread_struct { /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; + mm_segment_t addr_limit; + + unsigned int sig_on_uaccess_err:1; + unsigned int uaccess_err:1; /* uaccess failed */ + /* Floating point and extended processor state */ struct fpu fpu; /* @@ -490,11 +500,6 @@ static inline void load_sp0(struct tss_struct *tss, #define set_iopl_mask native_set_iopl_mask #endif /* CONFIG_PARAVIRT */ -typedef struct { - unsigned long seg; -} mm_segment_t; - - /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); @@ -716,6 +721,7 @@ static inline void spin_lock_prefetch(const void *x) .sp0 = TOP_OF_INIT_STACK, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ + .addr_limit = KERNEL_DS, \ } extern unsigned long thread_saved_pc(struct task_struct *tsk); @@ -765,8 +771,9 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX TASK_SIZE_MAX -#define INIT_THREAD { \ - .sp0 = TOP_OF_INIT_STACK \ +#define INIT_THREAD { \ + .sp0 = TOP_OF_INIT_STACK, \ + .addr_limit = KERNEL_DS, \ } /* diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index fdcc04020636..7c1c89598688 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -69,29 +69,22 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) } static __always_inline -u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) -{ - u64 delta = rdtsc_ordered() - src->tsc_timestamp; - return pvclock_scale_delta(delta, src->tsc_to_system_mul, - src->tsc_shift); -} - -static __always_inline unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, cycle_t *cycles, u8 *flags) { unsigned version; - cycle_t ret, offset; - u8 ret_flags; + cycle_t offset; + u64 delta; version = src->version; + /* Make the latest version visible */ + smp_rmb(); - offset = pvclock_get_nsec_offset(src); - ret = src->system_time + offset; - ret_flags = src->flags; - - *cycles = ret; - *flags = ret_flags; + delta = rdtsc_ordered() - src->tsc_timestamp; + offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, + src->tsc_shift); + *cycles = src->system_time + offset; + *flags = src->flags; return version; } diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 8f7866a5b9a4..661dd305694a 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -1,11 +1,13 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc -#ifdef CC_HAVE_ASM_GOTO +#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) + +/* Use asm goto */ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ - asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \ + asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ : : "m" (var), ## __VA_ARGS__ \ : "memory" : cc_label); \ return 0; \ @@ -19,15 +21,17 @@ cc_label: \ #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val)) -#else /* !CC_HAVE_ASM_GOTO */ +#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ + +/* Use flags output or a set instruction */ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ - char c; \ - asm volatile (fullop "; set" cc " %1" \ - : "+m" (var), "=qm" (c) \ + bool c; \ + asm volatile (fullop ";" CC_SET(cc) \ + : "+m" (var), CC_OUT(cc) (c) \ : __VA_ARGS__ : "memory"); \ - return c != 0; \ + return c; \ } while (0) #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ @@ -36,6 +40,6 @@ do { \ #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) -#endif /* CC_HAVE_ASM_GOTO */ +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ #endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 453744c1d347..8dbc762ad132 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -77,7 +77,7 @@ static inline void __down_read(struct rw_semaphore *sem) /* * trylock for reading -- returns 1 if successful, 0 if contention */ -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline bool __down_read_trylock(struct rw_semaphore *sem) { long result, tmp; asm volatile("# beginning __down_read_trylock\n\t" @@ -93,7 +93,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) : "+m" (sem->count), "=&a" (result), "=&r" (tmp) : "i" (RWSEM_ACTIVE_READ_BIAS) : "memory", "cc"); - return result >= 0 ? 1 : 0; + return result >= 0; } /* @@ -134,9 +134,10 @@ static inline int __down_write_killable(struct rw_semaphore *sem) /* * trylock for writing -- returns 1 if successful, 0 if contention */ -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline bool __down_write_trylock(struct rw_semaphore *sem) { - long result, tmp; + bool result; + long tmp0, tmp1; asm volatile("# beginning __down_write_trylock\n\t" " mov %0,%1\n\t" "1:\n\t" @@ -144,14 +145,14 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* was the active mask 0 before? */ " jnz 2f\n\t" " mov %1,%2\n\t" - " add %3,%2\n\t" + " add %4,%2\n\t" LOCK_PREFIX " cmpxchg %2,%0\n\t" " jnz 1b\n\t" "2:\n\t" - " sete %b1\n\t" - " movzbl %b1, %k1\n\t" + CC_SET(e) "# ending __down_write_trylock\n\t" - : "+m" (sem->count), "=&a" (result), "=&r" (tmp) + : "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1), + CC_OUT(e) (result) : "er" (RWSEM_ACTIVE_WRITE_BIAS) : "memory", "cc"); return result; @@ -213,23 +214,5 @@ static inline void __downgrade_write(struct rw_semaphore *sem) : "memory", "cc"); } -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) -{ - asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" - : "+m" (sem->count) - : "er" (delta)); -} - -/* - * implement exchange and add functionality - */ -static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) -{ - return delta + xadd(&sem->count, delta); -} - #endif /* __KERNEL__ */ #endif /* _ASM_X86_RWSEM_H */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 2138c9ae19ee..dd1e7d6387ab 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -81,9 +81,9 @@ static inline int __const_sigismember(sigset_t *set, int _sig) static inline int __gen_sigismember(sigset_t *set, int _sig) { - int ret; - asm("btl %2,%1\n\tsbbl %0,%0" - : "=r"(ret) : "m"(*set), "Ir"(_sig-1) : "cc"); + unsigned char ret; + asm("btl %2,%1\n\tsetc %0" + : "=qm"(ret) : "m"(*set), "Ir"(_sig-1) : "cc"); return ret; } diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 66b057306f40..ebd0c164cd4e 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -33,6 +33,7 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) } DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid); @@ -135,6 +136,7 @@ int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); int common_cpu_die(unsigned int cpu); void native_cpu_die(unsigned int cpu); +void hlt_play_dead(void); void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); @@ -147,6 +149,7 @@ void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); void smp_store_boot_cpu_info(void); void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) +#define cpu_acpi_id(cpu) per_cpu(x86_cpu_to_acpiid, cpu) #else /* !CONFIG_SMP */ #define wbinvd_on_cpu(cpu) wbinvd() @@ -172,12 +175,6 @@ extern int safe_smp_processor_id(void); #elif defined(CONFIG_X86_64_SMP) #define raw_smp_processor_id() (this_cpu_read(cpu_number)) -#define stack_smp_processor_id() \ -({ \ - struct thread_info *ti; \ - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ - ti->cpu; \ -}) #define safe_smp_processor_id() smp_processor_id() #endif diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 7c247e7404be..0944218af9e2 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -14,7 +14,7 @@ extern int kstack_depth_to_print; struct thread_info; struct stacktrace_ops; -typedef unsigned long (*walk_stack_t)(struct thread_info *tinfo, +typedef unsigned long (*walk_stack_t)(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, @@ -23,13 +23,13 @@ typedef unsigned long (*walk_stack_t)(struct thread_info *tinfo, int *graph); extern unsigned long -print_context_stack(struct thread_info *tinfo, +print_context_stack(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, unsigned long *end, int *graph); extern unsigned long -print_context_stack_bp(struct thread_info *tinfo, +print_context_stack_bp(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, unsigned long *end, int *graph); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 6136d99f537b..d0fe23ec7e98 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -78,7 +78,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 exit_int_info; u32 exit_int_info_err; u64 nested_ctl; - u8 reserved_4[16]; + u64 avic_vapic_bar; + u8 reserved_4[8]; u32 event_inj; u32 event_inj_err; u64 nested_cr3; @@ -88,7 +89,11 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 next_rip; u8 insn_len; u8 insn_bytes[15]; - u8 reserved_6[800]; + u64 avic_backing_page; /* Offset 0xe0 */ + u8 reserved_6[8]; /* Offset 0xe8 */ + u64 avic_logical_id; /* Offset 0xf0 */ + u64 avic_physical_id; /* Offset 0xf8 */ + u8 reserved_7[768]; }; @@ -111,6 +116,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_INTR_MASKING_SHIFT 24 #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) +#define AVIC_ENABLE_SHIFT 31 +#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) + #define SVM_INTERRUPT_SHADOW_MASK 1 #define SVM_IOIO_STR_SHIFT 2 diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index f28a24b51dc7..cbf8847d02a0 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -79,10 +79,10 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; - asm volatile("lock; bts %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "+m" (ADDR) + asm volatile("lock; bts %2,%1\n\tsetc %0" + : "=qm" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; } @@ -97,10 +97,10 @@ static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; - asm volatile("lock; btr %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "+m" (ADDR) + asm volatile("lock; btr %2,%1\n\tsetc %0" + : "=qm" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; } @@ -115,10 +115,10 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; - asm volatile("lock; btc %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "+m" (ADDR) + asm volatile("lock; btc %2,%1\n\tsetc %0" + : "=qm" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; } diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 30c133ac05cd..89bff044a6f5 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -57,9 +57,6 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - mm_segment_t addr_limit; - unsigned int sig_on_uaccess_error:1; - unsigned int uaccess_err:1; /* uaccess failed */ }; #define INIT_THREAD_INFO(tsk) \ @@ -67,7 +64,6 @@ struct thread_info { .task = &tsk, \ .flags = 0, \ .cpu = 0, \ - .addr_limit = KERNEL_DS, \ } #define init_thread_info (init_thread_union.thread_info) @@ -186,11 +182,6 @@ static inline unsigned long current_stack_pointer(void) # define cpu_current_top_of_stack (cpu_tss + TSS_sp0) #endif -/* Load thread_info address into "reg" */ -#define GET_THREAD_INFO(reg) \ - _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ - _ASM_SUB $(THREAD_SIZE),reg ; - /* * ASM operand which evaluates to a 'thread_info' address of * the current task, if it is known that "reg" is exactly "off" diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 7f991bd5031b..43e87a3dd95c 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -25,16 +25,6 @@ #ifndef _ASM_X86_TOPOLOGY_H #define _ASM_X86_TOPOLOGY_H -#ifdef CONFIG_X86_32 -# ifdef CONFIG_SMP -# define ENABLE_TOPO_DEFINES -# endif -#else -# ifdef CONFIG_SMP -# define ENABLE_TOPO_DEFINES -# endif -#endif - /* * to preserve the visibility of NUMA_NO_NODE definition, * moved to there from here. May be used independent of @@ -123,12 +113,20 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) -#ifdef ENABLE_TOPO_DEFINES +#ifdef CONFIG_SMP #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) extern unsigned int __max_logical_packages; #define topology_max_packages() (__max_logical_packages) + +extern int __max_smt_threads; + +static inline int topology_max_smt_threads(void) +{ + return __max_smt_threads; +} + int topology_update_package_map(unsigned int apicid, unsigned int cpu); extern int topology_phys_to_logical_pkg(unsigned int pkg); #else @@ -136,6 +134,7 @@ extern int topology_phys_to_logical_pkg(unsigned int pkg); static inline int topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } +static inline int topology_max_smt_threads(void) { return 1; } #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h new file mode 100644 index 000000000000..9217ab1f5bf6 --- /dev/null +++ b/arch/x86/include/asm/trace/fpu.h @@ -0,0 +1,119 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM x86_fpu + +#if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FPU_H + +#include <linux/tracepoint.h> + +DECLARE_EVENT_CLASS(x86_fpu, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu), + + TP_STRUCT__entry( + __field(struct fpu *, fpu) + __field(bool, fpregs_active) + __field(bool, fpstate_active) + __field(int, counter) + __field(u64, xfeatures) + __field(u64, xcomp_bv) + ), + + TP_fast_assign( + __entry->fpu = fpu; + __entry->fpregs_active = fpu->fpregs_active; + __entry->fpstate_active = fpu->fpstate_active; + __entry->counter = fpu->counter; + if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { + __entry->xfeatures = fpu->state.xsave.header.xfeatures; + __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; + } + ), + TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d counter: %d xfeatures: %llx xcomp_bv: %llx", + __entry->fpu, + __entry->fpregs_active, + __entry->fpstate_active, + __entry->counter, + __entry->xfeatures, + __entry->xcomp_bv + ) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_state, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_before_save, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_after_save, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_before_restore, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_after_restore, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_activate_state, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_init_state, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_dropped, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_copy_src, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH asm/trace/ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE fpu +#endif /* _TRACE_FPU_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 7428697c5b8d..33b6365c22fe 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -35,7 +35,7 @@ extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); -extern int check_tsc_disabled(void); +extern unsigned long native_calibrate_cpu(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); @@ -52,7 +52,6 @@ extern int notsc_setup(char *); extern void tsc_save_sched_clock_state(void); extern void tsc_restore_sched_clock_state(void); -/* MSR based TSC calibration for Intel Atom SoC platforms */ -unsigned long try_msr_calibrate_tsc(void); +unsigned long cpu_khz_from_msr(void); #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 12f9653bde8d..c03bfb68c503 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -5,6 +5,7 @@ */ #include <linux/errno.h> #include <linux/compiler.h> +#include <linux/kasan-checks.h> #include <linux/thread_info.h> #include <linux/string.h> #include <asm/asm.h> @@ -28,12 +29,12 @@ #define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX) #define get_ds() (KERNEL_DS) -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) +#define get_fs() (current->thread.addr_limit) +#define set_fs(x) (current->thread.addr_limit = (x)) #define segment_eq(a, b) ((a).seg == (b).seg) -#define user_addr_max() (current_thread_info()->addr_limit.seg) +#define user_addr_max() (current->thread.addr_limit.seg) #define __addr_ok(addr) \ ((unsigned long __force)(addr) < user_addr_max()) @@ -341,7 +342,26 @@ do { \ } while (0) #ifdef CONFIG_X86_32 -#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad() +#define __get_user_asm_u64(x, ptr, retval, errret) \ +({ \ + __typeof__(ptr) __ptr = (ptr); \ + asm volatile(ASM_STAC "\n" \ + "1: movl %2,%%eax\n" \ + "2: movl %3,%%edx\n" \ + "3: " ASM_CLAC "\n" \ + ".section .fixup,\"ax\"\n" \ + "4: mov %4,%0\n" \ + " xorl %%eax,%%eax\n" \ + " xorl %%edx,%%edx\n" \ + " jmp 3b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE(2b, 4b) \ + : "=r" (retval), "=A"(x) \ + : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \ + "i" (errret), "0" (retval)); \ +}) + #define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad() #else #define __get_user_asm_u64(x, ptr, retval, errret) \ @@ -428,7 +448,7 @@ do { \ #define __get_user_nocheck(x, ptr, size) \ ({ \ int __gu_err; \ - unsigned long __gu_val; \ + __inttype(*(ptr)) __gu_val; \ __uaccess_begin(); \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ __uaccess_end(); \ @@ -467,13 +487,13 @@ struct __large_struct { unsigned long buf[100]; }; * uaccess_try and catch */ #define uaccess_try do { \ - current_thread_info()->uaccess_err = 0; \ + current->thread.uaccess_err = 0; \ __uaccess_begin(); \ barrier(); #define uaccess_catch(err) \ __uaccess_end(); \ - (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ + (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \ } while (0) /** @@ -721,6 +741,8 @@ copy_from_user(void *to, const void __user *from, unsigned long n) might_fault(); + kasan_check_write(to, n); + /* * While we would like to have the compiler do the checking for us * even in the non-constant size case, any false positives there are @@ -754,6 +776,8 @@ copy_to_user(void __user *to, const void *from, unsigned long n) { int sz = __compiletime_object_size(from); + kasan_check_read(from, n); + might_fault(); /* See the comment in copy_from_user() above. */ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 3fe0eac59462..4b32da24faaf 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -33,46 +33,10 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. - * - * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault - * we return the initial request size (1, 2 or 4), as copy_*_user should do. - * If a store crosses a page boundary and gets a fault, the x86 will not write - * anything, so this is accurate. */ - static __always_inline unsigned long __must_check __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __put_user_size(*(u8 *)from, (u8 __user *)to, - 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __put_user_size(*(u16 *)from, (u16 __user *)to, - 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __put_user_size(*(u32 *)from, (u32 __user *)to, - 4, ret, 4); - __uaccess_end(); - return ret; - case 8: - __uaccess_begin(); - __put_user_size(*(u64 *)from, (u64 __user *)to, - 8, ret, 8); - __uaccess_end(); - return ret; - } - } return __copy_to_user_ll(to, from, n); } @@ -101,32 +65,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - /* Avoid zeroing the tail if the copy fails.. - * If 'n' is constant and 1, 2, or 4, we do still zero on a failure, - * but as the zeroing behaviour is only significant when n is not - * constant, that shouldn't be a problem. - */ - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __get_user_size(*(u8 *)to, from, 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __get_user_size(*(u16 *)to, from, 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __get_user_size(*(u32 *)to, from, 4, ret, 4); - __uaccess_end(); - return ret; - } - } return __copy_from_user_ll_nozero(to, from, n); } diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 307698688fa1..2eac2aa3e37f 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -7,6 +7,7 @@ #include <linux/compiler.h> #include <linux/errno.h> #include <linux/lockdep.h> +#include <linux/kasan-checks.h> #include <asm/alternative.h> #include <asm/cpufeatures.h> #include <asm/page.h> @@ -109,6 +110,7 @@ static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) { might_fault(); + kasan_check_write(dst, size); return __copy_from_user_nocheck(dst, src, size); } @@ -175,6 +177,7 @@ static __always_inline __must_check int __copy_to_user(void __user *dst, const void *src, unsigned size) { might_fault(); + kasan_check_read(src, size); return __copy_to_user_nocheck(dst, src, size); } @@ -242,12 +245,14 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) static __must_check __always_inline int __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) { + kasan_check_write(dst, size); return __copy_from_user_nocheck(dst, src, size); } static __must_check __always_inline int __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) { + kasan_check_read(src, size); return __copy_to_user_nocheck(dst, src, size); } @@ -258,6 +263,7 @@ static inline int __copy_from_user_nocache(void *dst, const void __user *src, unsigned size) { might_fault(); + kasan_check_write(dst, size); return __copy_user_nocache(dst, src, size, 1); } @@ -265,6 +271,7 @@ static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { + kasan_check_write(dst, size); return __copy_user_nocache(dst, src, size, 0); } diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 2b19caa4081c..32712a925f26 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -26,6 +26,8 @@ # define __ARCH_WANT_COMPAT_SYS_GETDENTS64 # define __ARCH_WANT_COMPAT_SYS_PREADV64 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 +# define __ARCH_WANT_COMPAT_SYS_PREADV64V2 +# define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 # endif diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 4dcdf74dfed8..6ba793178441 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -168,20 +168,22 @@ struct x86_legacy_devices { * struct x86_legacy_features - legacy x86 features * * @rtc: this device has a CMOS real-time clock present - * @ebda_search: it's safe to search for the EBDA signature in the hardware's - * low RAM + * @reserve_bios_regions: boot code will search for the EBDA address and the + * start of the 640k - 1M BIOS region. If false, the platform must + * ensure that its memory map correctly reserves sub-1MB regions as needed. * @devices: legacy x86 devices, refer to struct x86_legacy_devices * documentation for further details. */ struct x86_legacy_features { int rtc; - int ebda_search; + int reserve_bios_regions; struct x86_legacy_devices devices; }; /** * struct x86_platform_ops - platform specific runtime functions - * @calibrate_tsc: calibrate TSC + * @calibrate_cpu: calibrate CPU + * @calibrate_tsc: calibrate TSC, if different from CPU * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic @@ -200,6 +202,7 @@ struct x86_legacy_features { * semantics. */ struct x86_platform_ops { + unsigned long (*calibrate_cpu)(void); unsigned long (*calibrate_tsc)(void); void (*get_wallclock)(struct timespec *ts); int (*set_wallclock)(const struct timespec *ts); diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h index 0d809e9fc975..3bdd10d71223 100644 --- a/arch/x86/include/asm/xen/cpuid.h +++ b/arch/x86/include/asm/xen/cpuid.h @@ -76,15 +76,18 @@ /* * Leaf 5 (0x40000x04) * HVM-specific features + * EAX: Features + * EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag) */ -/* EAX Features */ /* Virtualized APIC registers */ #define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized x2APIC accesses */ #define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Memory mapped from other domains has valid IOMMU entries */ #define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2) +/* vcpu id is present in EBX */ +#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) #define XEN_CPUID_MAX_NUM_LEAVES 4 diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index cd54147cb365..739c0c594022 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -216,9 +216,9 @@ struct kvm_cpuid_entry2 { __u32 padding[3]; }; -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) -#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) -#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX (1 << 0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC (1 << 1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT (1 << 2) /* for KVM_SET_CPUID2 */ struct kvm_cpuid2 { diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 8a4add8e4639..3725e145aa58 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -2,10 +2,12 @@ #define _UAPI__SVM_H #define SVM_EXIT_READ_CR0 0x000 +#define SVM_EXIT_READ_CR2 0x002 #define SVM_EXIT_READ_CR3 0x003 #define SVM_EXIT_READ_CR4 0x004 #define SVM_EXIT_READ_CR8 0x008 #define SVM_EXIT_WRITE_CR0 0x010 +#define SVM_EXIT_WRITE_CR2 0x012 #define SVM_EXIT_WRITE_CR3 0x013 #define SVM_EXIT_WRITE_CR4 0x014 #define SVM_EXIT_WRITE_CR8 0x018 @@ -73,15 +75,19 @@ #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_NPF 0x400 +#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 +#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 #define SVM_EXIT_ERR -1 #define SVM_EXIT_REASONS \ { SVM_EXIT_READ_CR0, "read_cr0" }, \ + { SVM_EXIT_READ_CR2, "read_cr2" }, \ { SVM_EXIT_READ_CR3, "read_cr3" }, \ { SVM_EXIT_READ_CR4, "read_cr4" }, \ { SVM_EXIT_READ_CR8, "read_cr8" }, \ { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ + { SVM_EXIT_WRITE_CR2, "write_cr2" }, \ { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ @@ -89,32 +95,66 @@ { SVM_EXIT_READ_DR1, "read_dr1" }, \ { SVM_EXIT_READ_DR2, "read_dr2" }, \ { SVM_EXIT_READ_DR3, "read_dr3" }, \ + { SVM_EXIT_READ_DR4, "read_dr4" }, \ + { SVM_EXIT_READ_DR5, "read_dr5" }, \ + { SVM_EXIT_READ_DR6, "read_dr6" }, \ + { SVM_EXIT_READ_DR7, "read_dr7" }, \ { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ + { SVM_EXIT_WRITE_DR4, "write_dr4" }, \ { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ + { SVM_EXIT_WRITE_DR6, "write_dr6" }, \ { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ + { SVM_EXIT_EXCP_BASE + DE_VECTOR, "DE excp" }, \ { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ + { SVM_EXIT_EXCP_BASE + OF_VECTOR, "OF excp" }, \ + { SVM_EXIT_EXCP_BASE + BR_VECTOR, "BR excp" }, \ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ - { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + DF_VECTOR, "DF excp" }, \ + { SVM_EXIT_EXCP_BASE + TS_VECTOR, "TS excp" }, \ + { SVM_EXIT_EXCP_BASE + NP_VECTOR, "NP excp" }, \ + { SVM_EXIT_EXCP_BASE + SS_VECTOR, "SS excp" }, \ + { SVM_EXIT_EXCP_BASE + GP_VECTOR, "GP excp" }, \ + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ + { SVM_EXIT_EXCP_BASE + MF_VECTOR, "MF excp" }, \ { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ + { SVM_EXIT_EXCP_BASE + XM_VECTOR, "XF excp" }, \ { SVM_EXIT_INTR, "interrupt" }, \ { SVM_EXIT_NMI, "nmi" }, \ { SVM_EXIT_SMI, "smi" }, \ { SVM_EXIT_INIT, "init" }, \ { SVM_EXIT_VINTR, "vintr" }, \ + { SVM_EXIT_CR0_SEL_WRITE, "cr0_sel_write" }, \ + { SVM_EXIT_IDTR_READ, "read_idtr" }, \ + { SVM_EXIT_GDTR_READ, "read_gdtr" }, \ + { SVM_EXIT_LDTR_READ, "read_ldtr" }, \ + { SVM_EXIT_TR_READ, "read_rt" }, \ + { SVM_EXIT_IDTR_WRITE, "write_idtr" }, \ + { SVM_EXIT_GDTR_WRITE, "write_gdtr" }, \ + { SVM_EXIT_LDTR_WRITE, "write_ldtr" }, \ + { SVM_EXIT_TR_WRITE, "write_rt" }, \ + { SVM_EXIT_RDTSC, "rdtsc" }, \ + { SVM_EXIT_RDPMC, "rdpmc" }, \ + { SVM_EXIT_PUSHF, "pushf" }, \ + { SVM_EXIT_POPF, "popf" }, \ { SVM_EXIT_CPUID, "cpuid" }, \ + { SVM_EXIT_RSM, "rsm" }, \ + { SVM_EXIT_IRET, "iret" }, \ + { SVM_EXIT_SWINT, "swint" }, \ { SVM_EXIT_INVD, "invd" }, \ + { SVM_EXIT_PAUSE, "pause" }, \ { SVM_EXIT_HLT, "hlt" }, \ { SVM_EXIT_INVLPG, "invlpg" }, \ { SVM_EXIT_INVLPGA, "invlpga" }, \ { SVM_EXIT_IOIO, "io" }, \ { SVM_EXIT_MSR, "msr" }, \ { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ + { SVM_EXIT_FERR_FREEZE, "ferr_freeze" }, \ { SVM_EXIT_SHUTDOWN, "shutdown" }, \ { SVM_EXIT_VMRUN, "vmrun" }, \ { SVM_EXIT_VMMCALL, "hypercall" }, \ @@ -123,11 +163,16 @@ { SVM_EXIT_STGI, "stgi" }, \ { SVM_EXIT_CLGI, "clgi" }, \ { SVM_EXIT_SKINIT, "skinit" }, \ + { SVM_EXIT_RDTSCP, "rdtscp" }, \ + { SVM_EXIT_ICEBP, "icebp" }, \ { SVM_EXIT_WBINVD, "wbinvd" }, \ { SVM_EXIT_MONITOR, "monitor" }, \ { SVM_EXIT_MWAIT, "mwait" }, \ { SVM_EXIT_XSETBV, "xsetbv" }, \ - { SVM_EXIT_NPF, "npf" } + { SVM_EXIT_NPF, "npf" }, \ + { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ + { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \ + { SVM_EXIT_ERR, "invalid_guest_state" } #endif /* _UAPI__SVM_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9abf8551c7e4..0503f5bfb18d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -83,7 +83,6 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 9414f84584e4..6738e5c82cca 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -161,13 +161,15 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) /** * acpi_register_lapic - register a local apic and generates a logic cpu number * @id: local apic id to register + * @acpiid: ACPI id to register * @enabled: this cpu is enabled or not * * Returns the logic cpu number which maps to the local apic */ -static int acpi_register_lapic(int id, u8 enabled) +static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) { unsigned int ver = 0; + int cpu; if (id >= MAX_LOCAL_APIC) { printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); @@ -182,7 +184,11 @@ static int acpi_register_lapic(int id, u8 enabled) if (boot_cpu_physical_apicid != -1U) ver = apic_version[boot_cpu_physical_apicid]; - return generic_processor_info(id, ver); + cpu = generic_processor_info(id, ver); + if (cpu >= 0) + early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; + + return cpu; } static int __init @@ -212,7 +218,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) if (!apic->apic_id_valid(apic_id) && enabled) printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); else - acpi_register_lapic(apic_id, enabled); + acpi_register_lapic(apic_id, processor->uid, enabled); #else printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); #endif @@ -240,6 +246,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) * when we use CPU hotplug. */ acpi_register_lapic(processor->id, /* APIC ID */ + processor->processor_id, /* ACPI ID */ processor->lapic_flags & ACPI_MADT_ENABLED); return 0; @@ -258,6 +265,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */ + processor->processor_id, /* ACPI ID */ processor->lapic_flags & ACPI_MADT_ENABLED); return 0; @@ -714,7 +722,7 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) { int cpu; - cpu = acpi_register_lapic(physid, ACPI_MADT_ENABLED); + cpu = acpi_register_lapic(physid, U32_MAX, ACPI_MADT_ENABLED); if (cpu < 0) { pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); return cpu; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index a147e676fc7b..e991d5c8bb3a 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -71,8 +71,8 @@ int amd_cache_northbridges(void) while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) i++; - if (i == 0) - return 0; + if (!i) + return -ENODEV; nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 60078a67d7e3..ac8d8ad8b009 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -92,8 +92,10 @@ static int apic_extnmi = APIC_EXTNMI_BSP; */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); #ifdef CONFIG_X86_32 @@ -2045,7 +2047,7 @@ int generic_processor_info(int apicid, int version) int thiscpu = max + disabled_cpus - 1; pr_warning( - "ACPI: NR_CPUS/possible_cpus limit of %i almost" + "APIC: NR_CPUS/possible_cpus limit of %i almost" " reached. Keeping one slot for boot cpu." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); @@ -2057,7 +2059,7 @@ int generic_processor_info(int apicid, int version) int thiscpu = max + disabled_cpus; pr_warning( - "ACPI: NR_CPUS/possible_cpus limit of %i reached." + "APIC: NR_CPUS/possible_cpus limit of %i reached." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); disabled_cpus++; @@ -2085,7 +2087,7 @@ int generic_processor_info(int apicid, int version) if (topology_update_package_map(apicid, cpu) < 0) { int thiscpu = max + disabled_cpus; - pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n", + pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", thiscpu, apicid); disabled_cpus++; return -ENOSPC; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 76f89e2b245a..048747778d37 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -181,7 +181,6 @@ static struct apic apic_flat = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFu << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, @@ -278,7 +277,6 @@ static struct apic apic_physflat = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFu << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 13d19ed58514..2cebf59092d8 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -141,7 +141,6 @@ struct apic apic_noop = { .get_apic_id = noop_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0x0F << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index ab5c2c685a3c..714d4fda0d52 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -269,7 +269,6 @@ static const struct apic apic_numachip1 __refconst = { .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, - .apic_id_mask = 0xffU << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, @@ -321,7 +320,6 @@ static const struct apic apic_numachip2 __refconst = { .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, - .apic_id_mask = 0xffU << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index cf9bd896c12d..06dbaa458bfe 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -171,7 +171,6 @@ static struct apic apic_bigsmp = { .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0xFF << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 045e424fb368..7788ce643bf4 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,7 +18,6 @@ #include <linux/nmi.h> #include <linux/module.h> #include <linux/delay.h> -#include <linux/seq_buf.h> #ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 84e33ff5a6d5..f072b9572634 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -981,7 +981,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info), - info, legacy); + info, legacy, NULL); } /* @@ -1014,7 +1014,8 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, info->ioapic_pin)) return -ENOMEM; } else { - irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, + NULL); if (irq >= 0) { irq_data = irq_domain_get_irq_data(domain, irq); data = irq_data->chip_data; @@ -2567,29 +2568,25 @@ static struct resource * __init ioapic_setup_resources(void) unsigned long n; struct resource *res; char *mem; - int i, num = 0; + int i; - for_each_ioapic(i) - num++; - if (num == 0) + if (nr_ioapics == 0) return NULL; n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= num; + n *= nr_ioapics; mem = alloc_bootmem(n); res = (void *)mem; - mem += sizeof(struct resource) * num; + mem += sizeof(struct resource) * nr_ioapics; - num = 0; for_each_ioapic(i) { - res[num].name = mem; - res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; - num++; - ioapics[i].iomem_res = res; + ioapics[i].iomem_res = &res[i]; } ioapic_resources = res; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index f316e34abb42..93edfa01b408 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -101,7 +101,6 @@ static struct apic apic_default = { .get_apic_id = default_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0x0F << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index aca8b75c1552..24170d0809ba 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -270,7 +270,6 @@ static struct apic apic_x2apic_cluster = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a1242e2c12e6..4f13f54f1b1f 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -126,7 +126,6 @@ static struct apic apic_x2apic_phys = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 29003154fafd..64dd38fbf218 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -582,7 +582,6 @@ static struct apic __refdata apic_x2apic_uv_x = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, @@ -919,7 +918,7 @@ static void uv_heartbeat(unsigned long ignored) uv_set_scir_bits(bits); /* enable next timer period */ - mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); + mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); } static void uv_heartbeat_enable(int cpu) @@ -928,7 +927,7 @@ static void uv_heartbeat_enable(int cpu) struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); - setup_timer(timer, uv_heartbeat, cpu); + setup_pinned_timer(timer, uv_heartbeat, cpu); timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_scir_info(cpu)->enabled = 1; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 674134e9f5e5..2bd5c6ff7ee7 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -31,7 +31,9 @@ void common(void) { BLANK(); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); - OFFSET(TI_addr_limit, thread_info, addr_limit); + + BLANK(); + OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c343a54bed39..f5c69d8974e1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -674,14 +674,14 @@ static void init_amd_bd(struct cpuinfo_x86 *c) u64 value; /* re-enable TopologyExtensions if switched off by BIOS */ - if ((c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && + if ((c->x86_model >= 0x10) && (c->x86_model <= 0x6f) && !cpu_has(c, X86_FEATURE_TOPOEXT)) { if (msr_set_bit(0xc0011005, 54) > 0) { rdmsrl(0xc0011005, value); if (value & BIT_64(54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); } } } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6ef6ed9ccca6..d22a7b9c4f0e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -37,6 +37,7 @@ #include <asm/mtrr.h> #include <linux/numa.h> #include <asm/asm.h> +#include <asm/bugs.h> #include <asm/cpu.h> #include <asm/mce.h> #include <asm/msr.h> @@ -270,6 +271,8 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) static __init int setup_disable_smep(char *arg) { setup_clear_cpu_cap(X86_FEATURE_SMEP); + /* Check for things that depend on SMEP being enabled: */ + check_mpx_erratum(&boot_cpu_data); return 1; } __setup("nosmep", setup_disable_smep); @@ -310,6 +313,10 @@ static bool pku_disabled; static __always_inline void setup_pku(struct cpuinfo_x86 *c) { + /* check the boot processor, plus compile options for PKU: */ + if (!cpu_feature_enabled(X86_FEATURE_PKU)) + return; + /* checks the actual processor's cpuid bits: */ if (!cpu_has(c, X86_FEATURE_PKU)) return; if (pku_disabled) @@ -1445,7 +1452,7 @@ void cpu_init(void) struct task_struct *me; struct tss_struct *t; unsigned long v; - int cpu = stack_smp_processor_id(); + int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8dae51fd3db1..c1a89bc026ac 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -25,6 +25,41 @@ #include <asm/apic.h> #endif +/* + * Just in case our CPU detection goes bad, or you have a weird system, + * allow a way to override the automatic disabling of MPX. + */ +static int forcempx; + +static int __init forcempx_setup(char *__unused) +{ + forcempx = 1; + + return 1; +} +__setup("intel-skd-046-workaround=disable", forcempx_setup); + +void check_mpx_erratum(struct cpuinfo_x86 *c) +{ + if (forcempx) + return; + /* + * Turn off the MPX feature on CPUs where SMEP is not + * available or disabled. + * + * Works around Intel Erratum SKD046: "Branch Instructions + * May Initialize MPX Bound Registers Incorrectly". + * + * This might falsely disable MPX on systems without + * SMEP, like Atom processors without SMEP. But there + * is no such hardware known at the moment. + */ + if (cpu_has(c, X86_FEATURE_MPX) && !cpu_has(c, X86_FEATURE_SMEP)) { + setup_clear_cpu_cap(X86_FEATURE_MPX); + pr_warn("x86/mpx: Disabling MPX since SMEP not present\n"); + } +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -173,6 +208,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (edx & (1U << 28)) c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); } + + check_mpx_erratum(c); } #ifdef CONFIG_X86_32 @@ -263,15 +300,14 @@ static void intel_workarounds(struct cpuinfo_x86 *c) } /* - * P4 Xeon errata 037 workaround. + * P4 Xeon erratum 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { if (msr_set_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) - > 0) { + MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { pr_info("CPU: C0 stepping P4 Xeon detected.\n"); - pr_info("CPU: Disabling hardware prefetching (Errata 037)\n"); + pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n"); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 34c89a3e8260..83f1a98d37db 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -46,7 +46,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) return; mce_setup(&m); - m.bank = 1; + m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 92e5e37d97bf..79d8ec849468 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -425,7 +425,7 @@ static u64 mce_rdmsrl(u32 msr) } if (rdmsrl_safe(msr, &v)) { - WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); + WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); /* * Return zero in case the access faulted. This should * not happen normally but can happen if the CPU does @@ -1309,7 +1309,7 @@ static void __restart_timer(struct timer_list *t, unsigned long interval) if (timer_pending(t)) { if (time_before(when, t->expires)) - mod_timer_pinned(t, when); + mod_timer(t, when); } else { t->expires = round_jiffies(when); add_timer_on(t, smp_processor_id()); @@ -1735,7 +1735,7 @@ static void __mcheck_cpu_init_timer(void) struct timer_list *t = this_cpu_ptr(&mce_timer); unsigned int cpu = smp_processor_id(); - setup_timer(t, mce_timer_fn, cpu); + setup_pinned_timer(t, mce_timer_fn, cpu); mce_start_timer(cpu, t); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 10b0661651e0..7b7f3be783d4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -93,7 +93,7 @@ const char * const amd_df_mcablock_names[] = { EXPORT_SYMBOL_GPL(amd_df_mcablock_names); static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); -static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ +static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index f6f50c4ceaec..cfa97ff67bda 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -39,9 +39,9 @@ __setup("nordrand", x86_rdrand_setup); */ #define SANITY_CHECK_LOOPS 8 +#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { -#ifdef CONFIG_ARCH_RANDOM unsigned long tmp; int i; @@ -55,5 +55,5 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) return; } } -#endif } +#endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2bb25c3fe2e8..92e8f0a7159c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -42,16 +42,14 @@ void printk_address(unsigned long address) static void print_ftrace_graph_addr(unsigned long addr, void *data, const struct stacktrace_ops *ops, - struct thread_info *tinfo, int *graph) + struct task_struct *task, int *graph) { - struct task_struct *task; unsigned long ret_addr; int index; if (addr != (unsigned long)return_to_handler) return; - task = tinfo->task; index = task->curr_ret_stack; if (!task->ret_stack || index < *graph) @@ -68,7 +66,7 @@ print_ftrace_graph_addr(unsigned long addr, void *data, static inline void print_ftrace_graph_addr(unsigned long addr, void *data, const struct stacktrace_ops *ops, - struct thread_info *tinfo, int *graph) + struct task_struct *task, int *graph) { } #endif @@ -79,28 +77,36 @@ print_ftrace_graph_addr(unsigned long addr, void *data, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct thread_info *tinfo, +static inline int valid_stack_ptr(struct task_struct *task, void *p, unsigned int size, void *end) { - void *t = tinfo; + void *t = task_stack_page(task); if (end) { if (p < end && p >= (end-THREAD_SIZE)) return 1; else return 0; } - return p > t && p < t + THREAD_SIZE - size; + return p >= t && p < t + THREAD_SIZE - size; } unsigned long -print_context_stack(struct thread_info *tinfo, +print_context_stack(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, unsigned long *end, int *graph) { struct stack_frame *frame = (struct stack_frame *)bp; - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + /* + * If we overflowed the stack into a guard page, jump back to the + * bottom of the usable stack. + */ + if ((unsigned long)task_stack_page(task) - (unsigned long)stack < + PAGE_SIZE) + stack = (unsigned long *)task_stack_page(task); + + while (valid_stack_ptr(task, stack, sizeof(*stack), end)) { unsigned long addr; addr = *stack; @@ -112,7 +118,7 @@ print_context_stack(struct thread_info *tinfo, } else { ops->address(data, addr, 0); } - print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + print_ftrace_graph_addr(addr, data, ops, task, graph); } stack++; } @@ -121,7 +127,7 @@ print_context_stack(struct thread_info *tinfo, EXPORT_SYMBOL_GPL(print_context_stack); unsigned long -print_context_stack_bp(struct thread_info *tinfo, +print_context_stack_bp(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, unsigned long *end, int *graph) @@ -129,7 +135,7 @@ print_context_stack_bp(struct thread_info *tinfo, struct stack_frame *frame = (struct stack_frame *)bp; unsigned long *ret_addr = &frame->return_address; - while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { + while (valid_stack_ptr(task, ret_addr, sizeof(*ret_addr), end)) { unsigned long addr = *ret_addr; if (!__kernel_text_address(addr)) @@ -139,7 +145,7 @@ print_context_stack_bp(struct thread_info *tinfo, break; frame = frame->next_frame; ret_addr = &frame->return_address; - print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + print_ftrace_graph_addr(addr, data, ops, task, graph); } return (unsigned long)frame; @@ -199,6 +205,11 @@ void show_stack(struct task_struct *task, unsigned long *sp) show_stack_log_lvl(task, NULL, sp, bp, ""); } +void show_stack_regs(struct pt_regs *regs) +{ + show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, ""); +} + static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -228,6 +239,8 @@ unsigned long oops_begin(void) EXPORT_SYMBOL_GPL(oops_begin); NOKPROBE_SYMBOL(oops_begin); +void __noreturn rewind_stack_do_exit(int signr); + void oops_end(unsigned long flags, struct pt_regs *regs, int signr) { if (regs && kexec_should_crash(current)) @@ -249,7 +262,13 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - do_exit(signr); + + /* + * We're not going to return, but we might be on an IST stack or + * have very little stack space left. Rewind the stack and kill + * the task. + */ + rewind_stack_do_exit(signr); } NOKPROBE_SYMBOL(oops_end); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 464ffd69b92e..948d77da3881 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -61,15 +61,13 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, bp = stack_frame(task, regs); for (;;) { - struct thread_info *context; void *end_stack; end_stack = is_hardirq_stack(stack, cpu); if (!end_stack) end_stack = is_softirq_stack(stack, cpu); - context = task_thread_info(task); - bp = ops->walk_stack(context, stack, bp, ops, data, + bp = ops->walk_stack(task, stack, bp, ops, data, end_stack, &graph); /* Stop if not on irq stack */ @@ -98,7 +96,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, int i; if (sp == NULL) { - if (task) + if (regs) + sp = (unsigned long *)regs->sp; + else if (task) sp = (unsigned long *)task->thread.sp; else sp = (unsigned long *)&sp; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5f1c6266eb30..6dede08dd98b 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -153,7 +153,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); - struct thread_info *tinfo; unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); unsigned long dummy; unsigned used = 0; @@ -179,7 +178,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * current stack address. If the stacks consist of nested * exceptions */ - tinfo = task_thread_info(task); while (!done) { unsigned long *stack_end; enum stack_type stype; @@ -202,7 +200,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, id) < 0) break; - bp = ops->walk_stack(tinfo, stack, bp, ops, + bp = ops->walk_stack(task, stack, bp, ops, data, stack_end, &graph); ops->stack(data, "<EOE>"); /* @@ -218,7 +216,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, "IRQ") < 0) break; - bp = ops->walk_stack(tinfo, stack, bp, + bp = ops->walk_stack(task, stack, bp, ops, data, stack_end, &graph); /* * We link to the next stack (which would be @@ -240,7 +238,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -266,7 +264,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, * back trace for this cpu: */ if (sp == NULL) { - if (task) + if (regs) + sp = (unsigned long *)regs->sp; + else if (task) sp = (unsigned long *)task->thread.sp; else sp = (unsigned long *)&sp; @@ -274,6 +274,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { + unsigned long word; + if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); @@ -283,12 +285,18 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; } + + if (probe_kernel_address(stack, word)) + break; + if ((i % STACKSLOTS_PER_LINE) == 0) { if (i != 0) pr_cont("\n"); - printk("%s %016lx", log_lvl, *stack++); + printk("%s %016lx", log_lvl, word); } else - pr_cont(" %016lx", *stack++); + pr_cont(" %016lx", word); + + stack++; touch_nmi_watchdog(); } preempt_enable(); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index bca14c899137..57b71373bae3 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -11,7 +11,11 @@ #include <linux/pci.h> #include <linux/acpi.h> +#include <linux/delay.h> +#include <linux/dmi.h> #include <linux/pci_ids.h> +#include <linux/bcma/bcma.h> +#include <linux/bcma/bcma_regs.h> #include <drm/i915_drm.h> #include <asm/pci-direct.h> #include <asm/dma.h> @@ -21,6 +25,9 @@ #include <asm/iommu.h> #include <asm/gart.h> #include <asm/irq_remapping.h> +#include <asm/early_ioremap.h> + +#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg) static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -76,6 +83,13 @@ static void __init nvidia_bugs(int num, int slot, int func) #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC /* + * Only applies to Nvidia root ports (bus 0) and not to + * Nvidia graphics cards with PCI ports on secondary buses. + */ + if (num) + return; + + /* * All timer overrides on Nvidia are * wrong unless HPET is enabled. * Unfortunately that's not true on many Asus boards. @@ -590,6 +604,61 @@ static void __init force_disable_hpet(int num, int slot, int func) #endif } +#define BCM4331_MMIO_SIZE 16384 +#define BCM4331_PM_CAP 0x40 +#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg) +#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg) + +static void __init apple_airport_reset(int bus, int slot, int func) +{ + void __iomem *mmio; + u16 pmcsr; + u64 addr; + int i; + + if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) + return; + + /* Card may have been put into PCI_D3hot by grub quirk */ + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr); + mdelay(10); + + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + dev_err("Cannot power up Apple AirPort card\n"); + return; + } + } + + addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32; + addr &= PCI_BASE_ADDRESS_MEM_MASK; + + mmio = early_ioremap(addr, BCM4331_MMIO_SIZE); + if (!mmio) { + dev_err("Cannot iomap Apple AirPort card\n"); + return; + } + + pr_info("Resetting Apple AirPort card (left enabled by EFI)\n"); + + for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++) + udelay(10); + + bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET); + bcma_aread32(BCMA_RESET_CTL); + udelay(1); + + bcma_awrite32(BCMA_RESET_CTL, 0); + bcma_aread32(BCMA_RESET_CTL); + udelay(10); + + early_iounmap(mmio, BCM4331_MMIO_SIZE); +} #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 @@ -603,12 +672,6 @@ struct chipset { void (*f)(int num, int slot, int func); }; -/* - * Only works for devices on the root bus. If you add any devices - * not on bus 0 readd another loop level in early_quirks(). But - * be careful because at least the Nvidia quirk here relies on - * only matching on bus 0. - */ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, @@ -638,9 +701,13 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_BROADCOM, 0x4331, + PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} }; +static void __init early_pci_scan_bus(int bus); + /** * check_dev_quirk - apply early quirks to a given PCI device * @num: bus number @@ -649,7 +716,7 @@ static struct chipset early_qrk[] __initdata = { * * Check the vendor & device ID against the early quirks table. * - * If the device is single function, let early_quirks() know so we don't + * If the device is single function, let early_pci_scan_bus() know so we don't * poke at this device again. */ static int __init check_dev_quirk(int num, int slot, int func) @@ -658,6 +725,7 @@ static int __init check_dev_quirk(int num, int slot, int func) u16 vendor; u16 device; u8 type; + u8 sec; int i; class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); @@ -685,25 +753,36 @@ static int __init check_dev_quirk(int num, int slot, int func) type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); + + if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) { + sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS); + if (sec > num) + early_pci_scan_bus(sec); + } + if (!(type & 0x80)) return -1; return 0; } -void __init early_quirks(void) +static void __init early_pci_scan_bus(int bus) { int slot, func; - if (!early_pci_allowed()) - return; - /* Poor man's PCI discovery */ - /* Only scan the root bus */ for (slot = 0; slot < 32; slot++) for (func = 0; func < 8; func++) { /* Only probe function 0 on single fn devices */ - if (check_dev_quirk(0, slot, func)) + if (check_dev_quirk(bus, slot, func)) break; } } + +void __init early_quirks(void) +{ + if (!early_pci_allowed()) + return; + + early_pci_scan_bus(0); +} diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c index afe65dffee80..4312f8ae71b7 100644 --- a/arch/x86/kernel/ebda.c +++ b/arch/x86/kernel/ebda.c @@ -6,66 +6,92 @@ #include <asm/bios_ebda.h> /* + * This function reserves all conventional PC system BIOS related + * firmware memory areas (some of which are data, some of which + * are code), that must not be used by the kernel as available + * RAM. + * * The BIOS places the EBDA/XBDA at the top of conventional * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. + * conventional memory (int 0x12) too. + * + * This means that as a first approximation on most systems we can + * guess the reserved BIOS area by looking at the low BIOS RAM size + * value and assume that everything above that value (up to 1MB) is + * reserved. + * + * But life in firmware country is not that simple: + * + * - This code also contains a quirk for Dell systems that neglect + * to reserve the EBDA area in the 'RAM size' value ... + * + * - The same quirk also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). (Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in.) + * + * - Plus paravirt systems don't have a reliable value in the + * 'BIOS RAM size' pointer we can rely on, so we must quirk + * them too. + * + * Due to those various problems this function is deliberately + * very conservative and tries to err on the side of reserving + * too much, to not risk reserving too little. + * + * Losing a small amount of memory in the bottom megabyte is + * rarely a problem, as long as we have enough memory to install + * the SMP bootup trampoline which *must* be in this area. * - * This functions is deliberately very conservative. Losing - * memory in the bottom megabyte is rarely a problem, as long - * as we have enough memory to install the trampoline. Using - * memory that is in use by the BIOS or by some DMA device - * the BIOS didn't shut down *is* a big problem. + * Using memory that is in use by the BIOS or by some DMA device + * the BIOS didn't shut down *is* a big problem to the kernel, + * obviously. */ -#define BIOS_LOWMEM_KILOBYTES 0x413 -#define LOWMEM_CAP 0x9f000U /* Absolute maximum */ -#define INSANE_CUTOFF 0x20000U /* Less than this = insane */ +#define BIOS_RAM_SIZE_KB_PTR 0x413 -void __init reserve_ebda_region(void) +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +void __init reserve_bios_regions(void) { - unsigned int lowmem, ebda_addr; + unsigned int bios_start, ebda_start; /* - * To determine the position of the EBDA and the - * end of conventional memory, we need to look at - * the BIOS data area. In a paravirtual environment - * that area is absent. We'll just have to assume - * that the paravirt case can handle memory setup - * correctly, without our help. + * NOTE: In a paravirtual environment the BIOS reserved + * area is absent. We'll just have to assume that the + * paravirt case can handle memory setup correctly, + * without our help. */ - if (!x86_platform.legacy.ebda_search) + if (!x86_platform.legacy.reserve_bios_regions) return; - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); - /* - * Note: some old Dells seem to need 4k EBDA without - * reporting so, so just consider the memory above 0x9f000 - * to be off limits (bugzilla 2990). + * BIOS RAM size is encoded in kilobytes, convert it + * to bytes to get a first guess at where the BIOS + * firmware area starts: */ + bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR); + bios_start <<= 10; - /* If the EBDA address is below 128K, assume it is bogus */ - if (ebda_addr < INSANE_CUTOFF) - ebda_addr = LOWMEM_CAP; + /* + * If bios_start is less than 128K, assume it is bogus + * and bump it up to 640K. Similarly, if bios_start is above 640K, + * don't trust it. + */ + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; - /* If lowmem is less than 128K, assume it is bogus */ - if (lowmem < INSANE_CUTOFF) - lowmem = LOWMEM_CAP; + /* Get the start address of the EBDA page: */ + ebda_start = get_bios_ebda(); - /* Use the lower of the lowmem and EBDA markers as the cutoff */ - lowmem = min(lowmem, ebda_addr); - lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */ + /* + * If the EBDA start address is sane and is below the BIOS region, + * then also reserve everything from the EBDA start address up to + * the BIOS region. + */ + if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; - /* reserve all memory between lowmem and the 1MB mark */ - memblock_reserve(lowmem, 0x100000 - lowmem); + /* Reserve all memory between bios_start and the 1MB mark: */ + memblock_reserve(bios_start, 0x100000 - bios_start); } diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 4d38416e2a7f..04f89caef9c4 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -57,7 +57,7 @@ # error "Need more than one PGD for the ESPFIX hack" #endif -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) /* This contains the *bottom* address of the espfix stack */ DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 97027545a72d..3fc03a09a93b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -8,10 +8,14 @@ #include <asm/fpu/internal.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> +#include <asm/fpu/types.h> #include <asm/traps.h> #include <linux/hardirq.h> +#define CREATE_TRACE_POINTS +#include <asm/trace/fpu.h> + /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: @@ -192,6 +196,7 @@ void fpu__save(struct fpu *fpu) WARN_ON_FPU(fpu != ¤t->thread.fpu); preempt_disable(); + trace_x86_fpu_before_save(fpu); if (fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(fpu)) { if (use_eager_fpu()) @@ -200,6 +205,7 @@ void fpu__save(struct fpu *fpu) fpregs_deactivate(fpu); } } + trace_x86_fpu_after_save(fpu); preempt_enable(); } EXPORT_SYMBOL_GPL(fpu__save); @@ -222,7 +228,14 @@ void fpstate_init(union fpregs_state *state) return; } - memset(state, 0, xstate_size); + memset(state, 0, fpu_kernel_xstate_size); + + /* + * XRSTORS requires that this bit is set in xcomp_bv, or + * it will #GP. Make sure it is replaced after the memset(). + */ + if (static_cpu_has(X86_FEATURE_XSAVES)) + state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT; if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); @@ -247,7 +260,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * leak into the child task: */ if (use_eager_fpu()) - memset(&dst_fpu->state.xsave, 0, xstate_size); + memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* * Save current FPU registers directly into the child @@ -266,7 +279,8 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) */ preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, xstate_size); + memcpy(&src_fpu->state, &dst_fpu->state, + fpu_kernel_xstate_size); if (use_eager_fpu()) copy_kernel_to_fpregs(&src_fpu->state); @@ -275,6 +289,9 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) } preempt_enable(); + trace_x86_fpu_copy_src(src_fpu); + trace_x86_fpu_copy_dst(dst_fpu); + return 0; } @@ -288,7 +305,9 @@ void fpu__activate_curr(struct fpu *fpu) if (!fpu->fpstate_active) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for the current task: */ fpu->fpstate_active = 1; } @@ -314,7 +333,9 @@ void fpu__activate_fpstate_read(struct fpu *fpu) } else { if (!fpu->fpstate_active) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for current and for stopped child tasks: */ fpu->fpstate_active = 1; } @@ -347,7 +368,9 @@ void fpu__activate_fpstate_write(struct fpu *fpu) fpu->last_cpu = -1; } else { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for stopped child tasks: */ fpu->fpstate_active = 1; } @@ -432,9 +455,11 @@ void fpu__restore(struct fpu *fpu) /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); + trace_x86_fpu_before_restore(fpu); fpregs_activate(fpu); copy_kernel_to_fpregs(&fpu->state); fpu->counter++; + trace_x86_fpu_after_restore(fpu); kernel_fpu_enable(); } EXPORT_SYMBOL_GPL(fpu__restore); @@ -463,6 +488,8 @@ void fpu__drop(struct fpu *fpu) fpu->fpstate_active = 0; + trace_x86_fpu_dropped(fpu); + preempt_enable(); } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index aacfd7a82cec..93982aebb398 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -145,8 +145,8 @@ static void __init fpu__init_system_generic(void) * This is inherent to the XSAVE architecture which puts all state * components into a single, continuous memory block: */ -unsigned int xstate_size; -EXPORT_SYMBOL_GPL(xstate_size); +unsigned int fpu_kernel_xstate_size; +EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) @@ -178,7 +178,7 @@ static void __init fpu__init_task_struct_size(void) * Add back the dynamically-calculated register state * size. */ - task_size += xstate_size; + task_size += fpu_kernel_xstate_size; /* * We dynamically size 'struct fpu', so we require that @@ -195,7 +195,7 @@ static void __init fpu__init_task_struct_size(void) } /* - * Set up the xstate_size based on the legacy FPU context size. + * Set up the user and kernel xstate sizes based on the legacy FPU context size. * * We set this up first, and later it will be overwritten by * fpu__init_system_xstate() if the CPU knows about xstates. @@ -208,7 +208,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) on_boot_cpu = 0; /* - * Note that xstate_size might be overwriten later during + * Note that xstate sizes might be overwritten later during * fpu__init_system_xstate(). */ @@ -219,27 +219,17 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - xstate_size = sizeof(struct swregs_state); + fpu_kernel_xstate_size = sizeof(struct swregs_state); } else { if (boot_cpu_has(X86_FEATURE_FXSR)) - xstate_size = sizeof(struct fxregs_state); + fpu_kernel_xstate_size = + sizeof(struct fxregs_state); else - xstate_size = sizeof(struct fregs_state); + fpu_kernel_xstate_size = + sizeof(struct fregs_state); } - /* - * Quirk: we don't yet handle the XSAVES* instructions - * correctly, as we don't correctly convert between - * standard and compacted format when interfacing - * with user-space - so disable it for now. - * - * The difference is small: with recent CPUs the - * compacted format is only marginally smaller than - * the standard FPU state format. - * - * ( This is easy to backport while we are fixing - * XSAVES* support. ) - */ - setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + fpu_user_xstate_size = fpu_kernel_xstate_size; } /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 81422dfb152b..c114b132d121 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -4,6 +4,7 @@ #include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/fpu/xstate.h> /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, @@ -85,21 +86,26 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; - fpu__activate_fpstate_read(fpu); - xsave = &fpu->state.xsave; - /* - * Copy the 48bytes defined by the software first into the xstate - * memory layout in the thread struct, so that we can copy the entire - * xstateregs to the user using one user_regset_copyout(). - */ - memcpy(&xsave->i387.sw_reserved, - xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); - /* - * Copy the xstate memory layout. - */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + fpu__activate_fpstate_read(fpu); + + if (using_compacted_format()) { + ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave); + } else { + fpstate_sanitize_xstate(fpu); + /* + * Copy the 48 bytes defined by the software into the xsave + * area in the thread struct, so that we can copy the whole + * area to user using one user_regset_copyout(). + */ + memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + + /* + * Copy the xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + } return ret; } @@ -114,11 +120,27 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; - fpu__activate_fpstate_write(fpu); + /* + * A whole standard-format XSAVE buffer is needed: + */ + if ((pos != 0) || (count < fpu_user_xstate_size)) + return -EFAULT; xsave = &fpu->state.xsave; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + fpu__activate_fpstate_write(fpu); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + ret = copyin_to_xsaves(kbuf, ubuf, xsave); + else + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + + /* + * In case of failure, mark all states as init: + */ + if (ret) + fpstate_init(&fpu->state); + /* * mxcsr reserved bits must be masked to zero for security reasons. */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 31c6a60505e6..9e231d88bb33 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -8,8 +8,10 @@ #include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/fpu/xstate.h> #include <asm/sigframe.h> +#include <asm/trace/fpu.h> static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; @@ -31,7 +33,7 @@ static inline int check_for_xstate(struct fxregs_state __user *buf, /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > xstate_size || + fx_sw->xstate_size > fpu_user_xstate_size || fx_sw->xstate_size > fx_sw->extended_size) return -1; @@ -88,7 +90,8 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) if (!use_xsave()) return err; - err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 *)(buf + fpu_user_xstate_size)); /* * Read the xfeatures which we copied (directly from the cpu or @@ -125,7 +128,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) else err = copy_fregs_to_user((struct fregs_state __user *) buf); - if (unlikely(err) && __clear_user(buf, xstate_size)) + if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size)) err = -EFAULT; return err; } @@ -167,7 +170,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpregs_active()) { + if (fpregs_active() || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; @@ -175,8 +178,19 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (ia32_fxstate) copy_fxregs_to_kernel(&tsk->thread.fpu); } else { + /* + * It is a *bug* if kernel uses compacted-format for xsave + * area and we copy it out directly to a signal frame. It + * should have been handled above by saving the registers + * directly. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n"); + return -1; + } + fpstate_sanitize_xstate(&tsk->thread.fpu); - if (__copy_to_user(buf_fx, xsave, xstate_size)) + if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) return -1; } @@ -250,7 +264,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; - int state_size = xstate_size; + int state_size = fpu_kernel_xstate_size; u64 xfeatures = 0; int fx_only = 0; @@ -282,6 +296,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ state_size = sizeof(struct fxregs_state); fx_only = 1; + trace_x86_fpu_xstate_check_failed(fpu); } else { state_size = fx_sw_user.xstate_size; xfeatures = fx_sw_user.xfeatures; @@ -308,9 +323,17 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpu__drop(fpu); - if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || - __copy_from_user(&env, buf, sizeof(env))) { + if (using_compacted_format()) { + err = copyin_to_xsaves(NULL, buf_fx, + &fpu->state.xsave); + } else { + err = __copy_from_user(&fpu->state.xsave, + buf_fx, state_size); + } + + if (err || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); err = -1; } else { sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); @@ -341,7 +364,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) static inline int xstate_sigframe_size(void) { - return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; + return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : + fpu_user_xstate_size; } /* @@ -385,12 +409,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, */ void fpu__init_prepare_fx_sw_frame(void) { - int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; + int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE; fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; fx_sw_reserved.xfeatures = xfeatures_mask; - fx_sw_reserved.xstate_size = xstate_size; + fx_sw_reserved.xstate_size = fpu_user_xstate_size; if (config_enabled(CONFIG_IA32_EMULATION) || config_enabled(CONFIG_X86_32)) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 4ea2a59483c7..680049aa4593 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -11,6 +11,7 @@ #include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/fpu/xstate.h> #include <asm/tlbflush.h> @@ -44,6 +45,13 @@ static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; /* + * The XSAVE area of kernel can be in standard or compacted format; + * it is always in standard format for user mode. This is the user + * mode standard format size used for signal and ptrace frames. + */ +unsigned int fpu_user_xstate_size; + +/* * Clear all of the X86_FEATURE_* bits that are unavailable * when the CPU has no XSAVE support. */ @@ -105,6 +113,27 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); +static int xfeature_is_supervisor(int xfeature_nr) +{ + /* + * We currently do not support supervisor states, but if + * we did, we could find out like this. + * + * SDM says: If state component 'i' is a user state component, + * ECX[0] return 0; if state component i is a supervisor + * state component, ECX[0] returns 1. + */ + u32 eax, ebx, ecx, edx; + + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return !!(ecx & 1); +} + +static int xfeature_is_user(int xfeature_nr) +{ + return !xfeature_is_supervisor(xfeature_nr); +} + /* * When executing XSAVEOPT (or other optimized XSAVE instructions), if * a processor implementation detects that an FPU state component is still @@ -171,7 +200,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ while (xfeatures) { if (xfeatures & 0x1) { - int offset = xstate_offsets[feature_bit]; + int offset = xstate_comp_offsets[feature_bit]; int size = xstate_sizes[feature_bit]; memcpy((void *)fx + offset, @@ -192,6 +221,15 @@ void fpu__init_cpu_xstate(void) { if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) return; + /* + * Make it clear that XSAVES supervisor states are not yet + * implemented should anyone expect it to work by changing + * bits in XFEATURE_MASK_* macros and XCR0. + */ + WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR), + "x86/fpu: XSAVES supervisor states are not yet implemented.\n"); + + xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; cr4_set_bits(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); @@ -217,13 +255,29 @@ static void __init setup_xstate_features(void) /* start at the beginnning of the "extended state" */ unsigned int last_good_offset = offsetof(struct xregs_state, extended_state_area); + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_offsets[0] = 0; + xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space); + xstate_offsets[1] = xstate_sizes[0]; + xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (!xfeature_enabled(i)) continue; cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - xstate_offsets[i] = ebx; + + /* + * If an xfeature is supervisor state, the offset + * in EBX is invalid. We leave it to -1. + */ + if (xfeature_is_user(i)) + xstate_offsets[i] = ebx; + xstate_sizes[i] = eax; /* * In our xstate size checks, we assume that the @@ -233,8 +287,6 @@ static void __init setup_xstate_features(void) WARN_ONCE(last_good_offset > xstate_offsets[i], "x86/fpu: misordered xstate at %d\n", last_good_offset); last_good_offset = xstate_offsets[i]; - - printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax); } } @@ -263,6 +315,33 @@ static void __init print_xstate_features(void) } /* + * This check is important because it is easy to get XSTATE_* + * confused with XSTATE_BIT_*. + */ +#define CHECK_XFEATURE(nr) do { \ + WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ + WARN_ON(nr >= XFEATURE_MAX); \ +} while (0) + +/* + * We could cache this like xstate_size[], but we only use + * it here, so it would be a waste of space. + */ +static int xfeature_is_aligned(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + /* + * The value returned by ECX[1] indicates the alignment + * of state component 'i' when the compacted format + * of the extended region of an XSAVE area is used: + */ + return !!(ecx & 2); +} + +/* * This function sets up offsets and sizes of all extended states in * xsave area. This supports both standard format and compacted format * of the xsave aread. @@ -299,10 +378,29 @@ static void __init setup_xstate_comp(void) else xstate_comp_sizes[i] = 0; - if (i > FIRST_EXTENDED_XFEATURE) + if (i > FIRST_EXTENDED_XFEATURE) { xstate_comp_offsets[i] = xstate_comp_offsets[i-1] + xstate_comp_sizes[i-1]; + if (xfeature_is_aligned(i)) + xstate_comp_offsets[i] = + ALIGN(xstate_comp_offsets[i], 64); + } + } +} + +/* + * Print out xstate component offsets and sizes + */ +static void __init print_xstate_offset_size(void) +{ + int i; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i)) + continue; + pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", + i, xstate_comp_offsets[i], i, xstate_sizes[i]); } } @@ -322,13 +420,11 @@ static void __init setup_init_fpu_buf(void) setup_xstate_features(); print_xstate_features(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) { + if (boot_cpu_has(X86_FEATURE_XSAVES)) init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; - init_fpstate.xsave.header.xfeatures = xfeatures_mask; - } /* - * Init all the features state with header_bv being 0x0 + * Init all the features state with header.xfeatures being 0x0 */ copy_kernel_to_xregs_booting(&init_fpstate.xsave); @@ -339,58 +435,19 @@ static void __init setup_init_fpu_buf(void) copy_xregs_to_kernel_booting(&init_fpstate.xsave); } -static int xfeature_is_supervisor(int xfeature_nr) -{ - /* - * We currently do not support supervisor states, but if - * we did, we could find out like this. - * - * SDM says: If state component i is a user state component, - * ECX[0] return 0; if state component i is a supervisor - * state component, ECX[0] returns 1. - u32 eax, ebx, ecx, edx; - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx; - return !!(ecx & 1); - */ - return 0; -} -/* -static int xfeature_is_user(int xfeature_nr) -{ - return !xfeature_is_supervisor(xfeature_nr); -} -*/ - -/* - * This check is important because it is easy to get XSTATE_* - * confused with XSTATE_BIT_*. - */ -#define CHECK_XFEATURE(nr) do { \ - WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ - WARN_ON(nr >= XFEATURE_MAX); \ -} while (0) - -/* - * We could cache this like xstate_size[], but we only use - * it here, so it would be a waste of space. - */ -static int xfeature_is_aligned(int xfeature_nr) +static int xfeature_uncompacted_offset(int xfeature_nr) { u32 eax, ebx, ecx, edx; - CHECK_XFEATURE(xfeature_nr); - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); /* - * The value returned by ECX[1] indicates the alignment - * of state component i when the compacted format - * of the extended region of an XSAVE area is used + * Only XSAVES supports supervisor states and it uses compacted + * format. Checking a supervisor state's uncompacted offset is + * an error. */ - return !!(ecx & 2); -} - -static int xfeature_uncompacted_offset(int xfeature_nr) -{ - u32 eax, ebx, ecx, edx; + if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) { + WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); + return -1; + } CHECK_XFEATURE(xfeature_nr); cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); @@ -415,7 +472,7 @@ static int xfeature_size(int xfeature_nr) * that it is obvious which aspect of 'XSAVES' is being handled * by the calling code. */ -static int using_compacted_format(void) +int using_compacted_format(void) { return boot_cpu_has(X86_FEATURE_XSAVES); } @@ -530,11 +587,12 @@ static void do_extra_xstate_size_checks(void) */ paranoid_xstate_size += xfeature_size(i); } - XSTATE_WARN_ON(paranoid_xstate_size != xstate_size); + XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size); } + /* - * Calculate total size of enabled xstates in XCR0/xfeatures_mask. + * Get total size of enabled xstates in XCR0/xfeatures_mask. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer @@ -544,34 +602,33 @@ static void do_extra_xstate_size_checks(void) * Note that we do not currently set any bits on IA32_XSS so * 'XCR0 | IA32_XSS == XCR0' for now. */ -static unsigned int __init calculate_xstate_size(void) +static unsigned int __init get_xsaves_size(void) { unsigned int eax, ebx, ecx, edx; - unsigned int calculated_xstate_size; + /* + * - CPUID function 0DH, sub-function 1: + * EBX enumerates the size (in bytes) required by + * the XSAVES instruction for an XSAVE area + * containing all the state components + * corresponding to bits currently set in + * XCR0 | IA32_XSS. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + return ebx; +} - if (!boot_cpu_has(X86_FEATURE_XSAVES)) { - /* - * - CPUID function 0DH, sub-function 0: - * EBX enumerates the size (in bytes) required by - * the XSAVE instruction for an XSAVE area - * containing all the *user* state components - * corresponding to bits currently set in XCR0. - */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - calculated_xstate_size = ebx; - } else { - /* - * - CPUID function 0DH, sub-function 1: - * EBX enumerates the size (in bytes) required by - * the XSAVES instruction for an XSAVE area - * containing all the state components - * corresponding to bits currently set in - * XCR0 | IA32_XSS. - */ - cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); - calculated_xstate_size = ebx; - } - return calculated_xstate_size; +static unsigned int __init get_xsave_size(void) +{ + unsigned int eax, ebx, ecx, edx; + /* + * - CPUID function 0DH, sub-function 0: + * EBX enumerates the size (in bytes) required by + * the XSAVE instruction for an XSAVE area + * containing all the *user* state components + * corresponding to bits currently set in XCR0. + */ + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + return ebx; } /* @@ -591,7 +648,15 @@ static bool is_supported_xstate_size(unsigned int test_xstate_size) static int init_xstate_size(void) { /* Recompute the context size for enabled features: */ - unsigned int possible_xstate_size = calculate_xstate_size(); + unsigned int possible_xstate_size; + unsigned int xsave_size; + + xsave_size = get_xsave_size(); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + possible_xstate_size = get_xsaves_size(); + else + possible_xstate_size = xsave_size; /* Ensure we have the space to store all enabled: */ if (!is_supported_xstate_size(possible_xstate_size)) @@ -601,8 +666,13 @@ static int init_xstate_size(void) * The size is OK, we are definitely going to use xsave, * make it known to the world that we need more space. */ - xstate_size = possible_xstate_size; + fpu_kernel_xstate_size = possible_xstate_size; do_extra_xstate_size_checks(); + + /* + * User space is always in standard format. + */ + fpu_user_xstate_size = xsave_size; return 0; } @@ -644,8 +714,13 @@ void __init fpu__init_system_xstate(void) xfeatures_mask = eax + ((u64)edx << 32); if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + /* + * This indicates that something really unexpected happened + * with the enumeration. Disable XSAVE and try to continue + * booting without it. This is too early to BUG(). + */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); - BUG(); + goto out_disable; } xfeatures_mask &= fpu__get_supported_xfeatures_mask(); @@ -653,21 +728,29 @@ void __init fpu__init_system_xstate(void) /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); err = init_xstate_size(); - if (err) { - /* something went wrong, boot without any XSAVE support */ - fpu__init_disable_system_xstate(); - return; - } + if (err) + goto out_disable; + + /* + * Update info used for ptrace frames; use standard-format size and no + * supervisor xstates: + */ + update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR); - update_regset_xstate_info(xstate_size, xfeatures_mask); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp(); + print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask, - xstate_size, + fpu_kernel_xstate_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); + return; + +out_disable: + /* something went wrong, try to boot without any XSAVE support */ + fpu__init_disable_system_xstate(); } /* @@ -693,6 +776,11 @@ void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask) { int feature_nr = fls64(xstate_feature_mask) - 1; + if (!xfeature_enabled(feature_nr)) { + WARN_ON_FPU(1); + return NULL; + } + return (void *)xsave + xstate_comp_offsets[feature_nr]; } /* @@ -887,16 +975,16 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; - /* Set the bits we need in PKRU */ + /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) new_pkru_bits |= PKRU_AD_BIT; if (init_val & PKEY_DISABLE_WRITE) new_pkru_bits |= PKRU_WD_BIT; - /* Shift the bits in to the correct place in PKRU for pkey. */ + /* Shift the bits in to the correct place in PKRU for pkey: */ new_pkru_bits <<= pkey_shift; - /* Locate old copy of the state in the xsave buffer */ + /* Locate old copy of the state in the xsave buffer: */ old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); /* @@ -909,9 +997,10 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, else new_pkru_state.pkru = old_pkru_state->pkru; - /* mask off any old bits in place */ + /* Mask off any old bits in place: */ new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - /* Set the newly-requested bits */ + + /* Set the newly-requested bits: */ new_pkru_state.pkru |= new_pkru_bits; /* @@ -925,8 +1014,168 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ new_pkru_state.pad = 0; - fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, - sizeof(new_pkru_state)); + fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, sizeof(new_pkru_state)); + + return 0; +} + +/* + * This is similar to user_regset_copyout(), but will not add offset to + * the source data pointer or increment pos, count, kbuf, and ubuf. + */ +static inline int xstate_copyout(unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf, + const void *data, const int start_pos, + const int end_pos) +{ + if ((count == 0) || (pos < start_pos)) + return 0; + + if (end_pos < 0 || pos < end_pos) { + unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + + if (kbuf) { + memcpy(kbuf + pos, data, copy); + } else { + if (__copy_to_user(ubuf + pos, data, copy)) + return -EFAULT; + } + } + return 0; +} + +/* + * Convert from kernel XSAVES compacted format to standard format and copy + * to a ptrace buffer. It supports partial copy but pos always starts from + * zero. This is called from xstateregs_get() and there we check the CPU + * has XSAVES. + */ +int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, + void __user *ubuf, struct xregs_state *xsave) +{ + unsigned int offset, size; + int ret, i; + struct xstate_header header; + + /* + * Currently copy_regset_to_user() starts from pos 0: + */ + if (unlikely(pos != 0)) + return -EFAULT; + + /* + * The destination is a ptrace buffer; we put in only user xstates: + */ + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; + header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + + /* + * Copy xregs_state->header: + */ + offset = offsetof(struct xregs_state, header); + size = sizeof(header); + + ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count); + + if (ret) + return ret; + + for (i = 0; i < XFEATURE_MAX; i++) { + /* + * Copy only in-use xstates: + */ + if ((header.xfeatures >> i) & 1) { + void *src = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count); + + if (ret) + return ret; + + if (offset + size >= count) + break; + } + + } + + /* + * Fill xsave->i387.sw_reserved value for ptrace frame: + */ + offset = offsetof(struct fxregs_state, sw_reserved); + size = sizeof(xstate_fx_sw_bytes); + + ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); + + if (ret) + return ret; + + return 0; +} + +/* + * Convert from a ptrace standard-format buffer to kernel XSAVES format + * and copy to the target thread. This is called from xstateregs_set() and + * there we check the CPU has XSAVES and a whole standard-sized buffer + * exists. + */ +int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, + struct xregs_state *xsave) +{ + unsigned int offset, size; + int i; + u64 xfeatures; + u64 allowed_features; + + offset = offsetof(struct xregs_state, header); + size = sizeof(xfeatures); + + if (kbuf) { + memcpy(&xfeatures, kbuf + offset, size); + } else { + if (__copy_from_user(&xfeatures, ubuf + offset, size)) + return -EFAULT; + } + + /* + * Reject if the user sets any disabled or supervisor features: + */ + allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; + + if (xfeatures & ~allowed_features) + return -EINVAL; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = ((u64)1 << i); + + if (xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + if (kbuf) { + memcpy(dst, kbuf + offset, size); + } else { + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; + } + } + } + + /* + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': + */ + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + + /* + * Add back in the features that came in from userspace: + */ + xsave->header.xfeatures |= xfeatures; return 0; } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d784bb547a9d..2dda0bc4576e 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -26,7 +26,7 @@ static void __init i386_default_early_setup(void) x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; - reserve_ebda_region(); + reserve_bios_regions(); } asmlinkage __visible void __init i386_start_kernel(void) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b72fb0b71dd1..99d48e7d2974 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -183,7 +183,7 @@ void __init x86_64_start_reservations(char *real_mode_data) copy_bootdata(__va(real_mode_data)); x86_early_init_platform_quirks(); - reserve_ebda_region(); + reserve_bios_regions(); switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 5df831ef1442..9f8efc9f0075 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -38,7 +38,7 @@ #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) +L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) @@ -299,6 +299,7 @@ ENTRY(secondary_startup_64) pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq +ENDPROC(secondary_startup_64) #include "verify_cpu.S" diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 64341aa485ae..d40ee8a38fed 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(___preempt_schedule); EXPORT_SYMBOL(___preempt_schedule_notrace); #endif + +EXPORT_SYMBOL(__sw_hweight32); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 38da8f29a9c8..c627bf8d98ad 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -130,11 +130,9 @@ void irq_ctx_init(int cpu) void do_softirq_own_stack(void) { - struct thread_info *curstk; struct irq_stack *irqstk; u32 *isp, *prev_esp; - curstk = current_stack(); irqstk = __this_cpu_read(softirq_stack); /* build the stack frame on the softirq stack */ diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 2af478e3fd4e..f2356bda2b05 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -19,8 +19,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/efi.h> -#include <linux/verify_pefile.h> -#include <keys/system_keyring.h> +#include <linux/verification.h> #include <asm/bootparam.h> #include <asm/setup.h> @@ -529,18 +528,9 @@ static int bzImage64_cleanup(void *loader_data) #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) { - bool trusted; - int ret; - - ret = verify_pefile_signature(kernel, kernel_len, - system_trusted_keyring, - VERIFYING_KEXEC_PE_SIGNATURE, - &trusted); - if (ret < 0) - return ret; - if (!trusted) - return -EKEYREJECTED; - return 0; + return verify_pefile_signature(kernel, kernel_len, + NULL, + VERIFYING_KEXEC_PE_SIGNATURE); } #endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 38cf7a741250..7847e5c0e0b5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -961,7 +961,19 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * normal page fault. */ regs->ip = (unsigned long)cur->addr; + /* + * Trap flag (TF) has been set here because this fault + * happened where the single stepping will be done. + * So clear it by resetting the current kprobe: + */ + regs->flags &= ~X86_EFLAGS_TF; + + /* + * If the TF flag was set before the kprobe hit, + * don't touch it: + */ regs->flags |= kcb->kprobe_old_flags; + if (kcb->kprobe_status == KPROBE_REENTER) restore_previous_kprobe(kcb); else diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index eea2a6f72b31..1ef5e48b3a36 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -301,8 +301,6 @@ static void kvm_register_steal_time(void) if (!has_steal_clock) return; - memset(st, 0, sizeof(*st)); - wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); pr_info("kvm-stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c deleted file mode 100644 index 92fc1a51f994..000000000000 --- a/arch/x86/kernel/livepatch.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * livepatch.c - x86-specific Kernel Live Patching Core - * - * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> - * Copyright (C) 2014 SUSE - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/module.h> -#include <linux/uaccess.h> -#include <asm/elf.h> -#include <asm/livepatch.h> - -/** - * klp_write_module_reloc() - write a relocation in a module - * @mod: module in which the section to be modified is found - * @type: ELF relocation type (see asm/elf.h) - * @loc: address that the relocation should be written to - * @value: relocation value (sym address + addend) - * - * This function writes a relocation to the specified location for - * a particular module. - */ -int klp_write_module_reloc(struct module *mod, unsigned long type, - unsigned long loc, unsigned long value) -{ - size_t size = 4; - unsigned long val; - unsigned long core = (unsigned long)mod->core_layout.base; - unsigned long core_size = mod->core_layout.size; - - switch (type) { - case R_X86_64_NONE: - return 0; - case R_X86_64_64: - val = value; - size = 8; - break; - case R_X86_64_32: - val = (u32)value; - break; - case R_X86_64_32S: - val = (s32)value; - break; - case R_X86_64_PC32: - val = (u32)(value - loc); - break; - default: - /* unsupported relocation type */ - return -EINVAL; - } - - if (loc < core || loc >= core + core_size) - /* loc does not point to any symbol inside the module */ - return -EINVAL; - - return probe_kernel_write((void *)loc, &val, size); -} diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index ba7fbba9831b..5a294e48b185 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -538,3 +538,48 @@ overflow: return -ENOEXEC; } #endif /* CONFIG_KEXEC_FILE */ + +static int +kexec_mark_range(unsigned long start, unsigned long end, bool protect) +{ + struct page *page; + unsigned int nr_pages; + + /* + * For physical range: [start, end]. We must skip the unassigned + * crashk resource with zero-valued "end" member. + */ + if (!end || start > end) + return 0; + + page = pfn_to_page(start >> PAGE_SHIFT); + nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + if (protect) + return set_pages_ro(page, nr_pages); + else + return set_pages_rw(page, nr_pages); +} + +static void kexec_mark_crashkres(bool protect) +{ + unsigned long control; + + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect); + + /* Don't touch the control code page used in crash_kexec().*/ + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); + /* Control code page is located in the 2nd page. */ + kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect); + control += KEXEC_CONTROL_PAGE_SIZE; + kexec_mark_range(control, crashk_res.end, protect); +} + +void arch_kexec_protect_crashkres(void) +{ + kexec_mark_crashkres(true); +} + +void arch_kexec_unprotect_crashkres(void) +{ + kexec_mark_crashkres(false); +} diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index ed48a9f465f8..61924222a9e1 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -182,7 +182,8 @@ GLOBAL(ftrace_graph_call) jmp ftrace_stub #endif -GLOBAL(ftrace_stub) +/* This is weak to keep gas from relaxing the jumps */ +WEAK(ftrace_stub) retq END(ftrace_caller) diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index b2f8a33b36ff..24a50301f150 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -7,12 +7,12 @@ void __init x86_early_init_platform_quirks(void) { x86_platform.legacy.rtc = 1; - x86_platform.legacy.ebda_search = 0; + x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 1; switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_PC: - x86_platform.legacy.ebda_search = 1; + x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2915d54e9dd5..96becbbb52e0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -97,10 +97,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - struct task_struct *me = current; - struct thread_struct *t = &me->thread; + struct thread_struct *t = &tsk->thread; unsigned long *bp = t->io_bitmap_ptr; struct fpu *fpu = &t->fpu; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6b16c36f0939..6e789ca1f841 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -532,7 +532,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) switch (code) { case ARCH_SET_GS: - if (addr >= TASK_SIZE_OF(task)) + if (addr >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.gsindex = 0; @@ -546,7 +546,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) case ARCH_SET_FS: /* Not strictly needed for fs, but do it for symmetry with gs */ - if (addr >= TASK_SIZE_OF(task)) + if (addr >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.fsindex = 0; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e60ef918f53d..600edd225e81 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -392,7 +392,7 @@ static int putreg(struct task_struct *child, #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct,fs_base): - if (value >= TASK_SIZE_OF(child)) + if (value >= TASK_SIZE_MAX) return -EIO; /* * When changing the segment base, use do_arch_prctl @@ -406,7 +406,7 @@ static int putreg(struct task_struct *child, /* * Exactly the same here as the %fs handling above. */ - if (value >= TASK_SIZE_OF(child)) + if (value >= TASK_SIZE_MAX) return -EIO; if (child->thread.gsbase != value) return do_arch_prctl(child, ARCH_SET_GS, value); diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 99bfc025111d..06c58ce46762 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -61,11 +61,16 @@ void pvclock_resume(void) u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) { unsigned version; - cycle_t ret; u8 flags; do { - version = __pvclock_read_cycles(src, &ret, &flags); + version = src->version; + /* Make the latest version visible */ + smp_rmb(); + + flags = src->flags; + /* Make sure that the version double-check is last. */ + smp_rmb(); } while ((src->version & 1) || version != src->version); return flags & valid_flags; @@ -80,6 +85,8 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) do { version = __pvclock_read_cycles(src, &ret, &flags); + /* Make sure that the version double-check is last. */ + smp_rmb(); } while ((src->version & 1) || version != src->version); if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a9b31eb815f2..15ed70f8278b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -55,6 +55,19 @@ bool port_cf9_safe = false; */ /* + * Some machines require the "reboot=a" commandline options + */ +static int __init set_acpi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_ACPI) { + reboot_type = BOOT_ACPI; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "ACPI"); + } + return 0; +} + +/* * Some machines require the "reboot=b" or "reboot=k" commandline options, * this quirk makes that automatic. */ @@ -395,6 +408,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 7450 AIO */ + .callback = set_acpi_reboot, + .ident = "Dell OptiPlex 7450 AIO", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7450 AIO"), + }, + }, /* Hewlett-Packard */ { /* Handle problems with rebooting on HP laptops */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c4e7b3991b60..6cb2b02fcc87 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -113,6 +113,7 @@ #include <asm/prom.h> #include <asm/microcode.h> #include <asm/mmu_context.h> +#include <asm/kaslr.h> /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -399,10 +400,6 @@ static void __init reserve_initrd(void) memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); } -static void __init early_initrd_acpi_init(void) -{ - early_acpi_table_init((void *)initrd_start, initrd_end - initrd_start); -} #else static void __init early_reserve_initrd(void) { @@ -410,9 +407,6 @@ static void __init early_reserve_initrd(void) static void __init reserve_initrd(void) { } -static void __init early_initrd_acpi_init(void) -{ -} #endif /* CONFIG_BLK_DEV_INITRD */ static void __init parse_setup_data(void) @@ -942,6 +936,8 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + kernel_randomize_memory(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); @@ -1146,7 +1142,7 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); - early_initrd_acpi_init(); + acpi_table_upgrade(); vsmp_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e4fcb87ba7a6..7a40e068302d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -236,6 +236,8 @@ void __init setup_per_cpu_areas(void) early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); + per_cpu(x86_cpu_to_acpiid, cpu) = + early_per_cpu_map(x86_cpu_to_acpiid, cpu); #endif #ifdef CONFIG_X86_32 per_cpu(x86_cpu_to_logical_apicid, cpu) = @@ -271,6 +273,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_LOCAL_APIC early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; + early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL; #endif #ifdef CONFIG_X86_32 early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index dc3c0b1c816f..b44564bf86a8 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -1,11 +1,104 @@ #include <linux/compat.h> #include <linux/uaccess.h> +/* + * The compat_siginfo_t structure and handing code is very easy + * to break in several ways. It must always be updated when new + * updates are made to the main siginfo_t, and + * copy_siginfo_to_user32() must be updated when the + * (arch-independent) copy_siginfo_to_user() is updated. + * + * It is also easy to put a new member in the compat_siginfo_t + * which has implicit alignment which can move internal structure + * alignment around breaking the ABI. This can happen if you, + * for instance, put a plain 64-bit value in there. + */ +static inline void signal_compat_build_tests(void) +{ + int _sifields_offset = offsetof(compat_siginfo_t, _sifields); + + /* + * If adding a new si_code, there is probably new data in + * the siginfo. Make sure folks bumping the si_code + * limits also have to look at this code. Make sure any + * new fields are handled in copy_siginfo_to_user32()! + */ + BUILD_BUG_ON(NSIGILL != 8); + BUILD_BUG_ON(NSIGFPE != 8); + BUILD_BUG_ON(NSIGSEGV != 4); + BUILD_BUG_ON(NSIGBUS != 5); + BUILD_BUG_ON(NSIGTRAP != 4); + BUILD_BUG_ON(NSIGCHLD != 6); + BUILD_BUG_ON(NSIGSYS != 1); + + /* This is part of the ABI and can never change in size: */ + BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); + /* + * The offsets of all the (unioned) si_fields are fixed + * in the ABI, of course. Make sure none of them ever + * move and are always at the beginning: + */ + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int)); +#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name)) + + /* + * Ensure that the size of each si_field never changes. + * If it does, it is a sign that the + * copy_siginfo_to_user32() code below needs to updated + * along with the size in the CHECK_SI_SIZE(). + * + * We repeat this check for both the generic and compat + * siginfos. + * + * Note: it is OK for these to grow as long as the whole + * structure stays within the padding size (checked + * above). + */ +#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name)) +#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name)) + + CHECK_CSI_OFFSET(_kill); + CHECK_CSI_SIZE (_kill, 2*sizeof(int)); + CHECK_SI_SIZE (_kill, 2*sizeof(int)); + + CHECK_CSI_OFFSET(_timer); + CHECK_CSI_SIZE (_timer, 5*sizeof(int)); + CHECK_SI_SIZE (_timer, 6*sizeof(int)); + + CHECK_CSI_OFFSET(_rt); + CHECK_CSI_SIZE (_rt, 3*sizeof(int)); + CHECK_SI_SIZE (_rt, 4*sizeof(int)); + + CHECK_CSI_OFFSET(_sigchld); + CHECK_CSI_SIZE (_sigchld, 5*sizeof(int)); + CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); + + CHECK_CSI_OFFSET(_sigchld_x32); + CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int)); + /* no _sigchld_x32 in the generic siginfo_t */ + + CHECK_CSI_OFFSET(_sigfault); + CHECK_CSI_SIZE (_sigfault, 4*sizeof(int)); + CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); + + CHECK_CSI_OFFSET(_sigpoll); + CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); + CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); + + CHECK_CSI_OFFSET(_sigsys); + CHECK_CSI_SIZE (_sigsys, 3*sizeof(int)); + CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); + + /* any new si_fields should be added here */ +} + int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) { int err = 0; bool ia32 = test_thread_flag(TIF_IA32); + signal_compat_build_tests(); + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; @@ -32,6 +125,21 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) &to->_sifields._pad[0]); switch (from->si_code >> 16) { case __SI_FAULT >> 16: + if (from->si_signo == SIGBUS && + (from->si_code == BUS_MCEERR_AR || + from->si_code == BUS_MCEERR_AO)) + put_user_ex(from->si_addr_lsb, &to->si_addr_lsb); + + if (from->si_signo == SIGSEGV) { + if (from->si_code == SEGV_BNDERR) { + compat_uptr_t lower = (unsigned long)&to->si_lower; + compat_uptr_t upper = (unsigned long)&to->si_upper; + put_user_ex(lower, &to->si_lower); + put_user_ex(upper, &to->si_upper); + } + if (from->si_code == SEGV_PKUERR) + put_user_ex(from->si_pkey, &to->si_pkey); + } break; case __SI_SYS >> 16: put_user_ex(from->si_syscall, &to->si_syscall); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fafe8b923cac..c93609c97406 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -105,6 +105,9 @@ static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); +/* Maximum number of SMT threads on any online core */ +int __max_smt_threads __read_mostly; + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; @@ -493,7 +496,7 @@ void set_cpu_sibling_map(int cpu) bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); struct cpuinfo_x86 *o; - int i; + int i, threads; cpumask_set_cpu(cpu, cpu_sibling_setup_mask); @@ -550,6 +553,10 @@ void set_cpu_sibling_map(int cpu) if (match_die(c, o) && !topology_same_node(c, o)) primarily_use_numa_for_topology(); } + + threads = cpumask_weight(topology_sibling_cpumask(cpu)); + if (threads > __max_smt_threads) + __max_smt_threads = threads; } /* maps the cpu to the sched domain representing multi-core */ @@ -1285,7 +1292,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) cpumask_copy(cpu_callin_mask, cpumask_of(0)); mb(); - current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); @@ -1441,6 +1447,21 @@ __init void prefill_possible_map(void) #ifdef CONFIG_HOTPLUG_CPU +/* Recompute SMT state for all CPUs on offline */ +static void recompute_smt_state(void) +{ + int max_threads, cpu; + + max_threads = 0; + for_each_online_cpu (cpu) { + int threads = cpumask_weight(topology_sibling_cpumask(cpu)); + + if (threads > max_threads) + max_threads = threads; + } + __max_smt_threads = max_threads; +} + static void remove_siblinginfo(int cpu) { int sibling; @@ -1465,6 +1486,7 @@ static void remove_siblinginfo(int cpu) c->phys_proc_id = 0; c->cpu_core_id = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); + recompute_smt_state(); } static void remove_cpu_from_maps(int cpu) @@ -1622,7 +1644,7 @@ static inline void mwait_play_dead(void) } } -static inline void hlt_play_dead(void) +void hlt_play_dead(void) { if (__this_cpu_read(cpu_info.x86) >= 4) wbinvd(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d1590486204a..00f03d82e69a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -96,6 +96,12 @@ static inline void cond_local_irq_disable(struct pt_regs *regs) local_irq_disable(); } +/* + * In IST context, we explicitly disable preemption. This serves two + * purposes: it makes it much less likely that we would accidentally + * schedule in IST context and it will force a warning if we somehow + * manage to schedule by accident. + */ void ist_enter(struct pt_regs *regs) { if (user_mode(regs)) { @@ -110,13 +116,7 @@ void ist_enter(struct pt_regs *regs) rcu_nmi_enter(); } - /* - * We are atomic because we're on the IST stack; or we're on - * x86_32, in which case we still shouldn't schedule; or we're - * on x86_64 and entered from user mode, in which case we're - * still atomic unless ist_begin_non_atomic is called. - */ - preempt_count_add(HARDIRQ_OFFSET); + preempt_disable(); /* This code is a bit fragile. Test it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); @@ -124,7 +124,7 @@ void ist_enter(struct pt_regs *regs) void ist_exit(struct pt_regs *regs) { - preempt_count_sub(HARDIRQ_OFFSET); + preempt_enable_no_resched(); if (!user_mode(regs)) rcu_nmi_exit(); @@ -155,7 +155,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) BUG_ON((unsigned long)(current_top_of_stack() - current_stack_pointer()) >= THREAD_SIZE); - preempt_count_sub(HARDIRQ_OFFSET); + preempt_enable_no_resched(); } /** @@ -165,7 +165,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) */ void ist_end_non_atomic(void) { - preempt_count_add(HARDIRQ_OFFSET); + preempt_disable(); } static nokprobe_inline int diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 38ba6de56ede..a804b5ab32d0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -239,7 +239,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +static void set_cyc2ns_scale(unsigned long khz, int cpu) { unsigned long long tsc_now, ns_now; struct cyc2ns_data *data; @@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) local_irq_save(flags); sched_clock_idle_sleep_event(); - if (!cpu_khz) + if (!khz) goto done; data = cyc2ns_write_begin(cpu); @@ -261,7 +261,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz, + clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* @@ -335,12 +335,6 @@ int check_tsc_unstable(void) } EXPORT_SYMBOL_GPL(check_tsc_unstable); -int check_tsc_disabled(void) -{ - return tsc_disabled; -} -EXPORT_SYMBOL_GPL(check_tsc_disabled); - #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { @@ -665,19 +659,77 @@ success: } /** - * native_calibrate_tsc - calibrate the tsc on boot + * native_calibrate_tsc + * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) { + unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; + unsigned int crystal_khz; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (boot_cpu_data.cpuid_level < 0x15) + return 0; + + eax_denominator = ebx_numerator = ecx_hz = edx = 0; + + /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ + cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + + if (ebx_numerator == 0 || eax_denominator == 0) + return 0; + + crystal_khz = ecx_hz / 1000; + + if (crystal_khz == 0) { + switch (boot_cpu_data.x86_model) { + case 0x4E: /* SKL */ + case 0x5E: /* SKL */ + crystal_khz = 24000; /* 24.0 MHz */ + break; + case 0x5C: /* BXT */ + crystal_khz = 19200; /* 19.2 MHz */ + break; + } + } + + return crystal_khz * ebx_numerator / eax_denominator; +} + +static unsigned long cpu_khz_from_cpuid(void) +{ + unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (boot_cpu_data.cpuid_level < 0x16) + return 0; + + eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; + + cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); + + return eax_base_mhz * 1000; +} + +/** + * native_calibrate_cpu - calibrate the cpu on boot + */ +unsigned long native_calibrate_cpu(void) +{ u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; - /* Calibrate TSC using MSR for Intel Atom SoCs */ - local_irq_save(flags); - fast_calibrate = try_msr_calibrate_tsc(); - local_irq_restore(flags); + fast_calibrate = cpu_khz_from_cpuid(); + if (fast_calibrate) + return fast_calibrate; + + fast_calibrate = cpu_khz_from_msr(); if (fast_calibrate) return fast_calibrate; @@ -837,8 +889,12 @@ int recalibrate_cpu_khz(void) if (!boot_cpu_has(X86_FEATURE_TSC)) return -ENODEV; + cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); - cpu_khz = tsc_khz; + if (tsc_khz == 0) + tsc_khz = cpu_khz; + else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) + cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); @@ -1244,8 +1300,18 @@ void __init tsc_init(void) return; } + cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); - cpu_khz = tsc_khz; + + /* + * Trust non-zero tsc_khz as authorative, + * and use it to sanity check cpu_khz, + * which will be off if system timer is off. + */ + if (tsc_khz == 0) + tsc_khz = cpu_khz; + else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) + cpu_khz = tsc_khz; if (!tsc_khz) { mark_tsc_unstable("could not calculate TSC khz"); @@ -1265,7 +1331,7 @@ void __init tsc_init(void) */ for_each_possible_cpu(cpu) { cyc2ns_init(cpu); - set_cyc2ns_scale(cpu_khz, cpu); + set_cyc2ns_scale(tsc_khz, cpu); } if (tsc_disabled > 0) diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 6aa0f4d9eea6..0fe720d64fef 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -1,14 +1,5 @@ /* - * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms. - * - * TSC in Intel Atom SoC runs at a constant rate which can be figured - * by this formula: - * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency> - * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5 - * for details. - * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR - * based calibration is the only option. - * + * tsc_msr.c - TSC frequency enumeration via MSR * * Copyright (C) 2013 Intel Corporation * Author: Bin Gao <bin.gao@intel.com> @@ -22,17 +13,10 @@ #include <asm/apic.h> #include <asm/param.h> -/* CPU reference clock frequency: in KHz */ -#define FREQ_83 83200 -#define FREQ_100 99840 -#define FREQ_133 133200 -#define FREQ_166 166400 - -#define MAX_NUM_FREQS 8 +#define MAX_NUM_FREQS 9 /* - * According to Intel 64 and IA-32 System Programming Guide, - * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be + * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. * Unfortunately some Intel Atom SoCs aren't quite compliant to this, * so we need manually differentiate SoC families. This is what the @@ -47,15 +31,18 @@ struct freq_desc { static struct freq_desc freq_desc_tables[] = { /* PNW */ - { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, + { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }, /* CLV+ */ - { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, - /* TNG */ - { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } }, - /* VLV2 */ - { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, - /* ANN */ - { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } }, + { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } }, + /* TNG - Intel Atom processor Z3400 series */ + { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } }, + /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */ + { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } }, + /* ANN - Intel Atom processor Z3500 series */ + { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }, + /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */ + { 6, 0x4c, 1, { 83300, 100000, 133300, 116700, + 80000, 93300, 90000, 88900, 87500 } }, }; static int match_cpu(u8 family, u8 model) @@ -76,16 +63,20 @@ static int match_cpu(u8 family, u8 model) (freq_desc_tables[cpu_index].freqs[freq_id]) /* - * Do MSR calibration only for known/supported CPUs. + * MSR-based CPU/TSC frequency discovery for certain CPUs. * - * Returns the calibration value or 0 if MSR calibration failed. + * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy + * Return processor base frequency in KHz, or 0 on failure. */ -unsigned long try_msr_calibrate_tsc(void) +unsigned long cpu_khz_from_msr(void) { u32 lo, hi, ratio, freq_id, freq; unsigned long res; int cpu_index; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); if (cpu_index < 0) return 0; @@ -97,31 +88,17 @@ unsigned long try_msr_calibrate_tsc(void) rdmsr(MSR_IA32_PERF_STATUS, lo, hi); ratio = (hi >> 8) & 0x1f; } - pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio); - - if (!ratio) - goto fail; /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); freq_id = lo & 0x7; freq = id_to_freq(cpu_index, freq_id); - pr_info("Resolved frequency ID: %u, frequency: %u KHz\n", - freq_id, freq); - if (!freq) - goto fail; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ res = freq * ratio; - pr_info("TSC runs at %lu KHz\n", res); #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; - pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency); #endif return res; - -fail: - pr_warn("Fast TSC calibration using MSR failed\n"); - return 0; } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 3dce1ca0a653..01f30e56f99e 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -440,10 +440,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs) static inline int is_revectored(int nr, struct revectored_struct *bitmap) { - __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0" - :"=r" (nr) - :"m" (*bitmap), "r" (nr)); - return nr; + return test_bit(nr, bitmap->__map); } #define val_byte(val, n) (((__u8 *)&val)[n]) diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index cd05942bc918..f1aebfb49c36 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -44,6 +44,9 @@ EXPORT_SYMBOL(clear_page); EXPORT_SYMBOL(csum_partial); +EXPORT_SYMBOL(__sw_hweight32); +EXPORT_SYMBOL(__sw_hweight64); + /* * Export string functions. We normally rely on gcc builtin for most of these, * but gcc sometimes decides not to inline them. diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index dad5fe9633a3..58b459296e13 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -92,6 +92,7 @@ static void default_nmi_init(void) { }; static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform = { + .calibrate_cpu = native_calibrate_cpu, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 769af907f824..7597b42a8a88 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -181,19 +181,22 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry __user *entries) { int r, i; - struct kvm_cpuid_entry *cpuid_entries; + struct kvm_cpuid_entry *cpuid_entries = NULL; r = -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) goto out; r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); - if (!cpuid_entries) - goto out; - r = -EFAULT; - if (copy_from_user(cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry))) - goto out_free; + if (cpuid->nent) { + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * + cpuid->nent); + if (!cpuid_entries) + goto out; + r = -EFAULT; + if (copy_from_user(cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry))) + goto out; + } for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -212,9 +215,8 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, kvm_x86_ops->cpuid_update(vcpu); r = kvm_update_cpuid(vcpu); -out_free: - vfree(cpuid_entries); out: + vfree(cpuid_entries); return r; } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 9db47090ead0..5f42d038fcb4 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -443,7 +443,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, spin_lock(&ioapic->lock); if (trigger_mode != IOAPIC_LEVEL_TRIG || - kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) + kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) continue; ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 54ead79e444b..dfb4c6476877 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -382,9 +382,6 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u32 i, nr_ioapic_pins; int idx; - /* kvm->irq_routing must be read after clearing - * KVM_SCAN_IOAPIC. */ - smp_mb(); idx = srcu_read_lock(&kvm->irq_srcu); table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); nr_ioapic_pins = min_t(u32, table->nr_rt_entries, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1a2da0e5a373..a397200281c1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -59,9 +59,8 @@ /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ #define apic_debug(fmt, arg...) -#define APIC_LVT_NUM 6 /* 14 is the version for Xeon and Pentium 8.4.8*/ -#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) +#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ #define APIC_SHORT_MASK 0xc0000 @@ -73,14 +72,6 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul -#define VEC_POS(v) ((v) & (32 - 1)) -#define REG_POS(v) (((v) >> 5) << 4) - -static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) -{ - *((u32 *) (apic->regs + reg_off)) = val; -} - static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -94,11 +85,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) apic_test_vector(vector, apic->regs + APIC_IRR); } -static inline void apic_set_vector(int vec, void *bitmap) -{ - set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - static inline void apic_clear_vector(int vec, void *bitmap) { clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -173,7 +159,7 @@ static void recalculate_apic_map(struct kvm *kvm) continue; aid = kvm_apic_id(apic); - ldr = kvm_apic_get_reg(apic, APIC_LDR); + ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (aid < ARRAY_SIZE(new->phys_map)) new->phys_map[aid] = apic; @@ -182,7 +168,7 @@ static void recalculate_apic_map(struct kvm *kvm) new->mode |= KVM_APIC_MODE_X2APIC; } else if (ldr) { ldr = GET_APIC_LOGICAL_ID(ldr); - if (kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) + if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) new->mode |= KVM_APIC_MODE_XAPIC_FLAT; else new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; @@ -212,7 +198,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) { bool enabled = val & APIC_SPIV_APIC_ENABLED; - apic_set_reg(apic, APIC_SPIV, val); + kvm_lapic_set_reg(apic, APIC_SPIV, val); if (enabled != apic->sw_enabled) { apic->sw_enabled = enabled; @@ -226,13 +212,13 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) { - apic_set_reg(apic, APIC_ID, id << 24); + kvm_lapic_set_reg(apic, APIC_ID, id << 24); recalculate_apic_map(apic->vcpu->kvm); } static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) { - apic_set_reg(apic, APIC_LDR, id); + kvm_lapic_set_reg(apic, APIC_LDR, id); recalculate_apic_map(apic->vcpu->kvm); } @@ -240,19 +226,19 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - apic_set_reg(apic, APIC_ID, id << 24); - apic_set_reg(apic, APIC_LDR, ldr); + kvm_lapic_set_reg(apic, APIC_ID, id << 24); + kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); } static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) { - return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); + return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); } static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) { - return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; + return kvm_lapic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; } static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) @@ -287,10 +273,10 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) v |= APIC_LVR_DIRECTED_EOI; - apic_set_reg(apic, APIC_LVR, v); + kvm_lapic_set_reg(apic, APIC_LVR, v); } -static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = { +static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = { LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ @@ -349,16 +335,6 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); -static inline void apic_set_irr(int vec, struct kvm_lapic *apic) -{ - apic_set_vector(vec, apic->regs + APIC_IRR); - /* - * irr_pending must be true if any interrupt is pending; set it after - * APIC_IRR to avoid race with apic_clear_irr - */ - apic->irr_pending = true; -} - static inline int apic_search_irr(struct kvm_lapic *apic) { return find_highest_vector(apic->regs + APIC_IRR); @@ -416,7 +392,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); + kvm_x86_ops->hwapic_isr_update(vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -464,7 +440,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + kvm_x86_ops->hwapic_isr_update(vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; @@ -549,8 +525,8 @@ static void apic_update_ppr(struct kvm_lapic *apic) u32 tpr, isrv, ppr, old_ppr; int isr; - old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI); - tpr = kvm_apic_get_reg(apic, APIC_TASKPRI); + old_ppr = kvm_lapic_get_reg(apic, APIC_PROCPRI); + tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI); isr = apic_find_highest_isr(apic); isrv = (isr != -1) ? isr : 0; @@ -563,7 +539,7 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic, ppr, isr, isrv); if (old_ppr != ppr) { - apic_set_reg(apic, APIC_PROCPRI, ppr); + kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); if (ppr < old_ppr) kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } @@ -571,7 +547,7 @@ static void apic_update_ppr(struct kvm_lapic *apic) static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) { - apic_set_reg(apic, APIC_TASKPRI, tpr); + kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr); apic_update_ppr(apic); } @@ -601,7 +577,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) if (kvm_apic_broadcast(apic, mda)) return true; - logical_id = kvm_apic_get_reg(apic, APIC_LDR); + logical_id = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) return ((logical_id >> 16) == (mda >> 16)) @@ -610,7 +586,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) logical_id = GET_APIC_LOGICAL_ID(logical_id); mda = GET_APIC_DEST_FIELD(mda); - switch (kvm_apic_get_reg(apic, APIC_DFR)) { + switch (kvm_lapic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: return (logical_id & mda) != 0; case APIC_DFR_CLUSTER: @@ -618,7 +594,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) && (logical_id & mda & 0xf) != 0; default: apic_debug("Bad DFR vcpu %d: %08x\n", - apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); + apic->vcpu->vcpu_id, kvm_lapic_get_reg(apic, APIC_DFR)); return false; } } @@ -668,6 +644,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, return false; } } +EXPORT_SYMBOL_GPL(kvm_apic_match_dest); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, const unsigned long *bitmap, u32 bitmap_size) @@ -921,7 +898,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { if (trig_mode) - apic_set_vector(vector, apic->regs + APIC_TMR); + kvm_lapic_set_vector(vector, apic->regs + APIC_TMR); else apic_clear_vector(vector, apic->regs + APIC_TMR); } @@ -929,7 +906,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (vcpu->arch.apicv_active) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); else { - apic_set_irr(vector, apic); + kvm_lapic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -1073,8 +1050,8 @@ EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); static void apic_send_ipi(struct kvm_lapic *apic) { - u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); - u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2); + u32 icr_low = kvm_lapic_get_reg(apic, APIC_ICR); + u32 icr_high = kvm_lapic_get_reg(apic, APIC_ICR2); struct kvm_lapic_irq irq; irq.vector = icr_low & APIC_VECTOR_MASK; @@ -1111,7 +1088,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) ASSERT(apic != NULL); /* if initial count is 0, current count should also be 0 */ - if (kvm_apic_get_reg(apic, APIC_TMICT) == 0 || + if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 || apic->lapic_timer.period == 0) return 0; @@ -1168,13 +1145,13 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) break; case APIC_PROCPRI: apic_update_ppr(apic); - val = kvm_apic_get_reg(apic, offset); + val = kvm_lapic_get_reg(apic, offset); break; case APIC_TASKPRI: report_tpr_access(apic, false); /* fall thru */ default: - val = kvm_apic_get_reg(apic, offset); + val = kvm_lapic_get_reg(apic, offset); break; } @@ -1186,7 +1163,7 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) return container_of(dev, struct kvm_lapic, dev); } -static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, +int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data) { unsigned char alignment = offset & 0xf; @@ -1223,6 +1200,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, } return 0; } +EXPORT_SYMBOL_GPL(kvm_lapic_reg_read); static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { @@ -1240,7 +1218,7 @@ static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; - apic_reg_read(apic, offset, len, data); + kvm_lapic_reg_read(apic, offset, len, data); return 0; } @@ -1249,7 +1227,7 @@ static void update_divide_count(struct kvm_lapic *apic) { u32 tmp1, tmp2, tdcr; - tdcr = kvm_apic_get_reg(apic, APIC_TDCR); + tdcr = kvm_lapic_get_reg(apic, APIC_TDCR); tmp1 = tdcr & 0xf; tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; apic->divide_count = 0x1 << (tmp2 & 0x7); @@ -1260,7 +1238,7 @@ static void update_divide_count(struct kvm_lapic *apic) static void apic_update_lvtt(struct kvm_lapic *apic) { - u32 timer_mode = kvm_apic_get_reg(apic, APIC_LVTT) & + u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask; if (apic->lapic_timer.timer_mode != timer_mode) { @@ -1296,7 +1274,7 @@ static void apic_timer_expired(struct kvm_lapic *apic) static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 reg = kvm_apic_get_reg(apic, APIC_LVTT); + u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT); if (kvm_apic_hw_enabled(apic)) { int vec = reg & APIC_VECTOR_MASK; @@ -1332,7 +1310,8 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ if (guest_tsc < tsc_deadline) - __delay(tsc_deadline - guest_tsc); + __delay(min(tsc_deadline - guest_tsc, + nsec_to_cycles(vcpu, lapic_timer_advance_ns))); } static void start_apic_timer(struct kvm_lapic *apic) @@ -1344,7 +1323,7 @@ static void start_apic_timer(struct kvm_lapic *apic) if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { /* lapic timer in oneshot or periodic mode */ now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT) + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->divide_count; if (!apic->lapic_timer.period) @@ -1376,7 +1355,7 @@ static void start_apic_timer(struct kvm_lapic *apic) "timer initial count 0x%x, period %lldns, " "expire @ 0x%016" PRIx64 ".\n", __func__, APIC_BUS_CYCLE_NS, ktime_to_ns(now), - kvm_apic_get_reg(apic, APIC_TMICT), + kvm_lapic_get_reg(apic, APIC_TMICT), apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); @@ -1425,7 +1404,7 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) } } -static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) +int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; @@ -1457,7 +1436,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_DFR: if (!apic_x2apic_mode(apic)) { - apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); recalculate_apic_map(apic->vcpu->kvm); } else ret = 1; @@ -1465,17 +1444,17 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_SPIV: { u32 mask = 0x3ff; - if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) + if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) mask |= APIC_SPIV_DIRECTED_EOI; apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; u32 lvt_val; - for (i = 0; i < APIC_LVT_NUM; i++) { - lvt_val = kvm_apic_get_reg(apic, + for (i = 0; i < KVM_APIC_LVT_NUM; i++) { + lvt_val = kvm_lapic_get_reg(apic, APIC_LVTT + 0x10 * i); - apic_set_reg(apic, APIC_LVTT + 0x10 * i, + kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, lvt_val | APIC_LVT_MASKED); } apic_update_lvtt(apic); @@ -1486,14 +1465,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) } case APIC_ICR: /* No delay here, so we always clear the pending bit */ - apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); + kvm_lapic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); apic_send_ipi(apic); break; case APIC_ICR2: if (!apic_x2apic_mode(apic)) val &= 0xff000000; - apic_set_reg(apic, APIC_ICR2, val); + kvm_lapic_set_reg(apic, APIC_ICR2, val); break; case APIC_LVT0: @@ -1507,7 +1486,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) val |= APIC_LVT_MASKED; val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; - apic_set_reg(apic, reg, val); + kvm_lapic_set_reg(apic, reg, val); break; @@ -1515,7 +1494,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); - apic_set_reg(apic, APIC_LVTT, val); + kvm_lapic_set_reg(apic, APIC_LVTT, val); apic_update_lvtt(apic); break; @@ -1524,14 +1503,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; hrtimer_cancel(&apic->lapic_timer.timer); - apic_set_reg(apic, APIC_TMICT, val); + kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); break; case APIC_TDCR: if (val & 4) apic_debug("KVM_WRITE:TDCR %x\n", val); - apic_set_reg(apic, APIC_TDCR, val); + kvm_lapic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); break; @@ -1544,7 +1523,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_SELF_IPI: if (apic_x2apic_mode(apic)) { - apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); + kvm_lapic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); } else ret = 1; break; @@ -1556,6 +1535,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) apic_debug("Local APIC Write to read-only register %x\n", reg); return ret; } +EXPORT_SYMBOL_GPL(kvm_lapic_reg_write); static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t address, int len, const void *data) @@ -1585,14 +1565,14 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, apic_debug("%s: offset 0x%x with length 0x%x, and value is " "0x%x\n", __func__, offset, len, val); - apic_reg_write(apic, offset & 0xff0, val); + kvm_lapic_reg_write(apic, offset & 0xff0, val); return 0; } void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { - apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); + kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); @@ -1604,10 +1584,10 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) /* hw has done the conditional check and inst decode */ offset &= 0xff0; - apic_reg_read(vcpu->arch.apic, offset, 4, &val); + kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val); /* TODO: optimize to just emulate side effect w/o one more write */ - apic_reg_write(vcpu->arch.apic, offset, val); + kvm_lapic_reg_write(vcpu->arch.apic, offset, val); } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); @@ -1667,14 +1647,14 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) struct kvm_lapic *apic = vcpu->arch.apic; apic_set_tpr(apic, ((cr8 & 0x0f) << 4) - | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4)); + | (kvm_lapic_get_reg(apic, APIC_TASKPRI) & 4)); } u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { u64 tpr; - tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI); + tpr = (u64) kvm_lapic_get_reg(vcpu->arch.apic, APIC_TASKPRI); return (tpr & 0xf0) >> 4; } @@ -1740,28 +1720,28 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_apic_set_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); - for (i = 0; i < APIC_LVT_NUM; i++) - apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + for (i = 0; i < KVM_APIC_LVT_NUM; i++) + kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) - apic_set_reg(apic, APIC_LVT0, + kvm_lapic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); - apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0)); + apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); - apic_set_reg(apic, APIC_DFR, 0xffffffffU); + kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU); apic_set_spiv(apic, 0xff); - apic_set_reg(apic, APIC_TASKPRI, 0); + kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); if (!apic_x2apic_mode(apic)) kvm_apic_set_ldr(apic, 0); - apic_set_reg(apic, APIC_ESR, 0); - apic_set_reg(apic, APIC_ICR, 0); - apic_set_reg(apic, APIC_ICR2, 0); - apic_set_reg(apic, APIC_TDCR, 0); - apic_set_reg(apic, APIC_TMICT, 0); + kvm_lapic_set_reg(apic, APIC_ESR, 0); + kvm_lapic_set_reg(apic, APIC_ICR, 0); + kvm_lapic_set_reg(apic, APIC_ICR2, 0); + kvm_lapic_set_reg(apic, APIC_TDCR, 0); + kvm_lapic_set_reg(apic, APIC_TMICT, 0); for (i = 0; i < 8; i++) { - apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); - apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); - apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); + kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0); + kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0); + kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } apic->irr_pending = vcpu->arch.apicv_active; apic->isr_count = vcpu->arch.apicv_active ? 1 : 0; @@ -1806,7 +1786,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { - u32 reg = kvm_apic_get_reg(apic, lvt_type); + u32 reg = kvm_lapic_get_reg(apic, lvt_type); int vector, mode, trig_mode; if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { @@ -1901,14 +1881,14 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) apic_update_ppr(apic); highest_irr = apic_find_highest_irr(apic); if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI))) + ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI))) return -1; return highest_irr; } int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { - u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0); + u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; if (!kvm_apic_hw_enabled(vcpu->arch.apic)) @@ -1974,7 +1954,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); apic_update_lvtt(apic); - apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0)); + apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; @@ -1982,9 +1962,11 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { + if (kvm_x86_ops->apicv_post_state_restore) + kvm_x86_ops->apicv_post_state_restore(vcpu); kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + kvm_x86_ops->hwapic_isr_update(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -2097,7 +2079,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff; + tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff; max_irr = apic_find_highest_irr(apic); if (max_irr < 0) max_irr = 0; @@ -2139,8 +2121,8 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* if this is ICR write vector before command */ if (reg == APIC_ICR) - apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); - return apic_reg_write(apic, reg, (u32)data); + kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return kvm_lapic_reg_write(apic, reg, (u32)data); } int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) @@ -2157,10 +2139,10 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 1; } - if (apic_reg_read(apic, reg, 4, &low)) + if (kvm_lapic_reg_read(apic, reg, 4, &low)) return 1; if (reg == APIC_ICR) - apic_reg_read(apic, APIC_ICR2, 4, &high); + kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high); *data = (((u64)high) << 32) | low; @@ -2176,8 +2158,8 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) /* if this is ICR write vector before command */ if (reg == APIC_ICR) - apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); - return apic_reg_write(apic, reg, (u32)data); + kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return kvm_lapic_reg_write(apic, reg, (u32)data); } int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) @@ -2188,10 +2170,10 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) if (!lapic_in_kernel(vcpu)) return 1; - if (apic_reg_read(apic, reg, 4, &low)) + if (kvm_lapic_reg_read(apic, reg, 4, &low)) return 1; if (reg == APIC_ICR) - apic_reg_read(apic, APIC_ICR2, 4, &high); + kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high); *data = (((u64)high) << 32) | low; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f71183e502ee..891c6da7d4aa 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -7,6 +7,10 @@ #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 +#define KVM_APIC_LVT_NUM 6 + +#define KVM_APIC_SHORT_MASK 0xc0000 +#define KVM_APIC_DEST_MASK 0x800 struct kvm_timer { struct hrtimer timer; @@ -59,6 +63,11 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); +int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val); +int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, + void *data); +bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, unsigned int dest, int dest_mode); void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); @@ -99,9 +108,32 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); void kvm_lapic_init(void); -static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off) +#define VEC_POS(v) ((v) & (32 - 1)) +#define REG_POS(v) (((v) >> 5) << 4) + +static inline void kvm_lapic_set_vector(int vec, void *bitmap) +{ + set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic) +{ + kvm_lapic_set_vector(vec, apic->regs + APIC_IRR); + /* + * irr_pending must be true if any interrupt is pending; set it after + * APIC_IRR to avoid race with apic_clear_irr + */ + apic->irr_pending = true; +} + +static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off) +{ + return *((u32 *) (apic->regs + reg_off)); +} + +static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { - return *((u32 *) (apic->regs + reg_off)); + *((u32 *) (apic->regs + reg_off)) = val; } extern struct static_key kvm_no_apic_vcpu; @@ -169,7 +201,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) static inline int kvm_apic_id(struct kvm_lapic *apic) { - return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; + return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff; } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 38c0c32926c9..def97b3a392b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -336,12 +336,12 @@ static gfn_t pse36_gfn_delta(u32 gpte) #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { - *sptep = spte; + WRITE_ONCE(*sptep, spte); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) { - *sptep = spte; + WRITE_ONCE(*sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) @@ -390,7 +390,7 @@ static void __set_spte(u64 *sptep, u64 spte) */ smp_wmb(); - ssptep->spte_low = sspte.spte_low; + WRITE_ONCE(ssptep->spte_low, sspte.spte_low); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) @@ -400,7 +400,7 @@ static void __update_clear_spte_fast(u64 *sptep, u64 spte) ssptep = (union split_spte *)sptep; sspte = (union split_spte)spte; - ssptep->spte_low = sspte.spte_low; + WRITE_ONCE(ssptep->spte_low, sspte.spte_low); /* * If we map the spte from present to nonpresent, we should clear @@ -1909,18 +1909,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, * since it has been deleted from active_mmu_pages but still can be found * at hast list. * - * for_each_gfn_indirect_valid_sp has skipped that kind of page and - * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped - * all the obsolete pages. + * for_each_gfn_valid_sp() has skipped that kind of pages. */ -#define for_each_gfn_sp(_kvm, _sp, _gfn) \ +#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if ((_sp)->gfn != (_gfn)) {} else + if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \ + || (_sp)->role.invalid) {} else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_gfn_sp(_kvm, _sp, _gfn) \ - if ((_sp)->role.direct || (_sp)->role.invalid) {} else + for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ + if ((_sp)->role.direct) {} else /* @sp->gfn should be write-protected at the call site */ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -1961,6 +1960,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); +} + static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { @@ -2105,11 +2109,6 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sp); } -static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); -} - static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -2136,10 +2135,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_sp(vcpu->kvm, sp, gfn) { - if (is_obsolete_sp(vcpu->kvm, sp)) - continue; - + for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) { if (!need_sync && sp->unsync) need_sync = true; diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 3f8c732117ec..0149ac59c273 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -44,8 +44,6 @@ static bool msr_mtrr_valid(unsigned msr) case MSR_MTRRdefType: case MSR_IA32_CR_PAT: return true; - case 0x2f8: - return true; } return false; } @@ -541,6 +539,7 @@ static void mtrr_lookup_var_start(struct mtrr_iter *iter) iter->fixed = false; iter->start_max = iter->start; + iter->range = NULL; iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node); __mtrr_lookup_var_next(iter); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fafd720ce10a..16ef31b87452 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -14,6 +14,9 @@ * the COPYING file in the top-level directory. * */ + +#define pr_fmt(fmt) "SVM: " fmt + #include <linux/kvm_host.h> #include "irq.h" @@ -32,6 +35,7 @@ #include <linux/trace_events.h> #include <linux/slab.h> +#include <asm/apic.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -68,6 +72,8 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SVM_FEATURE_DECODE_ASSIST (1 << 7) #define SVM_FEATURE_PAUSE_FILTER (1 << 10) +#define SVM_AVIC_DOORBELL 0xc001011b + #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ @@ -78,6 +84,18 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define TSC_RATIO_MIN 0x0000000000000001ULL #define TSC_RATIO_MAX 0x000000ffffffffffULL +#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) + +/* + * 0xff is broadcast, so the max index allowed for physical APIC ID + * table is 0xfe. APIC IDs above 0xff are reserved. + */ +#define AVIC_MAX_PHYSICAL_ID_COUNT 255 + +#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 +#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 +#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -162,8 +180,21 @@ struct vcpu_svm { /* cached guest cpuid flags for faster access */ bool nrips_enabled : 1; + + u32 ldr_reg; + struct page *avic_backing_page; + u64 *avic_physical_id_cache; + bool avic_is_running; }; +#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) + +#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) +#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) +#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) +#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) + static DEFINE_PER_CPU(u64, current_tsc_ratio); #define TSC_RATIO_DEFAULT 0x0100000000ULL @@ -205,6 +236,12 @@ module_param(npt, int, S_IRUGO); static int nested = true; module_param(nested, int, S_IRUGO); +/* enable / disable AVIC */ +static int avic; +#ifdef CONFIG_X86_LOCAL_APIC +module_param(avic, int, S_IRUGO); +#endif + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -228,12 +265,18 @@ enum { VMCB_SEG, /* CS, DS, SS, ES, CPL */ VMCB_CR2, /* CR2 only */ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ + VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE, + * AVIC PHYSICAL_TABLE pointer, + * AVIC LOGICAL_TABLE pointer + */ VMCB_DIRTY_MAX, }; /* TPR and CR2 are always written before VMRUN */ #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL + static inline void mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -255,6 +298,23 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_svm, vcpu); } +static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) +{ + svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; + mark_dirty(svm->vmcb, VMCB_AVIC); +} + +static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 *entry = svm->avic_physical_id_cache; + + if (!entry) + return false; + + return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); +} + static void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h; @@ -923,6 +983,15 @@ static __init int svm_hardware_setup(void) } else kvm_disable_tdp(); + if (avic) { + if (!npt_enabled || + !boot_cpu_has(X86_FEATURE_AVIC) || + !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) + avic = false; + else + pr_info("AVIC enabled\n"); + } + return 0; err: @@ -1000,6 +1069,22 @@ static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } +static void avic_init_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb; + struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; + phys_addr_t bpa = page_to_phys(svm->avic_backing_page); + phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page); + phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page); + + vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; + vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; + vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; + vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + svm->vcpu.arch.apicv_active = true; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -1014,7 +1099,8 @@ static void init_vmcb(struct vcpu_svm *svm) set_cr_intercept(svm, INTERCEPT_CR0_WRITE); set_cr_intercept(svm, INTERCEPT_CR3_WRITE); set_cr_intercept(svm, INTERCEPT_CR4_WRITE); - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + if (!kvm_vcpu_apicv_active(&svm->vcpu)) + set_cr_intercept(svm, INTERCEPT_CR8_WRITE); set_dr_intercepts(svm); @@ -1110,9 +1196,197 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_PAUSE); } + if (avic) + avic_init_vmcb(svm); + mark_all_dirty(svm->vmcb); enable_gif(svm); + +} + +static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index) +{ + u64 *avic_physical_id_table; + struct kvm_arch *vm_data = &vcpu->kvm->arch; + + if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) + return NULL; + + avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page); + + return &avic_physical_id_table[index]; +} + +/** + * Note: + * AVIC hardware walks the nested page table to check permissions, + * but does not use the SPA address specified in the leaf page + * table entry since it uses address in the AVIC_BACKING_PAGE pointer + * field of the VMCB. Therefore, we set up the + * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. + */ +static int avic_init_access_page(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int ret; + + if (kvm->arch.apic_access_page_done) + return 0; + + ret = x86_set_memory_region(kvm, + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, + PAGE_SIZE); + if (ret) + return ret; + + kvm->arch.apic_access_page_done = true; + return 0; +} + +static int avic_init_backing_page(struct kvm_vcpu *vcpu) +{ + int ret; + u64 *entry, new_entry; + int id = vcpu->vcpu_id; + struct vcpu_svm *svm = to_svm(vcpu); + + ret = avic_init_access_page(vcpu); + if (ret) + return ret; + + if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) + return -EINVAL; + + if (!svm->vcpu.arch.apic->regs) + return -EINVAL; + + svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); + + /* Setting AVIC backing page address in the phy APIC ID table */ + entry = avic_get_physical_id_entry(vcpu, id); + if (!entry) + return -EINVAL; + + new_entry = READ_ONCE(*entry); + new_entry = (page_to_phys(svm->avic_backing_page) & + AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; + WRITE_ONCE(*entry, new_entry); + + svm->avic_physical_id_cache = entry; + + return 0; +} + +static void avic_vm_destroy(struct kvm *kvm) +{ + struct kvm_arch *vm_data = &kvm->arch; + + if (vm_data->avic_logical_id_table_page) + __free_page(vm_data->avic_logical_id_table_page); + if (vm_data->avic_physical_id_table_page) + __free_page(vm_data->avic_physical_id_table_page); +} + +static int avic_vm_init(struct kvm *kvm) +{ + int err = -ENOMEM; + struct kvm_arch *vm_data = &kvm->arch; + struct page *p_page; + struct page *l_page; + + if (!avic) + return 0; + + /* Allocating physical APIC ID table (4KB) */ + p_page = alloc_page(GFP_KERNEL); + if (!p_page) + goto free_avic; + + vm_data->avic_physical_id_table_page = p_page; + clear_page(page_address(p_page)); + + /* Allocating logical APIC ID table (4KB) */ + l_page = alloc_page(GFP_KERNEL); + if (!l_page) + goto free_avic; + + vm_data->avic_logical_id_table_page = l_page; + clear_page(page_address(l_page)); + + return 0; + +free_avic: + avic_vm_destroy(kvm); + return err; +} + +/** + * This function is called during VCPU halt/unhalt. + */ +static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +{ + u64 entry; + int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu); + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + svm->avic_is_running = is_run; + + /* ID = 0xff (broadcast), ID > 0xff (reserved) */ + if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) + return; + + entry = READ_ONCE(*(svm->avic_physical_id_cache)); + WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)); + + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + if (is_run) + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + WRITE_ONCE(*(svm->avic_physical_id_cache), entry); +} + +static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + u64 entry; + /* ID = 0xff (broadcast), ID > 0xff (reserved) */ + int h_physical_id = kvm_cpu_get_apicid(cpu); + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) + return; + + entry = READ_ONCE(*(svm->avic_physical_id_cache)); + WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); + + entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; + entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); + + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + if (svm->avic_is_running) + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + + WRITE_ONCE(*(svm->avic_physical_id_cache), entry); +} + +static void avic_vcpu_put(struct kvm_vcpu *vcpu) +{ + u64 entry; + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + entry = READ_ONCE(*(svm->avic_physical_id_cache)); + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -1131,6 +1405,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); + + if (kvm_vcpu_apicv_active(vcpu) && !init_event) + avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) @@ -1169,6 +1446,17 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (!hsave_page) goto free_page3; + if (avic) { + err = avic_init_backing_page(&svm->vcpu); + if (err) + goto free_page4; + } + + /* We initialize this flag to true to make sure that the is_running + * bit would be set the first time the vcpu is loaded. + */ + svm->avic_is_running = true; + svm->nested.hsave = page_address(hsave_page); svm->msrpm = page_address(msrpm_pages); @@ -1187,6 +1475,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) return &svm->vcpu; +free_page4: + __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: @@ -1243,6 +1533,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* This assumes that the kernel never uses MSR_TSC_AUX */ if (static_cpu_has(X86_FEATURE_RDTSCP)) wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + + avic_vcpu_load(vcpu, cpu); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) @@ -1250,6 +1542,8 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); int i; + avic_vcpu_put(vcpu); + ++vcpu->stat.host_state_reload; kvm_load_ldt(svm->host.ldt); #ifdef CONFIG_X86_64 @@ -1265,6 +1559,16 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } +static void svm_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + avic_set_running(vcpu, false); +} + +static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + avic_set_running(vcpu, true); +} + static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -2673,10 +2977,11 @@ static int clgi_interception(struct vcpu_svm *svm) disable_gif(svm); /* After a CLGI no interrupts should come */ - svm_clear_vintr(svm); - svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; - - mark_dirty(svm->vmcb, VMCB_INTR); + if (!kvm_vcpu_apicv_active(&svm->vcpu)) { + svm_clear_vintr(svm); + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + mark_dirty(svm->vmcb, VMCB_INTR); + } return 1; } @@ -3212,6 +3517,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; + case MSR_IA32_APICBASE: + if (kvm_vcpu_apicv_active(vcpu)) + avic_update_vapic_bar(to_svm(vcpu), data); + /* Follow through */ default: return kvm_set_msr_common(vcpu, msr); } @@ -3281,6 +3590,278 @@ static int mwait_interception(struct vcpu_svm *svm) return nop_interception(svm); } +enum avic_ipi_failure_cause { + AVIC_IPI_FAILURE_INVALID_INT_TYPE, + AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, + AVIC_IPI_FAILURE_INVALID_TARGET, + AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, +}; + +static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) +{ + u32 icrh = svm->vmcb->control.exit_info_1 >> 32; + u32 icrl = svm->vmcb->control.exit_info_1; + u32 id = svm->vmcb->control.exit_info_2 >> 32; + u32 index = svm->vmcb->control.exit_info_2 & 0xFF; + struct kvm_lapic *apic = svm->vcpu.arch.apic; + + trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); + + switch (id) { + case AVIC_IPI_FAILURE_INVALID_INT_TYPE: + /* + * AVIC hardware handles the generation of + * IPIs when the specified Message Type is Fixed + * (also known as fixed delivery mode) and + * the Trigger Mode is edge-triggered. The hardware + * also supports self and broadcast delivery modes + * specified via the Destination Shorthand(DSH) + * field of the ICRL. Logical and physical APIC ID + * formats are supported. All other IPI types cause + * a #VMEXIT, which needs to emulated. + */ + kvm_lapic_reg_write(apic, APIC_ICR2, icrh); + kvm_lapic_reg_write(apic, APIC_ICR, icrl); + break; + case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { + int i; + struct kvm_vcpu *vcpu; + struct kvm *kvm = svm->vcpu.kvm; + struct kvm_lapic *apic = svm->vcpu.arch.apic; + + /* + * At this point, we expect that the AVIC HW has already + * set the appropriate IRR bits on the valid target + * vcpus. So, we just need to kick the appropriate vcpu. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + bool m = kvm_apic_match_dest(vcpu, apic, + icrl & KVM_APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & KVM_APIC_DEST_MASK); + + if (m && !avic_vcpu_is_running(vcpu)) + kvm_vcpu_wake_up(vcpu); + } + break; + } + case AVIC_IPI_FAILURE_INVALID_TARGET: + break; + case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: + WARN_ONCE(1, "Invalid backing page\n"); + break; + default: + pr_err("Unknown IPI interception\n"); + } + + return 1; +} + +static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) +{ + struct kvm_arch *vm_data = &vcpu->kvm->arch; + int index; + u32 *logical_apic_id_table; + int dlid = GET_APIC_LOGICAL_ID(ldr); + + if (!dlid) + return NULL; + + if (flat) { /* flat */ + index = ffs(dlid) - 1; + if (index > 7) + return NULL; + } else { /* cluster */ + int cluster = (dlid & 0xf0) >> 4; + int apic = ffs(dlid & 0x0f) - 1; + + if ((apic < 0) || (apic > 7) || + (cluster >= 0xf)) + return NULL; + index = (cluster << 2) + apic; + } + + logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page); + + return &logical_apic_id_table[index]; +} + +static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, + bool valid) +{ + bool flat; + u32 *entry, new_entry; + + flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; + entry = avic_get_logical_id_entry(vcpu, ldr, flat); + if (!entry) + return -EINVAL; + + new_entry = READ_ONCE(*entry); + new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); + if (valid) + new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; + else + new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK; + WRITE_ONCE(*entry, new_entry); + + return 0; +} + +static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) +{ + int ret; + struct vcpu_svm *svm = to_svm(vcpu); + u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); + + if (!ldr) + return 1; + + ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true); + if (ret && svm->ldr_reg) { + avic_ldr_write(vcpu, 0, svm->ldr_reg, false); + svm->ldr_reg = 0; + } else { + svm->ldr_reg = ldr; + } + return ret; +} + +static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) +{ + u64 *old, *new; + struct vcpu_svm *svm = to_svm(vcpu); + u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID); + u32 id = (apic_id_reg >> 24) & 0xff; + + if (vcpu->vcpu_id == id) + return 0; + + old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id); + new = avic_get_physical_id_entry(vcpu, id); + if (!new || !old) + return 1; + + /* We need to move physical_id_entry to new offset */ + *new = *old; + *old = 0ULL; + to_svm(vcpu)->avic_physical_id_cache = new; + + /* + * Also update the guest physical APIC ID in the logical + * APIC ID table entry if already setup the LDR. + */ + if (svm->ldr_reg) + avic_handle_ldr_update(vcpu); + + return 0; +} + +static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_arch *vm_data = &vcpu->kvm->arch; + u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); + u32 mod = (dfr >> 28) & 0xf; + + /* + * We assume that all local APICs are using the same type. + * If this changes, we need to flush the AVIC logical + * APID id table. + */ + if (vm_data->ldr_mode == mod) + return 0; + + clear_page(page_address(vm_data->avic_logical_id_table_page)); + vm_data->ldr_mode = mod; + + if (svm->ldr_reg) + avic_handle_ldr_update(vcpu); + return 0; +} + +static int avic_unaccel_trap_write(struct vcpu_svm *svm) +{ + struct kvm_lapic *apic = svm->vcpu.arch.apic; + u32 offset = svm->vmcb->control.exit_info_1 & + AVIC_UNACCEL_ACCESS_OFFSET_MASK; + + switch (offset) { + case APIC_ID: + if (avic_handle_apic_id_update(&svm->vcpu)) + return 0; + break; + case APIC_LDR: + if (avic_handle_ldr_update(&svm->vcpu)) + return 0; + break; + case APIC_DFR: + avic_handle_dfr_update(&svm->vcpu); + break; + default: + break; + } + + kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); + + return 1; +} + +static bool is_avic_unaccelerated_access_trap(u32 offset) +{ + bool ret = false; + + switch (offset) { + case APIC_ID: + case APIC_EOI: + case APIC_RRR: + case APIC_LDR: + case APIC_DFR: + case APIC_SPIV: + case APIC_ESR: + case APIC_ICR: + case APIC_LVTT: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + case APIC_TMICT: + case APIC_TDCR: + ret = true; + break; + default: + break; + } + return ret; +} + +static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) +{ + int ret = 0; + u32 offset = svm->vmcb->control.exit_info_1 & + AVIC_UNACCEL_ACCESS_OFFSET_MASK; + u32 vector = svm->vmcb->control.exit_info_2 & + AVIC_UNACCEL_ACCESS_VECTOR_MASK; + bool write = (svm->vmcb->control.exit_info_1 >> 32) & + AVIC_UNACCEL_ACCESS_WRITE_MASK; + bool trap = is_avic_unaccelerated_access_trap(offset); + + trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset, + trap, write, vector); + if (trap) { + /* Handling Trap */ + WARN_ONCE(!write, "svm: Handling trap read.\n"); + ret = avic_unaccel_trap_write(svm); + } else { + /* Handling Fault */ + ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); + } + + return ret; +} + static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3344,6 +3925,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, [SVM_EXIT_RSM] = emulate_on_interception, + [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, + [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, }; static void dump_vmcb(struct kvm_vcpu *vcpu) @@ -3375,10 +3958,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); + pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); pr_err("%-20s%08x\n", "event_inj:", control->event_inj); pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); + pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); + pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); + pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3562,6 +4149,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; + /* The following fields are ignored when AVIC is enabled */ control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; @@ -3583,11 +4171,17 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } +static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) +{ + return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK); +} + static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) + if (svm_nested_virtualize_tpr(vcpu) || + kvm_vcpu_apicv_active(vcpu)) return; clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); @@ -3606,11 +4200,28 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) static bool svm_get_enable_apicv(void) { - return false; + return avic; +} + +static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ } +static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +{ +} + +/* Note: Currently only used by Hyper-V. */ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + + if (!avic) + return; + + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + mark_dirty(vmcb, VMCB_INTR); } static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -3623,6 +4234,18 @@ static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) return; } +static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) +{ + kvm_lapic_set_irr(vec, vcpu->arch.apic); + smp_mb__after_atomic(); + + if (avic_vcpu_is_running(vcpu)) + wrmsrl(SVM_AVIC_DOORBELL, + kvm_cpu_get_apicid(vcpu->cpu)); + else + kvm_vcpu_wake_up(vcpu); +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3677,6 +4300,9 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + if (kvm_vcpu_apicv_active(vcpu)) + return; + /* * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes * 1, because that's a separate STGI/VMRUN intercept. The next time we @@ -3728,7 +4354,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) + if (svm_nested_virtualize_tpr(vcpu)) return; if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { @@ -3742,7 +4368,8 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) + if (svm_nested_virtualize_tpr(vcpu) || + kvm_vcpu_apicv_active(vcpu)) return; cr8 = kvm_get_cr8(vcpu); @@ -4045,14 +4672,26 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) static void svm_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_cpuid_entry2 *entry; /* Update nrips enabled cache */ svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + entry = kvm_find_cpuid_entry(vcpu, 1, 0); + if (entry) + entry->ecx &= ~bit(X86_FEATURE_X2APIC); } static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { + case 0x1: + if (avic) + entry->ecx &= ~bit(X86_FEATURE_X2APIC); + break; case 0x80000001: if (nested) entry->ecx |= (1 << 2); /* Set SVM bit */ @@ -4307,6 +4946,15 @@ static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) { } +static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) +{ + if (avic_handle_apic_id_update(vcpu) != 0) + return; + if (avic_handle_dfr_update(vcpu) != 0) + return; + avic_handle_ldr_update(vcpu); +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -4322,9 +4970,14 @@ static struct kvm_x86_ops svm_x86_ops = { .vcpu_free = svm_free_vcpu, .vcpu_reset = svm_vcpu_reset, + .vm_init = avic_vm_init, + .vm_destroy = avic_vm_destroy, + .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, + .vcpu_blocking = svm_vcpu_blocking, + .vcpu_unblocking = svm_vcpu_unblocking, .update_bp_intercept = update_bp_intercept, .get_msr = svm_get_msr, @@ -4382,6 +5035,9 @@ static struct kvm_x86_ops svm_x86_ops = { .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .sync_pir_to_irr = svm_sync_pir_to_irr, + .hwapic_irr_update = svm_hwapic_irr_update, + .hwapic_isr_update = svm_hwapic_isr_update, + .apicv_post_state_restore = avic_post_state_restore, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, @@ -4415,6 +5071,7 @@ static struct kvm_x86_ops svm_x86_ops = { .sched_in = svm_sched_in, .pmu_ops = &amd_pmu_ops, + .deliver_posted_interrupt = svm_deliver_avic_intr, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b72743c5668d..8de925031b5c 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1291,6 +1291,63 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, __entry->vcpu_id, __entry->timer_index) ); +/* + * Tracepoint for AMD AVIC + */ +TRACE_EVENT(kvm_avic_incomplete_ipi, + TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index), + TP_ARGS(vcpu, icrh, icrl, id, index), + + TP_STRUCT__entry( + __field(u32, vcpu) + __field(u32, icrh) + __field(u32, icrl) + __field(u32, id) + __field(u32, index) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->icrh = icrh; + __entry->icrl = icrl; + __entry->id = id; + __entry->index = index; + ), + + TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n", + __entry->vcpu, __entry->icrh, __entry->icrl, + __entry->id, __entry->index) +); + +TRACE_EVENT(kvm_avic_unaccelerated_access, + TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec), + TP_ARGS(vcpu, offset, ft, rw, vec), + + TP_STRUCT__entry( + __field(u32, vcpu) + __field(u32, offset) + __field(bool, ft) + __field(bool, rw) + __field(u32, vec) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->offset = offset; + __entry->ft = ft; + __entry->rw = rw; + __entry->vec = vec; + ), + + TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n", + __entry->vcpu, + __entry->offset, + __print_symbolic(__entry->offset, kvm_trace_symbol_apic), + __entry->ft ? "trap" : "fault", + __entry->rw ? "write" : "read", + __entry->vec) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cb47fe3da292..7758680db20b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2072,7 +2072,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) unsigned int dest; if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) return; do { @@ -2180,7 +2181,8 @@ static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) return; /* Set SN when the vCPU is preempted */ @@ -2418,7 +2420,9 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_nested; - else if (vcpu->arch.apic_base & X2APIC_ENABLE) { + else if (cpu_has_secondary_exec_ctrls() && + (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else @@ -4787,6 +4791,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + if (cpu_has_secondary_exec_ctrls()) { + if (kvm_vcpu_apicv_active(vcpu)) + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + else + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + } + + if (cpu_has_vmx_msr_bitmap()) + vmx_set_msr_bitmap(vcpu); } static u32 vmx_exec_control(struct vcpu_vmx *vmx) @@ -4962,6 +4979,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (vmx_xsaves_supported()) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); + if (enable_pml) { + ASSERT(vmx->pml_pg); + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + } + return 0; } @@ -5050,8 +5073,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx->vcpu.arch.cr0 = cr0; + vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx_set_cr4(vcpu, 0); vmx_set_efer(vcpu, 0); vmx_fpu_activate(vcpu); @@ -6333,23 +6356,20 @@ static __init int hardware_setup(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - if (enable_apicv) { - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr); - - /* According SDM, in x2apic mode, the whole id reg is used. - * But in KVM, it only use the highest eight bits. Need to - * intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808); - /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b); - /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f); - } + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. But in + * KVM, it only use the highest eight bits. Need to intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); if (enable_ept) { kvm_mmu_set_mask_ptes(0ull, @@ -6657,7 +6677,13 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, /* Checks for #GP/#SS exceptions. */ exn = false; - if (is_protmode(vcpu)) { + if (is_long_mode(vcpu)) { + /* Long mode: #GP(0)/#SS(0) if the memory address is in a + * non-canonical form. This is the only check on the memory + * destination for long mode! + */ + exn = is_noncanonical_address(*ret); + } else if (is_protmode(vcpu)) { /* Protected mode: apply checks for segment validity in the * following order: * - segment type check (#GP(0) may be thrown) @@ -6674,17 +6700,10 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, * execute-only code segment */ exn = ((s.type & 0xa) == 8); - } - if (exn) { - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; - } - if (is_long_mode(vcpu)) { - /* Long mode: #GP(0)/#SS(0) if the memory address is in a - * non-canonical form. This is an only check for long mode. - */ - exn = is_noncanonical_address(*ret); - } else if (is_protmode(vcpu)) { + if (exn) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. */ exn = (s.unusable != 0); @@ -7924,22 +7943,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -static int vmx_create_pml_buffer(struct vcpu_vmx *vmx) -{ - struct page *pml_pg; - - pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!pml_pg) - return -ENOMEM; - - vmx->pml_pg = pml_pg; - - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - - return 0; -} - static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { if (vmx->pml_pg) { @@ -8211,6 +8214,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && + exit_reason != EXIT_REASON_PML_FULL && exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; @@ -8318,19 +8322,19 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) vmcs_write64(APIC_ACCESS_ADDR, hpa); } -static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) +static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { u16 status; u8 old; - if (isr == -1) - isr = 0; + if (max_isr == -1) + max_isr = 0; status = vmcs_read16(GUEST_INTR_STATUS); old = status >> 8; - if (isr != old) { + if (max_isr != old) { status &= 0xff; - status |= isr << 8; + status |= max_isr << 8; vmcs_write16(GUEST_INTR_STATUS, status); } } @@ -8841,6 +8845,22 @@ static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) put_cpu(); } +/* + * Ensure that the current vmcs of the logical processor is the + * vmcs01 of the vcpu before calling free_nested(). + */ +static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int r; + + r = vcpu_load(vcpu); + BUG_ON(r); + vmx_load_vmcs01(vcpu); + free_nested(vmx); + vcpu_put(vcpu); +} + static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8849,8 +8869,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); - free_nested(vmx); + vmx_free_vcpu_nested(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); @@ -8872,14 +8891,26 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_vcpu; + err = -ENOMEM; + + /* + * If PML is turned on, failure on enabling PML just results in failure + * of creating the vcpu, therefore we can simplify PML logic (by + * avoiding dealing with cases, such as enabling PML partially on vcpus + * for the guest, etc. + */ + if (enable_pml) { + vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!vmx->pml_pg) + goto uninit_vcpu; + } + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) > PAGE_SIZE); - err = -ENOMEM; - if (!vmx->guest_msrs) { - goto uninit_vcpu; - } + if (!vmx->guest_msrs) + goto free_pml; vmx->loaded_vmcs = &vmx->vmcs01; vmx->loaded_vmcs->vmcs = alloc_vmcs(); @@ -8923,18 +8954,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; - /* - * If PML is turned on, failure on enabling PML just results in failure - * of creating the vcpu, therefore we can simplify PML logic (by - * avoiding dealing with cases, such as enabling PML partially on vcpus - * for the guest, etc. - */ - if (enable_pml) { - err = vmx_create_pml_buffer(vmx); - if (err) - goto free_vmcs; - } - return &vmx->vcpu; free_vmcs: @@ -8942,6 +8961,8 @@ free_vmcs: free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); +free_pml: + vmx_destroy_pml_buffer(vmx); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: @@ -10702,7 +10723,8 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) return 0; vcpu->pre_pcpu = vcpu->cpu; @@ -10768,7 +10790,8 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) unsigned long flags; if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) return; do { @@ -10821,7 +10844,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, int idx, ret = -EINVAL; if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(kvm->vcpus[0])) return 0; idx = srcu_read_lock(&kvm->irq_srcu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 12f33e662382..b2766723c951 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -55,9 +55,6 @@ #include <linux/irqbypass.h> #include <trace/events/kvm.h> -#define CREATE_TRACE_POINTS -#include "trace.h" - #include <asm/debugreg.h> #include <asm/msr.h> #include <asm/desc.h> @@ -68,6 +65,9 @@ #include <asm/div64.h> #include <asm/irq_remapping.h> +#define CREATE_TRACE_POINTS +#include "trace.h" + #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) @@ -161,6 +161,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_exits", VCPU_STAT(halt_exits) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, + { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, @@ -1243,12 +1244,6 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); static unsigned long max_tsc_khz; -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) -{ - return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, - vcpu->arch.virtual_tsc_shift); -} - static u32 adjust_tsc_khz(u32 khz, s32 ppm) { u64 v = (u64)khz * (1000000 + ppm); @@ -2002,22 +1997,8 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) vcpu->arch.pv_time_enabled = false; } -static void accumulate_steal_time(struct kvm_vcpu *vcpu) -{ - u64 delta; - - if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) - return; - - delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; - vcpu->arch.st.last_steal = current->sched_info.run_delay; - vcpu->arch.st.accum_steal = delta; -} - static void record_steal_time(struct kvm_vcpu *vcpu) { - accumulate_steal_time(vcpu); - if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -2025,9 +2006,26 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; - vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; - vcpu->arch.st.steal.version += 2; - vcpu->arch.st.accum_steal = 0; + if (vcpu->arch.st.steal.version & 1) + vcpu->arch.st.steal.version += 1; /* first time write, random junk */ + + vcpu->arch.st.steal.version += 1; + + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + + smp_wmb(); + + vcpu->arch.st.steal.steal += current->sched_info.run_delay - + vcpu->arch.st.last_steal; + vcpu->arch.st.last_steal = current->sched_info.run_delay; + + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + + smp_wmb(); + + vcpu->arch.st.steal.version += 1; kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); @@ -2310,6 +2308,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: case MSR_AMD64_BU_CFG2: + case MSR_IA32_PERF_CTL: msr_info->data = 0; break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: @@ -2968,6 +2967,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SMM)) return -EINVAL; + if (events->exception.injected && + (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) + return -EINVAL; + process_nmi(vcpu); vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; @@ -3032,6 +3035,11 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, if (dbgregs->flags) return -EINVAL; + if (dbgregs->dr6 & ~0xffffffffull) + return -EINVAL; + if (dbgregs->dr7 & ~0xffffffffull) + return -EINVAL; + memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; @@ -7752,6 +7760,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); + if (kvm_x86_ops->vm_init) + return kvm_x86_ops->vm_init(kvm); + return 0; } @@ -7808,7 +7819,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) slot = id_to_memslot(slots, id); if (size) { - if (WARN_ON(slot->npages)) + if (slot->npages) return -EEXIST; /* @@ -7873,6 +7884,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0); x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); } + if (kvm_x86_ops->vm_destroy) + kvm_x86_ops->vm_destroy(kvm); kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); @@ -8355,19 +8368,21 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +bool kvm_arch_has_irq_bypass(void) +{ + return kvm_x86_ops->update_pi_irte != NULL; +} + int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); - if (kvm_x86_ops->update_pi_irte) { - irqfd->producer = prod; - return kvm_x86_ops->update_pi_irte(irqfd->kvm, - prod->irq, irqfd->gsi, 1); - } + irqfd->producer = prod; - return -EINVAL; + return kvm_x86_ops->update_pi_irte(irqfd->kvm, + prod->irq, irqfd->gsi, 1); } void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, @@ -8377,11 +8392,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); - if (!kvm_x86_ops->update_pi_irte) { - WARN_ON(irqfd->producer != NULL); - return; - } - WARN_ON(irqfd->producer != prod); irqfd->producer = NULL; @@ -8429,3 +8439,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7ce3634ab5fe..a82ca466b62e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,6 +2,7 @@ #define ARCH_X86_KVM_X86_H #include <linux/kvm_host.h> +#include <asm/pvclock.h> #include "kvm_cache_regs.h" #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL @@ -195,6 +196,12 @@ extern unsigned int lapic_timer_advance_ns; extern struct static_key kvm_no_apic_vcpu; +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) +{ + return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); +} + /* Same "calling convention" as do_div: * - divide (n << 32) by base * - put result in n diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 3847e736702e..25da5bc8d83d 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1233,8 +1233,6 @@ static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) static void probe_pci_console(void) { u8 cap, common_cap = 0, device_cap = 0; - /* Offset within BAR0 */ - u32 device_offset; u32 device_len; /* Avoid recursive printk into here. */ @@ -1258,24 +1256,16 @@ static void probe_pci_console(void) u8 vndr = read_pci_config_byte(0, 1, 0, cap); if (vndr == PCI_CAP_ID_VNDR) { u8 type, bar; - u32 offset, length; type = read_pci_config_byte(0, 1, 0, cap + offsetof(struct virtio_pci_cap, cfg_type)); bar = read_pci_config_byte(0, 1, 0, cap + offsetof(struct virtio_pci_cap, bar)); - offset = read_pci_config(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, offset)); - length = read_pci_config(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, length)); switch (type) { case VIRTIO_PCI_CAP_DEVICE_CFG: - if (bar == 0) { + if (bar == 0) device_cap = cap; - device_offset = offset; - device_len = length; - } break; case VIRTIO_PCI_CAP_PCI_CFG: console_access_cap = cap; @@ -1297,13 +1287,16 @@ static void probe_pci_console(void) * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE * it should ignore the access. */ + device_len = read_pci_config(0, 1, 0, + device_cap + offsetof(struct virtio_pci_cap, length)); if (device_len < (offsetof(struct virtio_console_config, emerg_wr) + sizeof(u32))) { printk(KERN_ERR "lguest: console missing emerg_wr field\n"); return; } - console_cfg_offset = device_offset; + console_cfg_offset = read_pci_config(0, 1, 0, + device_cap + offsetof(struct virtio_pci_cap, offset)); printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 72a576752a7e..34a74131a12c 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -24,8 +24,9 @@ lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o +lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o -obj-y += msr.o msr-reg.o msr-reg-export.o +obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 2b0ef26da0bd..bf603ebbfd8e 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -17,11 +17,11 @@ /* Standard copy_to_user with segment limit checking */ ENTRY(_copy_to_user) - GET_THREAD_INFO(%rax) + mov PER_CPU_VAR(current_task), %rax movq %rdi,%rcx addq %rdx,%rcx jc bad_to_user - cmpq TI_addr_limit(%rax),%rcx + cmpq TASK_addr_limit(%rax),%rcx ja bad_to_user ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ "jmp copy_user_generic_string", \ @@ -32,11 +32,11 @@ ENDPROC(_copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(_copy_from_user) - GET_THREAD_INFO(%rax) + mov PER_CPU_VAR(current_task), %rax movq %rsi,%rcx addq %rdx,%rcx jc bad_from_user - cmpq TI_addr_limit(%rax),%rcx + cmpq TASK_addr_limit(%rax),%rcx ja bad_from_user ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ "jmp copy_user_generic_string", \ diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 28a6654f0d08..b6fcb9a9ddbc 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -6,6 +6,7 @@ */ #include <asm/checksum.h> #include <linux/module.h> +#include <linux/uaccess.h> #include <asm/smap.h> /** diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 46668cda4ffd..0ef5128c2de8 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -35,8 +35,8 @@ .text ENTRY(__get_user_1) - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 1: movzbl (%_ASM_AX),%edx @@ -48,8 +48,8 @@ ENDPROC(__get_user_1) ENTRY(__get_user_2) add $1,%_ASM_AX jc bad_get_user - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 2: movzwl -1(%_ASM_AX),%edx @@ -61,8 +61,8 @@ ENDPROC(__get_user_2) ENTRY(__get_user_4) add $3,%_ASM_AX jc bad_get_user - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 3: movl -3(%_ASM_AX),%edx @@ -75,8 +75,8 @@ ENTRY(__get_user_8) #ifdef CONFIG_X86_64 add $7,%_ASM_AX jc bad_get_user - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 4: movq -7(%_ASM_AX),%rdx @@ -86,8 +86,8 @@ ENTRY(__get_user_8) #else add $7,%_ASM_AX jc bad_get_user_8 - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user_8 ASM_STAC 4: movl -7(%_ASM_AX),%edx diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S new file mode 100644 index 000000000000..02de3d74d2c5 --- /dev/null +++ b/arch/x86/lib/hweight.S @@ -0,0 +1,77 @@ +#include <linux/linkage.h> + +#include <asm/asm.h> + +/* + * unsigned int __sw_hweight32(unsigned int w) + * %rdi: w + */ +ENTRY(__sw_hweight32) + +#ifdef CONFIG_X86_64 + movl %edi, %eax # w +#endif + __ASM_SIZE(push,) %__ASM_REG(dx) + movl %eax, %edx # w -> t + shrl %edx # t >>= 1 + andl $0x55555555, %edx # t &= 0x55555555 + subl %edx, %eax # w -= t + + movl %eax, %edx # w -> t + shrl $2, %eax # w_tmp >>= 2 + andl $0x33333333, %edx # t &= 0x33333333 + andl $0x33333333, %eax # w_tmp &= 0x33333333 + addl %edx, %eax # w = w_tmp + t + + movl %eax, %edx # w -> t + shrl $4, %edx # t >>= 4 + addl %edx, %eax # w_tmp += t + andl $0x0f0f0f0f, %eax # w_tmp &= 0x0f0f0f0f + imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101 + shrl $24, %eax # w = w_tmp >> 24 + __ASM_SIZE(pop,) %__ASM_REG(dx) + ret +ENDPROC(__sw_hweight32) + +ENTRY(__sw_hweight64) +#ifdef CONFIG_X86_64 + pushq %rdx + + movq %rdi, %rdx # w -> t + movabsq $0x5555555555555555, %rax + shrq %rdx # t >>= 1 + andq %rdx, %rax # t &= 0x5555555555555555 + movabsq $0x3333333333333333, %rdx + subq %rax, %rdi # w -= t + + movq %rdi, %rax # w -> t + shrq $2, %rdi # w_tmp >>= 2 + andq %rdx, %rax # t &= 0x3333333333333333 + andq %rdi, %rdx # w_tmp &= 0x3333333333333333 + addq %rdx, %rax # w = w_tmp + t + + movq %rax, %rdx # w -> t + shrq $4, %rdx # t >>= 4 + addq %rdx, %rax # w_tmp += t + movabsq $0x0f0f0f0f0f0f0f0f, %rdx + andq %rdx, %rax # w_tmp &= 0x0f0f0f0f0f0f0f0f + movabsq $0x0101010101010101, %rdx + imulq %rdx, %rax # w_tmp *= 0x0101010101010101 + shrq $56, %rax # w = w_tmp >> 56 + + popq %rdx + ret +#else /* CONFIG_X86_32 */ + /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ + pushl %ecx + + call __sw_hweight32 + movl %eax, %ecx # stash away result + movl %edx, %eax # second part of input + call __sw_hweight32 + addl %ecx, %eax # result + + popl %ecx + ret +#endif +ENDPROC(__sw_hweight64) diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 1a416935bac9..1088eb8f3a5f 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -155,14 +155,24 @@ found: /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is - * LDS or LES. + * LDS or LES or BOUND. */ if (X86_MODRM_MOD(b2) != 3) goto vex_end; } insn->vex_prefix.bytes[0] = b; insn->vex_prefix.bytes[1] = b2; - if (inat_is_vex3_prefix(attr)) { + if (inat_is_evex_prefix(attr)) { + b2 = peek_nbyte_next(insn_byte_t, insn, 2); + insn->vex_prefix.bytes[2] = b2; + b2 = peek_nbyte_next(insn_byte_t, insn, 3); + insn->vex_prefix.bytes[3] = b2; + insn->vex_prefix.nbytes = 4; + insn->next_byte += 4; + if (insn->x86_64 && X86_VEX_W(b2)) + /* VEX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } else if (inat_is_vex3_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn->vex_prefix.bytes[2] = b2; insn->vex_prefix.nbytes = 3; @@ -221,7 +231,9 @@ void insn_get_opcode(struct insn *insn) m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); - if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr)) + if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || + (!inat_accept_vex(insn->attr) && + !inat_is_group(insn->attr))) insn->attr = 0; /* This instruction is bad */ goto end; /* VEX has only 1 byte for opcode */ } diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c new file mode 100644 index 000000000000..f7dfeda83e5c --- /dev/null +++ b/arch/x86/lib/kaslr.c @@ -0,0 +1,90 @@ +/* + * Entropy functions used on early boot for KASLR base and memory + * randomization. The base randomization is done in the compressed + * kernel and memory randomization is done early when the regular + * kernel starts. This file is included in the compressed kernel and + * normally linked in the regular. + */ +#include <asm/kaslr.h> +#include <asm/msr.h> +#include <asm/archrandom.h> +#include <asm/e820.h> +#include <asm/io.h> + +/* + * When built for the regular kernel, several functions need to be stubbed out + * or changed to their regular kernel equivalent. + */ +#ifndef KASLR_COMPRESSED_BOOT +#include <asm/cpufeature.h> +#include <asm/setup.h> + +#define debug_putstr(v) early_printk(v) +#define has_cpuflag(f) boot_cpu_has(f) +#define get_boot_seed() kaslr_offset() +#endif + +#define I8254_PORT_CONTROL 0x43 +#define I8254_PORT_COUNTER0 0x40 +#define I8254_CMD_READBACK 0xC0 +#define I8254_SELECT_COUNTER0 0x02 +#define I8254_STATUS_NOTREADY 0x40 +static inline u16 i8254(void) +{ + u16 status, timer; + + do { + outb(I8254_PORT_CONTROL, + I8254_CMD_READBACK | I8254_SELECT_COUNTER0); + status = inb(I8254_PORT_COUNTER0); + timer = inb(I8254_PORT_COUNTER0); + timer |= inb(I8254_PORT_COUNTER0) << 8; + } while (status & I8254_STATUS_NOTREADY); + + return timer; +} + +unsigned long kaslr_get_random_long(const char *purpose) +{ +#ifdef CONFIG_X86_64 + const unsigned long mix_const = 0x5d6008cbf3848dd3UL; +#else + const unsigned long mix_const = 0x3f39e593UL; +#endif + unsigned long raw, random = get_boot_seed(); + bool use_i8254 = true; + + debug_putstr(purpose); + debug_putstr(" KASLR using"); + + if (has_cpuflag(X86_FEATURE_RDRAND)) { + debug_putstr(" RDRAND"); + if (rdrand_long(&raw)) { + random ^= raw; + use_i8254 = false; + } + } + + if (has_cpuflag(X86_FEATURE_TSC)) { + debug_putstr(" RDTSC"); + raw = rdtsc(); + + random ^= raw; + use_i8254 = false; + } + + if (use_i8254) { + debug_putstr(" i8254"); + random ^= i8254(); + } + + /* Circular multiply for better bit diffusion */ + asm("mul %3" + : "=a" (random), "=d" (raw) + : "a" (random), "rm" (mix_const)); + random += raw; + + debug_putstr("...\n"); + + return random; +} diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index e0817a12d323..c891ece81e5b 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -29,14 +29,14 @@ * as they get called from within inline assembly. */ -#define ENTER GET_THREAD_INFO(%_ASM_BX) +#define ENTER mov PER_CPU_VAR(current_task), %_ASM_BX #define EXIT ASM_CLAC ; \ ret .text ENTRY(__put_user_1) ENTER - cmp TI_addr_limit(%_ASM_BX),%_ASM_CX + cmp TASK_addr_limit(%_ASM_BX),%_ASM_CX jae bad_put_user ASM_STAC 1: movb %al,(%_ASM_CX) @@ -46,7 +46,7 @@ ENDPROC(__put_user_1) ENTRY(__put_user_2) ENTER - mov TI_addr_limit(%_ASM_BX),%_ASM_BX + mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $1,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user @@ -58,7 +58,7 @@ ENDPROC(__put_user_2) ENTRY(__put_user_4) ENTER - mov TI_addr_limit(%_ASM_BX),%_ASM_BX + mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $3,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user @@ -70,7 +70,7 @@ ENDPROC(__put_user_4) ENTRY(__put_user_8) ENTER - mov TI_addr_limit(%_ASM_BX),%_ASM_BX + mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $7,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0a42327a59d7..9f760cdcaf40 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -6,7 +6,7 @@ * Copyright 2002 Andi Kleen <ak@suse.de> */ #include <linux/module.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* * Zero Userspace diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index d388de72eaca..ec378cd7b71e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -13,12 +13,17 @@ # opcode: escape # escaped-name # EndTable # +# mnemonics that begin with lowercase 'v' accept a VEX or EVEX prefix +# mnemonics that begin with lowercase 'k' accept a VEX prefix +# #<group maps> # GrpTable: GrpXXX # reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] # EndTable # # AVX Superscripts +# (ev): this opcode requires EVEX prefix. +# (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. # @@ -137,7 +142,7 @@ AVXcode: # 0x60 - 0x6f 60: PUSHA/PUSHAD (i64) 61: POPA/POPAD (i64) -62: BOUND Gv,Ma (i64) +62: BOUND Gv,Ma (i64) | EVEX (Prefix) 63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) 64: SEG=FS (Prefix) 65: SEG=GS (Prefix) @@ -399,17 +404,17 @@ AVXcode: 1 3f: # 0x0f 0x40-0x4f 40: CMOVO Gv,Ev -41: CMOVNO Gv,Ev -42: CMOVB/C/NAE Gv,Ev +41: CMOVNO Gv,Ev | kandw/q Vk,Hk,Uk | kandb/d Vk,Hk,Uk (66) +42: CMOVB/C/NAE Gv,Ev | kandnw/q Vk,Hk,Uk | kandnb/d Vk,Hk,Uk (66) 43: CMOVAE/NB/NC Gv,Ev -44: CMOVE/Z Gv,Ev -45: CMOVNE/NZ Gv,Ev -46: CMOVBE/NA Gv,Ev -47: CMOVA/NBE Gv,Ev +44: CMOVE/Z Gv,Ev | knotw/q Vk,Uk | knotb/d Vk,Uk (66) +45: CMOVNE/NZ Gv,Ev | korw/q Vk,Hk,Uk | korb/d Vk,Hk,Uk (66) +46: CMOVBE/NA Gv,Ev | kxnorw/q Vk,Hk,Uk | kxnorb/d Vk,Hk,Uk (66) +47: CMOVA/NBE Gv,Ev | kxorw/q Vk,Hk,Uk | kxorb/d Vk,Hk,Uk (66) 48: CMOVS Gv,Ev 49: CMOVNS Gv,Ev -4a: CMOVP/PE Gv,Ev -4b: CMOVNP/PO Gv,Ev +4a: CMOVP/PE Gv,Ev | kaddw/q Vk,Hk,Uk | kaddb/d Vk,Hk,Uk (66) +4b: CMOVNP/PO Gv,Ev | kunpckbw Vk,Hk,Uk (66) | kunpckwd/dq Vk,Hk,Uk 4c: CMOVL/NGE Gv,Ev 4d: CMOVNL/GE Gv,Ev 4e: CMOVLE/NG Gv,Ev @@ -426,7 +431,7 @@ AVXcode: 1 58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1) 59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1) 5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1) -5b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3) +5b: vcvtdq2ps Vps,Wdq | vcvtqq2ps Vps,Wqq (evo) | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3) 5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1) 5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1) 5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1) @@ -447,7 +452,7 @@ AVXcode: 1 6c: vpunpcklqdq Vx,Hx,Wx (66),(v1) 6d: vpunpckhqdq Vx,Hx,Wx (66),(v1) 6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1) -6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3) +6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqa32/64 Vx,Wx (66),(evo) | vmovdqu Vx,Wx (F3) | vmovdqu32/64 Vx,Wx (F3),(evo) | vmovdqu8/16 Vx,Wx (F2),(ev) # 0x0f 0x70-0x7f 70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1) 71: Grp12 (1A) @@ -458,14 +463,14 @@ AVXcode: 1 76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1) # Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX. 77: emms | vzeroupper | vzeroall -78: VMREAD Ey,Gy -79: VMWRITE Gy,Ey -7a: -7b: +78: VMREAD Ey,Gy | vcvttps2udq/pd2udq Vx,Wpd (evo) | vcvttsd2usi Gv,Wx (F2),(ev) | vcvttss2usi Gv,Wx (F3),(ev) | vcvttps2uqq/pd2uqq Vx,Wx (66),(ev) +79: VMWRITE Gy,Ey | vcvtps2udq/pd2udq Vx,Wpd (evo) | vcvtsd2usi Gv,Wx (F2),(ev) | vcvtss2usi Gv,Wx (F3),(ev) | vcvtps2uqq/pd2uqq Vx,Wx (66),(ev) +7a: vcvtudq2pd/uqq2pd Vpd,Wx (F3),(ev) | vcvtudq2ps/uqq2ps Vpd,Wx (F2),(ev) | vcvttps2qq/pd2qq Vx,Wx (66),(ev) +7b: vcvtusi2sd Vpd,Hpd,Ev (F2),(ev) | vcvtusi2ss Vps,Hps,Ev (F3),(ev) | vcvtps2qq/pd2qq Vx,Wx (66),(ev) 7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2) 7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2) 7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) -7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) +7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev) # 0x0f 0x80-0x8f # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). 80: JO Jz (f64) @@ -485,16 +490,16 @@ AVXcode: 1 8e: JLE/JNG Jz (f64) 8f: JNLE/JG Jz (f64) # 0x0f 0x90-0x9f -90: SETO Eb -91: SETNO Eb -92: SETB/C/NAE Eb -93: SETAE/NB/NC Eb +90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66) +91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66) +92: SETB/C/NAE Eb | kmovw Vk,Rv | kmovb Vk,Rv (66) | kmovq/d Vk,Rv (F2) +93: SETAE/NB/NC Eb | kmovw Gv,Uk | kmovb Gv,Uk (66) | kmovq/d Gv,Uk (F2) 94: SETE/Z Eb 95: SETNE/NZ Eb 96: SETBE/NA Eb 97: SETA/NBE Eb -98: SETS Eb -99: SETNS Eb +98: SETS Eb | kortestw/q Vk,Uk | kortestb/d Vk,Uk (66) +99: SETNS Eb | ktestw/q Vk,Uk | ktestb/d Vk,Uk (66) 9a: SETP/PE Eb 9b: SETNP/PO Eb 9c: SETL/NGE Eb @@ -564,11 +569,11 @@ d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1) d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1) d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1) da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1) -db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1) +db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1) | vpandd/q Vx,Hx,Wx (66),(evo) dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1) dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1) de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1) -df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1) +df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1) | vpandnd/q Vx,Hx,Wx (66),(evo) # 0x0f 0xe0-0xef e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1) e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1) @@ -576,16 +581,16 @@ e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1) e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1) e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1) e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1) -e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2) +e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtdq2pd/qq2pd Vx,Wdq (F3),(evo) | vcvtpd2dq Vx,Wpd (F2) e7: movntq Mq,Pq | vmovntdq Mx,Vx (66) e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1) e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1) ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1) -eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1) +eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1) | vpord/q Vx,Hx,Wx (66),(evo) ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1) ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1) ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1) -ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1) +ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1) | vpxord/q Vx,Hx,Wx (66),(evo) # 0x0f 0xf0-0xff f0: vlddqu Vx,Mx (F2) f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1) @@ -626,81 +631,105 @@ AVXcode: 2 0e: vtestps Vx,Wx (66),(v) 0f: vtestpd Vx,Wx (66),(v) # 0x0f 0x38 0x10-0x1f -10: pblendvb Vdq,Wdq (66) -11: -12: -13: vcvtph2ps Vx,Wx,Ib (66),(v) -14: blendvps Vdq,Wdq (66) -15: blendvpd Vdq,Wdq (66) -16: vpermps Vqq,Hqq,Wqq (66),(v) +10: pblendvb Vdq,Wdq (66) | vpsrlvw Vx,Hx,Wx (66),(evo) | vpmovuswb Wx,Vx (F3),(ev) +11: vpmovusdb Wx,Vd (F3),(ev) | vpsravw Vx,Hx,Wx (66),(ev) +12: vpmovusqb Wx,Vq (F3),(ev) | vpsllvw Vx,Hx,Wx (66),(ev) +13: vcvtph2ps Vx,Wx (66),(v) | vpmovusdw Wx,Vd (F3),(ev) +14: blendvps Vdq,Wdq (66) | vpmovusqw Wx,Vq (F3),(ev) | vprorvd/q Vx,Hx,Wx (66),(evo) +15: blendvpd Vdq,Wdq (66) | vpmovusqd Wx,Vq (F3),(ev) | vprolvd/q Vx,Hx,Wx (66),(evo) +16: vpermps Vqq,Hqq,Wqq (66),(v) | vpermps/d Vqq,Hqq,Wqq (66),(evo) 17: vptest Vx,Wx (66) 18: vbroadcastss Vx,Wd (66),(v) -19: vbroadcastsd Vqq,Wq (66),(v) -1a: vbroadcastf128 Vqq,Mdq (66),(v) -1b: +19: vbroadcastsd Vqq,Wq (66),(v) | vbroadcastf32x2 Vqq,Wq (66),(evo) +1a: vbroadcastf128 Vqq,Mdq (66),(v) | vbroadcastf32x4/64x2 Vqq,Wq (66),(evo) +1b: vbroadcastf32x8/64x4 Vqq,Mdq (66),(ev) 1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1) 1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1) 1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1) -1f: +1f: vpabsq Vx,Wx (66),(ev) # 0x0f 0x38 0x20-0x2f -20: vpmovsxbw Vx,Ux/Mq (66),(v1) -21: vpmovsxbd Vx,Ux/Md (66),(v1) -22: vpmovsxbq Vx,Ux/Mw (66),(v1) -23: vpmovsxwd Vx,Ux/Mq (66),(v1) -24: vpmovsxwq Vx,Ux/Md (66),(v1) -25: vpmovsxdq Vx,Ux/Mq (66),(v1) -26: -27: -28: vpmuldq Vx,Hx,Wx (66),(v1) -29: vpcmpeqq Vx,Hx,Wx (66),(v1) -2a: vmovntdqa Vx,Mx (66),(v1) +20: vpmovsxbw Vx,Ux/Mq (66),(v1) | vpmovswb Wx,Vx (F3),(ev) +21: vpmovsxbd Vx,Ux/Md (66),(v1) | vpmovsdb Wx,Vd (F3),(ev) +22: vpmovsxbq Vx,Ux/Mw (66),(v1) | vpmovsqb Wx,Vq (F3),(ev) +23: vpmovsxwd Vx,Ux/Mq (66),(v1) | vpmovsdw Wx,Vd (F3),(ev) +24: vpmovsxwq Vx,Ux/Md (66),(v1) | vpmovsqw Wx,Vq (F3),(ev) +25: vpmovsxdq Vx,Ux/Mq (66),(v1) | vpmovsqd Wx,Vq (F3),(ev) +26: vptestmb/w Vk,Hx,Wx (66),(ev) | vptestnmb/w Vk,Hx,Wx (F3),(ev) +27: vptestmd/q Vk,Hx,Wx (66),(ev) | vptestnmd/q Vk,Hx,Wx (F3),(ev) +28: vpmuldq Vx,Hx,Wx (66),(v1) | vpmovm2b/w Vx,Uk (F3),(ev) +29: vpcmpeqq Vx,Hx,Wx (66),(v1) | vpmovb2m/w2m Vk,Ux (F3),(ev) +2a: vmovntdqa Vx,Mx (66),(v1) | vpbroadcastmb2q Vx,Uk (F3),(ev) 2b: vpackusdw Vx,Hx,Wx (66),(v1) -2c: vmaskmovps Vx,Hx,Mx (66),(v) -2d: vmaskmovpd Vx,Hx,Mx (66),(v) +2c: vmaskmovps Vx,Hx,Mx (66),(v) | vscalefps/d Vx,Hx,Wx (66),(evo) +2d: vmaskmovpd Vx,Hx,Mx (66),(v) | vscalefss/d Vx,Hx,Wx (66),(evo) 2e: vmaskmovps Mx,Hx,Vx (66),(v) 2f: vmaskmovpd Mx,Hx,Vx (66),(v) # 0x0f 0x38 0x30-0x3f -30: vpmovzxbw Vx,Ux/Mq (66),(v1) -31: vpmovzxbd Vx,Ux/Md (66),(v1) -32: vpmovzxbq Vx,Ux/Mw (66),(v1) -33: vpmovzxwd Vx,Ux/Mq (66),(v1) -34: vpmovzxwq Vx,Ux/Md (66),(v1) -35: vpmovzxdq Vx,Ux/Mq (66),(v1) -36: vpermd Vqq,Hqq,Wqq (66),(v) +30: vpmovzxbw Vx,Ux/Mq (66),(v1) | vpmovwb Wx,Vx (F3),(ev) +31: vpmovzxbd Vx,Ux/Md (66),(v1) | vpmovdb Wx,Vd (F3),(ev) +32: vpmovzxbq Vx,Ux/Mw (66),(v1) | vpmovqb Wx,Vq (F3),(ev) +33: vpmovzxwd Vx,Ux/Mq (66),(v1) | vpmovdw Wx,Vd (F3),(ev) +34: vpmovzxwq Vx,Ux/Md (66),(v1) | vpmovqw Wx,Vq (F3),(ev) +35: vpmovzxdq Vx,Ux/Mq (66),(v1) | vpmovqd Wx,Vq (F3),(ev) +36: vpermd Vqq,Hqq,Wqq (66),(v) | vpermd/q Vqq,Hqq,Wqq (66),(evo) 37: vpcmpgtq Vx,Hx,Wx (66),(v1) -38: vpminsb Vx,Hx,Wx (66),(v1) -39: vpminsd Vx,Hx,Wx (66),(v1) -3a: vpminuw Vx,Hx,Wx (66),(v1) -3b: vpminud Vx,Hx,Wx (66),(v1) +38: vpminsb Vx,Hx,Wx (66),(v1) | vpmovm2d/q Vx,Uk (F3),(ev) +39: vpminsd Vx,Hx,Wx (66),(v1) | vpminsd/q Vx,Hx,Wx (66),(evo) | vpmovd2m/q2m Vk,Ux (F3),(ev) +3a: vpminuw Vx,Hx,Wx (66),(v1) | vpbroadcastmw2d Vx,Uk (F3),(ev) +3b: vpminud Vx,Hx,Wx (66),(v1) | vpminud/q Vx,Hx,Wx (66),(evo) 3c: vpmaxsb Vx,Hx,Wx (66),(v1) -3d: vpmaxsd Vx,Hx,Wx (66),(v1) +3d: vpmaxsd Vx,Hx,Wx (66),(v1) | vpmaxsd/q Vx,Hx,Wx (66),(evo) 3e: vpmaxuw Vx,Hx,Wx (66),(v1) -3f: vpmaxud Vx,Hx,Wx (66),(v1) +3f: vpmaxud Vx,Hx,Wx (66),(v1) | vpmaxud/q Vx,Hx,Wx (66),(evo) # 0x0f 0x38 0x40-0x8f -40: vpmulld Vx,Hx,Wx (66),(v1) +40: vpmulld Vx,Hx,Wx (66),(v1) | vpmulld/q Vx,Hx,Wx (66),(evo) 41: vphminposuw Vdq,Wdq (66),(v1) -42: -43: -44: +42: vgetexpps/d Vx,Wx (66),(ev) +43: vgetexpss/d Vx,Hx,Wx (66),(ev) +44: vplzcntd/q Vx,Wx (66),(ev) 45: vpsrlvd/q Vx,Hx,Wx (66),(v) -46: vpsravd Vx,Hx,Wx (66),(v) +46: vpsravd Vx,Hx,Wx (66),(v) | vpsravd/q Vx,Hx,Wx (66),(evo) 47: vpsllvd/q Vx,Hx,Wx (66),(v) -# Skip 0x48-0x57 +# Skip 0x48-0x4b +4c: vrcp14ps/d Vpd,Wpd (66),(ev) +4d: vrcp14ss/d Vsd,Hpd,Wsd (66),(ev) +4e: vrsqrt14ps/d Vpd,Wpd (66),(ev) +4f: vrsqrt14ss/d Vsd,Hsd,Wsd (66),(ev) +# Skip 0x50-0x57 58: vpbroadcastd Vx,Wx (66),(v) -59: vpbroadcastq Vx,Wx (66),(v) -5a: vbroadcasti128 Vqq,Mdq (66),(v) -# Skip 0x5b-0x77 +59: vpbroadcastq Vx,Wx (66),(v) | vbroadcasti32x2 Vx,Wx (66),(evo) +5a: vbroadcasti128 Vqq,Mdq (66),(v) | vbroadcasti32x4/64x2 Vx,Wx (66),(evo) +5b: vbroadcasti32x8/64x4 Vqq,Mdq (66),(ev) +# Skip 0x5c-0x63 +64: vpblendmd/q Vx,Hx,Wx (66),(ev) +65: vblendmps/d Vx,Hx,Wx (66),(ev) +66: vpblendmb/w Vx,Hx,Wx (66),(ev) +# Skip 0x67-0x74 +75: vpermi2b/w Vx,Hx,Wx (66),(ev) +76: vpermi2d/q Vx,Hx,Wx (66),(ev) +77: vpermi2ps/d Vx,Hx,Wx (66),(ev) 78: vpbroadcastb Vx,Wx (66),(v) 79: vpbroadcastw Vx,Wx (66),(v) -# Skip 0x7a-0x7f +7a: vpbroadcastb Vx,Rv (66),(ev) +7b: vpbroadcastw Vx,Rv (66),(ev) +7c: vpbroadcastd/q Vx,Rv (66),(ev) +7d: vpermt2b/w Vx,Hx,Wx (66),(ev) +7e: vpermt2d/q Vx,Hx,Wx (66),(ev) +7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) 81: INVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) +83: vpmultishiftqb Vx,Hx,Wx (66),(ev) +88: vexpandps/d Vpd,Wpd (66),(ev) +89: vpexpandd/q Vx,Wx (66),(ev) +8a: vcompressps/d Wx,Vx (66),(ev) +8b: vpcompressd/q Wx,Vx (66),(ev) 8c: vpmaskmovd/q Vx,Hx,Mx (66),(v) +8d: vpermb/w Vx,Hx,Wx (66),(ev) 8e: vpmaskmovd/q Mx,Vx,Hx (66),(v) # 0x0f 0x38 0x90-0xbf (FMA) -90: vgatherdd/q Vx,Hx,Wx (66),(v) -91: vgatherqd/q Vx,Hx,Wx (66),(v) +90: vgatherdd/q Vx,Hx,Wx (66),(v) | vpgatherdd/q Vx,Wx (66),(evo) +91: vgatherqd/q Vx,Hx,Wx (66),(v) | vpgatherqd/q Vx,Wx (66),(evo) 92: vgatherdps/d Vx,Hx,Wx (66),(v) 93: vgatherqps/d Vx,Hx,Wx (66),(v) 94: @@ -715,6 +744,10 @@ AVXcode: 2 9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1) 9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v) 9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1) +a0: vpscatterdd/q Wx,Vx (66),(ev) +a1: vpscatterqd/q Wx,Vx (66),(ev) +a2: vscatterdps/d Wx,Vx (66),(ev) +a3: vscatterqps/d Wx,Vx (66),(ev) a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v) a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v) a8: vfmadd213ps/d Vx,Hx,Wx (66),(v) @@ -725,6 +758,8 @@ ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v) ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1) ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v) af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1) +b4: vpmadd52luq Vx,Hx,Wx (66),(ev) +b5: vpmadd52huq Vx,Hx,Wx (66),(ev) b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v) b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v) b8: vfmadd231ps/d Vx,Hx,Wx (66),(v) @@ -736,12 +771,15 @@ bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1) be: vfnmsub231ps/d Vx,Hx,Wx (66),(v) bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1) # 0x0f 0x38 0xc0-0xff -c8: sha1nexte Vdq,Wdq +c4: vpconflictd/q Vx,Wx (66),(ev) +c6: Grp18 (1A) +c7: Grp19 (1A) +c8: sha1nexte Vdq,Wdq | vexp2ps/d Vx,Wx (66),(ev) c9: sha1msg1 Vdq,Wdq -ca: sha1msg2 Vdq,Wdq -cb: sha256rnds2 Vdq,Wdq -cc: sha256msg1 Vdq,Wdq -cd: sha256msg2 Vdq,Wdq +ca: sha1msg2 Vdq,Wdq | vrcp28ps/d Vx,Wx (66),(ev) +cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev) +cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev) +cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev) db: VAESIMC Vdq,Wdq (66),(v1) dc: VAESENC Vdq,Hdq,Wdq (66),(v1) dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) @@ -763,15 +801,15 @@ AVXcode: 3 00: vpermq Vqq,Wqq,Ib (66),(v) 01: vpermpd Vqq,Wqq,Ib (66),(v) 02: vpblendd Vx,Hx,Wx,Ib (66),(v) -03: +03: valignd/q Vx,Hx,Wx,Ib (66),(ev) 04: vpermilps Vx,Wx,Ib (66),(v) 05: vpermilpd Vx,Wx,Ib (66),(v) 06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v) 07: -08: vroundps Vx,Wx,Ib (66) -09: vroundpd Vx,Wx,Ib (66) -0a: vroundss Vss,Wss,Ib (66),(v1) -0b: vroundsd Vsd,Wsd,Ib (66),(v1) +08: vroundps Vx,Wx,Ib (66) | vrndscaleps Vx,Wx,Ib (66),(evo) +09: vroundpd Vx,Wx,Ib (66) | vrndscalepd Vx,Wx,Ib (66),(evo) +0a: vroundss Vss,Wss,Ib (66),(v1) | vrndscaless Vx,Hx,Wx,Ib (66),(evo) +0b: vroundsd Vsd,Wsd,Ib (66),(v1) | vrndscalesd Vx,Hx,Wx,Ib (66),(evo) 0c: vblendps Vx,Hx,Wx,Ib (66) 0d: vblendpd Vx,Hx,Wx,Ib (66) 0e: vpblendw Vx,Hx,Wx,Ib (66),(v1) @@ -780,26 +818,51 @@ AVXcode: 3 15: vpextrw Rd/Mw,Vdq,Ib (66),(v1) 16: vpextrd/q Ey,Vdq,Ib (66),(v1) 17: vextractps Ed,Vdq,Ib (66),(v1) -18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v) -19: vextractf128 Wdq,Vqq,Ib (66),(v) +18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v) | vinsertf32x4/64x2 Vqq,Hqq,Wqq,Ib (66),(evo) +19: vextractf128 Wdq,Vqq,Ib (66),(v) | vextractf32x4/64x2 Wdq,Vqq,Ib (66),(evo) +1a: vinsertf32x8/64x4 Vqq,Hqq,Wqq,Ib (66),(ev) +1b: vextractf32x8/64x4 Wdq,Vqq,Ib (66),(ev) 1d: vcvtps2ph Wx,Vx,Ib (66),(v) +1e: vpcmpud/q Vk,Hd,Wd,Ib (66),(ev) +1f: vpcmpd/q Vk,Hd,Wd,Ib (66),(ev) 20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1) 21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1) 22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1) -38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v) -39: vextracti128 Wdq,Vqq,Ib (66),(v) +23: vshuff32x4/64x2 Vx,Hx,Wx,Ib (66),(ev) +25: vpternlogd/q Vx,Hx,Wx,Ib (66),(ev) +26: vgetmantps/d Vx,Wx,Ib (66),(ev) +27: vgetmantss/d Vx,Hx,Wx,Ib (66),(ev) +30: kshiftrb/w Vk,Uk,Ib (66),(v) +31: kshiftrd/q Vk,Uk,Ib (66),(v) +32: kshiftlb/w Vk,Uk,Ib (66),(v) +33: kshiftld/q Vk,Uk,Ib (66),(v) +38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v) | vinserti32x4/64x2 Vqq,Hqq,Wqq,Ib (66),(evo) +39: vextracti128 Wdq,Vqq,Ib (66),(v) | vextracti32x4/64x2 Wdq,Vqq,Ib (66),(evo) +3a: vinserti32x8/64x4 Vqq,Hqq,Wqq,Ib (66),(ev) +3b: vextracti32x8/64x4 Wdq,Vqq,Ib (66),(ev) +3e: vpcmpub/w Vk,Hk,Wx,Ib (66),(ev) +3f: vpcmpb/w Vk,Hk,Wx,Ib (66),(ev) 40: vdpps Vx,Hx,Wx,Ib (66) 41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1) -42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1) +42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1) | vdbpsadbw Vx,Hx,Wx,Ib (66),(evo) +43: vshufi32x4/64x2 Vx,Hx,Wx,Ib (66),(ev) 44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1) 46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v) 4a: vblendvps Vx,Hx,Wx,Lx (66),(v) 4b: vblendvpd Vx,Hx,Wx,Lx (66),(v) 4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1) +50: vrangeps/d Vx,Hx,Wx,Ib (66),(ev) +51: vrangess/d Vx,Hx,Wx,Ib (66),(ev) +54: vfixupimmps/d Vx,Hx,Wx,Ib (66),(ev) +55: vfixupimmss/d Vx,Hx,Wx,Ib (66),(ev) +56: vreduceps/d Vx,Wx,Ib (66),(ev) +57: vreducess/d Vx,Hx,Wx,Ib (66),(ev) 60: vpcmpestrm Vdq,Wdq,Ib (66),(v1) 61: vpcmpestri Vdq,Wdq,Ib (66),(v1) 62: vpcmpistrm Vdq,Wdq,Ib (66),(v1) 63: vpcmpistri Vdq,Wdq,Ib (66),(v1) +66: vfpclassps/d Vk,Wx,Ib (66),(ev) +67: vfpclassss/d Vk,Wx,Ib (66),(ev) cc: sha1rnds4 Vdq,Wdq,Ib df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) f0: RORX Gy,Ey,Ib (F2),(v) @@ -927,8 +990,10 @@ GrpTable: Grp12 EndTable GrpTable: Grp13 +0: vprord/q Hx,Wx,Ib (66),(ev) +1: vprold/q Hx,Wx,Ib (66),(ev) 2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1) -4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1) +4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1) | vpsrad/q Hx,Ux,Ib (66),(evo) 6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1) EndTable @@ -963,6 +1028,20 @@ GrpTable: Grp17 3: BLSI By,Ey (v) EndTable +GrpTable: Grp18 +1: vgatherpf0dps/d Wx (66),(ev) +2: vgatherpf1dps/d Wx (66),(ev) +5: vscatterpf0dps/d Wx (66),(ev) +6: vscatterpf1dps/d Wx (66),(ev) +EndTable + +GrpTable: Grp19 +1: vgatherpf0qps/d Wx (66),(ev) +2: vgatherpf1qps/d Wx (66),(ev) +5: vscatterpf0qps/d Wx (66),(ev) +6: vscatterpf1qps/d Wx (66),(ev) +EndTable + # AMD's Prefetch Group GrpTable: GrpP 0: PREFETCH diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 62c0043a5fd5..96d2b847e09e 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -37,4 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 99bfb192803f..9a17250bcbe0 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -72,9 +72,9 @@ static struct addr_marker address_markers[] = { { 0, "User Space" }, #ifdef CONFIG_X86_64 { 0x8000000000000000UL, "Kernel Space" }, - { PAGE_OFFSET, "Low Kernel Mapping" }, - { VMALLOC_START, "vmalloc() Area" }, - { VMEMMAP_START, "Vmemmap" }, + { 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, + { 0/* VMALLOC_START */, "vmalloc() Area" }, + { 0/* VMEMMAP_START */, "Vmemmap" }, # ifdef CONFIG_X86_ESPFIX64 { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, # endif @@ -434,8 +434,16 @@ void ptdump_walk_pgd_level_checkwx(void) static int __init pt_dump_init(void) { + /* + * Various markers are not compile-time constants, so assign them + * here. + */ +#ifdef CONFIG_X86_64 + address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; +#endif #ifdef CONFIG_X86_32 - /* Not a compile-time constant on x86-32 */ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; # ifdef CONFIG_HIGHMEM diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 4bb53b89f3c5..832b98f822be 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,6 +1,7 @@ #include <linux/module.h> #include <asm/uaccess.h> #include <asm/traps.h> +#include <asm/kdebug.h> typedef bool (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int); @@ -37,7 +38,7 @@ bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { /* Special hack for uaccess_err */ - current_thread_info()->uaccess_err = 1; + current->thread.uaccess_err = 1; regs->ip = ex_fixup_addr(fixup); return true; } @@ -46,8 +47,9 @@ EXPORT_SYMBOL(ex_handler_ext); bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { - WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x\n", - (unsigned int)regs->cx); + if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) + show_stack_regs(regs); /* Pretend that the read succeeded and returned 0. */ regs->ip = ex_fixup_addr(fixup); @@ -60,9 +62,10 @@ EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { - WARN_ONCE(1, "unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x)\n", - (unsigned int)regs->cx, - (unsigned int)regs->dx, (unsigned int)regs->ax); + if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, + (unsigned int)regs->ax, regs->ip, (void *)regs->ip)) + show_stack_regs(regs); /* Pretend that the write succeeded. */ regs->ip = ex_fixup_addr(fixup); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 5ce1ed02f7e8..dc8023060456 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -292,7 +292,7 @@ void vmalloc_sync_all(void) return; for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE && address < FIXADDR_TOP; + address >= TASK_SIZE_MAX && address < FIXADDR_TOP; address += PMD_SIZE) { struct page *page; @@ -439,7 +439,7 @@ static noinline int vmalloc_fault(unsigned long address) * happen within a race in page table update. In the later * case just flush: */ - pgd = pgd_offset(current->active_mm, address); + pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; @@ -737,7 +737,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * In this case we need to make sure we're not recursively * faulting through the emulate_vsyscall() logic. */ - if (current_thread_info()->sig_on_uaccess_error && signal) { + if (current->thread.sig_on_uaccess_err && signal) { tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | PF_USER; tsk->thread.cr2 = address; @@ -854,8 +854,13 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, return; } #endif - /* Kernel addresses are always protection faults: */ - if (address >= TASK_SIZE) + + /* + * To avoid leaking information about the kernel page table + * layout, pretend that user-mode accesses to kernel addresses + * are always protection faults. + */ + if (address >= TASK_SIZE_MAX) error_code |= PF_PROT; if (likely(show_unhandled_signals)) @@ -1348,7 +1353,7 @@ good_area: * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); major |= fault & VM_FAULT_MAJOR; /* diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 14a95054d4e0..2ae8584b44c7 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -165,6 +165,7 @@ static __init int setup_hugepagesz(char *opt) } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); } else { + hugetlb_bad_size(); printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", ps >> 20); return 0; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 372aad2b3291..cc82830bc8c4 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -17,6 +17,7 @@ #include <asm/proto.h> #include <asm/dma.h> /* for MAX_DMA_PFN */ #include <asm/microcode.h> +#include <asm/kaslr.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -590,6 +591,9 @@ void __init init_mem_mapping(void) /* the ISA range is always mapped regardless of memory holes */ init_memory_mapping(0, ISA_END_ADDRESS); + /* Init the trampoline, possibly with KASLR memory offset */ + init_trampoline(); + /* * If the allocation is in bottom-up direction, we setup direct mapping * in bottom-up, otherwise we setup direct mapping in top-down. diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bce2e5d9edd4..53cc2256cf23 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -328,22 +328,30 @@ void __init cleanup_highmap(void) } } +/* + * Create PTE level page table mapping for physical addresses. + * It returns the last physical address mapped. + */ static unsigned long __meminit -phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, +phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, pgprot_t prot) { - unsigned long pages = 0, next; - unsigned long last_map_addr = end; + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; + pte_t *pte; int i; - pte_t *pte = pte_page + pte_index(addr); + pte = pte_page + pte_index(paddr); + i = pte_index(paddr); - for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { - next = (addr & PAGE_MASK) + PAGE_SIZE; - if (addr >= end) { + for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) { + paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE; + if (paddr >= paddr_end) { if (!after_bootmem && - !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && - !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) + !e820_any_mapped(paddr & PAGE_MASK, paddr_next, + E820_RAM) && + !e820_any_mapped(paddr & PAGE_MASK, paddr_next, + E820_RESERVED_KERN)) set_pte(pte, __pte(0)); continue; } @@ -354,54 +362,61 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, * pagetable pages as RO. So assume someone who pre-setup * these mappings are more intelligent. */ - if (pte_val(*pte)) { + if (!pte_none(*pte)) { if (!after_bootmem) pages++; continue; } if (0) - printk(" pte=%p addr=%lx pte=%016lx\n", - pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); + pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, + pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; - set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); - last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; } update_page_count(PG_LEVEL_4K, pages); - return last_map_addr; + return paddr_last; } +/* + * Create PMD level page table mapping for physical addresses. The virtual + * and physical address have to be aligned at this level. + * It returns the last physical address mapped. + */ static unsigned long __meminit -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, +phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, unsigned long page_size_mask, pgprot_t prot) { - unsigned long pages = 0, next; - unsigned long last_map_addr = end; + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; - int i = pmd_index(address); + int i = pmd_index(paddr); - for (; i < PTRS_PER_PMD; i++, address = next) { - pmd_t *pmd = pmd_page + pmd_index(address); + for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) { + pmd_t *pmd = pmd_page + pmd_index(paddr); pte_t *pte; pgprot_t new_prot = prot; - next = (address & PMD_MASK) + PMD_SIZE; - if (address >= end) { + paddr_next = (paddr & PMD_MASK) + PMD_SIZE; + if (paddr >= paddr_end) { if (!after_bootmem && - !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && - !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) + !e820_any_mapped(paddr & PMD_MASK, paddr_next, + E820_RAM) && + !e820_any_mapped(paddr & PMD_MASK, paddr_next, + E820_RESERVED_KERN)) set_pmd(pmd, __pmd(0)); continue; } - if (pmd_val(*pmd)) { + if (!pmd_none(*pmd)) { if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); pte = (pte_t *)pmd_page_vaddr(*pmd); - last_map_addr = phys_pte_init(pte, address, - end, prot); + paddr_last = phys_pte_init(pte, paddr, + paddr_end, prot); spin_unlock(&init_mm.page_table_lock); continue; } @@ -420,7 +435,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (page_size_mask & (1 << PG_LEVEL_2M)) { if (!after_bootmem) pages++; - last_map_addr = next; + paddr_last = paddr_next; continue; } new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); @@ -430,51 +445,65 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pmd, - pfn_pte((address & PMD_MASK) >> PAGE_SHIFT, + pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); - last_map_addr = next; + paddr_last = paddr_next; continue; } pte = alloc_low_page(); - last_map_addr = phys_pte_init(pte, address, end, new_prot); + paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot); spin_lock(&init_mm.page_table_lock); pmd_populate_kernel(&init_mm, pmd, pte); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); - return last_map_addr; + return paddr_last; } +/* + * Create PUD level page table mapping for physical addresses. The virtual + * and physical address do not have to be aligned at this level. KASLR can + * randomize virtual addresses up to this level. + * It returns the last physical address mapped. + */ static unsigned long __meminit -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, - unsigned long page_size_mask) +phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, + unsigned long page_size_mask) { - unsigned long pages = 0, next; - unsigned long last_map_addr = end; - int i = pud_index(addr); + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; + unsigned long vaddr = (unsigned long)__va(paddr); + int i = pud_index(vaddr); - for (; i < PTRS_PER_PUD; i++, addr = next) { - pud_t *pud = pud_page + pud_index(addr); + for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) { + pud_t *pud; pmd_t *pmd; pgprot_t prot = PAGE_KERNEL; - next = (addr & PUD_MASK) + PUD_SIZE; - if (addr >= end) { + vaddr = (unsigned long)__va(paddr); + pud = pud_page + pud_index(vaddr); + paddr_next = (paddr & PUD_MASK) + PUD_SIZE; + + if (paddr >= paddr_end) { if (!after_bootmem && - !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && - !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) + !e820_any_mapped(paddr & PUD_MASK, paddr_next, + E820_RAM) && + !e820_any_mapped(paddr & PUD_MASK, paddr_next, + E820_RESERVED_KERN)) set_pud(pud, __pud(0)); continue; } - if (pud_val(*pud)) { + if (!pud_none(*pud)) { if (!pud_large(*pud)) { pmd = pmd_offset(pud, 0); - last_map_addr = phys_pmd_init(pmd, addr, end, - page_size_mask, prot); + paddr_last = phys_pmd_init(pmd, paddr, + paddr_end, + page_size_mask, + prot); __flush_tlb_all(); continue; } @@ -493,7 +522,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (page_size_mask & (1 << PG_LEVEL_1G)) { if (!after_bootmem) pages++; - last_map_addr = next; + paddr_last = paddr_next; continue; } prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); @@ -503,16 +532,16 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pud, - pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, + pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); - last_map_addr = next; + paddr_last = paddr_next; continue; } pmd = alloc_low_page(); - last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, - prot); + paddr_last = phys_pmd_init(pmd, paddr, paddr_end, + page_size_mask, prot); spin_lock(&init_mm.page_table_lock); pud_populate(&init_mm, pud, pmd); @@ -522,38 +551,44 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, update_page_count(PG_LEVEL_1G, pages); - return last_map_addr; + return paddr_last; } +/* + * Create page table mapping for the physical memory for specific physical + * addresses. The virtual and physical addresses have to be aligned on PMD level + * down. It returns the last physical address mapped. + */ unsigned long __meminit -kernel_physical_mapping_init(unsigned long start, - unsigned long end, +kernel_physical_mapping_init(unsigned long paddr_start, + unsigned long paddr_end, unsigned long page_size_mask) { bool pgd_changed = false; - unsigned long next, last_map_addr = end; - unsigned long addr; + unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last; - start = (unsigned long)__va(start); - end = (unsigned long)__va(end); - addr = start; + paddr_last = paddr_end; + vaddr = (unsigned long)__va(paddr_start); + vaddr_end = (unsigned long)__va(paddr_end); + vaddr_start = vaddr; - for (; start < end; start = next) { - pgd_t *pgd = pgd_offset_k(start); + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + pgd_t *pgd = pgd_offset_k(vaddr); pud_t *pud; - next = (start & PGDIR_MASK) + PGDIR_SIZE; + vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; if (pgd_val(*pgd)) { pud = (pud_t *)pgd_page_vaddr(*pgd); - last_map_addr = phys_pud_init(pud, __pa(start), - __pa(end), page_size_mask); + paddr_last = phys_pud_init(pud, __pa(vaddr), + __pa(vaddr_end), + page_size_mask); continue; } pud = alloc_low_page(); - last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), - page_size_mask); + paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), + page_size_mask); spin_lock(&init_mm.page_table_lock); pgd_populate(&init_mm, pgd, pud); @@ -562,11 +597,11 @@ kernel_physical_mapping_init(unsigned long start, } if (pgd_changed) - sync_global_pgds(addr, end - 1, 0); + sync_global_pgds(vaddr_start, vaddr_end - 1, 0); __flush_tlb_all(); - return last_map_addr; + return paddr_last; } #ifndef CONFIG_NUMA @@ -673,7 +708,7 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) for (i = 0; i < PTRS_PER_PTE; i++) { pte = pte_start + i; - if (pte_val(*pte)) + if (!pte_none(*pte)) return; } @@ -691,7 +726,7 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) for (i = 0; i < PTRS_PER_PMD; i++) { pmd = pmd_start + i; - if (pmd_val(*pmd)) + if (!pmd_none(*pmd)) return; } @@ -702,27 +737,6 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) spin_unlock(&init_mm.page_table_lock); } -/* Return true if pgd is changed, otherwise return false. */ -static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) -{ - pud_t *pud; - int i; - - for (i = 0; i < PTRS_PER_PUD; i++) { - pud = pud_start + i; - if (pud_val(*pud)) - return false; - } - - /* free a pud table */ - free_pagetable(pgd_page(*pgd), 0); - spin_lock(&init_mm.page_table_lock); - pgd_clear(pgd); - spin_unlock(&init_mm.page_table_lock); - - return true; -} - static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, bool direct) @@ -913,7 +927,6 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) unsigned long addr; pgd_t *pgd; pud_t *pud; - bool pgd_changed = false; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); @@ -924,13 +937,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) pud = (pud_t *)pgd_page_vaddr(*pgd); remove_pud_table(pud, addr, next, direct); - if (free_pud_table(pud, pgd)) - pgd_changed = true; } - if (pgd_changed) - sync_global_pgds(start, end - 1, 1); - flush_tlb_all(); } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 1b1110fa0057..0493c17b8a51 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -54,8 +54,8 @@ static int kasan_die_handler(struct notifier_block *self, void *data) { if (val == DIE_GPF) { - pr_emerg("CONFIG_KASAN_INLINE enabled"); - pr_emerg("GPF could be caused by NULL-ptr deref or user memory access"); + pr_emerg("CONFIG_KASAN_INLINE enabled\n"); + pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n"); } return NOTIFY_OK; } diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c new file mode 100644 index 000000000000..26dccd6c0df1 --- /dev/null +++ b/arch/x86/mm/kaslr.c @@ -0,0 +1,172 @@ +/* + * This file implements KASLR memory randomization for x86_64. It randomizes + * the virtual address space of kernel memory regions (physical memory + * mapping, vmalloc & vmemmap) for x86_64. This security feature mitigates + * exploits relying on predictable kernel addresses. + * + * Entropy is generated using the KASLR early boot functions now shared in + * the lib directory (originally written by Kees Cook). Randomization is + * done on PGD & PUD page table levels to increase possible addresses. The + * physical memory mapping code was adapted to support PUD level virtual + * addresses. This implementation on the best configuration provides 30,000 + * possible virtual addresses in average for each memory region. An additional + * low memory page is used to ensure each CPU can start with a PGD aligned + * virtual address (for realmode). + * + * The order of each memory region is not changed. The feature looks at + * the available space for the regions based on different configuration + * options and randomizes the base and space between each. The size of the + * physical memory mapping is the available physical memory. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/random.h> + +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/setup.h> +#include <asm/kaslr.h> + +#include "mm_internal.h" + +#define TB_SHIFT 40 + +/* + * Virtual address start and end range for randomization. The end changes base + * on configuration to have the highest amount of space for randomization. + * It increases the possible random position for each randomized region. + * + * You need to add an if/def entry if you introduce a new memory region + * compatible with KASLR. Your entry must be in logical order with memory + * layout. For example, ESPFIX is before EFI because its virtual address is + * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to + * ensure that this order is correct and won't be changed. + */ +static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; +static const unsigned long vaddr_end = VMEMMAP_START; + +/* Default values */ +unsigned long page_offset_base = __PAGE_OFFSET_BASE; +EXPORT_SYMBOL(page_offset_base); +unsigned long vmalloc_base = __VMALLOC_BASE; +EXPORT_SYMBOL(vmalloc_base); + +/* + * Memory regions randomized by KASLR (except modules that use a separate logic + * earlier during boot). The list is ordered based on virtual addresses. This + * order is kept after randomization. + */ +static __initdata struct kaslr_memory_region { + unsigned long *base; + unsigned long size_tb; +} kaslr_regions[] = { + { &page_offset_base, 64/* Maximum */ }, + { &vmalloc_base, VMALLOC_SIZE_TB }, +}; + +/* Get size in bytes used by the memory region */ +static inline unsigned long get_padding(struct kaslr_memory_region *region) +{ + return (region->size_tb << TB_SHIFT); +} + +/* + * Apply no randomization if KASLR was disabled at boot or if KASAN + * is enabled. KASAN shadow mappings rely on regions being PGD aligned. + */ +static inline bool kaslr_memory_enabled(void) +{ + return kaslr_enabled() && !config_enabled(CONFIG_KASAN); +} + +/* Initialize base and padding for each memory region randomized with KASLR */ +void __init kernel_randomize_memory(void) +{ + size_t i; + unsigned long vaddr = vaddr_start; + unsigned long rand, memory_tb; + struct rnd_state rand_state; + unsigned long remain_entropy; + + if (!kaslr_memory_enabled()) + return; + + /* + * Update Physical memory mapping to available and + * add padding if needed (especially for memory hotplug support). + */ + BUG_ON(kaslr_regions[0].base != &page_offset_base); + memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) + + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; + + /* Adapt phyiscal memory region size based on available memory */ + if (memory_tb < kaslr_regions[0].size_tb) + kaslr_regions[0].size_tb = memory_tb; + + /* Calculate entropy available between regions */ + remain_entropy = vaddr_end - vaddr_start; + for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) + remain_entropy -= get_padding(&kaslr_regions[i]); + + prandom_seed_state(&rand_state, kaslr_get_random_long("Memory")); + + for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) { + unsigned long entropy; + + /* + * Select a random virtual address using the extra entropy + * available. + */ + entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); + prandom_bytes_state(&rand_state, &rand, sizeof(rand)); + entropy = (rand % (entropy + 1)) & PUD_MASK; + vaddr += entropy; + *kaslr_regions[i].base = vaddr; + + /* + * Jump the region and add a minimum padding based on + * randomization alignment. + */ + vaddr += get_padding(&kaslr_regions[i]); + vaddr = round_up(vaddr + 1, PUD_SIZE); + remain_entropy -= entropy; + } +} + +/* + * Create PGD aligned trampoline table to allow real mode initialization + * of additional CPUs. Consume only 1 low memory page. + */ +void __meminit init_trampoline(void) +{ + unsigned long paddr, paddr_next; + pgd_t *pgd; + pud_t *pud_page, *pud_page_tramp; + int i; + + if (!kaslr_memory_enabled()) { + init_trampoline_default(); + return; + } + + pud_page_tramp = alloc_low_page(); + + paddr = 0; + pgd = pgd_offset_k((unsigned long)__va(paddr)); + pud_page = (pud_t *) pgd_page_vaddr(*pgd); + + for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) { + pud_t *pud, *pud_tramp; + unsigned long vaddr = (unsigned long)__va(paddr); + + pud_tramp = pud_page_tramp + pud_index(paddr); + pud = pud_page + pud_index(vaddr); + paddr_next = (paddr & PUD_MASK) + PUD_SIZE; + + *pud_tramp = *pud; + } + + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); +} diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f70c1ff46125..968ac028c34e 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -1,4 +1,5 @@ /* Common code for 32 and 64-bit NUMA */ +#include <linux/acpi.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> @@ -15,7 +16,6 @@ #include <asm/e820.h> #include <asm/proto.h> #include <asm/dma.h> -#include <asm/acpi.h> #include <asm/amd_nb.h> #include "numa_internal.h" @@ -617,9 +617,7 @@ static void __init numa_init_array(void) if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); + rr = next_node_in(rr, node_online_map); } } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7a1f7bbf4105..849dc09fa4f0 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -101,7 +101,8 @@ static inline unsigned long highmap_start_pfn(void) static inline unsigned long highmap_end_pfn(void) { - return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; + /* Do not reference physical address outside the kernel. */ + return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; } #endif @@ -112,6 +113,12 @@ within(unsigned long addr, unsigned long start, unsigned long end) return addr >= start && addr < end; } +static inline int +within_inclusive(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr <= end; +} + /* * Flushing functions */ @@ -746,18 +753,6 @@ static bool try_to_free_pmd_page(pmd_t *pmd) return true; } -static bool try_to_free_pud_page(pud_t *pud) -{ - int i; - - for (i = 0; i < PTRS_PER_PUD; i++) - if (!pud_none(pud[i])) - return false; - - free_page((unsigned long)pud); - return true; -} - static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, start); @@ -871,16 +866,6 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) */ } -static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) -{ - pgd_t *pgd_entry = root + pgd_index(addr); - - unmap_pud_range(pgd_entry, addr, end); - - if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry))) - pgd_clear(pgd_entry); -} - static int alloc_pte_page(pmd_t *pmd) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); @@ -1113,7 +1098,12 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) ret = populate_pud(cpa, addr, pgd_entry, pgprot); if (ret < 0) { - unmap_pgd_range(cpa->pgd, addr, + /* + * Leave the PUD page in place in case some other CPU or thread + * already found it, but remove any useless entries we just + * added to it. + */ + unmap_pud_range(pgd_entry, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } @@ -1185,7 +1175,7 @@ repeat: return __cpa_process_fault(cpa, address, primary); old_pte = *kpte; - if (!pte_val(old_pte)) + if (pte_none(old_pte)) return __cpa_process_fault(cpa, address, primary); if (level == PG_LEVEL_4K) { @@ -1316,7 +1306,8 @@ static int cpa_process_alias(struct cpa_data *cpa) * to touch the high mapped kernel as well: */ if (!within(vaddr, (unsigned long)_text, _brk_end) && - within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { + within_inclusive(cpa->pfn, highmap_start_pfn(), + highmap_end_pfn())) { unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; alias_cpa = *cpa; @@ -1991,12 +1982,6 @@ out: return retval; } -void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, - unsigned numpages) -{ - unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT)); -} - /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index fb0604f11eec..db00e3e2f3dc 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -755,11 +755,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) return 1; while (cursor < to) { - if (!devmem_is_allowed(pfn)) { - pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", - current->comm, from, to - 1); + if (!devmem_is_allowed(pfn)) return 0; - } cursor += PAGE_SIZE; pfn++; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 4eb287e25043..3feec5af4e67 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -6,7 +6,7 @@ #include <asm/fixmap.h> #include <asm/mtrr.h> -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM @@ -18,7 +18,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(PGALLOC_GFP); + return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT); } pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) @@ -207,9 +207,13 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; bool failed = false; + gfp_t gfp = PGALLOC_GFP; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; for(i = 0; i < PREALLOCATED_PMDS; i++) { - pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); + pmd_t *pmd = (pmd_t *)__get_free_page(gfp); if (!pmd) failed = true; if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 75cc0978d45d..e67ae0e6c59d 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -47,7 +47,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) return; } pte = pte_offset_kernel(pmd, vaddr); - if (pte_val(pteval)) + if (!pte_none(pteval)) set_pte_at(&init_mm, vaddr, pte, pteval); else pte_clear(&init_mm, vaddr, pte); diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index b5f821881465..b1ecff460a46 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -15,8 +15,6 @@ #include <linux/bitmap.h> #include <linux/module.h> #include <linux/topology.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> #include <linux/mm.h> #include <asm/proto.h> #include <asm/numa.h> @@ -24,51 +22,6 @@ #include <asm/apic.h> #include <asm/uv/uv.h> -int acpi_numa __initdata; - -static __init int setup_node(int pxm) -{ - return acpi_map_pxm_to_node(pxm); -} - -static __init void bad_srat(void) -{ - printk(KERN_ERR "SRAT: SRAT not used.\n"); - acpi_numa = -1; -} - -static __init inline int srat_disabled(void) -{ - return acpi_numa < 0; -} - -/* - * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for - * I/O localities since SRAT does not list them. I/O localities are - * not supported at this point. - */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ - int i, j; - - for (i = 0; i < slit->locality_count; i++) { - const int from_node = pxm_to_node(i); - - if (from_node == NUMA_NO_NODE) - continue; - - for (j = 0; j < slit->locality_count; j++) { - const int to_node = pxm_to_node(j); - - if (to_node == NUMA_NO_NODE) - continue; - - numa_set_distance(from_node, to_node, - slit->entry[slit->locality_count * i + j]); - } - } -} - /* Callback for Proximity Domain -> x2APIC mapping */ void __init acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) @@ -91,7 +44,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) pxm, apic_id); return; } - node = setup_node(pxm); + node = acpi_map_pxm_to_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); bad_srat(); @@ -104,7 +57,6 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); } @@ -127,7 +79,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) pxm = pa->proximity_domain_lo; if (acpi_srat_revision >= 2) pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8; - node = setup_node(pxm); + node = acpi_map_pxm_to_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); bad_srat(); @@ -146,74 +98,10 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); } -#ifdef CONFIG_MEMORY_HOTPLUG -static inline int save_add_info(void) {return 1;} -#else -static inline int save_add_info(void) {return 0;} -#endif - -/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -int __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) -{ - u64 start, end; - u32 hotpluggable; - int node, pxm; - - if (srat_disabled()) - goto out_err; - if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) - goto out_err_bad_srat; - if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - goto out_err; - hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; - if (hotpluggable && !save_add_info()) - goto out_err; - - start = ma->base_address; - end = start + ma->length; - pxm = ma->proximity_domain; - if (acpi_srat_revision <= 1) - pxm &= 0xff; - - node = setup_node(pxm); - if (node < 0) { - printk(KERN_ERR "SRAT: Too many proximity domains.\n"); - goto out_err_bad_srat; - } - - if (numa_add_memblk(node, start, end) < 0) - goto out_err_bad_srat; - - node_set(node, numa_nodes_parsed); - - pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n", - node, pxm, - (unsigned long long) start, (unsigned long long) end - 1, - hotpluggable ? " hotplug" : "", - ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : ""); - - /* Mark hotplug range in memblock. */ - if (hotpluggable && memblock_mark_hotplug(start, ma->length)) - pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", - (unsigned long long)start, (unsigned long long)end - 1); - - max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1)); - - return 0; -out_err_bad_srat: - bad_srat(); -out_err: - return -1; -} - -void __init acpi_numa_arch_fixup(void) {} - int __init x86_acpi_numa_init(void) { int ret; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index b2a4e2a61f6b..3cd69832d7f4 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -396,6 +396,7 @@ int __init pci_acpi_init(void) return -ENODEV; printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); + acpi_irq_penalty_init(); pcibios_enable_irq = acpi_pci_irq_enable; pcibios_disable_irq = acpi_pci_irq_disable; x86_init.pci.init_irq = x86_init_noop; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 381a43c40bf7..8196054fedb0 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -516,7 +516,7 @@ void __init pcibios_set_cache_line_size(void) int __init pcibios_init(void) { - if (!raw_pci_ops) { + if (!raw_pci_ops && !raw_pci_ext_ops) { printk(KERN_WARNING "PCI: System does not support PCI\n"); return 0; } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index b7de1929714b..837ea36a837d 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -552,9 +552,16 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone); +/* + * Broadwell EP Home Agent BARs erroneously return non-zero values when read. + * + * See http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html + * entry BDF2. + */ static void pci_bdwep_bar(struct pci_dev *dev) { dev->non_compliant_bars = 1; } +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_bdwep_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar); diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 8b93e634af84..5a18aedcb341 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -36,7 +36,8 @@ #define PCIE_CAP_OFFSET 0x100 /* Quirks for the listed devices */ -#define PCI_DEVICE_ID_INTEL_MRFL_MMC 0x1190 +#define PCI_DEVICE_ID_INTEL_MRFLD_MMC 0x1190 +#define PCI_DEVICE_ID_INTEL_MRFLD_HSU 0x1191 /* Fixed BAR fields */ #define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */ @@ -225,13 +226,20 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) /* Special treatment for IRQ0 */ if (dev->irq == 0) { /* + * Skip HS UART common registers device since it has + * IRQ0 assigned and not used by the kernel. + */ + if (dev->device == PCI_DEVICE_ID_INTEL_MRFLD_HSU) + return -EBUSY; + /* * TNG has IRQ0 assigned to eMMC controller. But there * are also other devices with bogus PCI configuration * that have IRQ0 assigned. This check ensures that - * eMMC gets it. + * eMMC gets it. The rest of devices still could be + * enabled without interrupt line being allocated. */ - if (dev->device != PCI_DEVICE_ID_INTEL_MRFL_MMC) - return -EBUSY; + if (dev->device != PCI_DEVICE_ID_INTEL_MRFLD_MMC) + return 0; } break; default: @@ -308,14 +316,39 @@ static void pci_d3delay_fixup(struct pci_dev *dev) } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup); -static void mrst_power_off_unused_dev(struct pci_dev *dev) +static void mid_power_off_one_device(struct pci_dev *dev) { + u16 pmcsr; + + /* + * Update current state first, otherwise PCI core enforces PCI_D0 in + * pci_set_power_state() for devices which status was PCI_UNKNOWN. + */ + pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); + dev->current_state = (pci_power_t __force)(pmcsr & PCI_PM_CTRL_STATE_MASK); + pci_set_power_state(dev, PCI_D3hot); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x080C, mrst_power_off_unused_dev); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0815, mrst_power_off_unused_dev); + +static void mid_power_off_devices(struct pci_dev *dev) +{ + int id; + + if (!pci_soc_mode) + return; + + id = intel_mid_pwr_get_lss_id(dev); + if (id < 0) + return; + + /* + * This sets only PMCSR bits. The actual power off will happen in + * arch/x86/platform/intel-mid/pwr.c. + */ + mid_power_off_one_device(dev); +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, mid_power_off_devices); /* * Langwell devices reside at fixed offsets, don't try to move them. diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c index 7792aba266df..613cac7395c4 100644 --- a/arch/x86/pci/vmd.c +++ b/arch/x86/pci/vmd.c @@ -195,7 +195,7 @@ static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info, vmdirq->virq = virq; irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip, - vmdirq, handle_simple_irq, vmd, NULL); + vmdirq, handle_untracked_irq, vmd, NULL); return 0; } diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 4bd08b0fc8ea..99ddab79215e 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -491,8 +491,11 @@ int __init pci_xen_initial_domain(void) #endif __acpi_register_gsi = acpi_register_gsi_xen; __acpi_unregister_gsi = NULL; - /* Pre-allocate legacy irqs */ - for (irq = 0; irq < nr_legacy_irqs(); irq++) { + /* + * Pre-allocate the legacy IRQs. Use NR_LEGACY_IRQS here + * because we don't have a PIC and thus nr_legacy_irqs() is zero. + */ + for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { int trigger, polarity; if (acpi_get_override_irq(irq, &trigger, &polarity) == -1) diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c index 81c769e80614..8ff7b9355416 100644 --- a/arch/x86/platform/atom/punit_atom_debug.c +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -23,10 +23,9 @@ #include <linux/seq_file.h> #include <linux/io.h> #include <asm/cpu_device_id.h> +#include <asm/intel-family.h> #include <asm/iosf_mbi.h> -/* Power gate status reg */ -#define PWRGT_STATUS 0x61 /* Subsystem config/status Video processor */ #define VED_SS_PM0 0x32 /* Subsystem config/status ISP (Image Signal Processor) */ @@ -35,12 +34,16 @@ #define MIO_SS_PM 0x3B /* Shift bits for getting status for video, isp and i/o */ #define SSS_SHIFT 24 + +/* Power gate status reg */ +#define PWRGT_STATUS 0x61 /* Shift bits for getting status for graphics rendering */ #define RENDER_POS 0 /* Shift bits for getting status for media control */ #define MEDIA_POS 2 /* Shift bits for getting status for Valley View/Baytrail display */ #define VLV_DISPLAY_POS 6 + /* Subsystem config/status display for Cherry Trail SOC */ #define CHT_DSP_SSS 0x36 /* Shift bits for getting status for display */ @@ -52,6 +55,14 @@ struct punit_device { int sss_pos; }; +static const struct punit_device punit_device_tng[] = { + { "DISPLAY", CHT_DSP_SSS, SSS_SHIFT }, + { "VED", VED_SS_PM0, SSS_SHIFT }, + { "ISP", ISP_SS_PM0, SSS_SHIFT }, + { "MIO", MIO_SS_PM, SSS_SHIFT }, + { NULL } +}; + static const struct punit_device punit_device_byt[] = { { "GFX RENDER", PWRGT_STATUS, RENDER_POS }, { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS }, @@ -143,8 +154,9 @@ static void punit_dbgfs_unregister(void) (kernel_ulong_t)&drv_data } static const struct x86_cpu_id intel_punit_cpu_ids[] = { - ICPU(55, punit_device_byt), /* Valleyview, Bay Trail */ - ICPU(76, punit_device_cht), /* Braswell, Cherry Trail */ + ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt), + ICPU(INTEL_FAM6_ATOM_MERRIFIELD1, punit_device_tng), + ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht), {} }; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f93545e7dc54..17c8bbd4e2f0 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -98,21 +98,6 @@ static efi_status_t __init phys_efi_set_virtual_address_map( return status; } -void efi_get_time(struct timespec *now) -{ - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - - status = efi.get_time(&eft, &cap); - if (status != EFI_SUCCESS) - pr_err("Oops: efitime: can't read time!\n"); - - now->tv_sec = mktime(eft.year, eft.month, eft.day, eft.hour, - eft.minute, eft.second); - now->tv_nsec = 0; -} - void __init efi_find_mirror(void) { efi_memory_desc_t *md; @@ -978,8 +963,6 @@ static void __init __efi_enter_virtual_mode(void) * EFI mixed mode we need all of memory to be accessible when * we pass parameters to the EFI runtime services in the * thunking code. - * - * efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift); */ free_pages((unsigned long)new_memmap, pg_shift); diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 338402b91d2e..cef39b097649 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -49,9 +49,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { return 0; } -void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) -{ -} void __init efi_map_region(efi_memory_desc_t *md) { diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 6e7242be1c87..3e12c44f88a2 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -139,7 +139,7 @@ int __init efi_alloc_page_tables(void) if (efi_enabled(EFI_OLD_MEMMAP)) return 0; - gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO; + gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; efi_pgd = (pgd_t *)__get_free_page(gfp_mask); if (!efi_pgd) return -ENOMEM; @@ -285,11 +285,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 0; } -void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) -{ - kernel_unmap_pages_in_pgd(efi_pgd, pa_memmap, num_pages); -} - static void __init __map_region(efi_memory_desc_t *md, u64 va) { unsigned long flags = _PAGE_RW; @@ -466,22 +461,17 @@ extern efi_status_t efi64_thunk(u32, ...); #define efi_thunk(f, ...) \ ({ \ efi_status_t __s; \ - unsigned long flags; \ - u32 func; \ - \ - efi_sync_low_kernel_mappings(); \ - local_irq_save(flags); \ + unsigned long __flags; \ + u32 __func; \ \ - efi_scratch.prev_cr3 = read_cr3(); \ - write_cr3((unsigned long)efi_scratch.efi_pgt); \ - __flush_tlb_all(); \ + local_irq_save(__flags); \ + arch_efi_call_virt_setup(); \ \ - func = runtime_service32(f); \ - __s = efi64_thunk(func, __VA_ARGS__); \ + __func = runtime_service32(f); \ + __s = efi64_thunk(__func, __VA_ARGS__); \ \ - write_cr3(efi_scratch.prev_cr3); \ - __flush_tlb_all(); \ - local_irq_restore(flags); \ + arch_efi_call_virt_teardown(); \ + local_irq_restore(__flags); \ \ __s; \ }) diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 92723aeae0f9..cd95075944ab 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -11,7 +11,6 @@ #include <asm/msr.h> #include <asm/processor-flags.h> #include <asm/page_types.h> -#include <asm/frame.h> #define SAVE_XMM \ mov %rsp, %rax; \ @@ -40,10 +39,10 @@ mov (%rsp), %rsp ENTRY(efi_call) - FRAME_BEGIN + pushq %rbp + movq %rsp, %rbp SAVE_XMM - mov (%rsp), %rax - mov 8(%rax), %rax + mov 16(%rbp), %rax subq $48, %rsp mov %r9, 32(%rsp) mov %rax, 40(%rsp) @@ -53,6 +52,6 @@ ENTRY(efi_call) call *%rdi addq $48, %rsp RESTORE_XMM - FRAME_END + popq %rbp ret ENDPROC(efi_call) diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile index 0ce1b1913673..fa021dfab088 100644 --- a/arch/x86/platform/intel-mid/Makefile +++ b/arch/x86/platform/intel-mid/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfl.o +obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfld.o pwr.o # SFI specific code ifdef CONFIG_X86_INTEL_MID diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index 91ec9f8704bf..fc135bf70511 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -1,3 +1,5 @@ +# Family-Level Interface Shim (FLIS) +obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o # IPC Devices obj-y += platform_ipc.o obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o @@ -8,14 +10,18 @@ obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_battery.o obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o obj-$(subst m,y,$(CONFIG_GPIO_INTEL_PMIC)) += platform_pmic_gpio.o obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o +# SPI Devices +obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_spidev.o # I2C Devices obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o -obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_max7315.o obj-$(subst m,y,$(CONFIG_INPUT_MPU3050)) += platform_mpu3050.o obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o -obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o +# I2C GPIO Expanders +obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_max7315.o +obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o +obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o # MISC Devices obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_wdt.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c new file mode 100644 index 000000000000..4de8a664e6a1 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c @@ -0,0 +1,43 @@ +/* + * Intel Merrifield FLIS platform device initialization file + * + * Copyright (C) 2016, Intel Corporation + * + * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/platform_device.h> + +#include <asm/intel-mid.h> + +#define FLIS_BASE_ADDR 0xff0c0000 +#define FLIS_LENGTH 0x8000 + +static struct resource mrfld_pinctrl_mmio_resource = { + .start = FLIS_BASE_ADDR, + .end = FLIS_BASE_ADDR + FLIS_LENGTH - 1, + .flags = IORESOURCE_MEM, +}; + +static struct platform_device mrfld_pinctrl_device = { + .name = "pinctrl-merrifield", + .id = PLATFORM_DEVID_NONE, + .resource = &mrfld_pinctrl_mmio_resource, + .num_resources = 1, +}; + +static int __init mrfld_pinctrl_init(void) +{ + if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER) + return platform_device_register(&mrfld_pinctrl_device); + + return -ENODEV; +} +arch_initcall(mrfld_pinctrl_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c b/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c new file mode 100644 index 000000000000..429a94192671 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c @@ -0,0 +1,99 @@ +/* + * PCAL9555a platform data initilization file + * + * Copyright (C) 2016, Intel Corporation + * + * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com> + * Dan O'Donovan <dan@emutex.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/gpio.h> +#include <linux/init.h> +#include <linux/i2c.h> +#include <linux/platform_data/pca953x.h> +#include <linux/sfi.h> + +#include <asm/intel-mid.h> + +#define PCAL9555A_NUM 4 + +static struct pca953x_platform_data pcal9555a_pdata[PCAL9555A_NUM]; +static int nr; + +static void __init *pcal9555a_platform_data(void *info) +{ + struct i2c_board_info *i2c_info = info; + char *type = i2c_info->type; + struct pca953x_platform_data *pcal9555a; + char base_pin_name[SFI_NAME_LEN + 1]; + char intr_pin_name[SFI_NAME_LEN + 1]; + int gpio_base, intr; + + snprintf(base_pin_name, sizeof(base_pin_name), "%s_base", type); + snprintf(intr_pin_name, sizeof(intr_pin_name), "%s_int", type); + + gpio_base = get_gpio_by_name(base_pin_name); + intr = get_gpio_by_name(intr_pin_name); + + /* Check if the SFI record valid */ + if (gpio_base == -1) + return NULL; + + if (nr >= PCAL9555A_NUM) { + pr_err("%s: Too many instances, only %d supported\n", __func__, + PCAL9555A_NUM); + return NULL; + } + + pcal9555a = &pcal9555a_pdata[nr++]; + pcal9555a->gpio_base = gpio_base; + + if (intr >= 0) { + i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; + pcal9555a->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; + } else { + i2c_info->irq = -1; + pcal9555a->irq_base = -1; + } + + strcpy(type, "pcal9555a"); + return pcal9555a; +} + +static const struct devs_id pcal9555a_1_dev_id __initconst = { + .name = "pcal9555a-1", + .type = SFI_DEV_TYPE_I2C, + .delay = 1, + .get_platform_data = &pcal9555a_platform_data, +}; + +static const struct devs_id pcal9555a_2_dev_id __initconst = { + .name = "pcal9555a-2", + .type = SFI_DEV_TYPE_I2C, + .delay = 1, + .get_platform_data = &pcal9555a_platform_data, +}; + +static const struct devs_id pcal9555a_3_dev_id __initconst = { + .name = "pcal9555a-3", + .type = SFI_DEV_TYPE_I2C, + .delay = 1, + .get_platform_data = &pcal9555a_platform_data, +}; + +static const struct devs_id pcal9555a_4_dev_id __initconst = { + .name = "pcal9555a-4", + .type = SFI_DEV_TYPE_I2C, + .delay = 1, + .get_platform_data = &pcal9555a_platform_data, +}; + +sfi_device(pcal9555a_1_dev_id); +sfi_device(pcal9555a_2_dev_id); +sfi_device(pcal9555a_3_dev_id); +sfi_device(pcal9555a_4_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_spidev.c new file mode 100644 index 000000000000..30c601b399ee --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_spidev.c @@ -0,0 +1,50 @@ +/* + * spidev platform data initilization file + * + * (C) Copyright 2014, 2016 Intel Corporation + * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com> + * Dan O'Donovan <dan@emutex.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/spi/pxa2xx_spi.h> +#include <linux/spi/spi.h> + +#include <asm/intel-mid.h> + +#define MRFLD_SPI_DEFAULT_DMA_BURST 8 +#define MRFLD_SPI_DEFAULT_TIMEOUT 500 + +/* GPIO pin for spidev chipselect */ +#define MRFLD_SPIDEV_GPIO_CS 111 + +static struct pxa2xx_spi_chip spidev_spi_chip = { + .dma_burst_size = MRFLD_SPI_DEFAULT_DMA_BURST, + .timeout = MRFLD_SPI_DEFAULT_TIMEOUT, + .gpio_cs = MRFLD_SPIDEV_GPIO_CS, +}; + +static void __init *spidev_platform_data(void *info) +{ + struct spi_board_info *spi_info = info; + + spi_info->mode = SPI_MODE_0; + spi_info->controller_data = &spidev_spi_chip; + + return NULL; +} + +static const struct devs_id spidev_dev_id __initconst = { + .name = "spidev", + .type = SFI_DEV_TYPE_SPI, + .delay = 0, + .get_platform_data = &spidev_platform_data, +}; + +sfi_device(spidev_dev_id); diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 90bb997ed0a2..abbf49c6e9d3 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -16,6 +16,7 @@ #include <linux/init.h> #include <linux/kernel.h> #include <linux/interrupt.h> +#include <linux/regulator/machine.h> #include <linux/scatterlist.h> #include <linux/sfi.h> #include <linux/irq.h> @@ -144,6 +145,15 @@ static void intel_mid_arch_setup(void) out: if (intel_mid_ops->arch_setup) intel_mid_ops->arch_setup(); + + /* + * Intel MID platforms are using explicitly defined regulators. + * + * Let the regulator core know that we do not have any additional + * regulators left. This lets it substitute unprovided regulators with + * dummy ones: + */ + regulator_has_full_constraints(); } /* MID systems don't have i8042 controller */ diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfld.c index bd1adc621781..59253db41bbc 100644 --- a/arch/x86/platform/intel-mid/mrfl.c +++ b/arch/x86/platform/intel-mid/mrfld.c @@ -1,5 +1,5 @@ /* - * mrfl.c: Intel Merrifield platform specific setup code + * Intel Merrifield platform specific setup code * * (C) Copyright 2013 Intel Corporation * diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c new file mode 100644 index 000000000000..5bc90dd102d4 --- /dev/null +++ b/arch/x86/platform/intel-mid/pwr.c @@ -0,0 +1,418 @@ +/* + * Intel MID Power Management Unit (PWRMU) device driver + * + * Copyright (C) 2016, Intel Corporation + * + * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * Intel MID Power Management Unit device driver handles the South Complex PCI + * devices such as GPDMA, SPI, I2C, PWM, and so on. By default PCI core + * modifies bits in PMCSR register in the PCI configuration space. This is not + * enough on some SoCs like Intel Tangier. In such case PCI core sets a new + * power state of the device in question through a PM hook registered in struct + * pci_platform_pm_ops (see drivers/pci/pci-mid.c). + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/pci.h> + +#include <asm/intel-mid.h> + +/* Registers */ +#define PM_STS 0x00 +#define PM_CMD 0x04 +#define PM_ICS 0x08 +#define PM_WKC(x) (0x10 + (x) * 4) +#define PM_WKS(x) (0x18 + (x) * 4) +#define PM_SSC(x) (0x20 + (x) * 4) +#define PM_SSS(x) (0x30 + (x) * 4) + +/* Bits in PM_STS */ +#define PM_STS_BUSY (1 << 8) + +/* Bits in PM_CMD */ +#define PM_CMD_CMD(x) ((x) << 0) +#define PM_CMD_IOC (1 << 8) +#define PM_CMD_D3cold (1 << 21) + +/* List of commands */ +#define CMD_SET_CFG 0x01 + +/* Bits in PM_ICS */ +#define PM_ICS_INT_STATUS(x) ((x) & 0xff) +#define PM_ICS_IE (1 << 8) +#define PM_ICS_IP (1 << 9) +#define PM_ICS_SW_INT_STS (1 << 10) + +/* List of interrupts */ +#define INT_INVALID 0 +#define INT_CMD_COMPLETE 1 +#define INT_CMD_ERR 2 +#define INT_WAKE_EVENT 3 +#define INT_LSS_POWER_ERR 4 +#define INT_S0iX_MSG_ERR 5 +#define INT_NO_C6 6 +#define INT_TRIGGER_ERR 7 +#define INT_INACTIVITY 8 + +/* South Complex devices */ +#define LSS_MAX_SHARED_DEVS 4 +#define LSS_MAX_DEVS 64 + +#define LSS_WS_BITS 1 /* wake state width */ +#define LSS_PWS_BITS 2 /* power state width */ + +/* Supported device IDs */ +#define PCI_DEVICE_ID_PENWELL 0x0828 +#define PCI_DEVICE_ID_TANGIER 0x11a1 + +struct mid_pwr_dev { + struct pci_dev *pdev; + pci_power_t state; +}; + +struct mid_pwr { + struct device *dev; + void __iomem *regs; + int irq; + bool available; + + struct mutex lock; + struct mid_pwr_dev lss[LSS_MAX_DEVS][LSS_MAX_SHARED_DEVS]; +}; + +static struct mid_pwr *midpwr; + +static u32 mid_pwr_get_state(struct mid_pwr *pwr, int reg) +{ + return readl(pwr->regs + PM_SSS(reg)); +} + +static void mid_pwr_set_state(struct mid_pwr *pwr, int reg, u32 value) +{ + writel(value, pwr->regs + PM_SSC(reg)); +} + +static void mid_pwr_set_wake(struct mid_pwr *pwr, int reg, u32 value) +{ + writel(value, pwr->regs + PM_WKC(reg)); +} + +static void mid_pwr_interrupt_disable(struct mid_pwr *pwr) +{ + writel(~PM_ICS_IE, pwr->regs + PM_ICS); +} + +static bool mid_pwr_is_busy(struct mid_pwr *pwr) +{ + return !!(readl(pwr->regs + PM_STS) & PM_STS_BUSY); +} + +/* Wait 500ms that the latest PWRMU command finished */ +static int mid_pwr_wait(struct mid_pwr *pwr) +{ + unsigned int count = 500000; + bool busy; + + do { + busy = mid_pwr_is_busy(pwr); + if (!busy) + return 0; + udelay(1); + } while (--count); + + return -EBUSY; +} + +static int mid_pwr_wait_for_cmd(struct mid_pwr *pwr, u8 cmd) +{ + writel(PM_CMD_CMD(cmd), pwr->regs + PM_CMD); + return mid_pwr_wait(pwr); +} + +static int __update_power_state(struct mid_pwr *pwr, int reg, int bit, int new) +{ + int curstate; + u32 power; + int ret; + + /* Check if the device is already in desired state */ + power = mid_pwr_get_state(pwr, reg); + curstate = (power >> bit) & 3; + if (curstate == new) + return 0; + + /* Update the power state */ + mid_pwr_set_state(pwr, reg, (power & ~(3 << bit)) | (new << bit)); + + /* Send command to SCU */ + ret = mid_pwr_wait_for_cmd(pwr, CMD_SET_CFG); + if (ret) + return ret; + + /* Check if the device is already in desired state */ + power = mid_pwr_get_state(pwr, reg); + curstate = (power >> bit) & 3; + if (curstate != new) + return -EAGAIN; + + return 0; +} + +static pci_power_t __find_weakest_power_state(struct mid_pwr_dev *lss, + struct pci_dev *pdev, + pci_power_t state) +{ + pci_power_t weakest = PCI_D3hot; + unsigned int j; + + /* Find device in cache or first free cell */ + for (j = 0; j < LSS_MAX_SHARED_DEVS; j++) { + if (lss[j].pdev == pdev || !lss[j].pdev) + break; + } + + /* Store the desired state in cache */ + if (j < LSS_MAX_SHARED_DEVS) { + lss[j].pdev = pdev; + lss[j].state = state; + } else { + dev_WARN(&pdev->dev, "No room for device in PWRMU LSS cache\n"); + weakest = state; + } + + /* Find the power state we may use */ + for (j = 0; j < LSS_MAX_SHARED_DEVS; j++) { + if (lss[j].state < weakest) + weakest = lss[j].state; + } + + return weakest; +} + +static int __set_power_state(struct mid_pwr *pwr, struct pci_dev *pdev, + pci_power_t state, int id, int reg, int bit) +{ + const char *name; + int ret; + + state = __find_weakest_power_state(pwr->lss[id], pdev, state); + name = pci_power_name(state); + + ret = __update_power_state(pwr, reg, bit, (__force int)state); + if (ret) { + dev_warn(&pdev->dev, "Can't set power state %s: %d\n", name, ret); + return ret; + } + + dev_vdbg(&pdev->dev, "Set power state %s\n", name); + return 0; +} + +static int mid_pwr_set_power_state(struct mid_pwr *pwr, struct pci_dev *pdev, + pci_power_t state) +{ + int id, reg, bit; + int ret; + + id = intel_mid_pwr_get_lss_id(pdev); + if (id < 0) + return id; + + reg = (id * LSS_PWS_BITS) / 32; + bit = (id * LSS_PWS_BITS) % 32; + + /* We support states between PCI_D0 and PCI_D3hot */ + if (state < PCI_D0) + state = PCI_D0; + if (state > PCI_D3hot) + state = PCI_D3hot; + + mutex_lock(&pwr->lock); + ret = __set_power_state(pwr, pdev, state, id, reg, bit); + mutex_unlock(&pwr->lock); + return ret; +} + +int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) +{ + struct mid_pwr *pwr = midpwr; + int ret = 0; + + might_sleep(); + + if (pwr && pwr->available) + ret = mid_pwr_set_power_state(pwr, pdev, state); + dev_vdbg(&pdev->dev, "set_power_state() returns %d\n", ret); + + return 0; +} +EXPORT_SYMBOL_GPL(intel_mid_pci_set_power_state); + +int intel_mid_pwr_get_lss_id(struct pci_dev *pdev) +{ + int vndr; + u8 id; + + /* + * Mapping to PWRMU index is kept in the Logical SubSystem ID byte of + * Vendor capability. + */ + vndr = pci_find_capability(pdev, PCI_CAP_ID_VNDR); + if (!vndr) + return -EINVAL; + + /* Read the Logical SubSystem ID byte */ + pci_read_config_byte(pdev, vndr + INTEL_MID_PWR_LSS_OFFSET, &id); + if (!(id & INTEL_MID_PWR_LSS_TYPE)) + return -ENODEV; + + id &= ~INTEL_MID_PWR_LSS_TYPE; + if (id >= LSS_MAX_DEVS) + return -ERANGE; + + return id; +} + +static irqreturn_t mid_pwr_irq_handler(int irq, void *dev_id) +{ + struct mid_pwr *pwr = dev_id; + u32 ics; + + ics = readl(pwr->regs + PM_ICS); + if (!(ics & PM_ICS_IP)) + return IRQ_NONE; + + writel(ics | PM_ICS_IP, pwr->regs + PM_ICS); + + dev_warn(pwr->dev, "Unexpected IRQ: %#x\n", PM_ICS_INT_STATUS(ics)); + return IRQ_HANDLED; +} + +struct mid_pwr_device_info { + int (*set_initial_state)(struct mid_pwr *pwr); +}; + +static int mid_pwr_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct mid_pwr_device_info *info = (void *)id->driver_data; + struct device *dev = &pdev->dev; + struct mid_pwr *pwr; + int ret; + + ret = pcim_enable_device(pdev); + if (ret < 0) { + dev_err(&pdev->dev, "error: could not enable device\n"); + return ret; + } + + ret = pcim_iomap_regions(pdev, 1 << 0, pci_name(pdev)); + if (ret) { + dev_err(&pdev->dev, "I/O memory remapping failed\n"); + return ret; + } + + pwr = devm_kzalloc(dev, sizeof(*pwr), GFP_KERNEL); + if (!pwr) + return -ENOMEM; + + pwr->dev = dev; + pwr->regs = pcim_iomap_table(pdev)[0]; + pwr->irq = pdev->irq; + + mutex_init(&pwr->lock); + + /* Disable interrupts */ + mid_pwr_interrupt_disable(pwr); + + if (info && info->set_initial_state) { + ret = info->set_initial_state(pwr); + if (ret) + dev_warn(dev, "Can't set initial state: %d\n", ret); + } + + ret = devm_request_irq(dev, pdev->irq, mid_pwr_irq_handler, + IRQF_NO_SUSPEND, pci_name(pdev), pwr); + if (ret) + return ret; + + pwr->available = true; + midpwr = pwr; + + pci_set_drvdata(pdev, pwr); + return 0; +} + +static int mid_set_initial_state(struct mid_pwr *pwr) +{ + unsigned int i, j; + int ret; + + /* + * Enable wake events. + * + * PWRMU supports up to 32 sources for wake up the system. Ungate them + * all here. + */ + mid_pwr_set_wake(pwr, 0, 0xffffffff); + mid_pwr_set_wake(pwr, 1, 0xffffffff); + + /* + * Power off South Complex devices. + * + * There is a map (see a note below) of 64 devices with 2 bits per each + * on 32-bit HW registers. The following calls set all devices to one + * known initial state, i.e. PCI_D3hot. This is done in conjunction + * with PMCSR setting in arch/x86/pci/intel_mid_pci.c. + * + * NOTE: The actual device mapping is provided by a platform at run + * time using vendor capability of PCI configuration space. + */ + mid_pwr_set_state(pwr, 0, 0xffffffff); + mid_pwr_set_state(pwr, 1, 0xffffffff); + mid_pwr_set_state(pwr, 2, 0xffffffff); + mid_pwr_set_state(pwr, 3, 0xffffffff); + + /* Send command to SCU */ + ret = mid_pwr_wait_for_cmd(pwr, CMD_SET_CFG); + if (ret) + return ret; + + for (i = 0; i < LSS_MAX_DEVS; i++) { + for (j = 0; j < LSS_MAX_SHARED_DEVS; j++) + pwr->lss[i][j].state = PCI_D3hot; + } + + return 0; +} + +static const struct mid_pwr_device_info mid_info = { + .set_initial_state = mid_set_initial_state, +}; + +static const struct pci_device_id mid_pwr_pci_ids[] = { + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PENWELL), (kernel_ulong_t)&mid_info }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&mid_info }, + {} +}; +MODULE_DEVICE_TABLE(pci, mid_pwr_pci_ids); + +static struct pci_driver mid_pwr_pci_driver = { + .name = "intel_mid_pwr", + .probe = mid_pwr_probe, + .id_table = mid_pwr_pci_ids, +}; + +builtin_pci_driver(mid_pwr_pci_driver); diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 5ee360a951ce..1555672d436f 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -407,6 +407,32 @@ static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry, i2c_register_board_info(pentry->host_num, &i2c_info, 1); } +static void __init sfi_handle_sd_dev(struct sfi_device_table_entry *pentry, + struct devs_id *dev) +{ + struct mid_sd_board_info sd_info; + void *pdata; + + memset(&sd_info, 0, sizeof(sd_info)); + strncpy(sd_info.name, pentry->name, SFI_NAME_LEN); + sd_info.bus_num = pentry->host_num; + sd_info.max_clk = pentry->max_freq; + sd_info.addr = pentry->addr; + pr_debug("SD bus = %d, name = %16.16s, max_clk = %d, addr = 0x%x\n", + sd_info.bus_num, + sd_info.name, + sd_info.max_clk, + sd_info.addr); + pdata = intel_mid_sfi_get_pdata(dev, &sd_info); + if (IS_ERR(pdata)) + return; + + /* Nothing we can do with this for now */ + sd_info.platform_data = pdata; + + pr_debug("Successfully registered %16.16s", sd_info.name); +} + extern struct devs_id *const __x86_intel_mid_dev_start[], *const __x86_intel_mid_dev_end[]; @@ -490,6 +516,9 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) case SFI_DEV_TYPE_I2C: sfi_handle_i2c_dev(pentry, dev); break; + case SFI_DEV_TYPE_SD: + sfi_handle_sd_dev(pentry, dev); + break; case SFI_DEV_TYPE_UART: case SFI_DEV_TYPE_HSI: default: diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 815fec6e05e2..66b2166ea4a1 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -40,8 +40,7 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) */ return BIOS_STATUS_UNIMPLEMENTED; - ret = efi_call((void *)__va(tab->function), (u64)which, - a1, a2, a3, a4, a5); + ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5); return ret; } EXPORT_SYMBOL_GPL(uv_bios_call); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index d5f64996394a..b12c26e2e309 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/smp.h> #include <linux/perf_event.h> +#include <linux/tboot.h> #include <asm/pgtable.h> #include <asm/proto.h> @@ -266,6 +267,35 @@ void notrace restore_processor_state(void) EXPORT_SYMBOL(restore_processor_state); #endif +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HOTPLUG_CPU) +static void resume_play_dead(void) +{ + play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); + hlt_play_dead(); +} + +int hibernate_resume_nonboot_cpu_disable(void) +{ + void (*play_dead)(void) = smp_ops.play_dead; + int ret; + + /* + * Ensure that MONITOR/MWAIT will not be used in the "play dead" loop + * during hibernate image restoration, because it is likely that the + * monitored address will be actually written to at that time and then + * the "dead" CPU will attempt to execute instructions again, but the + * address in its instruction pointer may not be possible to resolve + * any more at that point (the page tables used by it previously may + * have been overwritten by hibernate image data). + */ + smp_ops.play_dead = resume_play_dead; + ret = disable_nonboot_cpus(); + smp_ops.play_dead = play_dead; + return ret; +} +#endif + /* * When bsp_check() is called in hibernate and suspend, cpu hotplug * is disabled already. So it's unnessary to handle race condition between diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 009947d419a6..f2b5e6a5cf95 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -19,6 +19,7 @@ #include <asm/mtrr.h> #include <asm/sections.h> #include <asm/suspend.h> +#include <asm/tlbflush.h> /* Defined in hibernate_asm_64.S */ extern asmlinkage __visible int restore_image(void); @@ -28,6 +29,7 @@ extern asmlinkage __visible int restore_image(void); * kernel's text (this value is passed in the image header). */ unsigned long restore_jump_address __visible; +unsigned long jump_address_phys; /* * Value of the cr3 register from before the hibernation (this value is passed @@ -37,7 +39,43 @@ unsigned long restore_cr3 __visible; pgd_t *temp_level4_pgt __visible; -void *relocated_restore_code __visible; +unsigned long relocated_restore_code __visible; + +static int set_up_temporary_text_mapping(void) +{ + pmd_t *pmd; + pud_t *pud; + + /* + * The new mapping only has to cover the page containing the image + * kernel's entry point (jump_address_phys), because the switch over to + * it is carried out by relocated code running from a page allocated + * specifically for this purpose and covered by the identity mapping, so + * the temporary kernel text mapping is only needed for the final jump. + * Moreover, in that mapping the virtual address of the image kernel's + * entry point must be the same as its virtual address in the image + * kernel (restore_jump_address), so the image kernel's + * restore_registers() code doesn't find itself in a different area of + * the virtual address space after switching over to the original page + * tables used by the image kernel. + */ + pud = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!pud) + return -ENOMEM; + + pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!pmd) + return -ENOMEM; + + set_pmd(pmd + pmd_index(restore_jump_address), + __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); + set_pud(pud + pud_index(restore_jump_address), + __pud(__pa(pmd) | _KERNPG_TABLE)); + set_pgd(temp_level4_pgt + pgd_index(restore_jump_address), + __pgd(__pa(pud) | _KERNPG_TABLE)); + + return 0; +} static void *alloc_pgt_page(void *context) { @@ -59,9 +97,10 @@ static int set_up_temporary_mappings(void) if (!temp_level4_pgt) return -ENOMEM; - /* It is safe to reuse the original kernel mapping */ - set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), - init_level4_pgt[pgd_index(__START_KERNEL_map)]); + /* Prepare a temporary mapping for the kernel text */ + result = set_up_temporary_text_mapping(); + if (result) + return result; /* Set up the direct mapping from scratch */ for (i = 0; i < nr_pfn_mapped; i++) { @@ -78,19 +117,50 @@ static int set_up_temporary_mappings(void) return 0; } +static int relocate_restore_code(void) +{ + pgd_t *pgd; + pud_t *pud; + + relocated_restore_code = get_safe_page(GFP_ATOMIC); + if (!relocated_restore_code) + return -ENOMEM; + + memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); + + /* Make the page containing the relocated code executable */ + pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); + pud = pud_offset(pgd, relocated_restore_code); + if (pud_large(*pud)) { + set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); + } else { + pmd_t *pmd = pmd_offset(pud, relocated_restore_code); + + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); + } else { + pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code); + + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); + } + } + __flush_tlb_all(); + + return 0; +} + int swsusp_arch_resume(void) { int error; /* We have got enough memory and from now on we cannot recover */ - if ((error = set_up_temporary_mappings())) + error = set_up_temporary_mappings(); + if (error) return error; - relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); - if (!relocated_restore_code) - return -ENOMEM; - memcpy(relocated_restore_code, &core_restore_code, - &restore_registers - &core_restore_code); + error = relocate_restore_code(); + if (error) + return error; restore_image(); return 0; @@ -109,11 +179,12 @@ int pfn_is_nosave(unsigned long pfn) struct restore_data_record { unsigned long jump_address; + unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; }; -#define RESTORE_MAGIC 0x0123456789ABCDEFUL +#define RESTORE_MAGIC 0x123456789ABCDEF0UL /** * arch_hibernation_header_save - populate the architecture specific part @@ -126,7 +197,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) if (max_size < sizeof(struct restore_data_record)) return -EOVERFLOW; - rdr->jump_address = restore_jump_address; + rdr->jump_address = (unsigned long)&restore_registers; + rdr->jump_address_phys = __pa_symbol(&restore_registers); rdr->cr3 = restore_cr3; rdr->magic = RESTORE_MAGIC; return 0; @@ -142,6 +214,7 @@ int arch_hibernation_header_restore(void *addr) struct restore_data_record *rdr = addr; restore_jump_address = rdr->jump_address; + jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; } diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 4400a43b9e28..3177c2bc26f6 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -44,9 +44,6 @@ ENTRY(swsusp_arch_suspend) pushfq popq pt_regs_flags(%rax) - /* save the address of restore_registers */ - movq $restore_registers, %rax - movq %rax, restore_jump_address(%rip) /* save cr3 */ movq %cr3, %rax movq %rax, restore_cr3(%rip) @@ -57,31 +54,34 @@ ENTRY(swsusp_arch_suspend) ENDPROC(swsusp_arch_suspend) ENTRY(restore_image) - /* switch to temporary page tables */ - movq $__PAGE_OFFSET, %rdx - movq temp_level4_pgt(%rip), %rax - subq %rdx, %rax - movq %rax, %cr3 - /* Flush TLB */ - movq mmu_cr4_features(%rip), %rax - movq %rax, %rdx - andq $~(X86_CR4_PGE), %rdx - movq %rdx, %cr4; # turn off PGE - movq %cr3, %rcx; # flush TLB - movq %rcx, %cr3; - movq %rax, %cr4; # turn PGE back on - /* prepare to jump to the image kernel */ - movq restore_jump_address(%rip), %rax - movq restore_cr3(%rip), %rbx + movq restore_jump_address(%rip), %r8 + movq restore_cr3(%rip), %r9 + + /* prepare to switch to temporary page tables */ + movq temp_level4_pgt(%rip), %rax + movq mmu_cr4_features(%rip), %rbx /* prepare to copy image data to their original locations */ movq restore_pblist(%rip), %rdx + + /* jump to relocated restore code */ movq relocated_restore_code(%rip), %rcx jmpq *%rcx /* code below has been relocated to a safe page */ ENTRY(core_restore_code) + /* switch to temporary page tables */ + movq $__PAGE_OFFSET, %rcx + subq %rcx, %rax + movq %rax, %cr3 + /* flush TLB */ + movq %rbx, %rcx + andq $~(X86_CR4_PGE), %rcx + movq %rcx, %cr4; # turn off PGE + movq %cr3, %rcx; # flush TLB + movq %rcx, %cr3; + movq %rbx, %cr4; # turn PGE back on .Lloop: testq %rdx, %rdx jz .Ldone @@ -96,24 +96,17 @@ ENTRY(core_restore_code) /* progress to the next pbe */ movq pbe_next(%rdx), %rdx jmp .Lloop + .Ldone: /* jump to the restore_registers address from the image header */ - jmpq *%rax - /* - * NOTE: This assumes that the boot kernel's text mapping covers the - * image kernel's page containing restore_registers and the address of - * this page is the same as in the image kernel's text mapping (it - * should always be true, because the text mapping is linear, starting - * from 0, and is supposed to cover the entire kernel text for every - * kernel). - * - * code below belongs to the image kernel - */ + jmpq *%r8 + /* code below belongs to the image kernel */ + .align PAGE_SIZE ENTRY(restore_registers) FRAME_BEGIN /* go back to the original page tables */ - movq %rbx, %cr3 + movq %r9, %cr3 /* Flush TLB, including "global" things (vmalloc) */ movq mmu_cr4_features(%rip), %rax diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 92e3e1d84c1d..12734a96df47 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -26,7 +26,5 @@ quiet_cmd_bin2c = BIN2C $@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) - @: - obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index e69f4701a076..1104515d5ad2 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -241,6 +241,31 @@ static void toggle_nb_mca_mst_cpu(u16 nid) __func__, PCI_FUNC(F3->devfn), NBCFG); } +static void prepare_msrs(void *info) +{ + struct mce i_mce = *(struct mce *)info; + u8 b = i_mce.bank; + + wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus); + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (i_mce.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr); + } else { + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr); + } + + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc); + } else { + wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc); + } + +} + static void do_inject(void) { u64 mcg_status = 0; @@ -287,36 +312,9 @@ static void do_inject(void) toggle_hw_mce_inject(cpu, true); - wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS, - (u32)mcg_status, (u32)(mcg_status >> 32)); - - if (boot_cpu_has(X86_FEATURE_SMCA)) { - if (inj_type == DFR_INT_INJ) { - wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DESTAT(b), - (u32)i_mce.status, (u32)(i_mce.status >> 32)); - - wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DEADDR(b), - (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); - } else { - wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_STATUS(b), - (u32)i_mce.status, (u32)(i_mce.status >> 32)); - - wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_ADDR(b), - (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); - } - - wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(b), - (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); - } else { - wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b), - (u32)i_mce.status, (u32)(i_mce.status >> 32)); - - wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b), - (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); - - wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b), - (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); - } + i_mce.mcgstatus = mcg_status; + i_mce.inject_flags = inj_type; + smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); toggle_hw_mce_inject(cpu, false); diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 0b7a63d98440..705e3fffb4a1 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -8,6 +8,9 @@ struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; +/* Hold the pgd entry used on booting additional CPUs */ +pgd_t trampoline_pgd_entry; + void __init reserve_real_mode(void) { phys_addr_t mem; @@ -84,7 +87,7 @@ void __init setup_real_mode(void) *trampoline_cr4_features = __read_cr4(); trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); - trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd; + trampoline_pgd[0] = trampoline_pgd_entry.pgd; trampoline_pgd[511] = init_level4_pgt[511].pgd; #endif } diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index b95964610ea7..c556c5ae8de5 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -59,7 +59,6 @@ OBJCOPYFLAGS_realmode.bin := -O binary targets += realmode.bin $(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs FORCE $(call if_changed,objcopy) - @: quiet_cmd_relocs = RELOCS $@ cmd_relocs = arch/x86/tools/relocs --realmode $< > $@ diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 093a892026f9..a3d2c62fd805 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -72,12 +72,14 @@ BEGIN { lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 - # All opcodes starting with lower-case 'v' or with (v1) superscript + # All opcodes starting with lower-case 'v', 'k' or with (v1) superscript # accepts VEX prefix - vexok_opcode_expr = "^v.*" + vexok_opcode_expr = "^[vk].*" vexok_expr = "\\(v1\\)" # All opcodes with (v) superscript supports *only* VEX prefix vexonly_expr = "\\(v\\)" + # All opcodes with (ev) superscript supports *only* EVEX prefix + evexonly_expr = "\\(ev\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -95,6 +97,7 @@ BEGIN { prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" prefix_num["VEX+1byte"] = "INAT_PFX_VEX2" prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" + prefix_num["EVEX"] = "INAT_PFX_EVEX" clear_vars() } @@ -319,7 +322,9 @@ function convert_operands(count,opnd, i,j,imm,mod) flags = add_flags(flags, "INAT_MODRM") # check VEX codes - if (match(ext, vexonly_expr)) + if (match(ext, evexonly_expr)) + flags = add_flags(flags, "INAT_VEXOK | INAT_EVEXONLY") + else if (match(ext, vexonly_expr)) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c index 41bfe84e11ab..00f54a91bb4b 100644 --- a/arch/x86/um/os-Linux/registers.c +++ b/arch/x86/um/os-Linux/registers.c @@ -11,21 +11,56 @@ #endif #include <longjmp.h> #include <sysdep/ptrace_user.h> +#include <sys/uio.h> +#include <asm/sigcontext.h> +#include <linux/elf.h> -int save_fp_registers(int pid, unsigned long *fp_regs) +int have_xstate_support; + +int save_i387_registers(int pid, unsigned long *fp_regs) { if (ptrace(PTRACE_GETFPREGS, pid, 0, fp_regs) < 0) return -errno; return 0; } -int restore_fp_registers(int pid, unsigned long *fp_regs) +int save_fp_registers(int pid, unsigned long *fp_regs) +{ + struct iovec iov; + + if (have_xstate_support) { + iov.iov_base = fp_regs; + iov.iov_len = sizeof(struct _xstate); + if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) < 0) + return -errno; + return 0; + } else { + return save_i387_registers(pid, fp_regs); + } +} + +int restore_i387_registers(int pid, unsigned long *fp_regs) { if (ptrace(PTRACE_SETFPREGS, pid, 0, fp_regs) < 0) return -errno; return 0; } +int restore_fp_registers(int pid, unsigned long *fp_regs) +{ + struct iovec iov; + + if (have_xstate_support) { + iov.iov_base = fp_regs; + iov.iov_len = sizeof(struct _xstate); + if (ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov) < 0) + return -errno; + return 0; + } else { + return restore_i387_registers(pid, fp_regs); + } +} + #ifdef __i386__ int have_fpx_regs = 1; int save_fpx_registers(int pid, unsigned long *fp_regs) @@ -85,6 +120,16 @@ int put_fp_registers(int pid, unsigned long *regs) return restore_fp_registers(pid, regs); } +void arch_init_registers(int pid) +{ + struct _xstate fp_regs; + struct iovec iov; + + iov.iov_base = &fp_regs; + iov.iov_len = sizeof(struct _xstate); + if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) == 0) + have_xstate_support = 1; +} #endif unsigned long get_thread_reg(int reg, jmp_buf *buf) diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index 47c78d5e5c32..ebd4dd6ef73b 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -194,7 +194,8 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c int err, n, cpu = ((struct thread_info *) child->stack)->cpu; struct user_i387_struct fpregs; - err = save_fp_registers(userspace_pid[cpu], (unsigned long *) &fpregs); + err = save_i387_registers(userspace_pid[cpu], + (unsigned long *) &fpregs); if (err) return err; @@ -214,7 +215,7 @@ static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *c if (n > 0) return -EFAULT; - return restore_fp_registers(userspace_pid[cpu], + return restore_i387_registers(userspace_pid[cpu], (unsigned long *) &fpregs); } diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index a629694ee750..faab418876ce 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -222,14 +222,14 @@ int is_syscall(unsigned long addr) static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { int err, n, cpu = ((struct thread_info *) child->stack)->cpu; - long fpregs[HOST_FP_SIZE]; + struct user_i387_struct fpregs; - BUG_ON(sizeof(*buf) != sizeof(fpregs)); - err = save_fp_registers(userspace_pid[cpu], fpregs); + err = save_i387_registers(userspace_pid[cpu], + (unsigned long *) &fpregs); if (err) return err; - n = copy_to_user(buf, fpregs, sizeof(fpregs)); + n = copy_to_user(buf, &fpregs, sizeof(fpregs)); if (n > 0) return -EFAULT; @@ -239,14 +239,14 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { int n, cpu = ((struct thread_info *) child->stack)->cpu; - long fpregs[HOST_FP_SIZE]; + struct user_i387_struct fpregs; - BUG_ON(sizeof(*buf) != sizeof(fpregs)); - n = copy_from_user(fpregs, buf, sizeof(fpregs)); + n = copy_from_user(&fpregs, buf, sizeof(fpregs)); if (n > 0) return -EFAULT; - return restore_fp_registers(userspace_pid[cpu], fpregs); + return restore_i387_registers(userspace_pid[cpu], + (unsigned long *) &fpregs); } long subarch_ptrace(struct task_struct *child, long request, diff --git a/arch/x86/um/shared/sysdep/ptrace_64.h b/arch/x86/um/shared/sysdep/ptrace_64.h index 919789f1071e..0dc223aa1c2d 100644 --- a/arch/x86/um/shared/sysdep/ptrace_64.h +++ b/arch/x86/um/shared/sysdep/ptrace_64.h @@ -57,8 +57,6 @@ #define UPT_SYSCALL_ARG5(r) UPT_R8(r) #define UPT_SYSCALL_ARG6(r) UPT_R9(r) -static inline void arch_init_registers(int pid) -{ -} +extern void arch_init_registers(int pid); #endif diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 14fcd01ed992..49e503697022 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -225,26 +225,16 @@ static int copy_sc_from_user(struct pt_regs *regs, } else #endif { - struct user_i387_struct fp; - - err = copy_from_user(&fp, (void *)sc.fpstate, - sizeof(struct user_i387_struct)); + err = copy_from_user(regs->regs.fp, (void *)sc.fpstate, + sizeof(struct _xstate)); if (err) return 1; - - err = restore_fp_registers(pid, (unsigned long *) &fp); - if (err < 0) { - printk(KERN_ERR "copy_sc_from_user - " - "restore_fp_registers failed, errno = %d\n", - -err); - return 1; - } } return 0; } static int copy_sc_to_user(struct sigcontext __user *to, - struct _fpstate __user *to_fp, struct pt_regs *regs, + struct _xstate __user *to_fp, struct pt_regs *regs, unsigned long mask) { struct sigcontext sc; @@ -310,25 +300,22 @@ static int copy_sc_to_user(struct sigcontext __user *to, return 1; } - err = convert_fxsr_to_user(to_fp, &fpx); + err = convert_fxsr_to_user(&to_fp->fpstate, &fpx); if (err) return 1; - err |= __put_user(fpx.swd, &to_fp->status); - err |= __put_user(X86_FXSR_MAGIC, &to_fp->magic); + err |= __put_user(fpx.swd, &to_fp->fpstate.status); + err |= __put_user(X86_FXSR_MAGIC, &to_fp->fpstate.magic); if (err) return 1; - if (copy_to_user(&to_fp->_fxsr_env[0], &fpx, + if (copy_to_user(&to_fp->fpstate._fxsr_env[0], &fpx, sizeof(struct user_fxsr_struct))) return 1; } else #endif { - struct user_i387_struct fp; - - err = save_fp_registers(pid, (unsigned long *) &fp); - if (copy_to_user(to_fp, &fp, sizeof(struct user_i387_struct))) + if (copy_to_user(to_fp, regs->regs.fp, sizeof(struct _xstate))) return 1; } @@ -337,7 +324,7 @@ static int copy_sc_to_user(struct sigcontext __user *to, #ifdef CONFIG_X86_32 static int copy_ucontext_to_user(struct ucontext __user *uc, - struct _fpstate __user *fp, sigset_t *set, + struct _xstate __user *fp, sigset_t *set, unsigned long sp) { int err = 0; @@ -353,7 +340,7 @@ struct sigframe char __user *pretcode; int sig; struct sigcontext sc; - struct _fpstate fpstate; + struct _xstate fpstate; unsigned long extramask[_NSIG_WORDS-1]; char retcode[8]; }; @@ -366,7 +353,7 @@ struct rt_sigframe void __user *puc; struct siginfo info; struct ucontext uc; - struct _fpstate fpstate; + struct _xstate fpstate; char retcode[8]; }; @@ -495,7 +482,7 @@ struct rt_sigframe char __user *pretcode; struct ucontext uc; struct siginfo info; - struct _fpstate fpstate; + struct _xstate fpstate; }; int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index 470564bbd08e..cb3c22370cf5 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -50,7 +50,7 @@ void foo(void) DEFINE(HOST_GS, GS); DEFINE(HOST_ORIG_AX, ORIG_EAX); #else - DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long)); + DEFINE(HOST_FP_SIZE, sizeof(struct _xstate) / sizeof(unsigned long)); DEFINE_LONGS(HOST_BX, RBX); DEFINE_LONGS(HOST_CX, RCX); DEFINE_LONGS(HOST_DI, RDI); diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 237c6831e095..6be22f991b59 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -61,7 +61,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (!vdso_enabled) return 0; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, VM_READ|VM_EXEC| diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index db52a7fafcc2..44c88ad1841a 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -177,7 +177,6 @@ static struct apic xen_pv_apic = { .get_apic_id = xen_get_apic_id, .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */ - .apic_id_mask = 0xFF << 24, /* Used by verify_local_APIC. Match with what xen_get_apic_id does. */ .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index be14cc3e48d5..3be012115853 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -20,10 +20,121 @@ #include <linux/init.h> #include <linux/string.h> +#include <xen/xen.h> #include <xen/xen-ops.h> +#include <xen/interface/platform.h> #include <asm/page.h> #include <asm/setup.h> +#include <asm/xen/hypercall.h> + +static efi_char16_t vendor[100] __initdata; + +static efi_system_table_t efi_systab_xen __initdata = { + .hdr = { + .signature = EFI_SYSTEM_TABLE_SIGNATURE, + .revision = 0, /* Initialized later. */ + .headersize = 0, /* Ignored by Linux Kernel. */ + .crc32 = 0, /* Ignored by Linux Kernel. */ + .reserved = 0 + }, + .fw_vendor = EFI_INVALID_TABLE_ADDR, /* Initialized later. */ + .fw_revision = 0, /* Initialized later. */ + .con_in_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .con_in = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .con_out_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .con_out = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .stderr_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .stderr = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .runtime = (efi_runtime_services_t *)EFI_INVALID_TABLE_ADDR, + /* Not used under Xen. */ + .boottime = (efi_boot_services_t *)EFI_INVALID_TABLE_ADDR, + /* Not used under Xen. */ + .nr_tables = 0, /* Initialized later. */ + .tables = EFI_INVALID_TABLE_ADDR /* Initialized later. */ +}; + +static const struct efi efi_xen __initconst = { + .systab = NULL, /* Initialized later. */ + .runtime_version = 0, /* Initialized later. */ + .mps = EFI_INVALID_TABLE_ADDR, + .acpi = EFI_INVALID_TABLE_ADDR, + .acpi20 = EFI_INVALID_TABLE_ADDR, + .smbios = EFI_INVALID_TABLE_ADDR, + .smbios3 = EFI_INVALID_TABLE_ADDR, + .sal_systab = EFI_INVALID_TABLE_ADDR, + .boot_info = EFI_INVALID_TABLE_ADDR, + .hcdp = EFI_INVALID_TABLE_ADDR, + .uga = EFI_INVALID_TABLE_ADDR, + .uv_systab = EFI_INVALID_TABLE_ADDR, + .fw_vendor = EFI_INVALID_TABLE_ADDR, + .runtime = EFI_INVALID_TABLE_ADDR, + .config_table = EFI_INVALID_TABLE_ADDR, + .get_time = xen_efi_get_time, + .set_time = xen_efi_set_time, + .get_wakeup_time = xen_efi_get_wakeup_time, + .set_wakeup_time = xen_efi_set_wakeup_time, + .get_variable = xen_efi_get_variable, + .get_next_variable = xen_efi_get_next_variable, + .set_variable = xen_efi_set_variable, + .query_variable_info = xen_efi_query_variable_info, + .update_capsule = xen_efi_update_capsule, + .query_capsule_caps = xen_efi_query_capsule_caps, + .get_next_high_mono_count = xen_efi_get_next_high_mono_count, + .reset_system = NULL, /* Functionality provided by Xen. */ + .set_virtual_address_map = NULL, /* Not used under Xen. */ + .flags = 0 /* Initialized later. */ +}; + +static efi_system_table_t __init *xen_efi_probe(void) +{ + struct xen_platform_op op = { + .cmd = XENPF_firmware_info, + .u.firmware_info = { + .type = XEN_FW_EFI_INFO, + .index = XEN_FW_EFI_CONFIG_TABLE + } + }; + union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info; + + if (!xen_initial_domain() || HYPERVISOR_platform_op(&op) < 0) + return NULL; + + /* Here we know that Xen runs on EFI platform. */ + + efi = efi_xen; + + efi_systab_xen.tables = info->cfg.addr; + efi_systab_xen.nr_tables = info->cfg.nent; + + op.cmd = XENPF_firmware_info; + op.u.firmware_info.type = XEN_FW_EFI_INFO; + op.u.firmware_info.index = XEN_FW_EFI_VENDOR; + info->vendor.bufsz = sizeof(vendor); + set_xen_guest_handle(info->vendor.name, vendor); + + if (HYPERVISOR_platform_op(&op) == 0) { + efi_systab_xen.fw_vendor = __pa_symbol(vendor); + efi_systab_xen.fw_revision = info->vendor.revision; + } else + efi_systab_xen.fw_vendor = __pa_symbol(L"UNKNOWN"); + + op.cmd = XENPF_firmware_info; + op.u.firmware_info.type = XEN_FW_EFI_INFO; + op.u.firmware_info.index = XEN_FW_EFI_VERSION; + + if (HYPERVISOR_platform_op(&op) == 0) + efi_systab_xen.hdr.revision = info->version; + + op.cmd = XENPF_firmware_info; + op.u.firmware_info.type = XEN_FW_EFI_INFO; + op.u.firmware_info.index = XEN_FW_EFI_RT_VERSION; + + if (HYPERVISOR_platform_op(&op) == 0) + efi.runtime_version = info->version; + + return &efi_systab_xen; +} void __init xen_efi_init(void) { diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 760789ae8562..69b4b6d29738 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -59,6 +59,7 @@ #include <asm/xen/pci.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <asm/xen/cpuid.h> #include <asm/fixmap.h> #include <asm/processor.h> #include <asm/proto.h> @@ -118,6 +119,10 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); */ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +/* Linux <-> Xen vCPU id mapping */ +DEFINE_PER_CPU(int, xen_vcpu_id) = -1; +EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); + enum xen_domain_type xen_domain_type = XEN_NATIVE; EXPORT_SYMBOL_GPL(xen_domain_type); @@ -179,7 +184,7 @@ static void clamp_max_cpus(void) #endif } -static void xen_vcpu_setup(int cpu) +void xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; int err; @@ -202,8 +207,9 @@ static void xen_vcpu_setup(int cpu) if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) return; } - if (cpu < MAX_VIRT_CPUS) - per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) + per_cpu(xen_vcpu, cpu) = + &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; if (!have_vcpu_info_placement) { if (cpu >= MAX_VIRT_CPUS) @@ -223,7 +229,8 @@ static void xen_vcpu_setup(int cpu) hypervisor has no unregister variant and this hypercall does not allow to over-write info.mfn and info.offset. */ - err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); + err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu), + &info); if (err) { printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); @@ -247,10 +254,11 @@ void xen_vcpu_restore(void) for_each_possible_cpu(cpu) { bool other_cpu = (cpu != smp_processor_id()); - bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL); + bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), + NULL); if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) BUG(); xen_setup_runstate_info(cpu); @@ -259,7 +267,7 @@ void xen_vcpu_restore(void) xen_vcpu_setup(cpu); if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) + HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) BUG(); } } @@ -521,9 +529,7 @@ static void set_aliased_prot(void *v, pgprot_t prot) preempt_disable(); - pagefault_disable(); /* Avoid warnings due to being atomic. */ - __get_user(dummy, (unsigned char __user __force *)v); - pagefault_enable(); + probe_kernel_read(&dummy, v, 1); if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) BUG(); @@ -590,7 +596,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr) { unsigned long va = dtr->address; unsigned int size = dtr->size + 1; - unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); unsigned long frames[pages]; int f; @@ -639,7 +645,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) { unsigned long va = dtr->address; unsigned int size = dtr->size + 1; - unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); unsigned long frames[pages]; int f; @@ -1137,8 +1143,11 @@ void xen_setup_vcpu_info_placement(void) { int cpu; - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { + /* Set up direct vCPU id mapping for PV guests. */ + per_cpu(xen_vcpu_id, cpu) = cpu; xen_vcpu_setup(cpu); + } /* xen_vcpu_setup managed to place the vcpu_info within the * percpu area for all cpus, so make use of it. Note that for @@ -1729,6 +1738,9 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif xen_raw_console_write("about to get started...\n"); + /* Let's presume PV guests always boot on vCPU with id 0. */ + per_cpu(xen_vcpu_id, 0) = 0; + xen_setup_runstate_info(0); xen_efi_init(); @@ -1770,9 +1782,10 @@ void __ref xen_hvm_init_shared_info(void) * in that case multiple vcpus might be online. */ for_each_online_cpu(cpu) { /* Leave it to be NULL. */ - if (cpu >= MAX_VIRT_CPUS) + if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS) continue; - per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + per_cpu(xen_vcpu, cpu) = + &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; } } @@ -1797,6 +1810,12 @@ static void __init init_hvm_pv_info(void) xen_setup_features(); + cpuid(base + 4, &eax, &ebx, &ecx, &edx); + if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT) + this_cpu_write(xen_vcpu_id, ebx); + else + this_cpu_write(xen_vcpu_id, smp_processor_id()); + pv_info.name = "Xen HVM"; xen_domain_type = XEN_HVM_DOMAIN; @@ -1808,6 +1827,10 @@ static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, int cpu = (long)hcpu; switch (action) { case CPU_UP_PREPARE: + if (cpu_acpi_id(cpu) != U32_MAX) + per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); + else + per_cpu(xen_vcpu_id, cpu) = cpu; xen_vcpu_setup(cpu); if (xen_have_vector_callback) { if (xen_feature(XENFEAT_hvm_safe_pvclock)) diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index e079500b17f3..de4144c24f1c 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -111,63 +111,18 @@ int arch_gnttab_init(unsigned long nr_shared) } #ifdef CONFIG_XEN_PVH -#include <xen/balloon.h> #include <xen/events.h> -#include <linux/slab.h> -static int __init xlated_setup_gnttab_pages(void) -{ - struct page **pages; - xen_pfn_t *pfns; - void *vaddr; - int rc; - unsigned int i; - unsigned long nr_grant_frames = gnttab_max_grant_frames(); - - BUG_ON(nr_grant_frames == 0); - pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL); - if (!pages) - return -ENOMEM; - - pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL); - if (!pfns) { - kfree(pages); - return -ENOMEM; - } - rc = alloc_xenballooned_pages(nr_grant_frames, pages); - if (rc) { - pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__, - nr_grant_frames, rc); - kfree(pages); - kfree(pfns); - return rc; - } - for (i = 0; i < nr_grant_frames; i++) - pfns[i] = page_to_pfn(pages[i]); - - vaddr = vmap(pages, nr_grant_frames, 0, PAGE_KERNEL); - if (!vaddr) { - pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__, - nr_grant_frames, rc); - free_xenballooned_pages(nr_grant_frames, pages); - kfree(pages); - kfree(pfns); - return -ENOMEM; - } - kfree(pages); - - xen_auto_xlat_grant_frames.pfn = pfns; - xen_auto_xlat_grant_frames.count = nr_grant_frames; - xen_auto_xlat_grant_frames.vaddr = vaddr; - - return 0; -} - +#include <xen/xen-ops.h> static int __init xen_pvh_gnttab_setup(void) { if (!xen_pvh_domain()) return -ENODEV; - return xlated_setup_gnttab_pages(); + xen_auto_xlat_grant_frames.count = gnttab_max_grant_frames(); + + return xen_xlate_map_ballooned_pages(&xen_auto_xlat_grant_frames.pfn, + &xen_auto_xlat_grant_frames.vaddr, + xen_auto_xlat_grant_frames.count); } /* Call it _before_ __gnttab_init as we need to initialize the * xen_auto_xlat_grant_frames first. */ diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index a1207cb6472a..33e92955e09d 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -109,7 +109,8 @@ static void xen_safe_halt(void) static void xen_halt(void) { if (irqs_disabled()) - HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + HYPERVISOR_vcpu_op(VCPUOP_down, + xen_vcpu_nr(smp_processor_id()), NULL); else xen_safe_halt(); } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 478a2de543a5..67433714b791 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1113,7 +1113,7 @@ static void __init xen_cleanhighmap(unsigned long vaddr, /* NOTE: The loop is more greedy than the cleanup_highmap variant. * We include the PMD passed in on _both_ boundaries. */ - for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE)); + for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD)); pmd++, vaddr += PMD_SIZE) { if (pmd_none(*pmd)) continue; @@ -1551,41 +1551,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } -#ifdef CONFIG_X86_32 -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) -{ - /* If there's an existing pte, then don't allow _PAGE_RW to be set */ - if (pte_val_ma(*ptep) & _PAGE_PRESENT) - pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & - pte_val_ma(pte)); - - return pte; -} -#else /* CONFIG_X86_64 */ -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) -{ - unsigned long pfn; - - if (xen_feature(XENFEAT_writable_page_tables) || - xen_feature(XENFEAT_auto_translated_physmap) || - xen_start_info->mfn_list >= __START_KERNEL_map) - return pte; - - /* - * Pages belonging to the initial p2m list mapped outside the default - * address range must be mapped read-only. This region contains the - * page tables for mapping the p2m list, too, and page tables MUST be - * mapped read-only. - */ - pfn = pte_pfn(pte); - if (pfn >= xen_start_info->first_p2m_pfn && - pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) - pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW); - - return pte; -} -#endif /* CONFIG_X86_64 */ - /* * Init-time set_pte while constructing initial pagetables, which * doesn't allow RO page table pages to be remapped RW. @@ -1600,13 +1565,37 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) * so always write the PTE directly and rely on Xen trapping and * emulating any updates as necessary. */ -static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) +__visible pte_t xen_make_pte_init(pteval_t pte) { - if (pte_mfn(pte) != INVALID_P2M_ENTRY) - pte = mask_rw_pte(ptep, pte); - else - pte = __pte_ma(0); +#ifdef CONFIG_X86_64 + unsigned long pfn; + + /* + * Pages belonging to the initial p2m list mapped outside the default + * address range must be mapped read-only. This region contains the + * page tables for mapping the p2m list, too, and page tables MUST be + * mapped read-only. + */ + pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT; + if (xen_start_info->mfn_list < __START_KERNEL_map && + pfn >= xen_start_info->first_p2m_pfn && + pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) + pte &= ~_PAGE_RW; +#endif + pte = pte_pfn_to_mfn(pte); + return native_make_pte(pte); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init); +static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) +{ +#ifdef CONFIG_X86_32 + /* If there's an existing pte, then don't allow _PAGE_RW to be set */ + if (pte_mfn(pte) != INVALID_P2M_ENTRY + && pte_val_ma(*ptep) & _PAGE_PRESENT) + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); +#endif native_set_pte(ptep, pte); } @@ -2407,6 +2396,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.alloc_pud = xen_alloc_pud; pv_mmu_ops.release_pud = xen_release_pud; #endif + pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte); #ifdef CONFIG_X86_64 pv_mmu_ops.write_cr3 = &xen_write_cr3; @@ -2455,7 +2445,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .pte_val = PV_CALLEE_SAVE(xen_pte_val), .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), - .make_pte = PV_CALLEE_SAVE(xen_make_pte), + .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), #ifdef CONFIG_X86_PAE diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index cab9f766bb06..dd2a49a8aacc 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -182,7 +182,7 @@ static void * __ref alloc_p2m_page(void) if (unlikely(!slab_is_available())) return alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); - return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); + return (void *)__get_free_page(GFP_KERNEL); } static void __ref free_p2m_page(void *p) diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 9466354d3e49..32bdc2c90297 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -547,7 +547,7 @@ void xen_pmu_init(int cpu) return; fail: - pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n", + pr_info_once("Could not initialize VPMU for cpu %d, error %d\n", cpu, err); free_pages((unsigned long)xenpmu_data, 0); } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 7ab29518a3b9..e345891450c3 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -393,6 +393,9 @@ static unsigned long __init xen_set_identity_and_remap_chunk( unsigned long i = 0; unsigned long n = end_pfn - start_pfn; + if (remap_pfn == 0) + remap_pfn = nr_pages; + while (i < n) { unsigned long cur_pfn = start_pfn + i; unsigned long left = n - i; @@ -438,17 +441,29 @@ static unsigned long __init xen_set_identity_and_remap_chunk( return remap_pfn; } -static void __init xen_set_identity_and_remap(unsigned long nr_pages) +static unsigned long __init xen_count_remap_pages( + unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long remap_pages) +{ + if (start_pfn >= nr_pages) + return remap_pages; + + return remap_pages + min(end_pfn, nr_pages) - start_pfn; +} + +static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, + unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn, + unsigned long nr_pages, unsigned long last_val)) { phys_addr_t start = 0; - unsigned long last_pfn = nr_pages; + unsigned long ret_val = 0; const struct e820entry *entry = xen_e820_map; int i; /* * Combine non-RAM regions and gaps until a RAM region (or the - * end of the map) is reached, then set the 1:1 map and - * remap the memory in those non-RAM regions. + * end of the map) is reached, then call the provided function + * to perform its duty on the non-RAM region. * * The combined non-RAM regions are rounded to a whole number * of pages so any partial pages are accessible via the 1:1 @@ -466,14 +481,13 @@ static void __init xen_set_identity_and_remap(unsigned long nr_pages) end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) - last_pfn = xen_set_identity_and_remap_chunk( - start_pfn, end_pfn, nr_pages, - last_pfn); + ret_val = func(start_pfn, end_pfn, nr_pages, + ret_val); start = end; } } - pr_info("Released %ld page(s)\n", xen_released_pages); + return ret_val; } /* @@ -596,35 +610,6 @@ static void __init xen_ignore_unusable(void) } } -static unsigned long __init xen_count_remap_pages(unsigned long max_pfn) -{ - unsigned long extra = 0; - unsigned long start_pfn, end_pfn; - const struct e820entry *entry = xen_e820_map; - int i; - - end_pfn = 0; - for (i = 0; i < xen_e820_map_entries; i++, entry++) { - start_pfn = PFN_DOWN(entry->addr); - /* Adjacent regions on non-page boundaries handling! */ - end_pfn = min(end_pfn, start_pfn); - - if (start_pfn >= max_pfn) - return extra + max_pfn - end_pfn; - - /* Add any holes in map to result. */ - extra += start_pfn - end_pfn; - - end_pfn = PFN_UP(entry->addr + entry->size); - end_pfn = min(end_pfn, max_pfn); - - if (entry->type != E820_RAM) - extra += end_pfn - start_pfn; - } - - return extra; -} - bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) { struct e820entry *entry; @@ -804,7 +789,7 @@ char * __init xen_memory_setup(void) max_pages = xen_get_max_pages(); /* How many extra pages do we need due to remapping? */ - max_pages += xen_count_remap_pages(max_pfn); + max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages); if (max_pages > max_pfn) extra_pages += max_pages - max_pfn; @@ -922,7 +907,9 @@ char * __init xen_memory_setup(void) * Set identity map on non-RAM pages and prepare remapping the * underlying RAM. */ - xen_set_identity_and_remap(max_pfn); + xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk); + + pr_info("Released %ld page(s)\n", xen_released_pages); return "Xen"; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 719cf291dcdf..0b4d04c8ab4d 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -322,6 +322,13 @@ static void __init xen_smp_prepare_boot_cpu(void) xen_filter_cpu_maps(); xen_setup_vcpu_info_placement(); } + + /* + * Setup vcpu_info for boot CPU. + */ + if (xen_hvm_domain()) + xen_vcpu_setup(0); + /* * The alternative logic (which patches the unlock/lock) runs before * the smp bootup up code is activated. Hence we need to set this up @@ -454,7 +461,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) #endif ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); - if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) + if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) BUG(); kfree(ctxt); @@ -492,7 +499,7 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) if (rc) return rc; - rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); + rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); BUG_ON(rc); while (cpu_report_state(cpu) != CPU_ONLINE) @@ -520,7 +527,8 @@ static int xen_cpu_disable(void) static void xen_cpu_die(unsigned int cpu) { - while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { + while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, + xen_vcpu_nr(cpu), NULL)) { __set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(HZ/10); } @@ -536,7 +544,7 @@ static void xen_cpu_die(unsigned int cpu) static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ { play_dead_common(); - HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); cpu_bringup(); /* * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) @@ -576,7 +584,7 @@ static void stop_self(void *v) set_cpu_online(cpu, false); - HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL); BUG(); } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index a0a4e554c6f1..67356d29d74d 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -11,8 +11,6 @@ #include <linux/interrupt.h> #include <linux/clocksource.h> #include <linux/clockchips.h> -#include <linux/kernel_stat.h> -#include <linux/math64.h> #include <linux/gfp.h> #include <linux/slab.h> #include <linux/pvclock_gtod.h> @@ -31,44 +29,6 @@ /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 -#define NS_PER_TICK (1000000000LL / HZ) - -/* snapshots of runstate info */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); - -/* unused ns of stolen time */ -static DEFINE_PER_CPU(u64, xen_residual_stolen); - -static void do_stolen_accounting(void) -{ - struct vcpu_runstate_info state; - struct vcpu_runstate_info *snap; - s64 runnable, offline, stolen; - cputime_t ticks; - - xen_get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - snap = this_cpu_ptr(&xen_runstate_snapshot); - - /* work out how much time the VCPU has not been runn*ing* */ - runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; - offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; - - *snap = state; - - /* Add the appropriate number of ticks of stolen time, - including any left-overs from last time. */ - stolen = runnable + offline + __this_cpu_read(xen_residual_stolen); - - if (stolen < 0) - stolen = 0; - - ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); - __this_cpu_write(xen_residual_stolen, stolen); - account_steal_ticks(ticks); -} /* Get the TSC speed from Xen */ static unsigned long xen_tsc_khz(void) @@ -263,8 +223,10 @@ static int xen_vcpuop_shutdown(struct clock_event_device *evt) { int cpu = smp_processor_id(); - if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || - HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu), + NULL) || + HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu), + NULL)) BUG(); return 0; @@ -274,7 +236,8 @@ static int xen_vcpuop_set_oneshot(struct clock_event_device *evt) { int cpu = smp_processor_id(); - if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu), + NULL)) BUG(); return 0; @@ -290,11 +253,12 @@ static int xen_vcpuop_set_next_event(unsigned long delta, WARN_ON(!clockevent_state_oneshot(evt)); single.timeout_abs_ns = get_abs_timeout(delta); - single.flags = VCPU_SSHOTTMR_future; - - ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); + /* Get an event anyway, even if the timeout is already expired */ + single.flags = 0; - BUG_ON(ret != 0 && ret != -ETIME); + ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu), + &single); + BUG_ON(ret != 0); return ret; } @@ -335,8 +299,6 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) ret = IRQ_HANDLED; } - do_stolen_accounting(); - return ret; } @@ -394,13 +356,15 @@ void xen_timer_resume(void) return; for_each_online_cpu(cpu) { - if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, + xen_vcpu_nr(cpu), NULL)) BUG(); } } static const struct pv_time_ops xen_time_ops __initconst = { .sched_clock = xen_clocksource_read, + .steal_clock = xen_steal_clock, }; static void __init xen_time_init(void) @@ -414,7 +378,8 @@ static void __init xen_time_init(void) clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); - if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu), + NULL) == 0) { /* Successfully turned off 100Hz tick, so we have the vcpuop-based timer interface */ printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); @@ -431,6 +396,8 @@ static void __init xen_time_init(void) xen_setup_timer(cpu); xen_setup_cpu_clockevents(); + xen_time_setup_guest(); + if (xen_initial_domain()) pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 4140b070f2e9..3cbce3b085e7 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -76,6 +76,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id); bool xen_vcpu_stolen(int vcpu); +void xen_vcpu_setup(int cpu); void xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP |