diff options
Diffstat (limited to 'arch/x86')
99 files changed, 1664 insertions, 736 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 25d2c6f7325e..7d5feb5908dd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -261,6 +261,9 @@ config ARCH_HWEIGHT_CFLAGS config ARCH_SUPPORTS_UPROBES def_bool y +config FIX_EARLYCON_MEM + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -415,7 +418,6 @@ config X86_UV config X86_GOLDFISH bool "Goldfish (Virtual Platform)" - depends on X86_32 depends on X86_EXTENDED_PLATFORM ---help--- Enable support for the Goldfish virtual platform used primarily @@ -2375,12 +2377,9 @@ config X86_DMA_REMAP depends on STA2X11 config IOSF_MBI - bool + tristate + default m depends on PCI - ---help--- - To be selected by modules requiring access to the Intel OnChip System - Fabric (IOSF) Sideband MailBox Interface (MBI). For MBI platforms - enumerable by PCI. source "net/Kconfig" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 602f57e590b5..33f71b01fd22 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -79,11 +79,14 @@ else UTS_MACHINE := x86_64 CHECKFLAGS += -D__x86_64__ -m64 + biarch := -m64 KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 # Don't autogenerate traditional x87, MMX or SSE instructions - KBUILD_CFLAGS += -mno-mmx -mno-sse -mno-80387 -mno-fp-ret-in-387 + KBUILD_CFLAGS += -mno-mmx -mno-sse + KBUILD_CFLAGS += $(call cc-option,-mno-80387) + KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) # Use -mpreferred-stack-boundary=3 if supported. KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) @@ -250,8 +253,8 @@ archclean: PHONY += kvmconfig kvmconfig: $(if $(wildcard $(objtree)/.config),, $(error You need an existing .config for this target)) - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config arch/x86/configs/kvm_guest.config - $(Q)yes "" | $(MAKE) oldconfig + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config $(srctree)/arch/x86/configs/kvm_guest.config + $(Q)yes "" | $(MAKE) -f $(srctree)/Makefile oldconfig define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index abb9eba61b50..dbe8dd2fe247 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -71,7 +71,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ @@ -80,7 +80,7 @@ targets += voffset.h $(obj)/voffset.h: vmlinux FORCE $(call if_changed,voffset) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 17684615374b..57ab74df7eea 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -354,7 +354,7 @@ static void parse_elf(void *output) free(phdrs); } -asmlinkage void *decompress_kernel(void *rmode, memptr heap, +asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index f3c57e341402..00e788be1db9 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,9 +1,5 @@ -#include "misc.h" #include "../string.c" -/* misc.h might pull in string_32.h which has a macro for memcpy. undef that */ -#undef memcpy - #ifdef CONFIG_X86_32 void *memcpy(void *dest, const void *src, size_t n) { diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 5339040ef86e..493f3fd9f139 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -12,14 +12,9 @@ * Very basic string functions */ -#include "boot.h" +#include <linux/types.h> +#include "ctype.h" -/* - * This file gets included in compressed/string.c which might pull in - * string_32.h and which in turn maps memcmp to __builtin_memcmp(). Undo - * that first. - */ -#undef memcmp int memcmp(const void *s1, const void *s2, size_t len) { u8 diff; diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 619e7f7426c6..32d2e7056c87 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -244,7 +244,6 @@ CONFIG_HID_TOPSEED=y CONFIG_HID_PID=y CONFIG_USB_HIDDEV=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 6181c69b786b..a481dd4755d5 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -239,7 +239,6 @@ CONFIG_HID_TOPSEED=y CONFIG_HID_PID=y CONFIG_USB_HIDDEV=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index b17f4f48ecd7..6dd1c7dd0473 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -7,6 +7,7 @@ #include <asm/alternative.h> #include <asm/cmpxchg.h> #include <asm/rmwcc.h> +#include <asm/barrier.h> /* * Atomic operations that C can't guarantee us. Useful for @@ -243,12 +244,6 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) : : "r" ((unsigned)(mask)), "m" (*(addr)) \ : "memory") -/* Atomic operations are already serializing on x86 */ -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() - #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> #else diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 69bbb4845020..5c7198cca5ed 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -137,6 +137,10 @@ do { \ #endif +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic() barrier() +#define smp_mb__after_atomic() barrier() + /* * Stop RDTSC speculation. This is needed when you need to use RDTSC * (or get_cycles or vread that possibly accesses the TSC) in a defined diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 9fc1af74dc83..afcd35d331de 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -15,6 +15,7 @@ #include <linux/compiler.h> #include <asm/alternative.h> #include <asm/rmwcc.h> +#include <asm/barrier.h> #if BITS_PER_LONG == 32 # define _BITOPS_LONG_SHIFT 5 @@ -102,7 +103,7 @@ static inline void __set_bit(long nr, volatile unsigned long *addr) * * clear_bit() is atomic and may not be reordered. However, it does * not contain a memory barrier, so if it is used for locking purposes, - * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() * in order to ensure changes are visible on other processors. */ static __always_inline void @@ -156,9 +157,6 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) __clear_bit(nr, addr); } -#define smp_mb__before_clear_bit() barrier() -#define smp_mb__after_clear_bit() barrier() - /** * __change_bit - Toggle a bit in memory * @nr: the bit to change diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h new file mode 100644 index 000000000000..e01f7f7ccb0c --- /dev/null +++ b/arch/x86/include/asm/cmdline.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_CMDLINE_H +#define _ASM_X86_CMDLINE_H + +int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); + +#endif /* _ASM_X86_CMDLINE_H */ diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index b18df579c0e9..36f7125945e3 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -63,6 +63,7 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; extern unsigned long force_hpet_address; +extern int boot_hpet_disable; extern u8 hpet_blockid; extern int hpet_force_user; extern u8 hpet_msi_disable; diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index a8091216963b..68c05398bba9 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -52,6 +52,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + ptep_clear_flush(vma, addr, ptep); } static inline int huge_pte_none(pte_t pte) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index a307b7530e54..4615906d83df 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -190,8 +190,8 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); #define trace_interrupt interrupt #endif -#define VECTOR_UNDEFINED -1 -#define VECTOR_RETRIGGERED -2 +#define VECTOR_UNDEFINED (-1) +#define VECTOR_RETRIGGERED (-2) typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index 8e71c7941767..57995f0596a6 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -50,6 +50,32 @@ #define BT_MBI_PCIE_READ 0x00 #define BT_MBI_PCIE_WRITE 0x01 +/* Quark available units */ +#define QRK_MBI_UNIT_HBA 0x00 +#define QRK_MBI_UNIT_HB 0x03 +#define QRK_MBI_UNIT_RMU 0x04 +#define QRK_MBI_UNIT_MM 0x05 +#define QRK_MBI_UNIT_MMESRAM 0x05 +#define QRK_MBI_UNIT_SOC 0x31 + +/* Quark read/write opcodes */ +#define QRK_MBI_HBA_READ 0x10 +#define QRK_MBI_HBA_WRITE 0x11 +#define QRK_MBI_HB_READ 0x10 +#define QRK_MBI_HB_WRITE 0x11 +#define QRK_MBI_RMU_READ 0x10 +#define QRK_MBI_RMU_WRITE 0x11 +#define QRK_MBI_MM_READ 0x10 +#define QRK_MBI_MM_WRITE 0x11 +#define QRK_MBI_MMESRAM_READ 0x12 +#define QRK_MBI_MMESRAM_WRITE 0x13 +#define QRK_MBI_SOC_READ 0x06 +#define QRK_MBI_SOC_WRITE 0x07 + +#if IS_ENABLED(CONFIG_IOSF_MBI) + +bool iosf_mbi_available(void); + /** * iosf_mbi_read() - MailBox Interface read command * @port: port indicating subunit being accessed @@ -87,4 +113,33 @@ int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr); */ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask); +#else /* CONFIG_IOSF_MBI is not enabled */ +static inline +bool iosf_mbi_available(void) +{ + return false; +} + +static inline +int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr) +{ + WARN(1, "IOSF_MBI driver not available"); + return -EPERM; +} + +static inline +int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr) +{ + WARN(1, "IOSF_MBI driver not available"); + return -EPERM; +} + +static inline +int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) +{ + WARN(1, "IOSF_MBI driver not available"); + return -EPERM; +} +#endif /* CONFIG_IOSF_MBI */ + #endif /* IOSF_MBI_SYMS_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fcaf9c961265..7de069afb382 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -60,7 +60,7 @@ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index b59827e76529..64dc362506b7 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -25,6 +25,7 @@ struct cpu_signature { struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; +extern bool dis_ucode_ldr; struct microcode_ops { enum ucode_state (*request_microcode_user) (int cpu, diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 8de6d9cf3b95..678205195ae1 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#define THREAD_SIZE_ORDER 1 +#define THREAD_SIZE_ORDER 2 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 96ae4f4040bb..0892ea0e683f 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -68,7 +68,6 @@ void pcibios_config_init(void); void pcibios_scan_root(int bus); void pcibios_set_master(struct pci_dev *dev); -void pcibios_penalize_isa_irq(int irq, int active); struct irq_routing_table *pcibios_get_irq_routing_table(void); int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 05af3b31d522..f28a24b51dc7 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -41,7 +41,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr) * * sync_clear_bit() is atomic and may not be reordered. However, it does * not contain a memory barrier, so if it is used for locking purposes, - * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() * in order to ensure changes are visible on other processors. */ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 47e5de25ba79..854053889d4d 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -83,6 +83,7 @@ struct thread_info { #define TIF_FORK 18 /* ret_from_fork */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ +#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ @@ -106,6 +107,7 @@ struct thread_info { #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) @@ -191,8 +193,6 @@ static inline struct thread_info *current_thread_info(void) * have to worry about atomic accesses. */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ -#define TS_POLLING 0x0004 /* idle task polling need_resched, - skip sending interrupt */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 3087ea9c5f2e..93bee7b93854 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -33,15 +33,27 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +struct uprobe_xol_ops; + struct arch_uprobe { - u16 fixups; union { u8 insn[MAX_UINSN_BYTES]; u8 ixol[MAX_UINSN_BYTES]; }; + + u16 fixups; + const struct uprobe_xol_ops *ops; + + union { #ifdef CONFIG_X86_64 - unsigned long rip_rela_target_address; + unsigned long rip_rela_target_address; #endif + struct { + s32 offs; + u8 ilen; + u8 opc1; + } branch; + }; }; struct arch_uprobe_task { diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index a30836c8ac4d..c63e925fd6b7 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -5,7 +5,7 @@ * * SGI UV architectural definitions * - * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_HUB_H @@ -204,16 +204,6 @@ static inline int is_uvx_hub(void) return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE; } -static inline int is_uv2_1_hub(void) -{ - return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE; -} - -static inline int is_uv2_2_hub(void) -{ - return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE + 1; -} - union uvh_apicid { unsigned long v; struct uvh_apicid_s { diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index e42249bcf7e1..ddd8db6b6e70 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -5,7 +5,7 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H @@ -2803,6 +2803,46 @@ union uv1h_lb_target_physical_apic_id_mask_u { }; /* ========================================================================= */ +/* UV3H_GR0_GAM_GR_CONFIG */ +/* ========================================================================= */ +#define UV3H_GR0_GAM_GR_CONFIG 0xc00028UL + +#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_SHFT 0 +#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10 +#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL +#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL + +union uv3h_gr0_gam_gr_config_u { + unsigned long v; + struct uv3h_gr0_gam_gr_config_s { + unsigned long m_skt:6; /* RW */ + unsigned long undef_6_9:4; /* Undefined */ + unsigned long subspace:1; /* RW */ + unsigned long reserved:53; + } s3; +}; + +/* ========================================================================= */ +/* UV3H_GR1_GAM_GR_CONFIG */ +/* ========================================================================= */ +#define UV3H_GR1_GAM_GR_CONFIG 0x1000028UL + +#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_SHFT 0 +#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_SHFT 10 +#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL +#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL + +union uv3h_gr1_gam_gr_config_u { + unsigned long v; + struct uv3h_gr1_gam_gr_config_s { + unsigned long m_skt:6; /* RW */ + unsigned long undef_6_9:4; /* Undefined */ + unsigned long subspace:1; /* RW */ + unsigned long reserved:53; + } s3; +}; + +/* ========================================================================= */ /* UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */ /* ========================================================================= */ #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x1603000UL diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index e709884d0ef9..ca08a27b90b3 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -343,7 +343,7 @@ HYPERVISOR_memory_op(unsigned int cmd, void *arg) } static inline int -HYPERVISOR_multicall(void *call_list, int nr_calls) +HYPERVISOR_multicall(void *call_list, uint32_t nr_calls) { return _hypercall2(int, multicall, call_list, nr_calls); } diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index fd9cb7695b5f..3400dbaec3c3 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -54,6 +54,9 @@ typedef unsigned long xen_pfn_t; #define PRI_xen_pfn "lx" typedef unsigned long xen_ulong_t; #define PRI_xen_ulong "lx" +typedef long xen_long_t; +#define PRI_xen_long "lx" + /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index c827ace3121b..fcf2b3ae1bf0 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -384,7 +384,7 @@ #define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18 #define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT) #define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22 -#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT); +#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23 #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT) #define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34 diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3a2ae4c88948..31368207837c 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -31,7 +31,7 @@ static char temp_stack[4096]; * * Wrapper around acpi_enter_sleep_state() to be called by assmebly. */ -acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state) +acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state) { return acpi_enter_sleep_state(state); } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9fa8aa051f54..76164e173a24 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -10,6 +10,8 @@ * * Copyright 2002 Andi Kleen, SuSE Labs. */ +#define pr_fmt(fmt) "AGP: " fmt + #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> @@ -75,14 +77,13 @@ static u32 __init allocate_aperture(void) addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, aper_size, aper_size); if (!addr) { - printk(KERN_ERR - "Cannot allocate aperture memory hole (%lx,%uK)\n", - addr, aper_size>>10); + pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); return 0; } memblock_reserve(addr, aper_size); - printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, addr); + pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); register_nosave_region(addr >> PAGE_SHIFT, (addr+aper_size) >> PAGE_SHIFT); @@ -126,10 +127,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) u64 aper; u32 old_order; - printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func); + pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func); apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); if (apsizereg == 0xffffffff) { - printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); + pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n", + bus, slot, func); return 0; } @@ -153,16 +155,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * On some sick chips, APSIZE is 0. It means it wants 4G * so let double check that order, and lets trust AMD NB settings: */ - printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n", - aper, 32 << old_order); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n", + bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1, + 32 << old_order); if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { - printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", - 32 << *order, apsizereg); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n", + bus, slot, func, 32 << *order, apsizereg); *order = old_order; } - printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", - aper, 32 << *order, apsizereg); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n", + bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1, + 32 << *order, apsizereg); if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) return 0; @@ -218,7 +222,7 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) } } } - printk(KERN_INFO "No AGP bridge found\n"); + pr_info("No AGP bridge found\n"); return 0; } @@ -310,7 +314,8 @@ void __init early_gart_iommu_check(void) if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { /* reserve it, so we can reuse it in second kernel */ - printk(KERN_INFO "update e820 for GART\n"); + pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", + aper_base, aper_base + aper_size - 1); e820_add_region(aper_base, aper_size, E820_RESERVED); update_e820(); } @@ -354,7 +359,7 @@ int __init gart_iommu_hole_init(void) !early_pci_allowed()) return -ENODEV; - printk(KERN_INFO "Checking aperture...\n"); + pr_info("Checking aperture...\n"); if (!fallback_aper_force) agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); @@ -395,8 +400,9 @@ int __init gart_iommu_hole_init(void) aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; - printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", - node, aper_base, aper_size >> 20); + pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n", + node, aper_base, aper_base + aper_size - 1, + aper_size >> 20); node++; if (!aperture_valid(aper_base, aper_size, 64<<20)) { @@ -407,9 +413,9 @@ int __init gart_iommu_hole_init(void) if (!no_iommu && max_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { - printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); - printk(KERN_ERR "please increase GART size in your BIOS setup\n"); - printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); + pr_err("you are using iommu with agp, but GART size is less than 64MB\n"); + pr_err("please increase GART size in your BIOS setup\n"); + pr_err("if BIOS doesn't have that option, contact your HW vendor!\n"); printed_gart_size_msg = 1; } } else { @@ -446,13 +452,10 @@ out: force_iommu || valid_agp || fallback_aper_force) { - printk(KERN_INFO - "Your BIOS doesn't leave a aperture memory hole\n"); - printk(KERN_INFO - "Please enable the IOMMU option in the BIOS setup\n"); - printk(KERN_INFO - "This costs you %d MB of RAM\n", - 32 << fallback_aper_order); + pr_info("Your BIOS doesn't leave a aperture memory hole\n"); + pr_info("Please enable the IOMMU option in the BIOS setup\n"); + pr_info("This costs you %dMB of RAM\n", + 32 << fallback_aper_order); aper_order = fallback_aper_order; aper_alloc = allocate_aperture(); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index a698d7165c96..eab67047dec3 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -57,7 +57,7 @@ void arch_trigger_all_cpu_backtrace(void) } clear_bit(0, &backtrace_flag); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static int __kprobes diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6ad4658de705..992060e09897 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2189,7 +2189,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) cfg->move_in_progress = 0; } -asmlinkage void smp_irq_move_cleanup_interrupt(void) +asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -3425,6 +3425,11 @@ int get_nr_irqs_gsi(void) return nr_irqs_gsi; } +unsigned int arch_dynirq_lower_bound(unsigned int from) +{ + return from < nr_irqs_gsi ? nr_irqs_gsi : from; +} + int __init arch_probe_nr_irqs(void) { int nr; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7834389ba5be..293b41df54ef 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #include <linux/cpumask.h> #include <linux/hardirq.h> @@ -440,6 +440,20 @@ static __initdata struct redir_addr redir_addrs[] = { {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, }; +static unsigned char get_n_lshift(int m_val) +{ + union uv3h_gr0_gam_gr_config_u m_gr_config; + + if (is_uv1_hub()) + return m_val; + + if (is_uv2_hub()) + return m_val == 40 ? 40 : 39; + + m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + return m_gr_config.s3.m_skt; +} + static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; @@ -849,6 +863,7 @@ void __init uv_system_init(void) int gnode_extra, min_pnode = 999999, max_pnode = -1; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; + unsigned char n_lshift; char *hub = (is_uv1_hub() ? "UV1" : (is_uv2_hub() ? "UV2" : "UV3")); @@ -860,6 +875,7 @@ void __init uv_system_init(void) m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; pnode_mask = (1 << n_val) - 1; + n_lshift = get_n_lshift(m_val); mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; @@ -867,8 +883,9 @@ void __init uv_system_init(void) node_id.v = uv_read_local_mmr(UVH_NODE_ID); gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; gnode_upper = ((unsigned long)gnode_extra << m_val); - pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x\n", - n_val, m_val, pnode_mask, gnode_upper, gnode_extra); + pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n", + n_val, m_val, pnode_mask, gnode_upper, gnode_extra, + n_lshift); pr_info("UV: global MMR base 0x%lx\n", mmr_base); @@ -935,8 +952,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; uv_cpu_hub_info(cpu)->m_shift = 64 - m_val; - uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ? - (m_val == 40 ? 40 : 39) : m_val; + uv_cpu_hub_info(cpu)->n_lshift = n_lshift; pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 3ab03430211d..f3a1f04ed4cb 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -844,21 +844,10 @@ static int apm_do_idle(void) int polling; int err = 0; - polling = !!(current_thread_info()->status & TS_POLLING); - if (polling) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - } if (!need_resched()) { idled = 1; ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); } - if (polling) - current_thread_info()->status |= TS_POLLING; if (!idled) return 0; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index eeee23ff75ef..6cc800381d14 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -598,7 +598,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; int i; - unsigned long *v; this_cpu_inc(mce_poll_count); @@ -618,8 +617,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; - v = &get_cpu_var(mce_polled_error); - set_bit(0, v); + this_cpu_write(mce_polled_error, 1); /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -706,8 +704,7 @@ static int mce_timed_out(u64 *t) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - /* CHECKME: Make panic default for 1 too? */ - if (mca_cfg.tolerant < 1) + if (mca_cfg.tolerant <= 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; @@ -2439,32 +2436,65 @@ static __init int mcheck_init_device(void) int err; int i = 0; - if (!mce_available(&boot_cpu_data)) - return -EIO; + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } - zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } mce_init_banks(); err = subsys_system_register(&mce_subsys, NULL); if (err) - return err; + goto err_out_mem; cpu_notifier_register_begin(); for_each_online_cpu(i) { err = mce_device_create(i); if (err) { cpu_notifier_register_done(); - return err; + goto err_device_create; } } - register_syscore_ops(&mce_syscore_ops); __register_hotcpu_notifier(&mce_cpu_notifier); cpu_notifier_register_done(); + register_syscore_ops(&mce_syscore_ops); + /* register character device /dev/mcelog */ - misc_register(&mce_chrdev_device); + err = misc_register(&mce_chrdev_device); + if (err) + goto err_register; + + return 0; + +err_register: + unregister_syscore_ops(&mce_syscore_ops); + + cpu_notifier_register_begin(); + __unregister_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + +err_device_create: + /* + * We didn't keep track of which devices were created above, but + * even if we had, the set of online cpus might have changed. + * Play safe and remove for every possible cpu, since + * mce_device_remove() will do the right thing. + */ + for_each_possible_cpu(i) + mce_device_remove(i); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 3bdb95ae8c43..9a316b21df8b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -42,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ -static DEFINE_RAW_SPINLOCK(cmci_discover_lock); +static DEFINE_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) @@ -144,14 +144,14 @@ static void cmci_storm_disable_banks(void) int bank; u64 val; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); owned = __get_cpu_var(mce_banks_owned); for_each_set_bit(bank, owned, MAX_NR_BANKS) { rdmsrl(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(bank), val); } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static bool cmci_storm_detect(void) @@ -211,7 +211,7 @@ static void cmci_discover(int banks) int i; int bios_wrong_thresh = 0; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; @@ -266,7 +266,7 @@ static void cmci_discover(int banks) WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { pr_info_once( "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); @@ -316,10 +316,10 @@ void cmci_clear(void) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) __cmci_disable_bank(i); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void cmci_rediscover_work_func(void *arg) @@ -360,9 +360,9 @@ void cmci_disable_bank(int bank) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); __cmci_disable_bank(bank); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void intel_init_cmci(void) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d921b7ee6595..36a1bb6d1ee0 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -429,14 +429,14 @@ static inline void __smp_thermal_interrupt(void) smp_thermal_vector(); } -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) { entering_irq(); __smp_thermal_interrupt(); exiting_ack_irq(); } -asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fe6b1c86645b..7245980186ee 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -24,14 +24,14 @@ static inline void __smp_threshold_interrupt(void) mce_threshold_vector(); } -asmlinkage void smp_threshold_interrupt(void) +asmlinkage __visible void smp_threshold_interrupt(void) { entering_irq(); __smp_threshold_interrupt(); exiting_ack_irq(); } -asmlinkage void smp_trace_threshold_interrupt(void) +asmlinkage __visible void smp_trace_threshold_interrupt(void) { entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 15c987698b0f..dd9d6190b08d 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -97,6 +97,9 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; +bool dis_ucode_ldr; +module_param(dis_ucode_ldr, bool, 0); + /* * Synchronization. * @@ -546,6 +549,9 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &cpu_data(0); int error; + if (dis_ucode_ldr) + return 0; + if (c->x86_vendor == X86_VENDOR_INTEL) microcode_ops = init_intel_microcode(); else if (c->x86_vendor == X86_VENDOR_AMD) diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index be7f8514f577..5f28a64e71ea 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -17,9 +17,11 @@ * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> +#include <asm/microcode.h> #include <asm/microcode_intel.h> #include <asm/microcode_amd.h> #include <asm/processor.h> +#include <asm/cmdline.h> #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) #define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') @@ -72,10 +74,33 @@ static int x86_family(void) return x86; } +static bool __init check_loader_disabled_bsp(void) +{ +#ifdef CONFIG_X86_32 + const char *cmdline = (const char *)__pa_nodebug(boot_command_line); + const char *opt = "dis_ucode_ldr"; + const char *option = (const char *)__pa_nodebug(opt); + bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ + const char *cmdline = boot_command_line; + const char *option = "dis_ucode_ldr"; + bool *res = &dis_ucode_ldr; +#endif + + if (cmdline_find_option_bool(cmdline, option)) + *res = true; + + return *res; +} + void __init load_ucode_bsp(void) { int vendor, x86; + if (check_loader_disabled_bsp()) + return; + if (!have_cpuid_p()) return; @@ -96,10 +121,22 @@ void __init load_ucode_bsp(void) } } +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 + return __pa_nodebug(dis_ucode_ldr); +#else + return dis_ucode_ldr; +#endif +} + void load_ucode_ap(void) { int vendor, x86; + if (check_loader_disabled_ap()) + return; + if (!have_cpuid_p()) return; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ae407f7226c8..89f3b7c1af20 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -721,6 +721,7 @@ int perf_assign_events(struct perf_event **events, int n, return sched.state.unassigned; } +EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index aa333d966886..adb02aa62af5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -169,7 +169,6 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ae96cfa5eddd..980970cb744d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -108,15 +108,31 @@ static u64 precise_store_data(u64 status) return val; } -static u64 precise_store_data_hsw(u64 status) +static u64 precise_store_data_hsw(struct perf_event *event, u64 status) { union perf_mem_data_src dse; + u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK; dse.val = 0; dse.mem_op = PERF_MEM_OP_STORE; dse.mem_lvl = PERF_MEM_LVL_NA; + + /* + * L1 info only valid for following events: + * + * MEM_UOPS_RETIRED.STLB_MISS_STORES + * MEM_UOPS_RETIRED.LOCK_STORES + * MEM_UOPS_RETIRED.SPLIT_STORES + * MEM_UOPS_RETIRED.ALL_STORES + */ + if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) + return dse.mem_lvl; + if (status & 1) - dse.mem_lvl = PERF_MEM_LVL_L1; + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + else + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; + /* Nothing else supported. Sorry. */ return dse.val; } @@ -887,7 +903,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, data.data_src.val = load_latency_data(pebs->dse); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) data.data_src.val = - precise_store_data_hsw(pebs->dse); + precise_store_data_hsw(event, pebs->dse); else data.data_src.val = precise_store_data(pebs->dse); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 059218ed5208..619f7699487a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -59,7 +59,7 @@ #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ -#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */ +#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ /* Clients have PP0, PKG */ @@ -72,6 +72,12 @@ 1<<RAPL_IDX_PKG_NRG_STAT|\ 1<<RAPL_IDX_RAM_NRG_STAT) +/* Servers have PP0, PKG, RAM, PP1 */ +#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT) + /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved @@ -425,6 +431,24 @@ static struct attribute *rapl_events_cln_attr[] = { NULL, }; +static struct attribute *rapl_events_hsw_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + static struct attribute_group rapl_pmu_events_group = { .name = "events", .attrs = NULL, /* patched at runtime */ @@ -511,6 +535,7 @@ static int rapl_cpu_prepare(int cpu) struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); int phys_id = topology_physical_package_id(cpu); u64 ms; + u64 msr_rapl_power_unit_bits; if (pmu) return 0; @@ -518,6 +543,10 @@ static int rapl_cpu_prepare(int cpu) if (phys_id < 0) return -1; + /* protect rdmsrl() to handle virtualization */ + if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + return -1; + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); if (!pmu) return -1; @@ -531,8 +560,7 @@ static int rapl_cpu_prepare(int cpu) * * we cache in local PMU instance */ - rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit); - pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; + pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; pmu->pmu = &rapl_pmu_class; /* @@ -631,11 +659,14 @@ static int __init rapl_pmu_init(void) switch (boot_cpu_data.x86_model) { case 42: /* Sandy Bridge */ case 58: /* Ivy Bridge */ - case 60: /* Haswell */ - case 69: /* Haswell-Celeron */ rapl_cntr_mask = RAPL_IDX_CLN; rapl_pmu_events_group.attrs = rapl_events_cln_attr; break; + case 60: /* Haswell */ + case 69: /* Haswell-Celeron */ + rapl_cntr_mask = RAPL_IDX_HSW; + rapl_pmu_events_group.attrs = rapl_events_hsw_attr; + break; case 45: /* Sandy Bridge-EP */ case 62: /* IvyTown */ rapl_cntr_mask = RAPL_IDX_SRV; @@ -650,7 +681,9 @@ static int __init rapl_pmu_init(void) cpu_notifier_register_begin(); for_each_online_cpu(cpu) { - rapl_cpu_prepare(cpu); + ret = rapl_cpu_prepare(cpu); + if (ret) + goto out; rapl_cpu_init(cpu); } @@ -673,6 +706,7 @@ static int __init rapl_pmu_init(void) hweight32(rapl_cntr_mask), ktime_to_ms(pmu->timer_interval)); +out: cpu_notifier_register_done(); return 0; diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 384df5105fbc..136ac74dee82 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -27,6 +27,7 @@ static int __init x86_rdrand_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_RDRAND); + setup_clear_cpu_cap(X86_FEATURE_RDSEED); return 1; } __setup("nordrand", x86_rdrand_setup); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index b0cc3809723d..6cda0baeac9d 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,6 +17,7 @@ #include <asm/dma.h> #include <asm/io_apic.h> #include <asm/apic.h> +#include <asm/hpet.h> #include <asm/iommu.h> #include <asm/gart.h> #include <asm/irq_remapping.h> @@ -240,7 +241,7 @@ static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_s return base; } -#define KB(x) ((x) * 1024) +#define KB(x) ((x) * 1024UL) #define MB(x) (KB (KB (x))) #define GB(x) (MB (KB (x))) @@ -530,6 +531,15 @@ static void __init intel_graphics_stolen(int num, int slot, int func) } } +static void __init force_disable_hpet(int num, int slot, int func) +{ +#ifdef CONFIG_HPET_TIMER + boot_hpet_disable = 1; + pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n"); +#endif +} + + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -567,6 +577,12 @@ static struct chipset early_qrk[] __initdata = { PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, QFLAG_APPLY_ONCE, intel_graphics_stolen }, + /* + * HPET on current version of Baytrail platform has accuracy + * problems, disable it for now: + */ + { PCI_VENDOR_ID_INTEL, 0x0f00, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, {} }; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c3628bf2..be846d2468f7 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -36,7 +36,7 @@ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack * frame that is otherwise undefined after a SYSCALL * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - errorentry/paranoidentry/zeroentry - Define exception entry points. + * - idtentry - Define exception entry points. */ #include <linux/linkage.h> @@ -1203,125 +1203,100 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -.macro zeroentry sym do_sym -ENTRY(\sym) - INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call error_entry - DEFAULT_FRAME 0 - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry sym do_sym +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) - INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - TRACE_IRQS_OFF - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - call \do_sym - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 + .error "using shift_ist requires paranoid=1" + .endif -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry_ist sym do_sym ist -ENTRY(\sym) + .if \has_error_code + XCPT_FRAME + .else INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - TRACE_IRQS_OFF_DEBUG - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) - call \do_sym - addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm + .endif -.macro errorentry sym do_sym -ENTRY(\sym) - XCPT_FRAME ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME + + .ifeq \has_error_code + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + .endif + subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + + .if \paranoid + call save_paranoid + .else call error_entry + .endif + DEFAULT_FRAME 0 + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + call \do_sym + + .if \shift_ist != -1 + addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + + .if \paranoid + jmp paranoid_exit /* %ebx: no swapgs flag */ + .else jmp error_exit /* %ebx: no swapgs flag */ + .endif + CFI_ENDPROC END(\sym) .endm #ifdef CONFIG_TRACING -.macro trace_errorentry sym do_sym -errorentry trace(\sym) trace(\do_sym) -errorentry \sym \do_sym +.macro trace_idtentry sym do_sym has_error_code:req +idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +idtentry \sym \do_sym has_error_code=\has_error_code .endm #else -.macro trace_errorentry sym do_sym -errorentry \sym \do_sym +.macro trace_idtentry sym do_sym has_error_code:req +idtentry \sym \do_sym has_error_code=\has_error_code .endm #endif - /* error code is on the stack already */ -.macro paranoiderrorentry sym do_sym -ENTRY(\sym) - XCPT_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - DEFAULT_FRAME 0 - TRACE_IRQS_OFF - movq %rsp,%rdi /* pt_regs pointer */ - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - call \do_sym - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm - -zeroentry divide_error do_divide_error -zeroentry overflow do_overflow -zeroentry bounds do_bounds -zeroentry invalid_op do_invalid_op -zeroentry device_not_available do_device_not_available -paranoiderrorentry double_fault do_double_fault -zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun -errorentry invalid_TSS do_invalid_TSS -errorentry segment_not_present do_segment_not_present -zeroentry spurious_interrupt_bug do_spurious_interrupt_bug -zeroentry coprocessor_error do_coprocessor_error -errorentry alignment_check do_alignment_check -zeroentry simd_coprocessor_error do_simd_coprocessor_error +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=1 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 /* Reload gs selector with exception handling */ @@ -1371,7 +1346,7 @@ ENTRY(do_softirq_own_stack) END(do_softirq_own_stack) #ifdef CONFIG_XEN -zeroentry xen_hypervisor_callback xen_do_hypervisor_callback +idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* * A note on the "critical region" in our callback handler. @@ -1482,21 +1457,21 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ */ .pushsection .kprobes.text, "ax" -paranoidzeroentry_ist debug do_debug DEBUG_STACK -paranoidzeroentry_ist int3 do_int3 DEBUG_STACK -paranoiderrorentry stack_segment do_stack_segment +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1 #ifdef CONFIG_XEN -zeroentry xen_debug do_debug -zeroentry xen_int3 do_int3 -errorentry xen_stack_segment do_stack_segment +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 #endif -errorentry general_protection do_general_protection -trace_errorentry page_fault do_page_fault +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST -errorentry async_page_fault do_async_page_fault +idtentry async_page_fault do_async_page_fault has_error_code=1 #endif #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check *machine_check_vector(%rip) +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif /* diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index c61a14a4a310..d6c1b9836995 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,7 +29,7 @@ static void __init i386_default_early_setup(void) reserve_ebda_region(); } -asmlinkage void __init i386_start_kernel(void) +asmlinkage __visible void __init i386_start_kernel(void) { sanitize_boot_params(&boot_params); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 85126ccbdf6b..068054f4bf20 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -137,7 +137,7 @@ static void __init copy_bootdata(char *real_mode_data) } } -asmlinkage void __init x86_64_start_kernel(char * real_mode_data) +asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) { int i; diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 8d80ae011603..4177bfbc80b0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -88,7 +88,7 @@ static inline void hpet_clear_mapping(void) /* * HPET command line enable / disable */ -static int boot_hpet_disable; +int boot_hpet_disable; int hpet_force_user; static int hpet_verbose; diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c index c3aae6672843..d30acdc1229d 100644 --- a/arch/x86/kernel/iosf_mbi.c +++ b/arch/x86/kernel/iosf_mbi.c @@ -25,6 +25,9 @@ #include <asm/iosf_mbi.h> +#define PCI_DEVICE_ID_BAYTRAIL 0x0F00 +#define PCI_DEVICE_ID_QUARK_X1000 0x0958 + static DEFINE_SPINLOCK(iosf_mbi_lock); static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) @@ -177,6 +180,13 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) } EXPORT_SYMBOL(iosf_mbi_modify); +bool iosf_mbi_available(void) +{ + /* Mbi isn't hot-pluggable. No remove routine is provided */ + return mbi_pdev; +} +EXPORT_SYMBOL(iosf_mbi_available); + static int iosf_mbi_probe(struct pci_dev *pdev, const struct pci_device_id *unused) { @@ -193,7 +203,8 @@ static int iosf_mbi_probe(struct pci_dev *pdev, } static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) }, { 0, }, }; MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 79a3f9682871..61b17dc2c277 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -897,9 +897,10 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - switch (kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: + if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) { + /* This must happen on single-stepping */ + WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS && + kcb->kprobe_status != KPROBE_REENTER); /* * We are here because the instruction being single * stepped caused a page fault. We reset the current @@ -914,9 +915,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) else reset_current_kprobe(); preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: + } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || + kcb->kprobe_status == KPROBE_HIT_SSDONE) { /* * We increment the nmissed count for accounting, * we can also use npre/npostfault count for accounting @@ -945,10 +945,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) * fixup routine could not handle it, * Let do_page_fault() fix it. */ - break; - default: - break; } + return 0; } diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index af1d14a9ebda..dcbbaa165bde 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -20,6 +20,8 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> +int sysctl_ldt16 = 0; + #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) { @@ -234,7 +236,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) * IRET leaking the high bits of the kernel stack address. */ #ifdef CONFIG_X86_64 - if (!ldt_info.seg_32bit) { + if (!ldt_info.seg_32bit && !sysctl_ldt16) { error = -EINVAL; goto out_unlock; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9c0280f93d05..898d077617a9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -asmlinkage DEFINE_PER_CPU(unsigned long, old_rsp); +__visible DEFINE_PER_CPU(unsigned long, old_rsp); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 654b46574b91..52b1157c53eb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -114,8 +114,8 @@ EXPORT_SYMBOL(machine_real_restart); */ static int __init set_pci_reboot(const struct dmi_system_id *d) { - if (reboot_type != BOOT_CF9) { - reboot_type = BOOT_CF9; + if (reboot_type != BOOT_CF9_FORCE) { + reboot_type = BOOT_CF9_FORCE; pr_info("%s series board detected. Selecting %s-method for reboots.\n", d->ident, "PCI"); } @@ -191,6 +191,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, }, + /* Certec */ + { /* Handle problems with rebooting on Certec BPC600 */ + .callback = set_pci_reboot, + .ident = "Certec BPC600", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Certec"), + DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"), + }, + }, + /* Dell */ { /* Handle problems with rebooting on Dell DXP061 */ .callback = set_bios_reboot, @@ -458,20 +468,23 @@ void __attribute__((weak)) mach_reboot_fixups(void) } /* - * Windows compatible x86 hardware expects the following on reboot: + * To the best of our knowledge Windows compatible x86 hardware expects + * the following on reboot: * * 1) If the FADT has the ACPI reboot register flag set, try it * 2) If still alive, write to the keyboard controller * 3) If still alive, write to the ACPI reboot register again * 4) If still alive, write to the keyboard controller again * 5) If still alive, call the EFI runtime service to reboot - * 6) If still alive, write to the PCI IO port 0xCF9 to reboot - * 7) If still alive, inform BIOS to do a proper reboot + * 6) If no EFI runtime service, call the BIOS to do a reboot + * + * We default to following the same pattern. We also have + * two other reboot methods: 'triple fault' and 'PCI', which + * can be triggered via the reboot= kernel boot option or + * via quirks. * - * If the machine is still alive at this stage, it gives up. We default to - * following the same pattern, except that if we're still alive after (7) we'll - * try to force a triple fault and then cycle between hitting the keyboard - * controller and doing that + * This means that this function can never return, it can misbehave + * by not rebooting properly and hanging. */ static void native_machine_emergency_restart(void) { @@ -492,6 +505,11 @@ static void native_machine_emergency_restart(void) for (;;) { /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { + case BOOT_ACPI: + acpi_reboot(); + reboot_type = BOOT_KBD; + break; + case BOOT_KBD: mach_reboot_fixups(); /* For board specific fixups */ @@ -509,43 +527,29 @@ static void native_machine_emergency_restart(void) } break; - case BOOT_TRIPLE: - load_idt(&no_idt); - __asm__ __volatile__("int3"); - - /* We're probably dead after this, but... */ - reboot_type = BOOT_KBD; - break; - - case BOOT_BIOS: - machine_real_restart(MRR_BIOS); - - /* We're probably dead after this, but... */ - reboot_type = BOOT_TRIPLE; - break; - - case BOOT_ACPI: - acpi_reboot(); - reboot_type = BOOT_KBD; - break; - case BOOT_EFI: if (efi_enabled(EFI_RUNTIME_SERVICES)) efi.reset_system(reboot_mode == REBOOT_WARM ? EFI_RESET_WARM : EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); - reboot_type = BOOT_CF9_COND; + reboot_type = BOOT_BIOS; + break; + + case BOOT_BIOS: + machine_real_restart(MRR_BIOS); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_CF9_SAFE; break; - case BOOT_CF9: + case BOOT_CF9_FORCE: port_cf9_safe = true; /* Fall through */ - case BOOT_CF9_COND: + case BOOT_CF9_SAFE: if (port_cf9_safe) { - u8 reboot_code = reboot_mode == REBOOT_WARM ? - 0x06 : 0x0E; + u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E; u8 cf9 = inb(0xcf9) & ~reboot_code; outb(cf9|2, 0xcf9); /* Request hard reset */ udelay(50); @@ -553,7 +557,15 @@ static void native_machine_emergency_restart(void) outb(cf9|reboot_code, 0xcf9); udelay(50); } - reboot_type = BOOT_BIOS; + reboot_type = BOOT_TRIPLE; + break; + + case BOOT_TRIPLE: + load_idt(&no_idt); + __asm__ __volatile__("int3"); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_KBD; break; } } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 7c3a5a61f2e4..be8e1bde07aa 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -168,7 +168,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) * this function calls the 'stop' function on all other CPUs in the system. */ -asmlinkage void smp_reboot_interrupt(void) +asmlinkage __visible void smp_reboot_interrupt(void) { ack_APIC_irq(); irq_enter(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 57409f6b8c62..f73b5d435bdc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -357,7 +357,7 @@ exit: * for scheduling or signal handling. The actual stack switch is done in * entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ @@ -601,11 +601,11 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void) { } -asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) +asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) { } diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 2ed845928b5f..ace22916ade3 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -53,7 +53,7 @@ #define OPCODE1(insn) ((insn)->opcode.bytes[0]) #define OPCODE2(insn) ((insn)->opcode.bytes[1]) #define OPCODE3(insn) ((insn)->opcode.bytes[2]) -#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) +#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -229,63 +229,6 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } -/* - * Figure out which fixups arch_uprobe_post_xol() will need to perform, and - * annotate arch_uprobe->fixups accordingly. To start with, - * arch_uprobe->fixups is either zero or it reflects rip-related fixups. - */ -static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) -{ - bool fix_ip = true, fix_call = false; /* defaults */ - int reg; - - insn_get_opcode(insn); /* should be a nop */ - - switch (OPCODE1(insn)) { - case 0x9d: - /* popf */ - auprobe->fixups |= UPROBE_FIX_SETF; - break; - case 0xc3: /* ret/lret */ - case 0xcb: - case 0xc2: - case 0xca: - /* ip is correct */ - fix_ip = false; - break; - case 0xe8: /* call relative - Fix return addr */ - fix_call = true; - break; - case 0x9a: /* call absolute - Fix return addr, not ip */ - fix_call = true; - fix_ip = false; - break; - case 0xff: - insn_get_modrm(insn); - reg = MODRM_REG(insn); - if (reg == 2 || reg == 3) { - /* call or lcall, indirect */ - /* Fix return addr; ip is correct. */ - fix_call = true; - fix_ip = false; - } else if (reg == 4 || reg == 5) { - /* jmp or ljmp, indirect */ - /* ip is correct. */ - fix_ip = false; - } - break; - case 0xea: /* jmp absolute -- ip is correct */ - fix_ip = false; - break; - default: - break; - } - if (fix_ip) - auprobe->fixups |= UPROBE_FIX_IP; - if (fix_call) - auprobe->fixups |= UPROBE_FIX_CALL; -} - #ifdef CONFIG_X86_64 /* * If arch_uprobe->insn doesn't use rip-relative addressing, return @@ -310,15 +253,11 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) * - The displacement is always 4 bytes. */ static void -handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; - if (mm->context.ia32_compat) - return; - - auprobe->rip_rela_target_address = 0x0; if (!insn_rip_relative(insn)) return; @@ -372,7 +311,48 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct ins cursor++; memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); } - return; +} + +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void +pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ + if (auprobe->fixups & UPROBE_FIX_RIP_AX) { + autask->saved_scratch_register = regs->ax; + regs->ax = current->utask->vaddr; + regs->ax += auprobe->rip_rela_target_address; + } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { + autask->saved_scratch_register = regs->cx; + regs->cx = current->utask->vaddr; + regs->cx += auprobe->rip_rela_target_address; + } +} + +static void +handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +{ + if (auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { + struct arch_uprobe_task *autask; + + autask = ¤t->utask->autask; + if (auprobe->fixups & UPROBE_FIX_RIP_AX) + regs->ax = autask->saved_scratch_register; + else + regs->cx = autask->saved_scratch_register; + + /* + * The original instruction includes a displacement, and so + * is 4 bytes longer than what we've just single-stepped. + * Caller may need to apply other fixups to handle stuff + * like "jmpq *...(%rip)" and "callq *...(%rip)". + */ + if (correction) + *correction += 4; + } } static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) @@ -401,9 +381,19 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, return validate_insn_64bits(auprobe, insn); } #else /* 32-bit: */ -static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +/* + * No RIP-relative addressing on 32-bit + */ +static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) +{ +} +static void pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ +} +static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, + long *correction) { - /* No RIP-relative addressing on 32-bit */ } static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) @@ -412,141 +402,311 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, } #endif /* CONFIG_X86_64 */ -/** - * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. - * @mm: the probed address space. - * @arch_uprobe: the probepoint information. - * @addr: virtual address at which to install the probepoint - * Return 0 on success or a -ve number on error. +struct uprobe_xol_ops { + bool (*emulate)(struct arch_uprobe *, struct pt_regs *); + int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); + int (*post_xol)(struct arch_uprobe *, struct pt_regs *); +}; + +static inline int sizeof_long(void) +{ + return is_ia32_task() ? 4 : 8; +} + +static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + pre_xol_rip_insn(auprobe, regs, ¤t->utask->autask); + return 0; +} + +/* + * Adjust the return address pushed by a call insn executed out of line. */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) +static int adjust_ret_addr(unsigned long sp, long correction) { - int ret; - struct insn insn; + int rasize = sizeof_long(); + long ra; - auprobe->fixups = 0; - ret = validate_insn_bits(auprobe, mm, &insn); - if (ret != 0) - return ret; + if (copy_from_user(&ra, (void __user *)sp, rasize)) + return -EFAULT; - handle_riprel_insn(auprobe, mm, &insn); - prepare_fixups(auprobe, &insn); + ra += correction; + if (copy_to_user((void __user *)sp, &ra, rasize)) + return -EFAULT; return 0; } -#ifdef CONFIG_X86_64 -/* - * If we're emulating a rip-relative instruction, save the contents - * of the scratch register and store the target address in that register. - */ -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) +static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - if (auprobe->fixups & UPROBE_FIX_RIP_AX) { - autask->saved_scratch_register = regs->ax; - regs->ax = current->utask->vaddr; - regs->ax += auprobe->rip_rela_target_address; - } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { - autask->saved_scratch_register = regs->cx; - regs->cx = current->utask->vaddr; - regs->cx += auprobe->rip_rela_target_address; + struct uprobe_task *utask = current->utask; + long correction = (long)(utask->vaddr - utask->xol_vaddr); + + handle_riprel_post_xol(auprobe, regs, &correction); + if (auprobe->fixups & UPROBE_FIX_IP) + regs->ip += correction; + + if (auprobe->fixups & UPROBE_FIX_CALL) { + if (adjust_ret_addr(regs->sp, correction)) { + regs->sp += sizeof_long(); + return -ERESTART; + } } + + return 0; } -#else -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) + +static struct uprobe_xol_ops default_xol_ops = { + .pre_xol = default_pre_xol_op, + .post_xol = default_post_xol_op, +}; + +static bool branch_is_call(struct arch_uprobe *auprobe) { - /* No RIP-relative addressing on 32-bit */ + return auprobe->branch.opc1 == 0xe8; } -#endif -/* - * arch_uprobe_pre_xol - prepare to execute out of line. - * @auprobe: the probepoint information. - * @regs: reflects the saved user state of current task. - */ -int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) -{ - struct arch_uprobe_task *autask; +#define CASE_COND \ + COND(70, 71, XF(OF)) \ + COND(72, 73, XF(CF)) \ + COND(74, 75, XF(ZF)) \ + COND(78, 79, XF(SF)) \ + COND(7a, 7b, XF(PF)) \ + COND(76, 77, XF(CF) || XF(ZF)) \ + COND(7c, 7d, XF(SF) != XF(OF)) \ + COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) - autask = ¤t->utask->autask; - autask->saved_trap_nr = current->thread.trap_nr; - current->thread.trap_nr = UPROBE_TRAP_NR; - regs->ip = current->utask->xol_vaddr; - pre_xol_rip_insn(auprobe, regs, autask); +#define COND(op_y, op_n, expr) \ + case 0x ## op_y: DO((expr) != 0) \ + case 0x ## op_n: DO((expr) == 0) - autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); - regs->flags |= X86_EFLAGS_TF; - if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) - set_task_blockstep(current, false); +#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) - return 0; +static bool is_cond_jmp_opcode(u8 opcode) +{ + switch (opcode) { + #define DO(expr) \ + return true; + CASE_COND + #undef DO + + default: + return false; + } } -/* - * This function is called by arch_uprobe_post_xol() to adjust the return - * address pushed by a call instruction executed out of line. - */ -static int adjust_ret_addr(unsigned long sp, long correction) +static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) { - int rasize, ncopied; - long ra = 0; + unsigned long flags = regs->flags; - if (is_ia32_task()) - rasize = 4; - else - rasize = 8; + switch (auprobe->branch.opc1) { + #define DO(expr) \ + return expr; + CASE_COND + #undef DO - ncopied = copy_from_user(&ra, (void __user *)sp, rasize); - if (unlikely(ncopied)) - return -EFAULT; + default: /* not a conditional jmp */ + return true; + } +} - ra += correction; - ncopied = copy_to_user((void __user *)sp, &ra, rasize); - if (unlikely(ncopied)) - return -EFAULT; +#undef XF +#undef COND +#undef CASE_COND - return 0; +static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long new_ip = regs->ip += auprobe->branch.ilen; + unsigned long offs = (long)auprobe->branch.offs; + + if (branch_is_call(auprobe)) { + unsigned long new_sp = regs->sp - sizeof_long(); + /* + * If it fails we execute this (mangled, see the comment in + * branch_clear_offset) insn out-of-line. In the likely case + * this should trigger the trap, and the probed application + * should die or restart the same insn after it handles the + * signal, arch_uprobe_post_xol() won't be even called. + * + * But there is corner case, see the comment in ->post_xol(). + */ + if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long())) + return false; + regs->sp = new_sp; + } else if (!check_jmp_cond(auprobe, regs)) { + offs = 0; + } + + regs->ip = new_ip + offs; + return true; } -#ifdef CONFIG_X86_64 -static bool is_riprel_insn(struct arch_uprobe *auprobe) +static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); + BUG_ON(!branch_is_call(auprobe)); + /* + * We can only get here if branch_emulate_op() failed to push the ret + * address _and_ another thread expanded our stack before the (mangled) + * "call" insn was executed out-of-line. Just restore ->sp and restart. + * We could also restore ->ip and try to call branch_emulate_op() again. + */ + regs->sp += sizeof_long(); + return -ERESTART; } -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) { - if (is_riprel_insn(auprobe)) { - struct arch_uprobe_task *autask; + /* + * Turn this insn into "call 1f; 1:", this is what we will execute + * out-of-line if ->emulate() fails. We only need this to generate + * a trap, so that the probed task receives the correct signal with + * the properly filled siginfo. + * + * But see the comment in ->post_xol(), in the unlikely case it can + * succeed. So we need to ensure that the new ->ip can not fall into + * the non-canonical area and trigger #GP. + * + * We could turn it into (say) "pushf", but then we would need to + * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte + * of ->insn[] for set_orig_insn(). + */ + memset(auprobe->insn + insn_offset_immediate(insn), + 0, insn->immediate.nbytes); +} - autask = ¤t->utask->autask; - if (auprobe->fixups & UPROBE_FIX_RIP_AX) - regs->ax = autask->saved_scratch_register; - else - regs->cx = autask->saved_scratch_register; +static struct uprobe_xol_ops branch_xol_ops = { + .emulate = branch_emulate_op, + .post_xol = branch_post_xol_op, +}; + +/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ +static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) +{ + u8 opc1 = OPCODE1(insn); + + /* has the side-effect of processing the entire instruction */ + insn_get_length(insn); + if (WARN_ON_ONCE(!insn_complete(insn))) + return -ENOEXEC; + + switch (opc1) { + case 0xeb: /* jmp 8 */ + case 0xe9: /* jmp 32 */ + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + break; + + case 0xe8: /* call relative */ + branch_clear_offset(auprobe, insn); + break; + case 0x0f: + if (insn->opcode.nbytes != 2) + return -ENOSYS; /* - * The original instruction includes a displacement, and so - * is 4 bytes longer than what we've just single-stepped. - * Fall through to handle stuff like "jmpq *...(%rip)" and - * "callq *...(%rip)". + * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches + * OPCODE1() of the "short" jmp which checks the same condition. */ - if (correction) - *correction += 4; + opc1 = OPCODE2(insn) - 0x10; + default: + if (!is_cond_jmp_opcode(opc1)) + return -ENOSYS; } + + auprobe->branch.opc1 = opc1; + auprobe->branch.ilen = insn->length; + auprobe->branch.offs = insn->immediate.value; + + auprobe->ops = &branch_xol_ops; + return 0; } -#else -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) + +/** + * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @mm: the probed address space. + * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint + * Return 0 on success or a -ve number on error. + */ +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) +{ + struct insn insn; + bool fix_ip = true, fix_call = false; + int ret; + + ret = validate_insn_bits(auprobe, mm, &insn); + if (ret) + return ret; + + ret = branch_setup_xol_ops(auprobe, &insn); + if (ret != -ENOSYS) + return ret; + + /* + * Figure out which fixups arch_uprobe_post_xol() will need to perform, + * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups + * is either zero or it reflects rip-related fixups. + */ + switch (OPCODE1(&insn)) { + case 0x9d: /* popf */ + auprobe->fixups |= UPROBE_FIX_SETF; + break; + case 0xc3: /* ret or lret -- ip is correct */ + case 0xcb: + case 0xc2: + case 0xca: + fix_ip = false; + break; + case 0x9a: /* call absolute - Fix return addr, not ip */ + fix_call = true; + fix_ip = false; + break; + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip = false; + break; + case 0xff: + insn_get_modrm(&insn); + switch (MODRM_REG(&insn)) { + case 2: case 3: /* call or lcall, indirect */ + fix_call = true; + case 4: case 5: /* jmp or ljmp, indirect */ + fix_ip = false; + } + /* fall through */ + default: + handle_riprel_insn(auprobe, &insn); + } + + if (fix_ip) + auprobe->fixups |= UPROBE_FIX_IP; + if (fix_call) + auprobe->fixups |= UPROBE_FIX_CALL; + + auprobe->ops = &default_xol_ops; + return 0; +} + +/* + * arch_uprobe_pre_xol - prepare to execute out of line. + * @auprobe: the probepoint information. + * @regs: reflects the saved user state of current task. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - /* No RIP-relative addressing on 32-bit */ + struct uprobe_task *utask = current->utask; + + regs->ip = utask->xol_vaddr; + utask->autask.saved_trap_nr = current->thread.trap_nr; + current->thread.trap_nr = UPROBE_TRAP_NR; + + utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) + set_task_blockstep(current, false); + + if (auprobe->ops->pre_xol) + return auprobe->ops->pre_xol(auprobe, regs); + return 0; } -#endif /* * If xol insn itself traps and generates a signal(Say, @@ -592,22 +752,25 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - struct uprobe_task *utask; - long correction; - int result = 0; + struct uprobe_task *utask = current->utask; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); - utask = current->utask; - current->thread.trap_nr = utask->autask.saved_trap_nr; - correction = (long)(utask->vaddr - utask->xol_vaddr); - handle_riprel_post_xol(auprobe, regs, &correction); - if (auprobe->fixups & UPROBE_FIX_IP) - regs->ip += correction; - - if (auprobe->fixups & UPROBE_FIX_CALL) - result = adjust_ret_addr(regs->sp, correction); + if (auprobe->ops->post_xol) { + int err = auprobe->ops->post_xol(auprobe, regs); + if (err) { + arch_uprobe_abort_xol(auprobe, regs); + /* + * Restart the probed insn. ->post_xol() must ensure + * this is really possible if it returns -ERESTART. + */ + if (err == -ERESTART) + return 0; + return err; + } + } + current->thread.trap_nr = utask->autask.saved_trap_nr; /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP * so we can get an extra SIGTRAP if we do not clear TF. We need @@ -618,7 +781,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; - return result; + return 0; } /* callback routine for handling exceptions. */ @@ -652,8 +815,9 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, /* * This function gets called when XOL instruction either gets trapped or - * the thread has a fatal signal, so reset the instruction pointer to its - * probed address. + * the thread has a fatal signal, or if arch_uprobe_post_xol() failed. + * Reset the instruction pointer to its probed address for the potential + * restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { @@ -668,25 +832,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) regs->flags &= ~X86_EFLAGS_TF; } -/* - * Skip these instructions as per the currently known x86 ISA. - * rep=0x66*; nop=0x90 - */ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - int i; - - for (i = 0; i < MAX_UINSN_BYTES; i++) { - if (auprobe->insn[i] == 0x66) - continue; - - if (auprobe->insn[i] == 0x90) { - regs->ip += i + 1; - return true; - } - - break; - } + if (auprobe->ops->emulate) + return auprobe->ops->emulate(auprobe, regs); return false; } @@ -701,23 +850,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { - int rasize, ncopied; + int rasize = sizeof_long(), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ - rasize = is_ia32_task() ? 4 : 8; - ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize); - if (unlikely(ncopied)) + if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) return -1; /* check whether address has been already hijacked */ if (orig_ret_vaddr == trampoline_vaddr) return orig_ret_vaddr; - ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); - if (likely(!ncopied)) + nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); + if (likely(!nleft)) return orig_ret_vaddr; - if (ncopied != rasize) { + if (nleft != rasize) { pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index f6584a90aba3..b99b9ad8540c 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -26,6 +26,9 @@ #define TOPOLOGY_REGISTER_OFFSET 0x10 +/* Flag below is initialized once during vSMP PCI initialization. */ +static int irq_routing_comply = 1; + #if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* * Interrupt control on vSMPowered systems: @@ -33,7 +36,7 @@ * and vice versa. */ -asmlinkage unsigned long vsmp_save_fl(void) +asmlinkage __visible unsigned long vsmp_save_fl(void) { unsigned long flags = native_save_fl(); @@ -53,7 +56,7 @@ __visible void vsmp_restore_fl(unsigned long flags) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); -asmlinkage void vsmp_irq_disable(void) +asmlinkage __visible void vsmp_irq_disable(void) { unsigned long flags = native_save_fl(); @@ -61,7 +64,7 @@ asmlinkage void vsmp_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); -asmlinkage void vsmp_irq_enable(void) +asmlinkage __visible void vsmp_irq_enable(void) { unsigned long flags = native_save_fl(); @@ -101,6 +104,10 @@ static void __init set_vsmp_pv_ops(void) #ifdef CONFIG_SMP if (cap & ctl & BIT(8)) { ctl &= ~BIT(8); + + /* Interrupt routing set to ignore */ + irq_routing_comply = 0; + #ifdef CONFIG_PROC_FS /* Don't let users change irq affinity via procfs */ no_irq_affinity = 1; @@ -218,7 +225,9 @@ static void vsmp_apic_post_init(void) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; - apic->vector_allocation_domain = fill_vector_allocation_domain; + + if (!irq_routing_comply) + apic->vector_allocation_domain = fill_vector_allocation_domain; } void __init vsmp_init(void) diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c index f9c6e56e14b5..9531fbb123ba 100644 --- a/arch/x86/kernel/vsyscall_gtod.c +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -43,7 +43,7 @@ void update_vsyscall(struct timekeeper *tk) vdata->monotonic_time_sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; vdata->monotonic_time_snsec = tk->xtime_nsec - + (tk->wall_to_monotonic.tv_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec << tk->shift); while (vdata->monotonic_time_snsec >= (((u64)NSEC_PER_SEC) << tk->shift)) { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bea60671ef8a..f47a104a749c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -308,7 +308,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX); + F(ADX) | F(SMAP); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a2a1bb7ed8c1..eeecbed26ac7 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -48,6 +48,14 @@ static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_SMEP)); } +static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMAP)); +} + static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f5704d9e5ddc..813d31038b93 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3601,20 +3601,27 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, } } -static void update_permission_bitmask(struct kvm_vcpu *vcpu, +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; - bool fault, x, w, u, wf, uf, ff, smep; + bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; - smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { pfec = byte << 1; map = 0; wf = pfec & PFERR_WRITE_MASK; uf = pfec & PFERR_USER_MASK; ff = pfec & PFERR_FETCH_MASK; + /* + * PFERR_RSVD_MASK bit is set in PFEC if the access is not + * subject to SMAP restrictions, and cleared otherwise. The + * bit is only meaningful if the SMAP bit is set in CR4. + */ + smapf = !(pfec & PFERR_RSVD_MASK); for (bit = 0; bit < 8; ++bit) { x = bit & ACC_EXEC_MASK; w = bit & ACC_WRITE_MASK; @@ -3626,12 +3633,33 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, /* Allow supervisor writes if !cr0.wp */ w |= !is_write_protection(vcpu) && !uf; /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(smep && u && !uf); + x &= !(cr4_smep && u && !uf); + + /* + * SMAP:kernel-mode data accesses from user-mode + * mappings should fault. A fault is considered + * as a SMAP violation if all of the following + * conditions are ture: + * - X86_CR4_SMAP is set in CR4 + * - An user page is accessed + * - Page fault in kernel mode + * - if CPL = 3 or X86_EFLAGS_AC is clear + * + * Here, we cover the first three conditions. + * The fourth is computed dynamically in + * permission_fault() and is in smapf. + * + * Also, SMAP does not affect instruction + * fetches, add the !ff check here to make it + * clearer. + */ + smap = cr4_smap && u && !uf && !ff; } else /* Not really needed: no U/S accesses on ept */ u = 1; - fault = (ff && !x) || (uf && !u) || (wf && !w); + fault = (ff && !x) || (uf && !u) || (wf && !w) || + (smapf && smap); map |= fault << bit; } mmu->permissions[byte] = map; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 292615274358..3842e70bdb7c 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,11 +44,17 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); @@ -73,6 +79,8 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly); +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + bool ept); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -110,10 +118,30 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) * Will a fault with a given page-fault error code (pfec) cause a permission * fault with the given access (in ACC_* format)? */ -static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, - unsigned pfec) +static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + unsigned pte_access, unsigned pfec) { - return (mmu->permissions[pfec >> 1] >> pte_access) & 1; + int cpl = kvm_x86_ops->get_cpl(vcpu); + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + /* + * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. + * + * If CPL = 3, SMAP applies to all supervisor-mode data accesses + * (these are implicit supervisor accesses) regardless of the value + * of EFLAGS.AC. + * + * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving + * the result in X86_EFLAGS_AC. We then insert it in place of + * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec, + * but it will be one in index if SMAP checks are being overridden. + * It is important to keep this branchless. + */ + unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); + int index = (pfec >> 1) + + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + + return (mmu->permissions[index] >> pte_access) & 1; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b1e6c1bf68d3..123efd3ec29f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -353,7 +353,7 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - if (unlikely(permission_fault(mmu, pte_access, access))) { + if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { errcode |= PFERR_PRESENT_MASK; goto error; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1320e0f8e611..138ceffc6377 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -503,7 +503,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) [number##_HIGH] = VMCS12_OFFSET(name)+4 -static const unsigned long shadow_read_only_fields[] = { +static unsigned long shadow_read_only_fields[] = { /* * We do NOT shadow fields that are modified when L0 * traps and emulates any vmx instruction (e.g. VMPTRLD, @@ -526,10 +526,10 @@ static const unsigned long shadow_read_only_fields[] = { GUEST_LINEAR_ADDRESS, GUEST_PHYSICAL_ADDRESS }; -static const int max_shadow_read_only_fields = +static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static const unsigned long shadow_read_write_fields[] = { +static unsigned long shadow_read_write_fields[] = { GUEST_RIP, GUEST_RSP, GUEST_CR0, @@ -558,7 +558,7 @@ static const unsigned long shadow_read_write_fields[] = { HOST_FS_SELECTOR, HOST_GS_SELECTOR }; -static const int max_shadow_read_write_fields = +static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); static const unsigned short vmcs_field_to_offset_table[] = { @@ -3009,6 +3009,41 @@ static void free_kvm_area(void) } } +static void init_vmcs_shadow_fields(void) +{ + int i, j; + + /* No checks for read only fields yet */ + + for (i = j = 0; i < max_shadow_read_write_fields; i++) { + switch (shadow_read_write_fields[i]) { + case GUEST_BNDCFGS: + if (!vmx_mpx_supported()) + continue; + break; + default: + break; + } + + if (j < i) + shadow_read_write_fields[j] = + shadow_read_write_fields[i]; + j++; + } + max_shadow_read_write_fields = j; + + /* shadowed fields guest access without vmexit */ + for (i = 0; i < max_shadow_read_write_fields; i++) { + clear_bit(shadow_read_write_fields[i], + vmx_vmwrite_bitmap); + clear_bit(shadow_read_write_fields[i], + vmx_vmread_bitmap); + } + for (i = 0; i < max_shadow_read_only_fields; i++) + clear_bit(shadow_read_only_fields[i], + vmx_vmread_bitmap); +} + static __init int alloc_kvm_area(void) { int cpu; @@ -3039,6 +3074,8 @@ static __init int hardware_setup(void) enable_vpid = 0; if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels()) { @@ -3484,13 +3521,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; /* - * SMEP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode to + * SMEP/SMAP is disabled if CPU is in non-paging mode + * in hardware. However KVM always uses paging mode to * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP needs to be manually - * disabled when guest switches to non-paging mode. + * To emulate this behavior, SMEP/SMAP needs to be + * manually disabled when guest switches to non-paging + * mode. */ - hw_cr4 &= ~X86_CR4_SMEP; + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } @@ -7740,7 +7778,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control = vmcs12->pin_based_vm_exec_control; exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | + PIN_BASED_POSTED_INTR); vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); vmx->nested.preemption_timer_expired = false; @@ -7777,7 +7816,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (!vmx->rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -8802,14 +8843,6 @@ static int __init vmx_init(void) memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - /* shadowed read/write fields */ - for (i = 0; i < max_shadow_read_write_fields; i++) { - clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap); - clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap); - } - /* shadowed read only fields */ - for (i = 0; i < max_shadow_read_only_fields; i++) - clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap); /* * Allow direct access to the PC debug port (it is often used for I/O diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9d1b5cd4d34c..20316c67b824 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -106,6 +106,8 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); static u32 tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +static bool backwards_tsc_observed = false; + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -280,7 +282,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage void kvm_spurious_fault(void) +asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG(); @@ -652,6 +654,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) return 1; + if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP)) + return 1; + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; @@ -680,6 +685,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); + if ((cr4 ^ old_cr4) & X86_CR4_SMAP) + update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); @@ -1117,7 +1125,6 @@ static inline u64 get_kernel_ns(void) { struct timespec ts; - WARN_ON(preemptible()); ktime_get_ts(&ts); monotonic_to_bootbased(&ts); return timespec_to_ns(&ts); @@ -1481,7 +1488,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_kernel_ns, &ka->master_cycle_now); - ka->use_master_clock = host_tsc_clocksource & vcpus_matched; + ka->use_master_clock = host_tsc_clocksource && vcpus_matched + && !backwards_tsc_observed; if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); @@ -4164,7 +4172,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, | (write ? PFERR_WRITE_MASK : 0); if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) { + && !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.access, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -6939,6 +6948,7 @@ int kvm_arch_hardware_enable(void *garbage) */ if (backwards_tsc) { u64 delta_cyc = max_tsc - local_tsc; + backwards_tsc_observed = true; list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ad1fb5f53925..aae94132bc24 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -233,13 +233,13 @@ static void lguest_end_context_switch(struct task_struct *next) * flags word contains all kind of stuff, but in practice Linux only cares * about the interrupt flag. Our "save_flags()" just returns that. */ -asmlinkage unsigned long lguest_save_fl(void) +asmlinkage __visible unsigned long lguest_save_fl(void) { return lguest_data.irq_enabled; } /* Interrupts go off... */ -asmlinkage void lguest_irq_disable(void) +asmlinkage __visible void lguest_irq_disable(void) { lguest_data.irq_enabled = 0; } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index eabcb6e6a900..4d4f96a27638 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -16,7 +16,7 @@ clean-files := inat-tables.c obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o -lib-y := delay.o misc.o +lib-y := delay.o misc.o cmdline.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c new file mode 100644 index 000000000000..422db000d727 --- /dev/null +++ b/arch/x86/lib/cmdline.c @@ -0,0 +1,84 @@ +/* + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * Misc librarized functions for cmdline poking. + */ +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <asm/setup.h> + +static inline int myisspace(u8 c) +{ + return c <= ' '; /* Close enough approximation */ +} + +/** + * Find a boolean option (like quiet,noapic,nosmp....) + * + * @cmdline: the cmdline string + * @option: option string to look for + * + * Returns the position of that @option (starts counting with 1) + * or 0 on not found. + */ +int cmdline_find_option_bool(const char *cmdline, const char *option) +{ + char c; + int len, pos = 0, wstart = 0; + const char *opptr = NULL; + enum { + st_wordstart = 0, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + } state = st_wordstart; + + if (!cmdline) + return -1; /* No command line */ + + len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE); + if (!len) + return 0; + + while (len--) { + c = *(char *)cmdline++; + pos++; + + switch (state) { + case st_wordstart: + if (!c) + return 0; + else if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + wstart = pos; + /* fall through */ + + case st_wordcmp: + if (!*opptr) + if (!c || myisspace(c)) + return wstart; + else + state = st_wordskip; + else if (!c) + return 0; + else if (c != *opptr++) + state = st_wordskip; + else if (!len) /* last word and is matching */ + return wstart; + break; + + case st_wordskip: + if (!c) + return 0; + else if (myisspace(c)) + state = st_wordstart; + break; + } + } + + return 0; /* Buffer overrun */ +} diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index db9db446b71a..43623739c7cf 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -76,7 +76,7 @@ static inline int __flip_bit(u32 msr, u8 bit, bool set) if (m1.q == m.q) return 0; - err = msr_write(msr, &m); + err = msr_write(msr, &m1); if (err) return err; diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index a5449089cd9f..9e6545f269e5 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -302,7 +302,7 @@ static struct { 0x242 in div_Xsig.S */ -asmlinkage void FPU_exception(int n) +asmlinkage __visible void FPU_exception(int n) { int i, int_type; @@ -492,7 +492,7 @@ int real_2op_NaN(FPU_REG const *b, u_char tagb, /* Invalid arith operation on Valid registers */ /* Returns < 0 if the exception is unmasked */ -asmlinkage int arith_invalid(int deststnr) +asmlinkage __visible int arith_invalid(int deststnr) { EXCEPTION(EX_Invalid); @@ -507,7 +507,7 @@ asmlinkage int arith_invalid(int deststnr) } /* Divide a finite number by zero */ -asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign) +asmlinkage __visible int FPU_divide_by_zero(int deststnr, u_char sign) { FPU_REG *dest = &st(deststnr); int tag = TAG_Valid; @@ -539,7 +539,7 @@ int set_precision_flag(int flags) } /* This may be called often, so keep it lean */ -asmlinkage void set_precision_flag_up(void) +asmlinkage __visible void set_precision_flag_up(void) { if (control_word & CW_Precision) partial_status |= (SW_Precision | SW_C1); /* The masked response */ @@ -548,7 +548,7 @@ asmlinkage void set_precision_flag_up(void) } /* This may be called often, so keep it lean */ -asmlinkage void set_precision_flag_down(void) +asmlinkage __visible void set_precision_flag_down(void) { if (control_word & CW_Precision) { /* The masked response */ partial_status &= ~SW_C1; @@ -557,7 +557,7 @@ asmlinkage void set_precision_flag_down(void) EXCEPTION(EX_Precision); } -asmlinkage int denormal_operand(void) +asmlinkage __visible int denormal_operand(void) { if (control_word & CW_Denormal) { /* The masked response */ partial_status |= SW_Denorm_Op; @@ -568,7 +568,7 @@ asmlinkage int denormal_operand(void) } } -asmlinkage int arith_overflow(FPU_REG *dest) +asmlinkage __visible int arith_overflow(FPU_REG *dest) { int tag = TAG_Valid; @@ -596,7 +596,7 @@ asmlinkage int arith_overflow(FPU_REG *dest) } -asmlinkage int arith_underflow(FPU_REG *dest) +asmlinkage __visible int arith_underflow(FPU_REG *dest) { int tag = TAG_Valid; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 597ac155c91c..bc7527e109c8 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, return err; } +static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, + void *arg) +{ + unsigned long i; + + for (i = 0; i < nr_pages; ++i) + if (pfn_valid(start_pfn + i) && + !PageReserved(pfn_to_page(start_pfn + i))) + return 1; + + WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn); + + return 0; +} + /* * Remap an arbitrary physical address space into the kernel virtual * address space. Needed when the kernel wants to access high addresses @@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't allow anybody to remap normal RAM that we're using.. */ + pfn = phys_addr >> PAGE_SHIFT; last_pfn = last_addr >> PAGE_SHIFT; - for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { - int is_ram = page_is_ram(pfn); - - if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) - return NULL; - WARN_ON_ONCE(is_ram); - } + if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, + __ioremap_check_ram) == 1) + return NULL; /* * Mappings have to be page-aligned diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index c96314abd144..0004ac72dbdd 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -399,13 +399,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - int young; - - young = ptep_test_and_clear_young(vma, address, ptep); - if (young) - flush_tlb_page(vma, address); - - return young; + /* + * On x86 CPUs, clearing the accessed bit without a TLB flush + * doesn't cause data corruption. [ It could cause incorrect + * page aging and the (mistaken) reclaim of hot pages, but the + * chance of that should be relatively low. ] + * + * So as a performance optimization don't flush the TLB when + * clearing the accessed bit, it will eventually be flushed by + * a context switch or a VM operation anyway. [ In the rare + * event of it not getting flushed for a long time the delay + * shouldn't really matter because there's no real memory + * pressure for swapout to react to. ] + */ + return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index dc017735bb91..6d5663a599a7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -171,7 +171,7 @@ static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen, memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ header->pages = sz / PAGE_SIZE; - hole = sz - (proglen + sizeof(*header)); + hole = min(sz - (proglen + sizeof(*header)), PAGE_SIZE - sizeof(*header)); /* insert a random number of int3 instructions before BPF code */ *image_ptr = &header->image[prandom_u32() % hole]; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 01edac6c5e18..5075371ab593 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -489,8 +489,12 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) } node = acpi_get_node(device->handle); - if (node == NUMA_NO_NODE) + if (node == NUMA_NO_NODE) { node = x86_pci_root_bus_node(busnum); + if (node != 0 && node != NUMA_NO_NODE) + dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n", + node); + } if (node != NUMA_NO_NODE && !node_online(node)) node = NUMA_NO_NODE; diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index e88f4c53d7f6..c20d2cc7ef64 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -11,27 +11,33 @@ #include "bus_numa.h" -/* - * This discovers the pcibus <-> node mapping on AMD K8. - * also get peer root bus resource for io,mmio - */ +#define AMD_NB_F0_NODE_ID 0x60 +#define AMD_NB_F0_UNIT_ID 0x64 +#define AMD_NB_F1_CONFIG_MAP_REG 0xe0 + +#define RANGE_NUM 16 +#define AMD_NB_F1_CONFIG_MAP_RANGES 4 -struct pci_hostbridge_probe { +struct amd_hostbridge { u32 bus; u32 slot; - u32 vendor; u32 device; }; -static struct pci_hostbridge_probe pci_probes[] __initdata = { - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 }, - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, - { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, +/* + * IMPORTANT NOTE: + * hb_probes[] and early_root_info_init() is in maintenance mode. + * It only supports K8, Fam10h, Fam11h, and Fam15h_00h-0fh . + * Future processor will rely on information in ACPI. + */ +static struct amd_hostbridge hb_probes[] __initdata = { + { 0, 0x18, 0x1100 }, /* K8 */ + { 0, 0x18, 0x1200 }, /* Family10h */ + { 0xff, 0, 0x1200 }, /* Family10h */ + { 0, 0x18, 0x1300 }, /* Family11h */ + { 0, 0x18, 0x1600 }, /* Family15h */ }; -#define RANGE_NUM 16 - static struct pci_root_info __init *find_pci_root_info(int node, int link) { struct pci_root_info *info; @@ -45,12 +51,12 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link) } /** - * early_fill_mp_bus_to_node() + * early_root_info_init() * called before pcibios_scan_root and pci_scan_bus - * fills the mp_bus_to_cpumask array based according to the LDT Bus Number - * Registers found in the K8 northbridge + * fills the mp_bus_to_cpumask array based according + * to the LDT Bus Number Registers found in the northbridge. */ -static int __init early_fill_mp_bus_info(void) +static int __init early_root_info_init(void) { int i; unsigned bus; @@ -75,19 +81,21 @@ static int __init early_fill_mp_bus_info(void) return -1; found = false; - for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { + for (i = 0; i < ARRAY_SIZE(hb_probes); i++) { u32 id; u16 device; u16 vendor; - bus = pci_probes[i].bus; - slot = pci_probes[i].slot; + bus = hb_probes[i].bus; + slot = hb_probes[i].slot; id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); - vendor = id & 0xffff; device = (id>>16) & 0xffff; - if (pci_probes[i].vendor == vendor && - pci_probes[i].device == device) { + + if (vendor != PCI_VENDOR_ID_AMD) + continue; + + if (hb_probes[i].device == device) { found = true; break; } @@ -96,10 +104,16 @@ static int __init early_fill_mp_bus_info(void) if (!found) return 0; - for (i = 0; i < 4; i++) { + /* + * We should learn topology and routing information from _PXM and + * _CRS methods in the ACPI namespace. We extract node numbers + * here to work around BIOSes that don't supply _PXM. + */ + for (i = 0; i < AMD_NB_F1_CONFIG_MAP_RANGES; i++) { int min_bus; int max_bus; - reg = read_pci_config(bus, slot, 1, 0xe0 + (i << 2)); + reg = read_pci_config(bus, slot, 1, + AMD_NB_F1_CONFIG_MAP_REG + (i << 2)); /* Check if that register is enabled for bus range */ if ((reg & 7) != 3) @@ -113,10 +127,21 @@ static int __init early_fill_mp_bus_info(void) info = alloc_pci_root_info(min_bus, max_bus, node, link); } + /* + * The following code extracts routing information for use on old + * systems where Linux doesn't automatically use host bridge _CRS + * methods (or when the user specifies "pci=nocrs"). + * + * We only do this through Fam11h, because _CRS should be enough on + * newer systems. + */ + if (boot_cpu_data.x86 > 0x11) + return 0; + /* get the default node and link for left over res */ - reg = read_pci_config(bus, slot, 0, 0x60); + reg = read_pci_config(bus, slot, 0, AMD_NB_F0_NODE_ID); def_node = (reg >> 8) & 0x07; - reg = read_pci_config(bus, slot, 0, 0x64); + reg = read_pci_config(bus, slot, 0, AMD_NB_F0_UNIT_ID); def_link = (reg >> 8) & 0x03; memset(range, 0, sizeof(range)); @@ -363,7 +388,7 @@ static int __init pci_io_ecs_init(void) int cpu; /* assume all cpus from fam10h have IO ECS */ - if (boot_cpu_data.x86 < 0x10) + if (boot_cpu_data.x86 < 0x10) return 0; /* Try the PCI method first. */ @@ -387,7 +412,7 @@ static int __init amd_postcore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - early_fill_mp_bus_info(); + early_root_info_init(); pci_io_ecs_init(); return 0; diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index 614392ced7d6..bb461cfd01ab 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c @@ -60,8 +60,8 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func) word1 = read_pci_config_16(bus, slot, func, 0xc4); word2 = read_pci_config_16(bus, slot, func, 0xc6); if (word1 != word2) { - res.start = (word1 << 16) | 0x0000; - res.end = (word2 << 16) | 0xffff; + res.start = ((resource_size_t) word1 << 16) | 0x0000; + res.end = ((resource_size_t) word2 << 16) | 0xffff; res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH; update_res(info, res.start, res.end, res.flags, 0); } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 94ae9ae9574f..b5e60268d93f 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -6,6 +6,7 @@ #include <linux/dmi.h> #include <linux/pci.h> #include <linux/vgaarb.h> +#include <asm/hpet.h> #include <asm/pci_x86.h> static void pci_fixup_i450nx(struct pci_dev *d) @@ -337,9 +338,7 @@ static void pci_fixup_video(struct pci_dev *pdev) * type BRIDGE, or CARDBUS. Host to PCI controllers use * PCI header type NORMAL. */ - if (bridge - && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE) - || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) { + if (bridge && (pci_is_bridge(bridge))) { pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, &config); if (!(config & PCI_BRIDGE_CTL_VGA)) @@ -526,6 +525,19 @@ static void sb600_disable_hpet_bar(struct pci_dev *dev) } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); +#ifdef CONFIG_HPET_TIMER +static void sb600_hpet_quirk(struct pci_dev *dev) +{ + struct resource *r = &dev->resource[1]; + + if (r->flags & IORESOURCE_MEM && r->start == hpet_address) { + r->flags |= IORESOURCE_PCI_FIXED; + dev_info(&dev->dev, "reg 0x14 contains HPET; making it immovable\n"); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, 0x4385, sb600_hpet_quirk); +#endif + /* * Twinhead H12Y needs us to block out a region otherwise we map devices * there and any access kills the box. diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index db6b1ab43255..a19ed92e74e4 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -271,11 +271,16 @@ static void pcibios_allocate_dev_resources(struct pci_dev *dev, int pass) "BAR %d: reserving %pr (d=%d, p=%d)\n", idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - /* We'll assign a new address later */ - pcibios_save_fw_addr(dev, - idx, r->start); - r->end -= r->start; - r->start = 0; + if (r->flags & IORESOURCE_PCI_FIXED) { + dev_info(&dev->dev, "BAR %d %pR is immovable\n", + idx, r); + } else { + /* We'll assign a new address later */ + pcibios_save_fw_addr(dev, + idx, r->start); + r->end -= r->start; + r->start = 0; + } } } } @@ -356,6 +361,12 @@ static int __init pcibios_assign_resources(void) return 0; } +/** + * called in fs_initcall (one below subsys_initcall), + * give a chance for motherboard reserve resources + */ +fs_initcall(pcibios_assign_resources); + void pcibios_resource_survey_bus(struct pci_bus *bus) { dev_printk(KERN_DEBUG, &bus->dev, "Allocating resources\n"); @@ -392,12 +403,6 @@ void __init pcibios_resource_survey(void) ioapic_insert_resources(); } -/** - * called in fs_initcall (one below subsys_initcall), - * give a chance for motherboard reserve resources - */ -fs_initcall(pcibios_assign_resources); - static const struct vm_operations_struct pci_mmap_ops = { .access = generic_access_phys, }; diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c index 81b506d5befd..524142117296 100644 --- a/arch/x86/platform/efi/early_printk.c +++ b/arch/x86/platform/efi/early_printk.c @@ -14,48 +14,92 @@ static const struct font_desc *font; static u32 efi_x, efi_y; +static void *efi_fb; +static bool early_efi_keep; -static __init void early_efi_clear_scanline(unsigned int y) +/* + * efi earlyprintk need use early_ioremap to map the framebuffer. + * But early_ioremap is not usable for earlyprintk=efi,keep, ioremap should + * be used instead. ioremap will be available after paging_init() which is + * earlier than initcall callbacks. Thus adding this early initcall function + * early_efi_map_fb to map the whole efi framebuffer. + */ +static __init int early_efi_map_fb(void) { - unsigned long base, *dst; - u16 len; + unsigned long base, size; + + if (!early_efi_keep) + return 0; base = boot_params.screen_info.lfb_base; - len = boot_params.screen_info.lfb_linelength; + size = boot_params.screen_info.lfb_size; + efi_fb = ioremap(base, size); + + return efi_fb ? 0 : -ENOMEM; +} +early_initcall(early_efi_map_fb); + +/* + * early_efi_map maps efi framebuffer region [start, start + len -1] + * In case earlyprintk=efi,keep we have the whole framebuffer mapped already + * so just return the offset efi_fb + start. + */ +static __init_refok void *early_efi_map(unsigned long start, unsigned long len) +{ + unsigned long base; + + base = boot_params.screen_info.lfb_base; + + if (efi_fb) + return (efi_fb + start); + else + return early_ioremap(base + start, len); +} - dst = early_ioremap(base + y*len, len); +static __init_refok void early_efi_unmap(void *addr, unsigned long len) +{ + if (!efi_fb) + early_iounmap(addr, len); +} + +static void early_efi_clear_scanline(unsigned int y) +{ + unsigned long *dst; + u16 len; + + len = boot_params.screen_info.lfb_linelength; + dst = early_efi_map(y*len, len); if (!dst) return; memset(dst, 0, len); - early_iounmap(dst, len); + early_efi_unmap(dst, len); } -static __init void early_efi_scroll_up(void) +static void early_efi_scroll_up(void) { - unsigned long base, *dst, *src; + unsigned long *dst, *src; u16 len; u32 i, height; - base = boot_params.screen_info.lfb_base; len = boot_params.screen_info.lfb_linelength; height = boot_params.screen_info.lfb_height; for (i = 0; i < height - font->height; i++) { - dst = early_ioremap(base + i*len, len); + dst = early_efi_map(i*len, len); if (!dst) return; - src = early_ioremap(base + (i + font->height) * len, len); + src = early_efi_map((i + font->height) * len, len); if (!src) { - early_iounmap(dst, len); + early_efi_unmap(dst, len); return; } memmove(dst, src, len); - early_iounmap(src, len); - early_iounmap(dst, len); + early_efi_unmap(src, len); + early_efi_unmap(dst, len); } } @@ -79,16 +123,14 @@ static void early_efi_write_char(u32 *dst, unsigned char c, unsigned int h) } } -static __init void +static void early_efi_write(struct console *con, const char *str, unsigned int num) { struct screen_info *si; - unsigned long base; unsigned int len; const char *s; void *dst; - base = boot_params.screen_info.lfb_base; si = &boot_params.screen_info; len = si->lfb_linelength; @@ -109,7 +151,7 @@ early_efi_write(struct console *con, const char *str, unsigned int num) for (h = 0; h < font->height; h++) { unsigned int n, x; - dst = early_ioremap(base + (efi_y + h) * len, len); + dst = early_efi_map((efi_y + h) * len, len); if (!dst) return; @@ -123,7 +165,7 @@ early_efi_write(struct console *con, const char *str, unsigned int num) s++; } - early_iounmap(dst, len); + early_efi_unmap(dst, len); } num -= count; @@ -179,6 +221,9 @@ static __init int early_efi_setup(struct console *con, char *options) for (i = 0; i < (yres - efi_y) / font->height; i++) early_efi_scroll_up(); + /* early_console_register will unset CON_BOOT in case ,keep */ + if (!(con->flags & CON_BOOT)) + early_efi_keep = true; return 0; } diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index ff0174dda810..a9acde72d4ed 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -75,7 +75,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state) return 0; } -asmlinkage int xo1_do_sleep(u8 sleep_state) +asmlinkage __visible int xo1_do_sleep(u8 sleep_state) { void *pgd_addr = __va(read_cr3()); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 304fca20d96e..35e2bb6c0f37 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -23,7 +23,7 @@ extern __visible const void __nosave_begin, __nosave_end; /* Defined in hibernate_asm_64.S */ -extern asmlinkage int restore_image(void); +extern asmlinkage __visible int restore_image(void); /* * Address to jump to in the last phase of restore in order to get to the image diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 3497f14e4dea..7c0d7be176a5 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -52,8 +52,9 @@ $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE OBJCOPYFLAGS_realmode.bin := -O binary targets += realmode.bin -$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs +$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs FORCE $(call if_changed,objcopy) + @: quiet_cmd_relocs = RELOCS $@ cmd_relocs = arch/x86/tools/relocs --realmode $< > $@ diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index f325af26107c..3323c2745248 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -54,5 +54,7 @@ syshdr-$(CONFIG_X86_64) += syscalls_64.h targets += $(uapisyshdr-y) $(syshdr-y) +PHONY += all all: $(addprefix $(uapi)/,$(uapisyshdr-y)) all: $(addprefix $(out)/,$(syshdr-y)) + @: diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 96bc506ac6de..d6b867921612 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -359,3 +359,4 @@ 350 i386 finit_module sys_finit_module 351 i386 sched_setattr sys_sched_setattr 352 i386 sched_getattr sys_sched_getattr +353 i386 renameat2 sys_renameat2 diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index e8120346903b..604a37efd4d5 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -40,4 +40,6 @@ $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/ina HOST_EXTRACFLAGS += -I$(srctree)/tools/include hostprogs-y += relocs relocs-objs := relocs_32.o relocs_64.o relocs_common.o +PHONY += relocs relocs: $(obj)/relocs + @: diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 2e263f367b13..9df017ab2285 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -9,12 +9,9 @@ SECTIONS #ifdef BUILD_VDSO32 #include <asm/vdso32.h> - .hpet_sect : { - hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE); - } :text :hpet_sect + hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE); - .vvar_sect : { - vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE); + vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE); /* Place all vvars at the offsets in asm/vvar.h. */ #define EMIT_VVAR(name, offset) vvar_ ## name = vvar + offset; @@ -22,7 +19,6 @@ SECTIONS #include <asm/vvar.h> #undef __VVAR_KERNEL_LDS #undef EMIT_VVAR - } :text :vvar_sect #endif . = SIZEOF_HEADERS; @@ -61,7 +57,12 @@ SECTIONS */ . = ALIGN(0x100); - .text : { *(.text*) } :text =0x90909090 + .text : { *(.text*) } :text =0x90909090, + + /* + * The comma above works around a bug in gold: + * https://sourceware.org/bugzilla/show_bug.cgi?id=16804 + */ /DISCARD/ : { *(.discard) @@ -84,8 +85,4 @@ PHDRS dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ eh_frame_hdr PT_GNU_EH_FRAME; -#ifdef BUILD_VDSO32 - vvar_sect PT_NULL FLAGS(4); /* PF_R */ - hpet_sect PT_NULL FLAGS(4); /* PF_R */ -#endif } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 00348980a3a6..e1f220e3ca68 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -39,6 +39,7 @@ #ifdef CONFIG_X86_64 #define vdso_enabled sysctl_vsyscall32 #define arch_setup_additional_pages syscall32_setup_pages +extern int sysctl_ldt16; #endif /* @@ -249,6 +250,13 @@ static struct ctl_table abi_table2[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "ldt16", + .data = &sysctl_ldt16, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, {} }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 201d09a7c46b..f17b29210ac4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1339,6 +1339,7 @@ xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) static struct notifier_block xen_panic_block = { .notifier_call= xen_panic_event, + .priority = INT_MIN }; int xen_panic_handler_init(void) @@ -1515,7 +1516,7 @@ static void __init xen_pvh_early_guest_init(void) } /* First C function to be called on Xen boot */ -asmlinkage void __init xen_start_kernel(void) +asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; int rc; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 08f763de26fe..a1207cb6472a 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -23,7 +23,7 @@ void xen_force_evtchn_callback(void) (void)HYPERVISOR_xen_version(0, NULL); } -asmlinkage unsigned long xen_save_fl(void) +asmlinkage __visible unsigned long xen_save_fl(void) { struct vcpu_info *vcpu; unsigned long flags; @@ -63,7 +63,7 @@ __visible void xen_restore_fl(unsigned long flags) } PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); -asmlinkage void xen_irq_disable(void) +asmlinkage __visible void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to make sure we're don't switch CPUs between getting the vcpu @@ -74,7 +74,7 @@ asmlinkage void xen_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); -asmlinkage void xen_irq_enable(void) +asmlinkage __visible void xen_irq_enable(void) { struct vcpu_info *vcpu; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 86e02eabb640..6f6e15d28466 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2510,6 +2510,95 @@ void __init xen_hvm_init_mmu_ops(void) } #endif +#ifdef CONFIG_XEN_PVH +/* + * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user + * space creating new guest on pvh dom0 and needing to map domU pages. + */ +static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn, + unsigned int domid) +{ + int rc, err = 0; + xen_pfn_t gpfn = lpfn; + xen_ulong_t idx = fgfn; + + struct xen_add_to_physmap_range xatp = { + .domid = DOMID_SELF, + .foreign_domid = domid, + .size = 1, + .space = XENMAPSPACE_gmfn_foreign, + }; + set_xen_guest_handle(xatp.idxs, &idx); + set_xen_guest_handle(xatp.gpfns, &gpfn); + set_xen_guest_handle(xatp.errs, &err); + + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); + if (rc < 0) + return rc; + return err; +} + +static int xlate_remove_from_p2m(unsigned long spfn, int count) +{ + struct xen_remove_from_physmap xrp; + int i, rc; + + for (i = 0; i < count; i++) { + xrp.domid = DOMID_SELF; + xrp.gpfn = spfn+i; + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); + if (rc) + break; + } + return rc; +} + +struct xlate_remap_data { + unsigned long fgfn; /* foreign domain's gfn */ + pgprot_t prot; + domid_t domid; + int index; + struct page **pages; +}; + +static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, + void *data) +{ + int rc; + struct xlate_remap_data *remap = data; + unsigned long pfn = page_to_pfn(remap->pages[remap->index++]); + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot)); + + rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid); + if (rc) + return rc; + native_set_pte(ptep, pteval); + + return 0; +} + +static int xlate_remap_gfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long mfn, + int nr, pgprot_t prot, unsigned domid, + struct page **pages) +{ + int err; + struct xlate_remap_data pvhdata; + + BUG_ON(!pages); + + pvhdata.fgfn = mfn; + pvhdata.prot = prot; + pvhdata.domid = domid; + pvhdata.index = 0; + pvhdata.pages = pages; + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, + xlate_map_pte_fn, &pvhdata); + flush_tlb_all(); + return err; +} +#endif + #define REMAP_BATCH_SIZE 16 struct remap_data { @@ -2522,7 +2611,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { struct remap_data *rmd = data; - pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); + pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; rmd->mmu_update->val = pte_val_ma(pte); @@ -2544,13 +2633,18 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long range; int err = 0; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return -EINVAL; - - prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); - BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); + if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef CONFIG_XEN_PVH + /* We need to update the local page tables and the xen HAP */ + return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, + domid, pages); +#else + return -EINVAL; +#endif + } + rmd.mfn = mfn; rmd.prot = prot; @@ -2588,6 +2682,25 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) return 0; +#ifdef CONFIG_XEN_PVH + while (numpgs--) { + /* + * The mmu has already cleaned up the process mmu + * resources at this point (lookup_address will return + * NULL). + */ + unsigned long pfn = page_to_pfn(pages[numpgs]); + + xlate_remove_from_p2m(pfn, 1); + } + /* + * We don't need to flush tlbs because as part of + * xlate_remove_from_p2m, the hypervisor will do tlb flushes + * after removing the p2m entries from the EPT/NPT + */ + return 0; +#else return -EINVAL; +#endif } EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 85e5d78c9874..9bb3d82ffec8 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -36,7 +36,7 @@ * pfn_to_mfn(0xc0000)=0xc0000 * * The benefit of this is, that we can assume for non-RAM regions (think - * PCI BARs, or ACPI spaces), we can create mappings easily b/c we + * PCI BARs, or ACPI spaces), we can create mappings easily because we * get the PFN value to match the MFN. * * For this to work efficiently we have one new page p2m_identity and @@ -60,7 +60,7 @@ * There is also a digram of the P2M at the end that can help. * Imagine your E820 looking as so: * - * 1GB 2GB + * 1GB 2GB 4GB * /-------------------+---------\/----\ /----------\ /---+-----\ * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | * \-------------------+---------/\----/ \----------/ \---+-----/ @@ -77,9 +77,8 @@ * of the PFN and the end PFN (263424 and 512256 respectively). The first step * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page * covers 512^2 of page estate (1GB) and in case the start or end PFN is not - * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn - * to end pfn. We reserve_brk top leaf pages if they are missing (means they - * point to p2m_mid_missing). + * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as + * required to split any existing p2m_mid_missing middle pages. * * With the E820 example above, 263424 is not 1GB aligned so we allocate a * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. @@ -88,7 +87,7 @@ * Next stage is to determine if we need to do a more granular boundary check * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. * We check if the start pfn and end pfn violate that boundary check, and if - * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer + * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer * granularity of setting which PFNs are missing and which ones are identity. * In our example 263424 and 512256 both fail the check so we reserve_brk two * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" @@ -102,9 +101,10 @@ * * The next step is to walk from the start pfn to the end pfn setting * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. - * If we find that the middle leaf is pointing to p2m_missing we can swap it - * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this - * point we do not need to worry about boundary aligment (so no need to + * If we find that the middle entry is pointing to p2m_missing we can swap it + * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and + * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions). + * At this point we do not need to worry about boundary aligment (so no need to * reserve_brk a middle page, figure out which PFNs are "missing" and which * ones are identity), as that has been done earlier. If we find that the * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference @@ -118,6 +118,9 @@ * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] * contain the INVALID_P2M_ENTRY value and are considered "missing." * + * Finally, the region beyond the end of of the E820 (4 GB in this example) + * is set to be identity (in case there are MMIO regions placed here). + * * This is what the p2m ends up looking (for the E820 above) with this * fabulous drawing: * @@ -129,21 +132,27 @@ * |-----| \ | [p2m_identity]+\\ | .... | * | 2 |--\ \-------------------->| ... | \\ \----------------/ * |-----| \ \---------------/ \\ - * | 3 |\ \ \\ p2m_identity - * |-----| \ \-------------------->/---------------\ /-----------------\ - * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... | - * \-----/ / | [p2m_identity]+-->| ..., ~0 | - * / /---------------\ | .... | \-----------------/ - * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. | - * / | IDENTITY[@256]|<----/ \---------------/ - * / | ~0, ~0, .... | - * | \---------------/ - * | - * p2m_mid_missing p2m_missing - * /-----------------\ /------------\ - * | [p2m_missing] +---->| ~0, ~0, ~0 | - * | [p2m_missing] +---->| ..., ~0 | - * \-----------------/ \------------/ + * | 3 |-\ \ \\ p2m_identity [1] + * |-----| \ \-------------------->/---------------\ /-----------------\ + * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... | + * \-----/ | | | [p2m_identity]+-->| ..., ~0 | + * | | | .... | \-----------------/ + * | | +-[x], ~0, ~0.. +\ + * | | \---------------/ \ + * | | \-> /---------------\ + * | V p2m_mid_missing p2m_missing | IDENTITY[@0] | + * | /-----------------\ /------------\ | IDENTITY[@256]| + * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... | + * | | [p2m_missing] +---->| ..., ~0 | \---------------/ + * | | ... | \------------/ + * | \-----------------/ + * | + * | p2m_mid_identity + * | /-----------------\ + * \-->| [p2m_identity] +---->[1] + * | [p2m_identity] +---->[1] + * | ... | + * \-----------------/ * * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ @@ -187,13 +196,15 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); /* We might hit two boundary violations at the start and end, at max each * boundary violation will require three middle nodes. */ -RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); +RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3); /* When we populate back during bootup, the amount of pages can vary. The * max we have is seen is 395979, but that does not mean it can't be more. @@ -242,20 +253,20 @@ static void p2m_top_mfn_p_init(unsigned long **top) top[i] = p2m_mid_missing_mfn; } -static void p2m_mid_init(unsigned long **mid) +static void p2m_mid_init(unsigned long **mid, unsigned long *leaf) { unsigned i; for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = p2m_missing; + mid[i] = leaf; } -static void p2m_mid_mfn_init(unsigned long *mid) +static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) { unsigned i; for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = virt_to_mfn(p2m_missing); + mid[i] = virt_to_mfn(leaf); } static void p2m_init(unsigned long *p2m) @@ -286,7 +297,9 @@ void __ref xen_build_mfn_list_list(void) /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(p2m_mid_missing_mfn); + p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); + p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_p_init(p2m_top_mfn_p); @@ -295,7 +308,8 @@ void __ref xen_build_mfn_list_list(void) p2m_top_mfn_init(p2m_top_mfn); } else { /* Reinitialise, mfn's all change after migration */ - p2m_mid_mfn_init(p2m_mid_missing_mfn); + p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); + p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); } for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { @@ -327,7 +341,7 @@ void __ref xen_build_mfn_list_list(void) * it too late. */ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); + p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; } @@ -365,16 +379,17 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_init(p2m_missing); + p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_identity); p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(p2m_mid_missing); + p2m_mid_init(p2m_mid_missing, p2m_missing); + p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_identity, p2m_identity); p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_init(p2m_top); - p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m_identity); - /* * The domain builder gives us a pre-constructed p2m array in * mfn_list for all the pages initially given to us, so we just @@ -386,7 +401,7 @@ void __init xen_build_dynamic_phys_to_machine(void) if (p2m_top[topidx] == p2m_mid_missing) { unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid); + p2m_mid_init(mid, p2m_missing); p2m_top[topidx] = mid; } @@ -492,7 +507,7 @@ unsigned long get_phys_to_machine(unsigned long pfn) unsigned topidx, mididx, idx; if (unlikely(pfn >= MAX_P2M_PFN)) - return INVALID_P2M_ENTRY; + return IDENTITY_FRAME(pfn); topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); @@ -545,7 +560,7 @@ static bool alloc_p2m(unsigned long pfn) if (!mid) return false; - p2m_mid_init(mid); + p2m_mid_init(mid, p2m_missing); if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) free_p2m_page(mid); @@ -565,7 +580,7 @@ static bool alloc_p2m(unsigned long pfn) if (!mid_mfn) return false; - p2m_mid_mfn_init(mid_mfn); + p2m_mid_mfn_init(mid_mfn, p2m_missing); missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); @@ -596,7 +611,7 @@ static bool alloc_p2m(unsigned long pfn) return true; } -static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) +static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; unsigned long *p2m; @@ -638,7 +653,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary return true; } -static bool __init early_alloc_p2m(unsigned long pfn) +static bool __init early_alloc_p2m_middle(unsigned long pfn) { unsigned topidx = p2m_top_index(pfn); unsigned long *mid_mfn_p; @@ -649,7 +664,7 @@ static bool __init early_alloc_p2m(unsigned long pfn) if (mid == p2m_mid_missing) { mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid); + p2m_mid_init(mid, p2m_missing); p2m_top[topidx] = mid; @@ -658,12 +673,12 @@ static bool __init early_alloc_p2m(unsigned long pfn) /* And the save/restore P2M tables.. */ if (mid_mfn_p == p2m_mid_missing_mfn) { mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); + p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); /* Note: we don't set mid_mfn_p[midix] here, - * look in early_alloc_p2m_middle */ + * look in early_alloc_p2m() */ } return true; } @@ -739,7 +754,7 @@ found: /* This shouldn't happen */ if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) - early_alloc_p2m(set_pfn); + early_alloc_p2m_middle(set_pfn); if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) return false; @@ -754,13 +769,13 @@ found: bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!early_alloc_p2m(pfn)) + if (!early_alloc_p2m_middle(pfn)) return false; if (early_can_reuse_p2m_middle(pfn, mfn)) return __set_phys_to_machine(pfn, mfn); - if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) + if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) return false; if (!__set_phys_to_machine(pfn, mfn)) @@ -769,12 +784,30 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } + +static void __init early_split_p2m(unsigned long pfn) +{ + unsigned long mididx, idx; + + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + /* + * Allocate new middle and leaf pages if this pfn lies in the + * middle of one. + */ + if (mididx || idx) + early_alloc_p2m_middle(pfn); + if (idx) + early_alloc_p2m(pfn, false); +} + unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { unsigned long pfn; - if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN)) + if (unlikely(pfn_s >= MAX_P2M_PFN)) return 0; if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) @@ -783,19 +816,30 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, if (pfn_s > pfn_e) return 0; - for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1)); - pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); - pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) - { - WARN_ON(!early_alloc_p2m(pfn)); - } + if (pfn_e > MAX_P2M_PFN) + pfn_e = MAX_P2M_PFN; - early_alloc_p2m_middle(pfn_s, true); - early_alloc_p2m_middle(pfn_e, true); + early_split_p2m(pfn_s); + early_split_p2m(pfn_e); + + for (pfn = pfn_s; pfn < pfn_e;) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); - for (pfn = pfn_s; pfn < pfn_e; pfn++) if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) break; + pfn++; + + /* + * If the PFN was set to a middle or leaf identity + * page the remainder must also be identity, so skip + * ahead to the next middle or leaf entry. + */ + if (p2m_top[topidx] == p2m_mid_identity) + pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE); + else if (p2m_top[topidx][mididx] == p2m_identity) + pfn = ALIGN(pfn, P2M_PER_PAGE); + } if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), "Identity mapping failed. We are %ld short of 1-1 mappings!\n", @@ -825,8 +869,22 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) /* For sparse holes were the p2m leaf has real PFN along with * PCI holes, stick in the PFN as the MFN value. + * + * set_phys_range_identity() will have allocated new middle + * and leaf pages as required so an existing p2m_mid_missing + * or p2m_missing mean that whole range will be identity so + * these can be switched to p2m_mid_identity or p2m_identity. */ if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { + if (p2m_top[topidx] == p2m_mid_identity) + return true; + + if (p2m_top[topidx] == p2m_mid_missing) { + WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing, + p2m_mid_identity) != p2m_mid_missing); + return true; + } + if (p2m_top[topidx][mididx] == p2m_identity) return true; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 0982233b9b84..210426a26cc0 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -89,10 +89,10 @@ static void __init xen_add_extra_mem(u64 start, u64 size) for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); - if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) + if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) continue; - WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", - pfn, mfn); + WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", + pfn, mfn); __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } @@ -469,6 +469,15 @@ char * __init xen_memory_setup(void) } /* + * Set the rest as identity mapped, in case PCI BARs are + * located here. + * + * PFNs above MAX_P2M_PFN are considered identity mapped as + * well. + */ + set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); + + /* * In domU, the ISA region is normal, usable memory, but we * reserve ISA memory anyway because too many things poke * about in there. diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a18eadd8bb40..7005974c3ff3 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -441,10 +441,11 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) irq_ctx_init(cpu); #else clear_tsk_thread_flag(idle, TIF_FORK); +#endif per_cpu(kernel_stack, cpu) = (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; -#endif + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_init_lock_cpu(cpu); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 4d3acc34a998..0ba5f3b967f0 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -274,7 +274,7 @@ void __init xen_init_spinlocks(void) printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); return; } - + printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; } @@ -290,6 +290,9 @@ static __init int xen_init_spinlocks_jump(void) if (!xen_pvspin) return 0; + if (!xen_domain()) + return 0; + static_key_slow_inc(¶virt_ticketlocks_enabled); return 0; } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 45329c8c226e..c4df9dbd63b7 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -12,8 +12,10 @@ #include "xen-ops.h" #include "mmu.h" -void xen_arch_pre_suspend(void) +static void xen_pv_pre_suspend(void) { + xen_mm_pin_all(); + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); xen_start_info->console.domU.mfn = mfn_to_pfn(xen_start_info->console.domU.mfn); @@ -26,7 +28,7 @@ void xen_arch_pre_suspend(void) BUG(); } -void xen_arch_hvm_post_suspend(int suspend_cancelled) +static void xen_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; @@ -41,7 +43,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) #endif } -void xen_arch_post_suspend(int suspend_cancelled) +static void xen_pv_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); @@ -60,6 +62,21 @@ void xen_arch_post_suspend(int suspend_cancelled) xen_vcpu_restore(); } + xen_mm_unpin_all(); +} + +void xen_arch_pre_suspend(void) +{ + if (xen_pv_domain()) + xen_pv_pre_suspend(); +} + +void xen_arch_post_suspend(int cancelled) +{ + if (xen_pv_domain()) + xen_pv_post_suspend(cancelled); + else + xen_hvm_post_suspend(cancelled); } static void xen_vcpu_notify_restore(void *data) diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 33ca6e42a4ca..fd92a64d748e 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -75,6 +75,17 @@ ENDPROC(xen_sysexit) * stack state in whatever form its in, we keep things simple by only * using a single register which is pushed/popped on the stack. */ + +.macro POP_FS +1: + popw %fs +.pushsection .fixup, "ax" +2: movw $0, (%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b,2b) +.endm + ENTRY(xen_iret) /* test eflags for special cases */ testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) @@ -83,15 +94,13 @@ ENTRY(xen_iret) push %eax ESP_OFFSET=4 # bytes pushed onto stack - /* - * Store vcpu_info pointer for easy access. Do it this way to - * avoid having to reload %fs - */ + /* Store vcpu_info pointer for easy access */ #ifdef CONFIG_SMP - GET_THREAD_INFO(%eax) - movl %ss:TI_cpu(%eax), %eax - movl %ss:__per_cpu_offset(,%eax,4), %eax - mov %ss:xen_vcpu(%eax), %eax + pushw %fs + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs + movl %fs:xen_vcpu, %eax + POP_FS #else movl %ss:xen_vcpu, %eax #endif diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1cb6f4c37300..c834d4b231f0 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -31,6 +31,8 @@ void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); extern unsigned long xen_max_p2m_pfn; +void xen_mm_pin_all(void); +void xen_mm_unpin_all(void); void xen_set_pat(u64); char * __init xen_memory_setup(void); |