From 9ae433bc79f97bae221d53bb1a8e21415ea58625 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 9 Dec 2016 14:33:51 +0000
Subject: crypto: chacha20 - convert generic and x86 versions to skcipher

This converts the ChaCha20 code from a blkcipher to a skcipher, which
is now the preferred way to implement symmetric block and stream ciphers.

This ports the generic and x86 versions at the same time because the
latter reuses routines of the former.

Note that the skcipher_walk() API guarantees that all presented blocks
except the final one are a multiple of the chunk size, so we can simplify
the encrypt() routine somewhat.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha20_glue.c | 69 ++++++++++++++++++-----------------------
 1 file changed, 31 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index f910d1d449f0..78f75b07dc25 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -11,7 +11,7 @@
 
 #include <crypto/algapi.h>
 #include <crypto/chacha20.h>
-#include <linux/crypto.h>
+#include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <asm/fpu/api.h>
@@ -63,36 +63,34 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 	}
 }
 
-static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
-			 struct scatterlist *src, unsigned int nbytes)
+static int chacha20_simd(struct skcipher_request *req)
 {
-	u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1];
-	struct blkcipher_walk walk;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	u32 state[16] __aligned(CHACHA20_STATE_ALIGN);
+	struct skcipher_walk walk;
 	int err;
 
-	if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha20_crypt(desc, dst, src, nbytes);
+	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha20_crypt(req);
 
-	state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN);
+	err = skcipher_walk_virt(&walk, req, true);
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
-
-	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
+	crypto_chacha20_init(state, ctx, walk.iv);
 
 	kernel_fpu_begin();
 
 	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
-		err = blkcipher_walk_done(desc, &walk,
-					  walk.nbytes % CHACHA20_BLOCK_SIZE);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes % CHACHA20_BLOCK_SIZE);
 	}
 
 	if (walk.nbytes) {
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				walk.nbytes);
-		err = blkcipher_walk_done(desc, &walk, 0);
+		err = skcipher_walk_done(&walk, 0);
 	}
 
 	kernel_fpu_end();
@@ -100,27 +98,22 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return err;
 }
 
-static struct crypto_alg alg = {
-	.cra_name		= "chacha20",
-	.cra_driver_name	= "chacha20-simd",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_ctxsize		= sizeof(struct chacha20_ctx),
-	.cra_alignmask		= sizeof(u32) - 1,
-	.cra_module		= THIS_MODULE,
-	.cra_u			= {
-		.blkcipher = {
-			.min_keysize	= CHACHA20_KEY_SIZE,
-			.max_keysize	= CHACHA20_KEY_SIZE,
-			.ivsize		= CHACHA20_IV_SIZE,
-			.geniv		= "seqiv",
-			.setkey		= crypto_chacha20_setkey,
-			.encrypt	= chacha20_simd,
-			.decrypt	= chacha20_simd,
-		},
-	},
+static struct skcipher_alg alg = {
+	.base.cra_name		= "chacha20",
+	.base.cra_driver_name	= "chacha20-simd",
+	.base.cra_priority	= 300,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_alignmask	= sizeof(u32) - 1,
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= CHACHA20_KEY_SIZE,
+	.max_keysize		= CHACHA20_KEY_SIZE,
+	.ivsize			= CHACHA20_IV_SIZE,
+	.chunksize		= CHACHA20_BLOCK_SIZE,
+	.setkey			= crypto_chacha20_setkey,
+	.encrypt		= chacha20_simd,
+	.decrypt		= chacha20_simd,
 };
 
 static int __init chacha20_simd_mod_init(void)
@@ -133,12 +126,12 @@ static int __init chacha20_simd_mod_init(void)
 			    boot_cpu_has(X86_FEATURE_AVX2) &&
 			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
 #endif
-	return crypto_register_alg(&alg);
+	return crypto_register_skcipher(&alg);
 }
 
 static void __exit chacha20_simd_mod_fini(void)
 {
-	crypto_unregister_alg(&alg);
+	crypto_unregister_skcipher(&alg);
 }
 
 module_init(chacha20_simd_mod_init);
-- 
cgit 


From 50fb57042402c819d247ac4231b80b0da86e2fd7 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Tue, 13 Dec 2016 16:32:06 +0200
Subject: crypto: aesni-intel - RFC4106 can zero copy when !PageHighMem

In the common case of !PageHighMem we can do zero copy crypto
even if sg crosses a pages boundary.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 31c34ee131f3..36ca1502630c 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -740,9 +740,11 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
 	*((__be32 *)(iv+12)) = counter;
 
 	if (sg_is_last(req->src) &&
-	    req->src->offset + req->src->length <= PAGE_SIZE &&
+	    (!PageHighMem(sg_page(req->src)) ||
+	    req->src->offset + req->src->length <= PAGE_SIZE) &&
 	    sg_is_last(req->dst) &&
-	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
+	    (!PageHighMem(sg_page(req->dst)) ||
+	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
 		assoc = scatterwalk_map(&src_sg_walk);
@@ -822,9 +824,11 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
 	*((__be32 *)(iv+12)) = counter;
 
 	if (sg_is_last(req->src) &&
-	    req->src->offset + req->src->length <= PAGE_SIZE &&
+	    (!PageHighMem(sg_page(req->src)) ||
+	    req->src->offset + req->src->length <= PAGE_SIZE) &&
 	    sg_is_last(req->dst) &&
-	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
+	    (!PageHighMem(sg_page(req->dst)) ||
+	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
 		assoc = scatterwalk_map(&src_sg_walk);
-- 
cgit 


From c4158ff536439619fa342810cc575ae2c809f03f Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Wed, 4 Jan 2017 12:20:33 +0100
Subject: x86/irq, trace: Add __irq_entry annotation to x86's platform IRQ
 handlers

This patch adds the __irq_entry annotation to the default x86
platform IRQ handlers. ftrace's function_graph tracer uses the
__irq_entry annotation to notify the entry and return of IRQ
handlers.

For example, before the patch:
  354549.667252 |   3)  d..1              |  default_idle_call() {
  354549.667252 |   3)  d..1              |    arch_cpu_idle() {
  354549.667253 |   3)  d..1              |      default_idle() {
  354549.696886 |   3)  d..1              |        smp_trace_reschedule_interrupt() {
  354549.696886 |   3)  d..1              |          irq_enter() {
  354549.696886 |   3)  d..1              |            rcu_irq_enter() {

After the patch:
  366416.254476 |   3)  d..1              |    arch_cpu_idle() {
  366416.254476 |   3)  d..1              |      default_idle() {
  366416.261566 |   3)  d..1  ==========> |
  366416.261566 |   3)  d..1              |        smp_trace_reschedule_interrupt() {
  366416.261566 |   3)  d..1              |          irq_enter() {
  366416.261566 |   3)  d..1              |            rcu_irq_enter() {

KASAN also uses this annotation. The smp_apic_timer_interrupt()
was already annotated.

Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Claudio Fontana <claudio.fontana@huawei.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Gu Zheng <guz.fnst@cn.fujitsu.com>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicolai Stange <nicstange@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: linux-edac@vger.kernel.org
Link: http://lkml.kernel.org/r/059fdf437c2f0c09b13c18c8fe4e69999d3ffe69.1483528431.git.bristot@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/apic.c              |  8 ++++----
 arch/x86/kernel/apic/vector.c            |  2 +-
 arch/x86/kernel/cpu/mcheck/mce_amd.c     |  4 ++--
 arch/x86/kernel/cpu/mcheck/therm_throt.c |  6 ++++--
 arch/x86/kernel/cpu/mcheck/threshold.c   |  4 ++--
 arch/x86/kernel/irq.c                    |  4 ++--
 arch/x86/kernel/irq_work.c               |  5 +++--
 arch/x86/kernel/smp.c                    | 15 +++++++++------
 8 files changed, 27 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 5b7e43eff139..30b122987906 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1864,14 +1864,14 @@ static void __smp_spurious_interrupt(u8 vector)
 		"should never happen.\n", vector, smp_processor_id());
 }
 
-__visible void smp_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
 {
 	entering_irq();
 	__smp_spurious_interrupt(~regs->orig_ax);
 	exiting_irq();
 }
 
-__visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
 {
 	u8 vector = ~regs->orig_ax;
 
@@ -1922,14 +1922,14 @@ static void __smp_error_interrupt(struct pt_regs *regs)
 
 }
 
-__visible void smp_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
 {
 	entering_irq();
 	__smp_error_interrupt(regs);
 	exiting_irq();
 }
 
-__visible void smp_trace_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs)
 {
 	entering_irq();
 	trace_error_apic_entry(ERROR_APIC_VECTOR);
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 5d30c5e42bb1..f3557a1eb562 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -559,7 +559,7 @@ void send_cleanup_vector(struct irq_cfg *cfg)
 		__send_cleanup_vector(data);
 }
 
-asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
+asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
 {
 	unsigned vector, me;
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index a5fd137417a2..9e655292cf10 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -814,14 +814,14 @@ static inline void __smp_deferred_error_interrupt(void)
 	deferred_error_int_vector();
 }
 
-asmlinkage __visible void smp_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
 {
 	entering_irq();
 	__smp_deferred_error_interrupt();
 	exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
 {
 	entering_irq();
 	trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 465aca8be009..772d9400930d 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -404,14 +404,16 @@ static inline void __smp_thermal_interrupt(void)
 	smp_thermal_vector();
 }
 
-asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_thermal_interrupt(struct pt_regs *regs)
 {
 	entering_irq();
 	__smp_thermal_interrupt();
 	exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_trace_thermal_interrupt(struct pt_regs *regs)
 {
 	entering_irq();
 	trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 9beb092d68a5..bb0e75eed10a 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -23,14 +23,14 @@ static inline void __smp_threshold_interrupt(void)
 	mce_threshold_vector();
 }
 
-asmlinkage __visible void smp_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
 {
 	entering_irq();
 	__smp_threshold_interrupt();
 	exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void)
 {
 	entering_irq();
 	trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 7c6e9ffe4424..4d8183b5f113 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -264,7 +264,7 @@ void __smp_x86_platform_ipi(void)
 		x86_platform_ipi_callback();
 }
 
-__visible void smp_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
@@ -315,7 +315,7 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
 }
 #endif
 
-__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 3512ba607361..275487872be2 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -9,6 +9,7 @@
 #include <linux/hardirq.h>
 #include <asm/apic.h>
 #include <asm/trace/irq_vectors.h>
+#include <linux/interrupt.h>
 
 static inline void __smp_irq_work_interrupt(void)
 {
@@ -16,14 +17,14 @@ static inline void __smp_irq_work_interrupt(void)
 	irq_work_run();
 }
 
-__visible void smp_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	__smp_irq_work_interrupt();
 	exiting_irq();
 }
 
-__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	trace_irq_work_entry(IRQ_WORK_VECTOR);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 68f8cc222f25..d3c66a15bbde 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -259,7 +259,7 @@ static inline void __smp_reschedule_interrupt(void)
 	scheduler_ipi();
 }
 
-__visible void smp_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	__smp_reschedule_interrupt();
@@ -268,7 +268,7 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs)
 	 */
 }
 
-__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs)
 {
 	/*
 	 * Need to call irq_enter() before calling the trace point.
@@ -292,14 +292,15 @@ static inline void __smp_call_function_interrupt(void)
 	inc_irq_stat(irq_call_count);
 }
 
-__visible void smp_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	__smp_call_function_interrupt();
 	exiting_irq();
 }
 
-__visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
@@ -314,14 +315,16 @@ static inline void __smp_call_function_single_interrupt(void)
 	inc_irq_stat(irq_call_count);
 }
 
-__visible void smp_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_call_function_single_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	__smp_call_function_single_interrupt();
 	exiting_irq();
 }
 
-__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
-- 
cgit 


From 4b5b61eaf8b70838750a1e6dc80ecd044c8f4b3f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 5 Jan 2017 15:02:34 +0200
Subject: x86/platform/intel-mid: Remove Moorestown code

The Moorestown support code was removed by:

  a8359e411eb ("x86/mid: Remove Intel Moorestown").

Remove this leftover as well.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170105130235.177792-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-mid/device_libs/Makefile   |  1 -
 .../platform/intel-mid/device_libs/platform_ipc.c  |  9 ----
 .../intel-mid/device_libs/platform_pmic_gpio.c     | 54 ----------------------
 3 files changed, 64 deletions(-)
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index 90e4f2a6625b..4d8c14a783d5 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -12,7 +12,6 @@ obj-$(subst m,y,$(CONFIG_GPIO_MSIC)) += platform_msic_gpio.o
 obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_ocd.o
 obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_battery.o
 obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o
-obj-$(subst m,y,$(CONFIG_GPIO_INTEL_PMIC)) += platform_pmic_gpio.o
 obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o
 # SPI Devices
 obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.c b/arch/x86/platform/intel-mid/device_libs/platform_ipc.c
index a84b73d6c4a0..a428c051e7bb 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_ipc.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_ipc.c
@@ -57,12 +57,3 @@ void __init ipc_device_handler(struct sfi_device_table_entry *pentry,
 	pdev->dev.platform_data = pdata;
 	intel_scu_device_register(pdev);
 }
-
-static const struct devs_id pmic_audio_dev_id __initconst = {
-	.name = "pmic_audio",
-	.type = SFI_DEV_TYPE_IPC,
-	.delay = 1,
-	.device_handler = &ipc_device_handler,
-};
-
-sfi_device(pmic_audio_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
deleted file mode 100644
index e30cb62e3300..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * platform_pmic_gpio.c: PMIC GPIO platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/scatterlist.h>
-#include <linux/gpio.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/intel_pmic_gpio.h>
-#include <asm/intel-mid.h>
-
-#include "platform_ipc.h"
-
-static void __init *pmic_gpio_platform_data(void *info)
-{
-	static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
-	int gpio_base = get_gpio_by_name("pmic_gpio_base");
-
-	if (gpio_base < 0)
-		gpio_base = 64;
-	pmic_gpio_pdata.gpio_base = gpio_base;
-	pmic_gpio_pdata.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
-	pmic_gpio_pdata.gpiointr = 0xffffeff8;
-
-	return &pmic_gpio_pdata;
-}
-
-static const struct devs_id pmic_gpio_spi_dev_id __initconst = {
-	.name = "pmic_gpio",
-	.type = SFI_DEV_TYPE_SPI,
-	.delay = 1,
-	.get_platform_data = &pmic_gpio_platform_data,
-};
-
-static const struct devs_id pmic_gpio_ipc_dev_id __initconst = {
-	.name = "pmic_gpio",
-	.type = SFI_DEV_TYPE_IPC,
-	.delay = 1,
-	.get_platform_data = &pmic_gpio_platform_data,
-	.device_handler = &ipc_device_handler
-};
-
-sfi_device(pmic_gpio_spi_dev_id);
-sfi_device(pmic_gpio_ipc_dev_id);
-- 
cgit 


From a01b3391b542aaaed539f9d9d6d0d4d6502ab9c6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 5 Jan 2017 15:02:35 +0200
Subject: x86/platform/intel-mid: Get rid of duplication of IPC handler

There is no other device handler than ipc_device_handler() and sfi.c already
has a handler for IPC devices.

Replace a pointer to custom handler by a flag. Due to this change adjust
sfi_handle_ipc_dev() to handle it instead of ipc_device_handler().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170105130235.177792-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/intel-mid.h                   |  4 +-
 arch/x86/platform/intel-mid/device_libs/Makefile   |  1 -
 .../platform/intel-mid/device_libs/platform_ipc.c  | 59 ----------------------
 .../platform/intel-mid/device_libs/platform_ipc.h  | 18 -------
 .../intel-mid/device_libs/platform_msic_audio.c    |  3 +-
 .../intel-mid/device_libs/platform_msic_battery.c  |  3 +-
 .../intel-mid/device_libs/platform_msic_gpio.c     |  3 +-
 .../intel-mid/device_libs/platform_msic_ocd.c      |  3 +-
 .../device_libs/platform_msic_power_btn.c          |  3 +-
 .../intel-mid/device_libs/platform_msic_thermal.c  |  3 +-
 arch/x86/platform/intel-mid/sfi.c                  | 55 ++++++++++++--------
 11 files changed, 40 insertions(+), 115 deletions(-)
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_ipc.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_ipc.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index 49da9f497b90..91ead0cefa76 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -42,10 +42,8 @@ struct devs_id {
 	char name[SFI_NAME_LEN + 1];
 	u8 type;
 	u8 delay;
+	u8 msic;
 	void *(*get_platform_data)(void *info);
-	/* Custom handler for devices */
-	void (*device_handler)(struct sfi_device_table_entry *pentry,
-			       struct devs_id *dev);
 };
 
 #define sfi_device(i)								\
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index 4d8c14a783d5..d4af7785844e 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -5,7 +5,6 @@ obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o
 # WiFi
 obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o
 # IPC Devices
-obj-y += platform_ipc.o
 obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o
 obj-$(subst m,y,$(CONFIG_SND_MFLD_MACHINE)) += platform_msic_audio.o
 obj-$(subst m,y,$(CONFIG_GPIO_MSIC)) += platform_msic_gpio.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.c b/arch/x86/platform/intel-mid/device_libs/platform_ipc.c
deleted file mode 100644
index a428c051e7bb..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_ipc.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * platform_ipc.c: IPC platform library file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/sfi.h>
-#include <linux/gpio.h>
-#include <asm/intel-mid.h>
-#include "platform_ipc.h"
-
-void __init ipc_device_handler(struct sfi_device_table_entry *pentry,
-				struct devs_id *dev)
-{
-	struct platform_device *pdev;
-	void *pdata = NULL;
-	static struct resource res __initdata = {
-		.name = "IRQ",
-		.flags = IORESOURCE_IRQ,
-	};
-
-	pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n",
-		pentry->name, pentry->irq);
-
-	/*
-	 * We need to call platform init of IPC devices to fill misc_pdata
-	 * structure. It will be used in msic_init for initialization.
-	 */
-	if (dev != NULL)
-		pdata = dev->get_platform_data(pentry);
-
-	/*
-	 * On Medfield the platform device creation is handled by the MSIC
-	 * MFD driver so we don't need to do it here.
-	 */
-	if (intel_mid_has_msic())
-		return;
-
-	pdev = platform_device_alloc(pentry->name, 0);
-	if (pdev == NULL) {
-		pr_err("out of memory for SFI platform device '%s'.\n",
-			pentry->name);
-		return;
-	}
-	res.start = pentry->irq;
-	platform_device_add_resources(pdev, &res, 1);
-
-	pdev->dev.platform_data = pdata;
-	intel_scu_device_register(pdev);
-}
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h
deleted file mode 100644
index 79bb09d4f718..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * platform_ipc.h: IPC platform library header file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#ifndef _PLATFORM_IPC_H_
-#define _PLATFORM_IPC_H_
-
-void __init
-ipc_device_handler(struct sfi_device_table_entry *pentry, struct devs_id *dev);
-
-#endif
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
index cb3490ecb341..d4dc744dd5a5 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
@@ -20,7 +20,6 @@
 #include <asm/intel-mid.h>
 
 #include "platform_msic.h"
-#include "platform_ipc.h"
 
 static void *msic_audio_platform_data(void *info)
 {
@@ -40,8 +39,8 @@ static const struct devs_id msic_audio_dev_id __initconst = {
 	.name = "msic_audio",
 	.type = SFI_DEV_TYPE_IPC,
 	.delay = 1,
+	.msic = 1,
 	.get_platform_data = &msic_audio_platform_data,
-	.device_handler = &ipc_device_handler,
 };
 
 sfi_device(msic_audio_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
index 4f72193939a6..5c3e9919633f 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
@@ -19,7 +19,6 @@
 #include <asm/intel-mid.h>
 
 #include "platform_msic.h"
-#include "platform_ipc.h"
 
 static void __init *msic_battery_platform_data(void *info)
 {
@@ -30,8 +29,8 @@ static const struct devs_id msic_battery_dev_id __initconst = {
 	.name = "msic_battery",
 	.type = SFI_DEV_TYPE_IPC,
 	.delay = 1,
+	.msic = 1,
 	.get_platform_data = &msic_battery_platform_data,
-	.device_handler = &ipc_device_handler,
 };
 
 sfi_device(msic_battery_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
index 70de5b531ba0..9fdb88d460d7 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
@@ -20,7 +20,6 @@
 #include <asm/intel-mid.h>
 
 #include "platform_msic.h"
-#include "platform_ipc.h"
 
 static void __init *msic_gpio_platform_data(void *info)
 {
@@ -41,8 +40,8 @@ static const struct devs_id msic_gpio_dev_id __initconst = {
 	.name = "msic_gpio",
 	.type = SFI_DEV_TYPE_IPC,
 	.delay = 1,
+	.msic = 1,
 	.get_platform_data = &msic_gpio_platform_data,
-	.device_handler = &ipc_device_handler,
 };
 
 sfi_device(msic_gpio_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
index 3d7c2011b6cf..7ae37cdbf256 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
@@ -20,7 +20,6 @@
 #include <asm/intel-mid.h>
 
 #include "platform_msic.h"
-#include "platform_ipc.h"
 
 static void __init *msic_ocd_platform_data(void *info)
 {
@@ -42,8 +41,8 @@ static const struct devs_id msic_ocd_dev_id __initconst = {
 	.name = "msic_ocd",
 	.type = SFI_DEV_TYPE_IPC,
 	.delay = 1,
+	.msic = 1,
 	.get_platform_data = &msic_ocd_platform_data,
-	.device_handler = &ipc_device_handler,
 };
 
 sfi_device(msic_ocd_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
index 038f618fbc52..96809b98cf69 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
@@ -18,7 +18,6 @@
 #include <asm/intel-mid.h>
 
 #include "platform_msic.h"
-#include "platform_ipc.h"
 
 static void __init *msic_power_btn_platform_data(void *info)
 {
@@ -29,8 +28,8 @@ static const struct devs_id msic_power_btn_dev_id __initconst = {
 	.name = "msic_power_btn",
 	.type = SFI_DEV_TYPE_IPC,
 	.delay = 1,
+	.msic = 1,
 	.get_platform_data = &msic_power_btn_platform_data,
-	.device_handler = &ipc_device_handler,
 };
 
 sfi_device(msic_power_btn_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
index 114a5755b1e4..3e4167d246cd 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
@@ -19,7 +19,6 @@
 #include <asm/intel-mid.h>
 
 #include "platform_msic.h"
-#include "platform_ipc.h"
 
 static void __init *msic_thermal_platform_data(void *info)
 {
@@ -30,8 +29,8 @@ static const struct devs_id msic_thermal_dev_id __initconst = {
 	.name = "msic_thermal",
 	.type = SFI_DEV_TYPE_IPC,
 	.delay = 1,
+	.msic = 1,
 	.get_platform_data = &msic_thermal_platform_data,
-	.device_handler = &ipc_device_handler,
 };
 
 sfi_device(msic_thermal_dev_id);
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index 051d264fce2e..e8f68f652087 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -335,10 +335,22 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry,
 
 	pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n",
 		pentry->name, pentry->irq);
+
+	/*
+	 * We need to call platform init of IPC devices to fill misc_pdata
+	 * structure. It will be used in msic_init for initialization.
+	 */
 	pdata = intel_mid_sfi_get_pdata(dev, pentry);
 	if (IS_ERR(pdata))
 		return;
 
+	/*
+	 * On Medfield the platform device creation is handled by the MSIC
+	 * MFD driver so we don't need to do it here.
+	 */
+	if (dev->msic && intel_mid_has_msic())
+		return;
+
 	pdev = platform_device_alloc(pentry->name, 0);
 	if (pdev == NULL) {
 		pr_err("out of memory for SFI platform device '%s'.\n",
@@ -348,7 +360,10 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry,
 	install_irq_resource(pdev, pentry->irq);
 
 	pdev->dev.platform_data = pdata;
-	platform_device_add(pdev);
+	if (dev->delay)
+		intel_scu_device_register(pdev);
+	else
+		platform_device_add(pdev);
 }
 
 static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry,
@@ -503,27 +518,23 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 		if (!dev)
 			continue;
 
-		if (dev->device_handler) {
-			dev->device_handler(pentry, dev);
-		} else {
-			switch (pentry->type) {
-			case SFI_DEV_TYPE_IPC:
-				sfi_handle_ipc_dev(pentry, dev);
-				break;
-			case SFI_DEV_TYPE_SPI:
-				sfi_handle_spi_dev(pentry, dev);
-				break;
-			case SFI_DEV_TYPE_I2C:
-				sfi_handle_i2c_dev(pentry, dev);
-				break;
-			case SFI_DEV_TYPE_SD:
-				sfi_handle_sd_dev(pentry, dev);
-				break;
-			case SFI_DEV_TYPE_UART:
-			case SFI_DEV_TYPE_HSI:
-			default:
-				break;
-			}
+		switch (pentry->type) {
+		case SFI_DEV_TYPE_IPC:
+			sfi_handle_ipc_dev(pentry, dev);
+			break;
+		case SFI_DEV_TYPE_SPI:
+			sfi_handle_spi_dev(pentry, dev);
+			break;
+		case SFI_DEV_TYPE_I2C:
+			sfi_handle_i2c_dev(pentry, dev);
+			break;
+		case SFI_DEV_TYPE_SD:
+			sfi_handle_sd_dev(pentry, dev);
+			break;
+		case SFI_DEV_TYPE_UART:
+		case SFI_DEV_TYPE_HSI:
+		default:
+			break;
 		}
 	}
 	return 0;
-- 
cgit 


From ecc7ea5dd1409d4e6dfba2f0ff0ee1c6ccd855bd Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 5 Jan 2017 18:17:17 +0200
Subject: x86/platform/intel-mid: Enable GPIO keys on Merrifield

The Merrifield firmware provides 3 descriptions of buttons connected to GPIO.
Append them to the list of supported GPIO keys.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170105161717.115261-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
index 52534ec29765..74283875c7e8 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
@@ -32,6 +32,9 @@ static struct gpio_keys_button gpio_button[] = {
 	{SW_LID,		-1, 1, "lid_switch",	EV_SW,  0, 20},
 	{KEY_VOLUMEUP,		-1, 1, "vol_up",	EV_KEY, 0, 20},
 	{KEY_VOLUMEDOWN,	-1, 1, "vol_down",	EV_KEY, 0, 20},
+	{KEY_MUTE,		-1, 1, "mute_enable",	EV_KEY, 0, 20},
+	{KEY_VOLUMEUP,		-1, 1, "volume_up",	EV_KEY, 0, 20},
+	{KEY_VOLUMEDOWN,	-1, 1, "volume_down",	EV_KEY, 0, 20},
 	{KEY_CAMERA,		-1, 1, "camera_full",	EV_KEY, 0, 20},
 	{KEY_CAMERA_FOCUS,	-1, 1, "camera_half",	EV_KEY, 0, 20},
 	{SW_KEYPAD_SLIDE,	-1, 1, "MagSw1",	EV_SW,  0, 20},
-- 
cgit 


From 12bf98b91f7aa8a9a526309aba645ccdcc470cab Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Thu, 5 Jan 2017 17:54:43 +0800
Subject: x86/apic: Fix typos in comments

 s/ID/IDs/
 s/inr_logical_cpuidi/nr_logical_cpuids/
 s/generic_processor_info()/__generic_processor_info()/

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1483610083-24314-1-git-send-email-douly.fnst@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/apic.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 5b7e43eff139..5c4fdcfda109 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2028,8 +2028,8 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 /*
  * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated
  * contiguously, it equals to current allocated max logical CPU ID plus 1.
- * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of
- * nr_logical_cpuids is nr_cpu_ids.
+ * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range,
+ * so the maximum of nr_logical_cpuids is nr_cpu_ids.
  *
  * NOTE: Reserve 0 for BSP.
  */
@@ -2094,7 +2094,7 @@ int __generic_processor_info(int apicid, int version, bool enabled)
 	 * Since fixing handling of boot_cpu_physical_apicid requires
 	 * another discussion and tests on each platform, we leave it
 	 * for now and here we use read_apic_id() directly in this
-	 * function, generic_processor_info().
+	 * function, __generic_processor_info().
 	 */
 	if (disabled_cpu_apicid != BAD_APICID &&
 	    disabled_cpu_apicid != read_apic_id() &&
-- 
cgit 


From 914122c389d091a02f7b5476209af715e77ccb73 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 29 Dec 2016 17:45:49 +0100
Subject: x86/apic: Implement set_state_oneshot_stopped() callback

When clock_event_device::set_state_oneshot_stopped() is not implemented,
hrtimer_cancel() can't stop the clock when there is no more timer in
the queue. So the ghost of the freshly cancelled hrtimer haunts us back
later with an extra interrupt:

          <idle>-0     [002] d..2  2248.557659: hrtimer_cancel: hrtimer=ffff88021fa92d80
          <idle>-0     [002] d.h1  2249.303659: local_timer_entry: vector=239

So let's implement this missing callback for the lapic clock. This
consist in calling its set_state_shutdown() callback. There don't seem
to be a lighter way to stop the clock. Simply writing 0 to APIC_TMICT
won't be enough to stop the clock and avoid the extra interrupt, as
opposed to what is specified in the specs. We must also mask the
timer interrupt in the device.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: http://lkml.kernel.org/r/1483029949-6925-1-git-send-email-fweisbec@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/apic.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 5c4fdcfda109..fdb9c46227cc 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -529,18 +529,19 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
  * The local apic timer can be used for any function which is CPU local.
  */
 static struct clock_event_device lapic_clockevent = {
-	.name			= "lapic",
-	.features		= CLOCK_EVT_FEAT_PERIODIC |
-				  CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
-				  | CLOCK_EVT_FEAT_DUMMY,
-	.shift			= 32,
-	.set_state_shutdown	= lapic_timer_shutdown,
-	.set_state_periodic	= lapic_timer_set_periodic,
-	.set_state_oneshot	= lapic_timer_set_oneshot,
-	.set_next_event		= lapic_next_event,
-	.broadcast		= lapic_timer_broadcast,
-	.rating			= 100,
-	.irq			= -1,
+	.name				= "lapic",
+	.features			= CLOCK_EVT_FEAT_PERIODIC |
+					  CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+					  | CLOCK_EVT_FEAT_DUMMY,
+	.shift				= 32,
+	.set_state_shutdown		= lapic_timer_shutdown,
+	.set_state_periodic		= lapic_timer_set_periodic,
+	.set_state_oneshot		= lapic_timer_set_oneshot,
+	.set_state_oneshot_stopped	= lapic_timer_shutdown,
+	.set_next_event			= lapic_next_event,
+	.broadcast			= lapic_timer_broadcast,
+	.rating				= 100,
+	.irq				= -1,
 };
 static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 
-- 
cgit 


From 35e6eaa3df55822d0cb1df3bf08e6cb816737131 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:01 +0100
Subject: KVM: x86: don't allow kernel irqchip with split irqchip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Split irqchip cannot be created after creating the kernel irqchip, but
we forgot to restrict the other way.  This is an API change.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2f22810a7e0c..c72a8d00a1c0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3961,7 +3961,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		mutex_lock(&kvm->lock);
 		r = -EEXIST;
-		if (kvm->arch.vpic)
+		if (irqchip_in_kernel(kvm))
 			goto create_irqchip_unlock;
 		r = -EINVAL;
 		if (kvm->created_vcpus)
-- 
cgit 


From 49776faf93f8074bb4990beac04781a9507d3650 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:02 +0100
Subject: KVM: x86: decouple irqchip_in_kernel() and pic_irqchip()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

irqchip_in_kernel() tried to save a bit by reusing pic_irqchip(), but it
just complicated the code.
Add a separate state for the irqchip mode.

Reviewed-by: David Hildenbrand <david@redhat.com>
[Used Paolo's version of condition in irqchip_in_kernel().]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  8 +++++++-
 arch/x86/kvm/irq.h              | 15 ++++++++-------
 arch/x86/kvm/x86.c              |  5 +++--
 3 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a7066dc1a7e9..fc03ab1f6110 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -716,6 +716,12 @@ struct kvm_hv {
 	HV_REFERENCE_TSC_PAGE tsc_ref;
 };
 
+enum kvm_irqchip_mode {
+	KVM_IRQCHIP_NONE,
+	KVM_IRQCHIP_KERNEL,       /* created with KVM_CREATE_IRQCHIP */
+	KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
+};
+
 struct kvm_arch {
 	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
@@ -788,7 +794,7 @@ struct kvm_arch {
 
 	u64 disabled_quirks;
 
-	bool irqchip_split;
+	enum kvm_irqchip_mode irqchip_mode;
 	u8 nr_reserved_ioapic_pins;
 
 	bool disabled_lapic_found;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 035731eb3897..f4965bc2613c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -93,18 +93,19 @@ static inline int pic_in_kernel(struct kvm *kvm)
 
 static inline int irqchip_split(struct kvm *kvm)
 {
-	return kvm->arch.irqchip_split;
+	return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT;
 }
 
-static inline int irqchip_in_kernel(struct kvm *kvm)
+static inline int irqchip_kernel(struct kvm *kvm)
 {
-	struct kvm_pic *vpic = pic_irqchip(kvm);
-	bool ret;
+	return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL;
+}
 
-	ret = (vpic != NULL);
-	ret |= irqchip_split(kvm);
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE;
 
-	/* Read vpic before kvm->irq_routing.  */
+	/* Matches with wmb after initializing kvm->irq_routing. */
 	smp_rmb();
 	return ret;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c72a8d00a1c0..0630ab438bd5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3894,7 +3894,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			goto split_irqchip_unlock;
 		/* Pairs with irqchip_in_kernel. */
 		smp_wmb();
-		kvm->arch.irqchip_split = true;
+		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
 		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
 		r = 0;
 split_irqchip_unlock:
@@ -3988,8 +3988,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			mutex_unlock(&kvm->slots_lock);
 			goto create_irqchip_unlock;
 		}
-		/* Write kvm->irq_routing before kvm->arch.vpic.  */
+		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
 		smp_wmb();
+		kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
 		kvm->arch.vpic = vpic;
 	create_irqchip_unlock:
 		mutex_unlock(&kvm->lock);
-- 
cgit 


From 099413664c71fcf9d0099eba4f8a4dd59653d5a3 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:03 +0100
Subject: KVM: x86: make pic setup code look like ioapic setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We don't treat kvm->arch.vpic specially anymore, so the setup can look
like ioapic.  This gets a bit more information out of return values.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8259.c | 16 +++++++++++-----
 arch/x86/kvm/irq.h   |  4 ++--
 arch/x86/kvm/x86.c   | 30 +++++++++++++++---------------
 3 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 7cc2360f1848..73ea24d4f119 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -598,14 +598,14 @@ static const struct kvm_io_device_ops picdev_eclr_ops = {
 	.write    = picdev_eclr_write,
 };
 
-struct kvm_pic *kvm_create_pic(struct kvm *kvm)
+int kvm_pic_init(struct kvm *kvm)
 {
 	struct kvm_pic *s;
 	int ret;
 
 	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
 	if (!s)
-		return NULL;
+		return -ENOMEM;
 	spin_lock_init(&s->lock);
 	s->kvm = kvm;
 	s->pics[0].elcr_mask = 0xf8;
@@ -635,7 +635,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 
 	mutex_unlock(&kvm->slots_lock);
 
-	return s;
+	kvm->arch.vpic = s;
+
+	return 0;
 
 fail_unreg_1:
 	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
@@ -648,13 +650,17 @@ fail_unlock:
 
 	kfree(s);
 
-	return NULL;
+	return ret;
 }
 
-void kvm_destroy_pic(struct kvm_pic *vpic)
+void kvm_pic_destroy(struct kvm *kvm)
 {
+	struct kvm_pic *vpic = kvm->arch.vpic;
+
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+
+	kvm->arch.vpic = NULL;
 	kfree(vpic);
 }
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index f4965bc2613c..40d5b2cf6061 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -73,8 +73,8 @@ struct kvm_pic {
 	unsigned long irq_states[PIC_NUM_PINS];
 };
 
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_destroy_pic(struct kvm_pic *vpic);
+int kvm_pic_init(struct kvm *kvm);
+void kvm_pic_destroy(struct kvm *kvm);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0630ab438bd5..05ac71a01f99 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3957,33 +3957,34 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 		break;
 	case KVM_CREATE_IRQCHIP: {
-		struct kvm_pic *vpic;
-
 		mutex_lock(&kvm->lock);
+
 		r = -EEXIST;
 		if (irqchip_in_kernel(kvm))
 			goto create_irqchip_unlock;
+
 		r = -EINVAL;
 		if (kvm->created_vcpus)
 			goto create_irqchip_unlock;
-		r = -ENOMEM;
-		vpic = kvm_create_pic(kvm);
-		if (vpic) {
-			r = kvm_ioapic_init(kvm);
-			if (r) {
-				mutex_lock(&kvm->slots_lock);
-				kvm_destroy_pic(vpic);
-				mutex_unlock(&kvm->slots_lock);
-				goto create_irqchip_unlock;
-			}
-		} else
+
+		r = kvm_pic_init(kvm);
+		if (r)
 			goto create_irqchip_unlock;
+
+		r = kvm_ioapic_init(kvm);
+		if (r) {
+			mutex_lock(&kvm->slots_lock);
+			kvm_pic_destroy(kvm);
+			mutex_unlock(&kvm->slots_lock);
+			goto create_irqchip_unlock;
+		}
+
 		r = kvm_setup_default_irq_routing(kvm);
 		if (r) {
 			mutex_lock(&kvm->slots_lock);
 			mutex_lock(&kvm->irq_lock);
 			kvm_ioapic_destroy(kvm);
-			kvm_destroy_pic(vpic);
+			kvm_pic_destroy(kvm);
 			mutex_unlock(&kvm->irq_lock);
 			mutex_unlock(&kvm->slots_lock);
 			goto create_irqchip_unlock;
@@ -3991,7 +3992,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
 		smp_wmb();
 		kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
-		kvm->arch.vpic = vpic;
 	create_irqchip_unlock:
 		mutex_unlock(&kvm->lock);
 		break;
-- 
cgit 


From e5dc48777dcc898210e2f16d80d44718db38cdc3 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:04 +0100
Subject: KVM: x86: refactor pic setup in kvm_set_routing_entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6c0191615f23..1dfeb185a1e3 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -297,14 +297,12 @@ int kvm_set_routing_entry(struct kvm *kvm,
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		delta = 0;
 		switch (ue->u.irqchip.irqchip) {
-		case KVM_IRQCHIP_PIC_MASTER:
-			e->set = kvm_set_pic_irq;
-			max_pin = PIC_NUM_PINS;
-			break;
 		case KVM_IRQCHIP_PIC_SLAVE:
+			delta = 8;
+			/* fall through */
+		case KVM_IRQCHIP_PIC_MASTER:
 			e->set = kvm_set_pic_irq;
 			max_pin = PIC_NUM_PINS;
-			delta = 8;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
 			max_pin = KVM_IOAPIC_NUM_PINS;
-- 
cgit 


From 8231f50d9853274ed104aac86b6b6263ca666c4d Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:05 +0100
Subject: KVM: x86: prevent setup of invalid routes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The check in kvm_set_pic_irq() and kvm_set_ioapic_irq() was just a
temporary measure until the code improved enough for us to do this.

This changes APIC in a case when KVM_SET_GSI_ROUTING is called to set up pic
and ioapic routes before KVM_CREATE_IRQCHIP.  Those rules would get overwritten
by KVM_CREATE_IRQCHIP at best, so it is pointless to allow it.  Userspaces
hopefully noticed that things don't work if they do that and don't do that.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 1dfeb185a1e3..2639b8d3dce2 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -41,15 +41,6 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   bool line_status)
 {
 	struct kvm_pic *pic = pic_irqchip(kvm);
-
-	/*
-	 * XXX: rejecting pic routes when pic isn't in use would be better,
-	 * but the default routing table is installed while kvm->arch.vpic is
-	 * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE.
-	 */
-	if (!pic)
-		return -1;
-
 	return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
 }
 
@@ -58,10 +49,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 			      bool line_status)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-
-	if (!ioapic)
-		return -1;
-
 	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
 				line_status);
 }
@@ -301,10 +288,16 @@ int kvm_set_routing_entry(struct kvm *kvm,
 			delta = 8;
 			/* fall through */
 		case KVM_IRQCHIP_PIC_MASTER:
+			if (!pic_in_kernel(kvm))
+				goto out;
+
 			e->set = kvm_set_pic_irq;
 			max_pin = PIC_NUM_PINS;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
+			if (!ioapic_in_kernel(kvm))
+				goto out;
+
 			max_pin = KVM_IOAPIC_NUM_PINS;
 			e->set = kvm_set_ioapic_irq;
 			break;
-- 
cgit 


From 826da32140dada1467f4216410525511393317e8 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:06 +0100
Subject: KVM: x86: simplify conditions with split/kernel irqchip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 2 +-
 arch/x86/kvm/x86.c      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 2639b8d3dce2..b96d3893f121 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -400,7 +400,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm)
 
 void kvm_arch_post_irq_routing_update(struct kvm *kvm)
 {
-	if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+	if (!irqchip_split(kvm))
 		return;
 	kvm_make_scan_ioapic_request(kvm);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05ac71a01f99..a356d8e12c2f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4027,7 +4027,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
+		if (!irqchip_kernel(kvm))
 			goto get_irqchip_out;
 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		if (r)
@@ -4051,7 +4051,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
+		if (!irqchip_kernel(kvm))
 			goto set_irqchip_out;
 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 		if (r)
-- 
cgit 


From f3414bc77419463c0d81eaa2cea7ee4ccb447c7d Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Tue, 20 Dec 2016 15:25:57 -0800
Subject: kvm: x86: export maximum number of mmu_page_hash collisions

Report the maximum number of mmu_page_hash collisions as a per-VM stat.
This will make it easy to identify problems with the mmu_page_hash in
the future.

Signed-off-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 25 +++++++++++++++++--------
 arch/x86/kvm/x86.c              |  2 ++
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc03ab1f6110..1bb1ffc0024c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -821,6 +821,7 @@ struct kvm_vm_stat {
 	ulong mmu_unsync;
 	ulong remote_tlb_flush;
 	ulong lpages;
+	ulong max_mmu_page_hash_collisions;
 };
 
 struct kvm_vcpu_stat {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7012de4a1fed..45ee7ae88239 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1904,17 +1904,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
  * since it has been deleted from active_mmu_pages but still can be found
  * at hast list.
  *
- * for_each_gfn_valid_sp() has skipped that kind of pages.
+ * for_each_valid_sp() has skipped that kind of pages.
  */
-#define for_each_gfn_valid_sp(_kvm, _sp, _gfn)				\
+#define for_each_valid_sp(_kvm, _sp, _gfn)				\
 	hlist_for_each_entry(_sp,					\
 	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
-		if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \
-			|| (_sp)->role.invalid) {} else
+		if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) {    \
+		} else
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)			\
-	for_each_gfn_valid_sp(_kvm, _sp, _gfn)				\
-		if ((_sp)->role.direct) {} else
+	for_each_valid_sp(_kvm, _sp, _gfn)				\
+		if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
 
 /* @sp->gfn should be write-protected at the call site */
 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2116,6 +2116,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	struct kvm_mmu_page *sp;
 	bool need_sync = false;
 	bool flush = false;
+	int collisions = 0;
 	LIST_HEAD(invalid_list);
 
 	role = vcpu->arch.mmu.base_role;
@@ -2130,7 +2131,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
 	}
-	for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) {
+	for_each_valid_sp(vcpu->kvm, sp, gfn) {
+		if (sp->gfn != gfn) {
+			collisions++;
+			continue;
+		}
+
 		if (!need_sync && sp->unsync)
 			need_sync = true;
 
@@ -2153,7 +2159,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
 		__clear_sp_write_flooding_count(sp);
 		trace_kvm_mmu_get_page(sp, false);
-		return sp;
+		goto out;
 	}
 
 	++vcpu->kvm->stat.mmu_cache_miss;
@@ -2183,6 +2189,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	trace_kvm_mmu_get_page(sp, true);
 
 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+out:
+	if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
+		vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
 	return sp;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a356d8e12c2f..4aece8b0a4aa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -190,6 +190,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
+	{ "max_mmu_page_hash_collisions",
+		VM_STAT(max_mmu_page_hash_collisions) },
 	{ NULL }
 };
 
-- 
cgit 


From 114df303a7eeae8b50ebf68229b7e647714a9bea Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Mon, 19 Dec 2016 13:58:25 -0800
Subject: kvm: x86: reduce collisions in mmu_page_hash

When using two-dimensional paging, the mmu_page_hash (which provides
lookups for existing kvm_mmu_page structs), becomes imbalanced; with
too many collisions in buckets 0 and 512. This has been seen to cause
mmu_lock to be held for multiple milliseconds in kvm_mmu_get_page on
VMs with a large amount of RAM mapped with 4K pages.

The current hash function uses the lower 10 bits of gfn to index into
mmu_page_hash. When doing shadow paging, gfn is the address of the
guest page table being shadow. These tables are 4K-aligned, which
makes the low bits of gfn a good hash. However, with two-dimensional
paging, no guest page tables are being shadowed, so gfn is the base
address that is mapped by the table. Thus page tables (level=1) have
a 2MB aligned gfn, page directories (level=2) have a 1GB aligned gfn,
etc. This means hashes will only differ in their 10th bit.

hash_64() provides a better hash. For example, on a VM with ~200G
(99458 direct=1 kvm_mmu_page structs):

hash            max_mmu_page_hash_collisions
--------------------------------------------
low 10 bits     49847
hash_64         105
perfect         97

While we're changing the hash, increase the table size by 4x to better
support large VMs (further reduces number of collisions in 200G VM to
29).

Note that hash_64() does not provide a good distribution prior to commit
ef703f49a6c5 ("Eliminate bad hash multipliers from hash_32() and
hash_64()").

Signed-off-by: David Matlack <dmatlack@google.com>
Change-Id: I5aa6b13c834722813c6cca46b8b1ed6f53368ade
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/mmu.c              | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1bb1ffc0024c..7e594a325158 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -115,7 +115,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
 
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
-#define KVM_MMU_HASH_SHIFT 10
+#define KVM_MMU_HASH_SHIFT 12
 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 45ee7ae88239..3f9fa39f1469 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/hash.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -1713,7 +1714,7 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
 {
-	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
 }
 
 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
-- 
cgit 


From 27959a4415a5a00881a7b9353ab9b1274da2ca47 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:10 -0800
Subject: kvm: x86: mmu: Use symbolic constants for EPT Violation Exit
 Qualifications

This change adds some symbolic constants for VM Exit Qualifications
related to EPT Violations and updates handle_ept_violation() to use
these constants instead of hard-coded numbers.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h | 16 ++++++++++++++++
 arch/x86/kvm/vmx.c         | 22 ++++++++++++++--------
 2 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b5b2d4b924e..25a482fb5241 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -499,6 +499,22 @@ struct vmx_msr_entry {
 #define ENTRY_FAIL_NMI			3
 #define ENTRY_FAIL_VMCS_LINK_PTR	4
 
+/*
+ * Exit Qualifications for EPT Violations
+ */
+#define EPT_VIOLATION_READ_BIT		0
+#define EPT_VIOLATION_WRITE_BIT		1
+#define EPT_VIOLATION_INSTR_BIT		2
+#define EPT_VIOLATION_READABLE_BIT	3
+#define EPT_VIOLATION_WRITABLE_BIT	4
+#define EPT_VIOLATION_EXECUTABLE_BIT	5
+#define EPT_VIOLATION_READ		(1 << EPT_VIOLATION_READ_BIT)
+#define EPT_VIOLATION_WRITE		(1 << EPT_VIOLATION_WRITE_BIT)
+#define EPT_VIOLATION_INSTR		(1 << EPT_VIOLATION_INSTR_BIT)
+#define EPT_VIOLATION_READABLE		(1 << EPT_VIOLATION_READABLE_BIT)
+#define EPT_VIOLATION_WRITABLE		(1 << EPT_VIOLATION_WRITABLE_BIT)
+#define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
+
 /*
  * VM-instruction error numbers
  */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a236decb81e4..81159a3878f4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6374,14 +6374,20 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	trace_kvm_page_fault(gpa, exit_qualification);
 
-	/* it is a read fault? */
-	error_code = (exit_qualification << 2) & PFERR_USER_MASK;
-	/* it is a write fault? */
-	error_code |= exit_qualification & PFERR_WRITE_MASK;
-	/* It is a fetch fault? */
-	error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
-	/* ept page table is present? */
-	error_code |= (exit_qualification & 0x38) != 0;
+	/* Is it a read fault? */
+	error_code = (exit_qualification & EPT_VIOLATION_READ)
+		     ? PFERR_USER_MASK : 0;
+	/* Is it a write fault? */
+	error_code |= (exit_qualification & EPT_VIOLATION_WRITE)
+		      ? PFERR_WRITE_MASK : 0;
+	/* Is it a fetch fault? */
+	error_code |= (exit_qualification & EPT_VIOLATION_INSTR)
+		      ? PFERR_FETCH_MASK : 0;
+	/* ept page table entry is present? */
+	error_code |= (exit_qualification &
+		       (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
+			EPT_VIOLATION_EXECUTABLE))
+		      ? PFERR_PRESENT_MASK : 0;
 
 	vcpu->arch.exit_qualification = exit_qualification;
 
-- 
cgit 


From ea4114bcd3a8c84f0eb0b52e56d348c27ddede2e Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:11 -0800
Subject: kvm: x86: mmu: Rename spte_is_locklessly_modifiable()

This change renames spte_is_locklessly_modifiable() to
spte_can_locklessly_be_made_writable() to distinguish it from other
forms of lockless modifications. The full set of lockless modifications
is covered by spte_has_volatile_bits().

Signed-off-by: Junaid Shahid <junaids@google.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3f9fa39f1469..e923f393ac26 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -474,7 +474,7 @@ retry:
 }
 #endif
 
-static bool spte_is_locklessly_modifiable(u64 spte)
+static bool spte_can_locklessly_be_made_writable(u64 spte)
 {
 	return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
 		(SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
@@ -488,7 +488,7 @@ static bool spte_has_volatile_bits(u64 spte)
 	 * also, it can help us to get a stable is_writable_pte()
 	 * to ensure tlb flush is not missed.
 	 */
-	if (spte_is_locklessly_modifiable(spte))
+	if (spte_can_locklessly_be_made_writable(spte))
 		return true;
 
 	if (!shadow_accessed_mask)
@@ -557,7 +557,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	 * we always atomically update it, see the comments in
 	 * spte_has_volatile_bits().
 	 */
-	if (spte_is_locklessly_modifiable(old_spte) &&
+	if (spte_can_locklessly_be_made_writable(old_spte) &&
 	      !is_writable_pte(new_spte))
 		ret = true;
 
@@ -1213,7 +1213,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
 	u64 spte = *sptep;
 
 	if (!is_writable_pte(spte) &&
-	      !(pt_protect && spte_is_locklessly_modifiable(spte)))
+	      !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
 		return false;
 
 	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
@@ -2975,7 +2975,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 	 * Currently, to simplify the code, only the spte write-protected
 	 * by dirty-log can be fast fixed.
 	 */
-	if (!spte_is_locklessly_modifiable(spte))
+	if (!spte_can_locklessly_be_made_writable(spte))
 		goto exit;
 
 	/*
-- 
cgit 


From 97dceba29a6acbb28d16c8c5757ae9f4e1e482ea Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:12 -0800
Subject: kvm: x86: mmu: Fast Page Fault path retries

This change adds retries into the Fast Page Fault path. Without the
retries, the code still works, but if a retry does end up being needed,
then it will result in a second page fault for the same memory access,
which will cause much more overhead compared to just retrying within the
original fault.

This would be especially useful with the upcoming fast access tracking
change, as that would make it more likely for retries to be needed
(e.g. due to read and write faults happening on different CPUs at
the same time).

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 124 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 51 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e923f393ac26..f6d3505c8d18 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2891,6 +2891,10 @@ static bool page_fault_can_be_fast(u32 error_code)
 	return true;
 }
 
+/*
+ * Returns true if the SPTE was fixed successfully. Otherwise,
+ * someone else modified the SPTE from its original value.
+ */
 static bool
 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			u64 *sptep, u64 spte)
@@ -2917,8 +2921,10 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 *
 	 * Compare with set_spte where instead shadow_dirty_mask is set.
 	 */
-	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
-		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte)
+		return false;
+
+	kvm_vcpu_mark_page_dirty(vcpu, gfn);
 
 	return true;
 }
@@ -2933,8 +2939,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
-	bool ret = false;
+	bool fault_handled = false;
 	u64 spte = 0ull;
+	uint retry_count = 0;
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return false;
@@ -2947,62 +2954,77 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		if (!is_shadow_present_pte(spte) || iterator.level < level)
 			break;
 
-	/*
-	 * If the mapping has been changed, let the vcpu fault on the
-	 * same address again.
-	 */
-	if (!is_shadow_present_pte(spte)) {
-		ret = true;
-		goto exit;
-	}
+	do {
+		/*
+		 * If the mapping has been changed, let the vcpu fault on the
+		 * same address again.
+		 */
+		if (!is_shadow_present_pte(spte)) {
+			fault_handled = true;
+			break;
+		}
 
-	sp = page_header(__pa(iterator.sptep));
-	if (!is_last_spte(spte, sp->role.level))
-		goto exit;
+		sp = page_header(__pa(iterator.sptep));
+		if (!is_last_spte(spte, sp->role.level))
+			break;
 
-	/*
-	 * Check if it is a spurious fault caused by TLB lazily flushed.
-	 *
-	 * Need not check the access of upper level table entries since
-	 * they are always ACC_ALL.
-	 */
-	 if (is_writable_pte(spte)) {
-		ret = true;
-		goto exit;
-	}
+		/*
+		 * Check if it is a spurious fault caused by TLB lazily flushed.
+		 *
+		 * Need not check the access of upper level table entries since
+		 * they are always ACC_ALL.
+		 */
+		if (is_writable_pte(spte)) {
+			fault_handled = true;
+			break;
+		}
 
-	/*
-	 * Currently, to simplify the code, only the spte write-protected
-	 * by dirty-log can be fast fixed.
-	 */
-	if (!spte_can_locklessly_be_made_writable(spte))
-		goto exit;
+		/*
+		 * Currently, to simplify the code, only the spte
+		 * write-protected by dirty-log can be fast fixed.
+		 */
+		if (!spte_can_locklessly_be_made_writable(spte))
+			break;
 
-	/*
-	 * Do not fix write-permission on the large spte since we only dirty
-	 * the first page into the dirty-bitmap in fast_pf_fix_direct_spte()
-	 * that means other pages are missed if its slot is dirty-logged.
-	 *
-	 * Instead, we let the slow page fault path create a normal spte to
-	 * fix the access.
-	 *
-	 * See the comments in kvm_arch_commit_memory_region().
-	 */
-	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
-		goto exit;
+		/*
+		 * Do not fix write-permission on the large spte since we only
+		 * dirty the first page into the dirty-bitmap in
+		 * fast_pf_fix_direct_spte() that means other pages are missed
+		 * if its slot is dirty-logged.
+		 *
+		 * Instead, we let the slow page fault path create a normal spte
+		 * to fix the access.
+		 *
+		 * See the comments in kvm_arch_commit_memory_region().
+		 */
+		if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+			break;
+
+		/*
+		 * Currently, fast page fault only works for direct mapping
+		 * since the gfn is not stable for indirect shadow page. See
+		 * Documentation/virtual/kvm/locking.txt to get more detail.
+		 */
+		fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
+							iterator.sptep, spte);
+		if (fault_handled)
+			break;
+
+		if (++retry_count > 4) {
+			printk_once(KERN_WARNING
+				"kvm: Fast #PF retrying more than 4 times.\n");
+			break;
+		}
+
+		spte = mmu_spte_get_lockless(iterator.sptep);
+
+	} while (true);
 
-	/*
-	 * Currently, fast page fault only works for direct mapping since
-	 * the gfn is not stable for indirect shadow page.
-	 * See Documentation/virtual/kvm/locking.txt to get more detail.
-	 */
-	ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
-exit:
 	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
-			      spte, ret);
+			      spte, fault_handled);
 	walk_shadow_page_lockless_end(vcpu);
 
-	return ret;
+	return fault_handled;
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-- 
cgit 


From 83ef6c8155c0ecb4c1a7e6bfbe425c85f7cb676d Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:13 -0800
Subject: kvm: x86: mmu: Refactor accessed/dirty checks in
 mmu_spte_update/clear

This simplifies mmu_spte_update() a little bit.
The checks for clearing of accessed and dirty bits are refactored into
separate functions, which are used inside both mmu_spte_update() and
mmu_spte_clear_track_bits(), as well as kvm_test_age_rmapp(). The new
helper functions handle both the case when A/D bits are supported in
hardware and the case when they are not.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 66 +++++++++++++++++++++++++-----------------------------
 1 file changed, 31 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f6d3505c8d18..cfef95969335 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -504,14 +504,16 @@ static bool spte_has_volatile_bits(u64 spte)
 	return true;
 }
 
-static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+static bool is_accessed_spte(u64 spte)
 {
-	return (old_spte & bit_mask) && !(new_spte & bit_mask);
+	return shadow_accessed_mask ? spte & shadow_accessed_mask
+				    : true;
 }
 
-static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
+static bool is_dirty_spte(u64 spte)
 {
-	return (old_spte & bit_mask) != (new_spte & bit_mask);
+	return shadow_dirty_mask ? spte & shadow_dirty_mask
+				 : spte & PT_WRITABLE_MASK;
 }
 
 /* Rules for using mmu_spte_set:
@@ -534,17 +536,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
  * will find a read-only spte, even though the writable spte
  * might be cached on a CPU's TLB, the return value indicates this
  * case.
+ *
+ * Returns true if the TLB needs to be flushed
  */
 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte = *sptep;
-	bool ret = false;
+	bool flush = false;
 
 	WARN_ON(!is_shadow_present_pte(new_spte));
 
 	if (!is_shadow_present_pte(old_spte)) {
 		mmu_spte_set(sptep, new_spte);
-		return ret;
+		return flush;
 	}
 
 	if (!spte_has_volatile_bits(old_spte))
@@ -552,6 +556,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	else
 		old_spte = __update_clear_spte_slow(sptep, new_spte);
 
+	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
 	/*
 	 * For the spte updated out of mmu-lock is safe, since
 	 * we always atomically update it, see the comments in
@@ -559,38 +565,31 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	 */
 	if (spte_can_locklessly_be_made_writable(old_spte) &&
 	      !is_writable_pte(new_spte))
-		ret = true;
-
-	if (!shadow_accessed_mask) {
-		/*
-		 * We don't set page dirty when dropping non-writable spte.
-		 * So do it now if the new spte is becoming non-writable.
-		 */
-		if (ret)
-			kvm_set_pfn_dirty(spte_to_pfn(old_spte));
-		return ret;
-	}
+		flush = true;
 
 	/*
-	 * Flush TLB when accessed/dirty bits are changed in the page tables,
+	 * Flush TLB when accessed/dirty states are changed in the page tables,
 	 * to guarantee consistency between TLB and page tables.
 	 */
-	if (spte_is_bit_changed(old_spte, new_spte,
-                                shadow_accessed_mask | shadow_dirty_mask))
-		ret = true;
 
-	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
+		flush = true;
 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
-	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+	}
+
+	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
+		flush = true;
 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+	}
 
-	return ret;
+	return flush;
 }
 
 /*
  * Rules for using mmu_spte_clear_track_bits:
  * It sets the sptep from present to nonpresent, and track the
  * state bits, it is used to clear the last level sptep.
+ * Returns non-zero if the PTE was previously valid.
  */
 static int mmu_spte_clear_track_bits(u64 *sptep)
 {
@@ -614,11 +613,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 	 */
 	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
 
-	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+	if (is_accessed_spte(old_spte))
 		kvm_set_pfn_accessed(pfn);
-	if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask :
-					    PT_WRITABLE_MASK))
+
+	if (is_dirty_spte(old_spte))
 		kvm_set_pfn_dirty(pfn);
+
 	return 1;
 }
 
@@ -1616,7 +1616,6 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
-	int young = 0;
 
 	/*
 	 * If there's no access bit in the secondary pte set by the
@@ -1626,14 +1625,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	if (!shadow_accessed_mask)
 		goto out;
 
-	for_each_rmap_spte(rmap_head, &iter, sptep) {
-		if (*sptep & shadow_accessed_mask) {
-			young = 1;
-			break;
-		}
-	}
+	for_each_rmap_spte(rmap_head, &iter, sptep)
+		if (is_accessed_spte(*sptep))
+			return 1;
 out:
-	return young;
+	return 0;
 }
 
 #define RMAP_RECYCLE_THRESHOLD 1000
-- 
cgit 


From f39a058d0ea2f58b9c69cfcf7c93184f33302c98 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:14 -0800
Subject: kvm: x86: mmu: Introduce a no-tracking version of mmu_spte_update

mmu_spte_update() tracks changes in the accessed/dirty state of
the SPTE being updated and calls kvm_set_pfn_accessed/dirty
appropriately. However, in some cases (e.g. when aging the SPTE),
this shouldn't be done. mmu_spte_update_no_track() is introduced
for use in such cases.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cfef95969335..b8b5259c8ebb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -528,27 +528,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
 	__set_spte(sptep, new_spte);
 }
 
-/* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changed.
- *
- * Whenever we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB, the return value indicates this
- * case.
- *
- * Returns true if the TLB needs to be flushed
+/*
+ * Update the SPTE (excluding the PFN), but do not track changes in its
+ * accessed/dirty status.
  */
-static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte = *sptep;
-	bool flush = false;
 
 	WARN_ON(!is_shadow_present_pte(new_spte));
 
 	if (!is_shadow_present_pte(old_spte)) {
 		mmu_spte_set(sptep, new_spte);
-		return flush;
+		return old_spte;
 	}
 
 	if (!spte_has_volatile_bits(old_spte))
@@ -558,6 +550,28 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 
 	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
 
+	return old_spte;
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
+ *
+ * Returns true if the TLB needs to be flushed
+ */
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+{
+	bool flush = false;
+	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
+
+	if (!is_shadow_present_pte(old_spte))
+		return false;
+
 	/*
 	 * For the spte updated out of mmu-lock is safe, since
 	 * we always atomically update it, see the comments in
-- 
cgit 


From 37f0e8fe6b10ee2ab52576caa721ee1282de74a6 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:15 -0800
Subject: kvm: x86: mmu: Do not use bit 63 for tracking special SPTEs

MMIO SPTEs currently set both bits 62 and 63 to distinguish them as special
PTEs. However, bit 63 is used as the SVE bit in Intel EPT PTEs. The SVE bit
is ignored for misconfigured PTEs but not necessarily for not-Present PTEs.
Since MMIO SPTEs use an EPT misconfiguration, so using bit 63 for them is
acceptable. However, the upcoming fast access tracking feature adds another
type of special tracking PTE, which uses not-Present PTEs and hence should
not set bit 63.

In order to use common bits to distinguish both type of special PTEs, we
now use only bit 62 as the special bit.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 7 +++++++
 arch/x86/include/asm/vmx.h      | 9 +++++++--
 arch/x86/kvm/vmx.c              | 6 +++---
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7e594a325158..3272a5e4aaad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -208,6 +208,13 @@ enum {
 				 PFERR_WRITE_MASK |		\
 				 PFERR_PRESENT_MASK)
 
+/*
+ * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
+ * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting
+ * with the SVE bit in EPT PTEs.
+ */
+#define SPTE_SPECIAL_MASK (1ULL << 62)
+
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC	0
 /*
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 25a482fb5241..fc061cbb46e0 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -467,8 +467,13 @@ enum vmcs_field {
 #define VMX_EPT_WRITABLE_MASK			0x2ull
 #define VMX_EPT_EXECUTABLE_MASK			0x4ull
 #define VMX_EPT_IPAT_BIT    			(1ull << 6)
-#define VMX_EPT_ACCESS_BIT				(1ull << 8)
-#define VMX_EPT_DIRTY_BIT				(1ull << 9)
+#define VMX_EPT_ACCESS_BIT			(1ull << 8)
+#define VMX_EPT_DIRTY_BIT			(1ull << 9)
+
+/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
+#define VMX_EPT_MISCONFIG_WX_VALUE           (VMX_EPT_WRITABLE_MASK |       \
+                                              VMX_EPT_EXECUTABLE_MASK)
+
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81159a3878f4..6f53dedd9b96 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5236,10 +5236,10 @@ static void ept_set_mmio_spte_mask(void)
 	/*
 	 * EPT Misconfigurations can be generated if the value of bits 2:0
 	 * of an EPT paging-structure entry is 110b (write/execute).
-	 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
-	 * spte.
+	 * Also, special bit (62) is set to quickly identify mmio spte.
 	 */
-	kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
+	kvm_mmu_set_mmio_spte_mask(SPTE_SPECIAL_MASK |
+				   VMX_EPT_MISCONFIG_WX_VALUE);
 }
 
 #define VMX_XSS_EXIT_BITMAP 0
-- 
cgit 


From f160c7b7bb322bf079a5bb4dd34c58f17553f193 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:16 -0800
Subject: kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A
 bits.

This change implements lockless access tracking for Intel CPUs without EPT
A bits. This is achieved by marking the PTEs as not-present (but not
completely clearing them) when clear_flush_young() is called after marking
the pages as accessed. When an EPT Violation is generated as a result of
the VM accessing those pages, the PTEs are restored to their original values.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   3 +-
 arch/x86/include/asm/vmx.h      |   9 +-
 arch/x86/kvm/mmu.c              | 279 ++++++++++++++++++++++++++++++----------
 arch/x86/kvm/vmx.c              |  26 ++--
 arch/x86/kvm/x86.c              |   2 +-
 5 files changed, 239 insertions(+), 80 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3272a5e4aaad..99a71d90b6ae 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1064,7 +1064,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
+		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+		u64 acc_track_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fc061cbb46e0..a22a4790f1ac 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -469,11 +469,14 @@ enum vmcs_field {
 #define VMX_EPT_IPAT_BIT    			(1ull << 6)
 #define VMX_EPT_ACCESS_BIT			(1ull << 8)
 #define VMX_EPT_DIRTY_BIT			(1ull << 9)
+#define VMX_EPT_RWX_MASK                        (VMX_EPT_READABLE_MASK |       \
+						 VMX_EPT_WRITABLE_MASK |       \
+						 VMX_EPT_EXECUTABLE_MASK)
+#define VMX_EPT_MT_MASK				(7ull << VMX_EPT_MT_EPTE_SHIFT)
 
 /* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
-#define VMX_EPT_MISCONFIG_WX_VALUE           (VMX_EPT_WRITABLE_MASK |       \
-                                              VMX_EPT_EXECUTABLE_MASK)
-
+#define VMX_EPT_MISCONFIG_WX_VALUE		(VMX_EPT_WRITABLE_MASK |       \
+						 VMX_EPT_EXECUTABLE_MASK)
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b8b5259c8ebb..64821ca3a7c3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -38,6 +38,7 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/hash.h>
+#include <linux/kern_levels.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -130,6 +131,10 @@ module_param(dbg, bool, 0644);
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
+/* The mask for the R/X bits in EPT PTEs */
+#define PT64_EPT_READABLE_MASK			0x1ull
+#define PT64_EPT_EXECUTABLE_MASK		0x4ull
+
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
@@ -179,6 +184,25 @@ static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
 static u64 __read_mostly shadow_present_mask;
 
+/*
+ * The mask/value to distinguish a PTE that has been marked not-present for
+ * access tracking purposes.
+ * The mask would be either 0 if access tracking is disabled, or
+ * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
+ */
+static u64 __read_mostly shadow_acc_track_mask;
+static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page.
+ */
+static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
+						    PT64_EPT_EXECUTABLE_MASK;
+static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
+
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static void mmu_free_roots(struct kvm_vcpu *vcpu);
 
@@ -188,6 +212,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
+static inline bool is_access_track_spte(u64 spte)
+{
+	/* Always false if shadow_acc_track_mask is zero.  */
+	return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
+}
+
 /*
  * the low bit of the generation number is always presumed to be zero.
  * This disables mmio caching during memslot updates.  The concept is
@@ -285,7 +315,8 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 }
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
+		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+		u64 acc_track_mask)
 {
 	shadow_user_mask = user_mask;
 	shadow_accessed_mask = accessed_mask;
@@ -293,9 +324,23 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 	shadow_nx_mask = nx_mask;
 	shadow_x_mask = x_mask;
 	shadow_present_mask = p_mask;
+	shadow_acc_track_mask = acc_track_mask;
+	WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
+void kvm_mmu_clear_all_pte_masks(void)
+{
+	shadow_user_mask = 0;
+	shadow_accessed_mask = 0;
+	shadow_dirty_mask = 0;
+	shadow_nx_mask = 0;
+	shadow_x_mask = 0;
+	shadow_mmio_mask = 0;
+	shadow_present_mask = 0;
+	shadow_acc_track_mask = 0;
+}
+
 static int is_cpuid_PSE36(void)
 {
 	return 1;
@@ -308,7 +353,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-	return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
+	return (pte != 0) && !is_mmio_spte(pte);
 }
 
 static int is_large_pte(u64 pte)
@@ -482,32 +527,32 @@ static bool spte_can_locklessly_be_made_writable(u64 spte)
 
 static bool spte_has_volatile_bits(u64 spte)
 {
+	if (!is_shadow_present_pte(spte))
+		return false;
+
 	/*
 	 * Always atomically update spte if it can be updated
 	 * out of mmu-lock, it can ensure dirty bit is not lost,
 	 * also, it can help us to get a stable is_writable_pte()
 	 * to ensure tlb flush is not missed.
 	 */
-	if (spte_can_locklessly_be_made_writable(spte))
+	if (spte_can_locklessly_be_made_writable(spte) ||
+	    is_access_track_spte(spte))
 		return true;
 
-	if (!shadow_accessed_mask)
-		return false;
-
-	if (!is_shadow_present_pte(spte))
-		return false;
-
-	if ((spte & shadow_accessed_mask) &&
-	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
-		return false;
+	if (shadow_accessed_mask) {
+		if ((spte & shadow_accessed_mask) == 0 ||
+	    	    (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
+			return true;
+	}
 
-	return true;
+	return false;
 }
 
 static bool is_accessed_spte(u64 spte)
 {
 	return shadow_accessed_mask ? spte & shadow_accessed_mask
-				    : true;
+				    : !is_access_track_spte(spte);
 }
 
 static bool is_dirty_spte(u64 spte)
@@ -651,6 +696,61 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 	return __get_spte_lockless(sptep);
 }
 
+static u64 mark_spte_for_access_track(u64 spte)
+{
+	if (shadow_accessed_mask != 0)
+		return spte & ~shadow_accessed_mask;
+
+	if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
+		return spte;
+
+	/*
+	 * Verify that the write-protection that we do below will be fixable
+	 * via the fast page fault path. Currently, that is always the case, at
+	 * least when using EPT (which is when access tracking would be used).
+	 */
+	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
+		  !spte_can_locklessly_be_made_writable(spte),
+		  "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+
+	WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
+			  shadow_acc_track_saved_bits_shift),
+		  "kvm: Access Tracking saved bit locations are not zero\n");
+
+	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
+		shadow_acc_track_saved_bits_shift;
+	spte &= ~shadow_acc_track_mask;
+	spte |= shadow_acc_track_value;
+
+	return spte;
+}
+
+/* Returns the Accessed status of the PTE and resets it at the same time. */
+static bool mmu_spte_age(u64 *sptep)
+{
+	u64 spte = mmu_spte_get_lockless(sptep);
+
+	if (!is_accessed_spte(spte))
+		return false;
+
+	if (shadow_accessed_mask) {
+		clear_bit((ffs(shadow_accessed_mask) - 1),
+			  (unsigned long *)sptep);
+	} else {
+		/*
+		 * Capture the dirty status of the page, so that it doesn't get
+		 * lost when the SPTE is marked for access tracking.
+		 */
+		if (is_writable_pte(spte))
+			kvm_set_pfn_dirty(spte_to_pfn(spte));
+
+		spte = mark_spte_for_access_track(spte);
+		mmu_spte_update_no_track(sptep, spte);
+	}
+
+	return true;
+}
+
 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -1435,7 +1535,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 restart:
 	for_each_rmap_spte(rmap_head, &iter, sptep) {
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
-			     sptep, *sptep, gfn, level);
+			    sptep, *sptep, gfn, level);
 
 		need_flush = 1;
 
@@ -1448,7 +1548,8 @@ restart:
 
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
-			new_spte &= ~shadow_accessed_mask;
+
+			new_spte = mark_spte_for_access_track(new_spte);
 
 			mmu_spte_clear_track_bits(sptep);
 			mmu_spte_set(sptep, new_spte);
@@ -1610,15 +1711,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	struct rmap_iterator uninitialized_var(iter);
 	int young = 0;
 
-	BUG_ON(!shadow_accessed_mask);
-
-	for_each_rmap_spte(rmap_head, &iter, sptep) {
-		if (*sptep & shadow_accessed_mask) {
-			young = 1;
-			clear_bit((ffs(shadow_accessed_mask) - 1),
-				 (unsigned long *)sptep);
-		}
-	}
+	for_each_rmap_spte(rmap_head, &iter, sptep)
+		young |= mmu_spte_age(sptep);
 
 	trace_kvm_age_page(gfn, level, slot, young);
 	return young;
@@ -1632,11 +1726,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	struct rmap_iterator iter;
 
 	/*
-	 * If there's no access bit in the secondary pte set by the
-	 * hardware it's up to gup-fast/gup to set the access bit in
-	 * the primary pte or in the page structure.
+	 * If there's no access bit in the secondary pte set by the hardware and
+	 * fast access tracking is also not enabled, it's up to gup-fast/gup to
+	 * set the access bit in the primary pte or in the page structure.
 	 */
-	if (!shadow_accessed_mask)
+	if (!shadow_accessed_mask && !shadow_acc_track_mask)
 		goto out;
 
 	for_each_rmap_spte(rmap_head, &iter, sptep)
@@ -1671,7 +1765,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 	 * This has some overhead, but not as much as the cost of swapping
 	 * out actively used pages or breaking up actively used hugepages.
 	 */
-	if (!shadow_accessed_mask)
+	if (!shadow_accessed_mask && !shadow_acc_track_mask)
 		return kvm_handle_hva_range(kvm, start, end, 0,
 					    kvm_unmap_rmapp);
 
@@ -2603,6 +2697,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_dirty_mask;
 	}
 
+	if (speculative)
+		spte = mark_spte_for_access_track(spte);
+
 set_pte:
 	if (mmu_spte_update(sptep, spte))
 		kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2656,7 +2753,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
 	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
 		 is_large_pte(*sptep)? "2MB" : "4kB",
-		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
+		 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
 		 *sptep, sptep);
 	if (!was_rmapped && is_large_pte(*sptep))
 		++vcpu->kvm->stat.lpages;
@@ -2889,16 +2986,28 @@ static bool page_fault_can_be_fast(u32 error_code)
 	if (unlikely(error_code & PFERR_RSVD_MASK))
 		return false;
 
+	/* See if the page fault is due to an NX violation */
+	if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
+		      == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
+		return false;
+
 	/*
-	 * #PF can be fast only if the shadow page table is present and it
-	 * is caused by write-protect, that means we just need change the
-	 * W bit of the spte which can be done out of mmu-lock.
+	 * #PF can be fast if:
+	 * 1. The shadow page table entry is not present, which could mean that
+	 *    the fault is potentially caused by access tracking (if enabled).
+	 * 2. The shadow page table entry is present and the fault
+	 *    is caused by write-protect, that means we just need change the W
+	 *    bit of the spte which can be done out of mmu-lock.
+	 *
+	 * However, if access tracking is disabled we know that a non-present
+	 * page must be a genuine page fault where we have to create a new SPTE.
+	 * So, if access tracking is disabled, we return true only for write
+	 * accesses to a present page.
 	 */
-	if (!(error_code & PFERR_PRESENT_MASK) ||
-	      !(error_code & PFERR_WRITE_MASK))
-		return false;
 
-	return true;
+	return shadow_acc_track_mask != 0 ||
+	       ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
+		== (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
 }
 
 /*
@@ -2907,17 +3016,26 @@ static bool page_fault_can_be_fast(u32 error_code)
  */
 static bool
 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			u64 *sptep, u64 spte)
+			u64 *sptep, u64 old_spte,
+			bool remove_write_prot, bool remove_acc_track)
 {
 	gfn_t gfn;
+	u64 new_spte = old_spte;
 
 	WARN_ON(!sp->role.direct);
 
-	/*
-	 * The gfn of direct spte is stable since it is calculated
-	 * by sp->gfn.
-	 */
-	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	if (remove_acc_track) {
+		u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift)
+				 & shadow_acc_track_saved_bits_mask;
+
+		new_spte &= ~shadow_acc_track_mask;
+		new_spte &= ~(shadow_acc_track_saved_bits_mask <<
+			      shadow_acc_track_saved_bits_shift);
+		new_spte |= saved_bits;
+	}
+
+	if (remove_write_prot)
+		new_spte |= PT_WRITABLE_MASK;
 
 	/*
 	 * Theoretically we could also set dirty bit (and flush TLB) here in
@@ -2931,10 +3049,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 *
 	 * Compare with set_spte where instead shadow_dirty_mask is set.
 	 */
-	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte)
+	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
 		return false;
 
-	kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	if (remove_write_prot) {
+		/*
+		 * The gfn of direct spte is stable since it is
+		 * calculated by sp->gfn.
+		 */
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	}
 
 	return true;
 }
@@ -2965,35 +3090,55 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 			break;
 
 	do {
-		/*
-		 * If the mapping has been changed, let the vcpu fault on the
-		 * same address again.
-		 */
-		if (!is_shadow_present_pte(spte)) {
-			fault_handled = true;
-			break;
-		}
+		bool remove_write_prot = false;
+		bool remove_acc_track;
 
 		sp = page_header(__pa(iterator.sptep));
 		if (!is_last_spte(spte, sp->role.level))
 			break;
 
 		/*
-		 * Check if it is a spurious fault caused by TLB lazily flushed.
+		 * Check whether the memory access that caused the fault would
+		 * still cause it if it were to be performed right now. If not,
+		 * then this is a spurious fault caused by TLB lazily flushed,
+		 * or some other CPU has already fixed the PTE after the
+		 * current CPU took the fault.
 		 *
 		 * Need not check the access of upper level table entries since
 		 * they are always ACC_ALL.
 		 */
-		if (is_writable_pte(spte)) {
-			fault_handled = true;
-			break;
+
+		if (error_code & PFERR_FETCH_MASK) {
+			if ((spte & (shadow_x_mask | shadow_nx_mask))
+			    == shadow_x_mask) {
+				fault_handled = true;
+				break;
+			}
+		} else if (error_code & PFERR_WRITE_MASK) {
+			if (is_writable_pte(spte)) {
+				fault_handled = true;
+				break;
+			}
+
+			/*
+			 * Currently, to simplify the code, write-protection can
+			 * be removed in the fast path only if the SPTE was
+			 * write-protected for dirty-logging.
+			 */
+			remove_write_prot =
+				spte_can_locklessly_be_made_writable(spte);
+		} else {
+			/* Fault was on Read access */
+			if (spte & PT_PRESENT_MASK) {
+				fault_handled = true;
+				break;
+			}
 		}
 
-		/*
-		 * Currently, to simplify the code, only the spte
-		 * write-protected by dirty-log can be fast fixed.
-		 */
-		if (!spte_can_locklessly_be_made_writable(spte))
+		remove_acc_track = is_access_track_spte(spte);
+
+		/* Verify that the fault can be handled in the fast path */
+		if (!remove_acc_track && !remove_write_prot)
 			break;
 
 		/*
@@ -3007,7 +3152,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		 *
 		 * See the comments in kvm_arch_commit_memory_region().
 		 */
-		if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot)
 			break;
 
 		/*
@@ -3016,7 +3161,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		 * Documentation/virtual/kvm/locking.txt to get more detail.
 		 */
 		fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
-							iterator.sptep, spte);
+							iterator.sptep, spte,
+							remove_write_prot,
+							remove_acc_track);
 		if (fault_handled)
 			break;
 
@@ -5105,6 +5252,8 @@ static void mmu_destroy_caches(void)
 
 int kvm_mmu_module_init(void)
 {
+	kvm_mmu_clear_all_pte_masks();
+
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 					    sizeof(struct pte_list_desc),
 					    0, 0, NULL);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6f53dedd9b96..d2fe3a51876c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6578,6 +6578,19 @@ static void wakeup_handler(void)
 	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 }
 
+void vmx_enable_tdp(void)
+{
+	kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
+		enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
+		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
+		0ull, VMX_EPT_EXECUTABLE_MASK,
+		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
+		enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK);
+
+	ept_set_mmio_spte_mask();
+	kvm_enable_tdp();
+}
+
 static __init int hardware_setup(void)
 {
 	int r = -ENOMEM, i, msr;
@@ -6703,16 +6716,9 @@ static __init int hardware_setup(void)
 	/* SELF-IPI */
 	vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
 
-	if (enable_ept) {
-		kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
-			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
-			(enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
-			0ull, VMX_EPT_EXECUTABLE_MASK,
-			cpu_has_vmx_ept_execute_only() ?
-				      0ull : VMX_EPT_READABLE_MASK);
-		ept_set_mmio_spte_mask();
-		kvm_enable_tdp();
-	} else
+	if (enable_ept)
+		vmx_enable_tdp();
+	else
 		kvm_disable_tdp();
 
 	update_ple_window_actual_max();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4aece8b0a4aa..c3ee5e29ea2a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6025,7 +6025,7 @@ int kvm_arch_init(void *opaque)
 
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0,
-			PT_PRESENT_MASK);
+			PT_PRESENT_MASK, 0);
 	kvm_timer_init();
 
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
-- 
cgit 


From f98a3efb284a7950745d6c95be489193e6d4c657 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 15 Dec 2016 18:06:45 +0100
Subject: KVM: x86: use delivery to self in hyperv synic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Interrupt to self can be sent without knowing the APIC ID.

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 1572c35b4f1a..08b27e0c7b71 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -305,13 +305,13 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
 		return -ENOENT;
 
 	memset(&irq, 0, sizeof(irq));
-	irq.dest_id = kvm_apic_id(vcpu->arch.apic);
+	irq.shorthand = APIC_DEST_SELF;
 	irq.dest_mode = APIC_DEST_PHYSICAL;
 	irq.delivery_mode = APIC_DM_FIXED;
 	irq.vector = vector;
 	irq.level = 1;
 
-	ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL);
+	ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
 	trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
 	return ret;
 }
-- 
cgit 


From 6e50043912d9c9c119e3c9c5378869d019df70a9 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 15 Dec 2016 18:06:46 +0100
Subject: KVM: x86: replace kvm_apic_id with kvm_{x,x2}apic_id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There were three calls sites:
 - recalculate_apic_map and kvm_apic_match_physical_addr, where it would
   only complicate implementation of x2APIC hotplug;
 - in apic_debug, where it was still somewhat preserved, but keeping the
   old function just for apic_debug was not worth it

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 31 ++++++++++++++++++++++---------
 arch/x86/kvm/lapic.h | 11 -----------
 2 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 5fe290c1b7d8..7c142f0fe9fd 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic)
 	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
 	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
 
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+	return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
+static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
+{
+	return apic->vcpu->vcpu_id;
+}
+
 static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
 		u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
 	switch (map->mode) {
@@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm)
 	struct kvm_apic_map *new, *old = NULL;
 	struct kvm_vcpu *vcpu;
 	int i;
-	u32 max_id = 255;
+	u32 max_id = 255; /* enough space for any xAPIC ID */
 
 	mutex_lock(&kvm->arch.apic_map_lock);
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		if (kvm_apic_present(vcpu))
-			max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+			max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
 
 	new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
 	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
@@ -184,12 +194,13 @@ static void recalculate_apic_map(struct kvm *kvm)
 		if (!kvm_apic_present(vcpu))
 			continue;
 
-		aid = kvm_apic_id(apic);
-		ldr = kvm_lapic_get_reg(apic, APIC_LDR);
-
+		aid = apic_x2apic_mode(apic) ? kvm_x2apic_id(apic)
+		                             : kvm_xapic_id(apic);
 		if (aid <= new->max_apic_id)
 			new->phys_map[aid] = apic;
 
+		ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+
 		if (apic_x2apic_mode(apic)) {
 			new->mode |= KVM_APIC_MODE_X2APIC;
 		} else if (ldr) {
@@ -250,6 +261,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 {
 	u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
 
+	WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+
 	kvm_lapic_set_reg(apic, APIC_ID, id);
 	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
 	recalculate_apic_map(apic->vcpu->kvm);
@@ -591,9 +604,9 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
 		return true;
 
 	if (apic_x2apic_mode(apic))
-		return mda == kvm_apic_id(apic);
+		return mda == kvm_x2apic_id(apic);
 
-	return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
+	return mda == SET_APIC_DEST_FIELD(kvm_xapic_id(apic));
 }
 
 static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -1907,9 +1920,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vcpu->arch.apic_arb_prio = 0;
 	vcpu->arch.apic_attention = 0;
 
-	apic_debug("%s: vcpu=%p, id=%d, base_msr="
+	apic_debug("%s: vcpu=%p, id=0x%x, base_msr="
 		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
-		   vcpu, kvm_apic_id(apic),
+		   vcpu, kvm_lapic_get_reg(apic, APIC_ID),
 		   vcpu->arch.apic_base, apic->base_address);
 }
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e0c80233b3e1..cb16e6fd2330 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -202,17 +202,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
 	return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 }
 
-static inline u32 kvm_apic_id(struct kvm_lapic *apic)
-{
-	/* To avoid a race between apic_base and following APIC_ID update when
-	 * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
-	 */
-	if (apic_x2apic_mode(apic))
-		return apic->vcpu->vcpu_id;
-
-	return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
-}
-
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
 void wait_lapic_expire(struct kvm_vcpu *vcpu);
-- 
cgit 


From b4535b58ae0df8b7cf0fe92a0c23aa3cf862e3cc Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 15 Dec 2016 18:06:47 +0100
Subject: KVM: x86: make interrupt delivery fast and slow path behave the same
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Slow path tried to prevent IPIs from x2APIC VCPUs from being delivered
to xAPIC VCPUs and vice-versa.  Make slow path behave like fast path,
which never distinguished that.

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7c142f0fe9fd..3ebef53d20a0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -592,10 +592,8 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 
 static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
 {
-	if (apic_x2apic_mode(apic))
-		return mda == X2APIC_BROADCAST;
-
-	return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
+	return mda == (apic_x2apic_mode(apic) ?
+			X2APIC_BROADCAST : APIC_BROADCAST);
 }
 
 static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
@@ -606,7 +604,7 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
 	if (apic_x2apic_mode(apic))
 		return mda == kvm_x2apic_id(apic);
 
-	return mda == SET_APIC_DEST_FIELD(kvm_xapic_id(apic));
+	return mda == kvm_xapic_id(apic);
 }
 
 static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -623,7 +621,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 		       && (logical_id & mda & 0xffff) != 0;
 
 	logical_id = GET_APIC_LOGICAL_ID(logical_id);
-	mda = GET_APIC_DEST_FIELD(mda);
 
 	switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
 	case APIC_DFR_FLAT:
@@ -640,9 +637,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 
 /* The KVM local APIC implementation has two quirks:
  *
- *  - the xAPIC MDA stores the destination at bits 24-31, while this
- *    is not true of struct kvm_lapic_irq's dest_id field.  This is
- *    just a quirk in the API and is not problematic.
+ *  - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
+ *    in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
+ *    KVM doesn't do that aliasing.
  *
  *  - in-kernel IOAPIC messages have to be delivered directly to
  *    x2APIC, because the kernel does not support interrupt remapping.
@@ -658,13 +655,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
 		struct kvm_lapic *source, struct kvm_lapic *target)
 {
 	bool ipi = source != NULL;
-	bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
 
 	if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
-	    !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+	    !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
 		return X2APIC_BROADCAST;
 
-	return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
+	return dest_id;
 }
 
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
-- 
cgit 


From 5bd5db385b3e13c702365574c0b7350c6ea45e84 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 15 Dec 2016 18:06:48 +0100
Subject: KVM: x86: allow hotplug of VCPU with APIC ID over 0xff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LAPIC after reset is in xAPIC mode, which poses a problem for hotplug of
VCPUs with high APIC ID, because reset VCPU is waiting for INIT/SIPI,
but there is no way to uniquely address it using xAPIC.

From many possible options, we chose the one that also works on real
hardware: accepting interrupts addressed to LAPIC's x2APIC ID even in
xAPIC mode.

KVM intentionally differs from real hardware, because real hardware
(Knights Landing) does just "x2apic_id & 0xff" to decide whether to
accept the interrupt in xAPIC mode and it can deliver one interrupt to
more than one physical destination, e.g. 0x123 to 0x123 and 0x23.

Fixes: 682f732ecf73 ("KVM: x86: bump MAX_VCPUS to 288")
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3ebef53d20a0..7e9ac4606279 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -189,15 +189,26 @@ static void recalculate_apic_map(struct kvm *kvm)
 		struct kvm_lapic *apic = vcpu->arch.apic;
 		struct kvm_lapic **cluster;
 		u16 mask;
-		u32 ldr, aid;
+		u32 ldr;
+		u8 xapic_id;
+		u32 x2apic_id;
 
 		if (!kvm_apic_present(vcpu))
 			continue;
 
-		aid = apic_x2apic_mode(apic) ? kvm_x2apic_id(apic)
-		                             : kvm_xapic_id(apic);
-		if (aid <= new->max_apic_id)
-			new->phys_map[aid] = apic;
+		xapic_id = kvm_xapic_id(apic);
+		x2apic_id = kvm_x2apic_id(apic);
+
+		/* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
+		if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
+				x2apic_id <= new->max_apic_id)
+			new->phys_map[x2apic_id] = apic;
+		/*
+		 * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
+		 * prevent them from masking VCPUs with APIC ID <= 0xff.
+		 */
+		if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+			new->phys_map[xapic_id] = apic;
 
 		ldr = kvm_lapic_get_reg(apic, APIC_LDR);
 
@@ -604,6 +615,15 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
 	if (apic_x2apic_mode(apic))
 		return mda == kvm_x2apic_id(apic);
 
+	/*
+	 * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if
+	 * it were in x2APIC mode.  Hotplugged VCPUs start in xAPIC mode and
+	 * this allows unique addressing of VCPUs with APIC ID over 0xff.
+	 * The 0xff condition is needed because writeable xAPIC ID.
+	 */
+	if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
+		return true;
+
 	return mda == kvm_xapic_id(apic);
 }
 
-- 
cgit 


From 0f89b207b04a1a399e19d35293658e3a571da3d7 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 14 Dec 2016 14:59:23 -0500
Subject: kvm: svm: Use the hardware provided GPA instead of page walk

When a guest causes a NPF which requires emulation, KVM sometimes walks
the guest page tables to translate the GVA to a GPA. This is unnecessary
most of the time on AMD hardware since the hardware provides the GPA in
EXITINFO2.

The only exception cases involve string operations involving rep or
operations that use two memory locations. With rep, the GPA will only be
the value of the initial NPF and with dual memory locations we won't know
which memory address was translated into EXITINFO2.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  3 +++
 arch/x86/kvm/emulate.c             | 20 +++++++++++++----
 arch/x86/kvm/svm.c                 |  2 ++
 arch/x86/kvm/x86.c                 | 45 +++++++++++++++++++++++++++++---------
 5 files changed, 57 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e9cd7befcb76..3e8c287090e4 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -441,5 +441,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
 void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
 void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
 
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 99a71d90b6ae..0419e114f27b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -675,6 +675,9 @@ struct kvm_vcpu_arch {
 
 	int pending_ioapic_eoi;
 	int pending_external_vector;
+
+	/* GPA available (AMD only) */
+	bool gpa_available;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 56628a44668b..2b8349a2b14b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -173,6 +173,7 @@
 #define NearBranch  ((u64)1 << 52)  /* Near branches */
 #define No16	    ((u64)1 << 53)  /* No 16 bit operand */
 #define IncSP       ((u64)1 << 54)  /* SP is incremented before ModRM calc */
+#define TwoMemOp    ((u64)1 << 55)  /* Instruction has two memory operand */
 
 #define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)
 
@@ -4256,7 +4257,7 @@ static const struct opcode group1[] = {
 };
 
 static const struct opcode group1A[] = {
-	I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
+	I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N,
 };
 
 static const struct opcode group2[] = {
@@ -4294,7 +4295,7 @@ static const struct opcode group5[] = {
 	I(SrcMemFAddr | ImplicitOps,		em_call_far),
 	I(SrcMem | NearBranch,			em_jmp_abs),
 	I(SrcMemFAddr | ImplicitOps,		em_jmp_far),
-	I(SrcMem | Stack,			em_push), D(Undefined),
+	I(SrcMem | Stack | TwoMemOp,		em_push), D(Undefined),
 };
 
 static const struct opcode group6[] = {
@@ -4514,8 +4515,8 @@ static const struct opcode opcode_table[256] = {
 	/* 0xA0 - 0xA7 */
 	I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
 	I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
-	I2bv(SrcSI | DstDI | Mov | String, em_mov),
-	F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r),
+	I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov),
+	F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r),
 	/* 0xA8 - 0xAF */
 	F2bv(DstAcc | SrcImm | NoWrite, em_test),
 	I2bv(SrcAcc | DstDI | Mov | String, em_mov),
@@ -5629,3 +5630,14 @@ void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
 {
 	writeback_registers(ctxt);
 }
+
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
+{
+	if (ctxt->rep_prefix && (ctxt->d & String))
+		return false;
+
+	if (ctxt->d & TwoMemOp)
+		return false;
+
+	return true;
+}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 08a4d3ab3455..d0414f054bdf 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4182,6 +4182,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
 
+	vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF);
+
 	if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
 		vcpu->arch.cr0 = svm->vmcb->save.cr0;
 	if (npt_enabled)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c3ee5e29ea2a..edff19d1df97 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4462,6 +4462,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
+static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+			    gpa_t gpa, bool write)
+{
+	/* For APIC access vmexit */
+	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+		return 1;
+
+	if (vcpu_match_mmio_gpa(vcpu, gpa)) {
+		trace_vcpu_match_mmio(gva, gpa, write, true);
+		return 1;
+	}
+
+	return 0;
+}
+
 static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 				gpa_t *gpa, struct x86_exception *exception,
 				bool write)
@@ -4488,16 +4503,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 	if (*gpa == UNMAPPED_GVA)
 		return -1;
 
-	/* For APIC access vmexit */
-	if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-		return 1;
-
-	if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
-		trace_vcpu_match_mmio(gva, *gpa, write, true);
-		return 1;
-	}
-
-	return 0;
+	return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
 }
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -4594,6 +4600,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
 	int handled, ret;
 	bool write = ops->write;
 	struct kvm_mmio_fragment *frag;
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+	/*
+	 * If the exit was due to a NPF we may already have a GPA.
+	 * If the GPA is present, use it to avoid the GVA to GPA table walk.
+	 * Note, this cannot be used on string operations since string
+	 * operation using rep will only have the initial GPA from the NPF
+	 * occurred.
+	 */
+	if (vcpu->arch.gpa_available &&
+	    emulator_can_use_gpa(ctxt) &&
+	    vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) &&
+	    (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) {
+		gpa = exception->address;
+		goto mmio;
+	}
 
 	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
 
@@ -5610,6 +5632,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	}
 
 restart:
+	/* Save the faulting GPA (cr2) in the address field */
+	ctxt->exception.address = cr2;
+
 	r = x86_emulate_insn(ctxt);
 
 	if (r == EMULATION_INTERCEPTED)
-- 
cgit 


From 0f1e261ead16ce09169bf2d223d4c8803576f85e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sat, 17 Dec 2016 16:05:19 +0100
Subject: KVM: x86: add VCPU stat for KVM_REQ_EVENT processing

This statistic can be useful to estimate the cost of an IRQ injection
scenario, by comparing it with irq_injections.  For example the stat
shows that sti;hlt triggers more KVM_REQ_EVENT than sti;nop.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/x86.c              | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0419e114f27b..417502cf42b6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -861,6 +861,7 @@ struct kvm_vcpu_stat {
 	u64 hypercalls;
 	u64 irq_injections;
 	u64 nmi_injections;
+	u64 req_event;
 };
 
 struct x86_instruction_info;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index edff19d1df97..b02af6285887 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
+	{ "req_event", VCPU_STAT(req_event) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -6756,6 +6757,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+		++vcpu->stat.req_event;
 		kvm_apic_accept_events(vcpu);
 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
 			r = 1;
-- 
cgit 


From eb90f3417a0cc4880e979ccc84e41890d410ea5b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 18 Dec 2016 14:02:21 +0100
Subject: KVM: vmx: speed up TPR below threshold vmexits

Since we're already in VCPU context, all we have to do here is recompute
the PPR value.  That will in turn generate a KVM_REQ_EVENT if necessary.

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 6 ++++++
 arch/x86/kvm/lapic.h | 1 +
 arch/x86/kvm/vmx.c   | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7e9ac4606279..6b1d3a76c1d0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -595,6 +595,12 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 	}
 }
 
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
+{
+	apic_update_ppr(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_ppr);
+
 static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 {
 	kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index cb16e6fd2330..5b5b1ba644cb 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -73,6 +73,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 
 void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 		     struct dest_map *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d2fe3a51876c..94fda2010f5f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6152,7 +6152,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
 
 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 {
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	kvm_apic_update_ppr(vcpu);
 	return 1;
 }
 
-- 
cgit 


From b3c045d33218fe291b04d30e24b6eab0431987e6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 18 Dec 2016 21:47:54 +0100
Subject: KVM: lapic: remove unnecessary KVM_REQ_EVENT on PPR update

PPR needs to be updated whenever on every IRR read because we
may have missed TPR writes that _increased_ PPR.  However, these
writes need not generate KVM_REQ_EVENT, because either KVM_REQ_EVENT
has been set already in __apic_accept_irq, or we are going to
process the interrupt right away.

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 6b1d3a76c1d0..a878e33119a3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -570,7 +570,15 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 }
 
-static void apic_update_ppr(struct kvm_lapic *apic)
+static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
+{
+	int highest_irr = apic_find_highest_irr(apic);
+	if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
+		return -1;
+	return highest_irr;
+}
+
+static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
 {
 	u32 tpr, isrv, ppr, old_ppr;
 	int isr;
@@ -588,11 +596,19 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 	apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
 		   apic, ppr, isr, isrv);
 
-	if (old_ppr != ppr) {
+	*new_ppr = ppr;
+	if (old_ppr != ppr)
 		kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
-		if (ppr < old_ppr)
-			kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
-	}
+
+	return ppr < old_ppr;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+	u32 ppr;
+
+	if (__apic_update_ppr(apic, &ppr))
+		kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 }
 
 void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
@@ -2056,17 +2072,13 @@ nomem:
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	int highest_irr;
+	u32 ppr;
 
 	if (!apic_enabled(apic))
 		return -1;
 
-	apic_update_ppr(apic);
-	highest_irr = apic_find_highest_irr(apic);
-	if ((highest_irr == -1) ||
-	    ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI)))
-		return -1;
-	return highest_irr;
+	__apic_update_ppr(apic, &ppr);
+	return apic_has_interrupt_for_ppr(apic, ppr);
 }
 
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
-- 
cgit 


From 26fbbee5815e9352187ac18f0aa53534f62567ff Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 18 Dec 2016 13:54:58 +0100
Subject: KVM: lapic: do not set KVM_REQ_EVENT unnecessarily on PPR update

On PPR update, we set KVM_REQ_EVENT unconditionally anytime PPR is lowered.
But we can take into account IRR here already.

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a878e33119a3..457fb206647d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -607,7 +607,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 {
 	u32 ppr;
 
-	if (__apic_update_ppr(apic, &ppr))
+	if (__apic_update_ppr(apic, &ppr) &&
+	    apic_has_interrupt_for_ppr(apic, ppr) != -1)
 		kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 }
 
-- 
cgit 


From 4d82d12b39132e820b9ac4aa058ccc733db98917 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 18 Dec 2016 21:43:41 +0100
Subject: KVM: lapic: do not scan IRR when delivering an interrupt

On interrupt delivery the PPR can only grow (except for auto-EOI),
so it is impossible that non-auto-EOI interrupt delivery results
in KVM_REQ_EVENT.  We can therefore use __apic_update_ppr.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 457fb206647d..10a745faa659 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2115,6 +2115,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 {
 	int vector = kvm_apic_has_interrupt(vcpu);
 	struct kvm_lapic *apic = vcpu->arch.apic;
+	u32 ppr;
 
 	if (vector == -1)
 		return -1;
@@ -2126,13 +2127,23 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 	 * because the process would deliver it through the IDT.
 	 */
 
-	apic_set_isr(vector, apic);
-	apic_update_ppr(apic);
 	apic_clear_irr(vector, apic);
-
 	if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
-		apic_clear_isr(vector, apic);
+		/*
+		 * For auto-EOI interrupts, there might be another pending
+		 * interrupt above PPR, so check whether to raise another
+		 * KVM_REQ_EVENT.
+		 */
 		apic_update_ppr(apic);
+	} else {
+		/*
+		 * For normal interrupts, PPR has been raised and there cannot
+		 * be a higher-priority pending interrupt---except if there was
+		 * a concurrent interrupt injection, but that would have
+		 * triggered KVM_REQ_EVENT already.
+		 */
+		apic_set_isr(vector, apic);
+		__apic_update_ppr(apic, &ppr);
 	}
 
 	return vector;
-- 
cgit 


From 21e7fbe7db2a983c046a05f12419d88c554a0f5a Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Thu, 22 Dec 2016 15:49:55 -0800
Subject: kvm: nVMX: Reorder error checks for emulated VMXON
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Checks on the operand to VMXON are performed after the check for
legacy mode operation and the #GP checks, according to the pseudo-code
in Intel's SDM.

Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 94fda2010f5f..4e691035a32d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7180,9 +7180,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
-		return 1;
-
 	if (vmx->nested.vmxon) {
 		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
 		return kvm_skip_emulated_instruction(vcpu);
@@ -7194,6 +7191,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
+	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+		return 1;
+
 	if (cpu_has_vmx_msr_bitmap()) {
 		vmx->nested.msr_bitmap =
 				(unsigned long *)__get_free_page(GFP_KERNEL);
-- 
cgit 


From f1be6cdaf57ce918828b6cff6ff2b4ea87be7f62 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Sat, 7 Jan 2017 14:34:57 +0200
Subject: x86/platform/intel-mid: Make intel_scu_device_register() static

There is no need anymore to have intel_scu_device_register() exported. Annotate
it with static keyword.

While here, rename to intel_scu_ipc_device_register() to use same pattern for
all SFI enumerated device register helpers.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: http://lkml.kernel.org/r/20170107123457.53033-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/intel-mid.h  | 1 -
 arch/x86/platform/intel-mid/sfi.c | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index 91ead0cefa76..fe04491130ae 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -27,7 +27,6 @@ extern void intel_mid_pwr_power_off(void);
 extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev);
 
 extern int get_gpio_by_name(const char *name);
-extern void intel_scu_device_register(struct platform_device *pdev);
 extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
 extern int __init sfi_parse_mtmr(struct sfi_table_header *table);
 extern int sfi_mrtc_num;
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index e8f68f652087..ce1303830231 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -226,7 +226,7 @@ int get_gpio_by_name(const char *name)
 	return -EINVAL;
 }
 
-void __init intel_scu_device_register(struct platform_device *pdev)
+static void __init intel_scu_ipc_device_register(struct platform_device *pdev)
 {
 	if (ipc_next_dev == MAX_IPCDEVS)
 		pr_err("too many SCU IPC devices");
@@ -361,7 +361,7 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry,
 
 	pdev->dev.platform_data = pdata;
 	if (dev->delay)
-		intel_scu_device_register(pdev);
+		intel_scu_ipc_device_register(pdev);
 	else
 		platform_device_add(pdev);
 }
-- 
cgit 


From fa5b6ec9e5274aeae2326e25995506a953e5f878 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 10 Jan 2017 13:35:40 -0800
Subject: lib/Kconfig.debug: Add ARCH_HAS_DEBUG_VIRTUAL

DEBUG_VIRTUAL currently depends on DEBUG_KERNEL && X86. arm64 is getting
the same support. Rather than add a list of architectures, switch this
to ARCH_HAS_DEBUG_VIRTUAL and let architectures select it as
appropriate.

Acked-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e487493bbd47..f1d4e8f2131f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -46,6 +46,7 @@ config X86
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_DISCARD_MEMBLOCK
 	select ARCH_HAS_ACPI_TABLE_UPGRADE	if ACPI
+	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
-- 
cgit 


From 6e03f66c001790d6ca853c68a56c87460bc86467 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 10 Jan 2017 18:43:54 +0200
Subject: locking/jump_labels: Update bug_at() boot message

First of all, %*ph specifier allows to dump data in hex format using the
pointer to a buffer. This is suitable to use here.

Besides that Thomas suggested to move it to critical level and replace __FILE__
by explicit mention of "jumplabel".

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170110164354.47372-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/jump_label.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index fc25f698d792..c37bd0f39c70 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -32,8 +32,7 @@ static void bug_at(unsigned char *ip, int line)
 	 * Something went wrong. Crash the box, as something could be
 	 * corrupting the kernel.
 	 */
-	pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n",
-	       ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line);
+	pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line);
 	BUG();
 }
 
-- 
cgit 


From b8fbe71f7535d4dfeed0bb8d924107dc58d502e2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 11 Jan 2017 20:28:06 +0800
Subject: crypto: x86/chacha20 - Manually align stack buffer

The kernel on x86-64 cannot use gcc attribute align to align to
a 16-byte boundary.  This patch reverts to the old way of aligning
it by hand.

Fixes: 9ae433bc79f9 ("crypto: chacha20 - convert generic and...")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/x86/crypto/chacha20_glue.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 78f75b07dc25..1e6af1b35f7b 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -67,10 +67,13 @@ static int chacha20_simd(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
-	u32 state[16] __aligned(CHACHA20_STATE_ALIGN);
+	u32 *state, state_buf[16 + 2] __aligned(8);
 	struct skcipher_walk walk;
 	int err;
 
+	BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
+	state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
+
 	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
 		return crypto_chacha20_crypt(req);
 
-- 
cgit 


From a665ece8b471de45bc19af05d52a1eaa5bc06dca Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 12 Jan 2017 13:23:31 +0200
Subject: x86/platform/intel: Remove PMIC GPIO block support

Moorestown support was removed by commit:

  1a8359e411eb ("x86/mid: Remove Intel Moorestown")

Remove this leftover.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: platform-driver-x86@vger.kernel.org
Link: http://lkml.kernel.org/r/20170112112331.93236-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-mid/sfi.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index ce1303830231..19b43e3a9f0f 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -15,7 +15,6 @@
 #include <linux/interrupt.h>
 #include <linux/scatterlist.h>
 #include <linux/sfi.h>
-#include <linux/intel_pmic_gpio.h>
 #include <linux/spi/spi.h>
 #include <linux/i2c.h>
 #include <linux/skbuff.h>
-- 
cgit 


From de1c2540aa4f7796f31acf5432597bb0eb086250 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 13 Jan 2017 18:43:55 +0200
Subject: x86/platform/intel-mid: Enable RTC on Intel Merrifield

Intel Merrifield has legacy RTC in contrast to the rest on Intel MID
platforms.

Set legacy RTC flag explicitly in architecture initialization code and
allocate interrupt for it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170113164355.66161-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-mid/mrfld.c |  1 +
 arch/x86/platform/intel-mid/sfi.c   | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c
index e0607c77a1bd..ae7bdeb0e507 100644
--- a/arch/x86/platform/intel-mid/mrfld.c
+++ b/arch/x86/platform/intel-mid/mrfld.c
@@ -91,6 +91,7 @@ static unsigned long __init tangier_calibrate_tsc(void)
 static void __init tangier_arch_setup(void)
 {
 	x86_platform.calibrate_tsc = tangier_calibrate_tsc;
+	x86_platform.legacy.rtc = 1;
 }
 
 /* tangier arch ops */
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index 19b43e3a9f0f..e4d4cabbb370 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -41,6 +41,7 @@
 #include <asm/intel_scu_ipc.h>
 #include <asm/apb_timer.h>
 #include <asm/reboot.h>
+#include <asm/time.h>
 
 #define	SFI_SIG_OEM0	"OEM0"
 #define MAX_IPCDEVS	24
@@ -539,8 +540,21 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 	return 0;
 }
 
+static int __init intel_mid_legacy_rtc_init(void)
+{
+	struct irq_alloc_info info;
+
+	if (!x86_platform.legacy.rtc)
+		return -ENODEV;
+
+	ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, 0);
+	return mp_map_gsi_to_irq(RTC_IRQ, IOAPIC_MAP_ALLOC, &info);
+}
+
 static int __init intel_mid_platform_init(void)
 {
+	intel_mid_legacy_rtc_init();
+
 	sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
 	sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
 	return 0;
-- 
cgit 


From eee5715efd8c268724b14c956de6af5d4931f470 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 13 Jan 2017 09:21:11 -0600
Subject: x86/platform/UV: Fix panic with missing UVsystab support

Fix the panic where KEXEC'd kernel does not have access to EFI runtime
mappings.  This may cause the extended UVsystab to not be available.
The solution is to revert to non-UV mode and continue with limited
capabilities.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Russ Anderson <rja@hpe.com>
Reviewed-by: Alex Thorlton <athorlton@sgi.com>
Acked-by: Dimitri Sivanich <sivanich@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170113152111.118886202@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 35690a168cf7..43930787cab9 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -1172,19 +1172,25 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
 		index, _min_socket, _max_socket, _min_pnode, _max_pnode);
 }
 
-static void __init decode_uv_systab(void)
+static int __init decode_uv_systab(void)
 {
 	struct uv_systab *st;
 	int i;
 
+	if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)
+		return 0;	/* No extended UVsystab required */
+
 	st = uv_systab;
-	if ((!st || st->revision < UV_SYSTAB_VERSION_UV4) && !is_uv4_hub())
-		return;
-	if (st->revision != UV_SYSTAB_VERSION_UV4_LATEST) {
-		pr_crit(
+	if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) {
+		int rev = st ? st->revision : 0;
+
+		pr_err(
 		"UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n",
-			st->revision, UV_SYSTAB_VERSION_UV4_LATEST);
-		BUG();
+			rev, UV_SYSTAB_VERSION_UV4_LATEST);
+		pr_err(
+		"UV: Cannot support UV operations, switching to generic PC\n");
+		uv_system_type = UV_NONE;
+		return -EINVAL;
 	}
 
 	for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) {
@@ -1205,6 +1211,7 @@ static void __init decode_uv_systab(void)
 			break;
 		}
 	}
+	return 0;
 }
 
 /*
@@ -1373,7 +1380,8 @@ void __init uv_system_init(void)
 	map_low_mmrs();
 
 	uv_bios_init();			/* get uv_systab for decoding */
-	decode_uv_systab();
+	if (decode_uv_systab() < 0)
+		return;			/* UVsystab problem, abort UV init */
 	build_socket_tables();
 	build_uv_gr_table();
 	uv_init_hub_info(&hub_info);
-- 
cgit 


From 81a71176740624ef3b1bff50c51e7b4aa187353d Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 13 Jan 2017 09:21:12 -0600
Subject: x86/platform/UV: Fix 2 socket config problem

A UV4 chassis with only 2 sockets configured can unexpectedly
target the wrong UV hub.  Fix the problem by limiting the minimum
size of a partition to 4 sockets even if only 2 are configured.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Russ Anderson <rja@hpe.com>
Acked-by: Dimitri Sivanich <sivanich@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170113152111.313888353@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 43930787cab9..97ea712fc72f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -56,6 +56,7 @@ static struct {
 	unsigned int socketid_shift;	/* aka pnode_shift for UV1/2/3 */
 	unsigned int pnode_mask;
 	unsigned int gpa_shift;
+	unsigned int gnode_shift;
 } uv_cpuid;
 
 int uv_min_hub_revision_id;
@@ -133,6 +134,7 @@ static int __init early_get_pnodeid(void)
 		break;
 	case UV4_HUB_PART_NUMBER:
 		uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1;
+		uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */
 		break;
 	}
 
@@ -1074,8 +1076,10 @@ void __init uv_init_hub_info(struct uv_hub_info_s *hub_info)
 		(1UL << uv_cpuid.gpa_shift) - 1;
 
 	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+	uv_cpuid.gnode_shift = max_t(unsigned int,
+					uv_cpuid.gnode_shift, mn.n_val);
 	hub_info->gnode_extra =
-		(node_id.s.node_id & ~((1 << mn.n_val) - 1)) >> 1;
+		(node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1;
 
 	hub_info->gnode_upper =
 		((unsigned long)hub_info->gnode_extra << mn.m_val);
-- 
cgit 


From aef591cd3d1ddccb268f64c836d38382007373c1 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 12 Jan 2017 15:27:58 -0500
Subject: locking/spinlocks/x86, paravirt: Remove paravirt_ticketlocks_enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a follow-up of commit:

  cfd8983f03c7b2 ("x86, locking/spinlocks: Remove ticket (spin)lock implementation")

The static_key structure 'paravirt_ticketlocks_enabled' is now removed as it is
no longer used.

As a result, the init functions kvm_spinlock_init_jump() and
xen_init_spinlocks_jump() are also removed.

A simple build and boot test was done to verify it.

Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1484252878-1962-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/spinlock.h      |  3 ---
 arch/x86/kernel/kvm.c                | 14 --------------
 arch/x86/kernel/paravirt-spinlocks.c |  3 ---
 arch/x86/xen/spinlock.c              | 19 -------------------
 4 files changed, 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 921bea7a2708..6d391909e864 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -23,9 +23,6 @@
 /* How long a lock should spin before we consider blocking */
 #define SPIN_THRESHOLD	(1 << 15)
 
-extern struct static_key paravirt_ticketlocks_enabled;
-static __always_inline bool static_key_false(struct static_key *key);
-
 #include <asm/qspinlock.h>
 
 /*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 36bc66416021..099fcba4981d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -620,18 +620,4 @@ void __init kvm_spinlock_init(void)
 	}
 }
 
-static __init int kvm_spinlock_init_jump(void)
-{
-	if (!kvm_para_available())
-		return 0;
-	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
-		return 0;
-
-	static_key_slow_inc(&paravirt_ticketlocks_enabled);
-	printk(KERN_INFO "KVM setup paravirtual spinlock\n");
-
-	return 0;
-}
-early_initcall(kvm_spinlock_init_jump);
-
 #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 6d4bf812af45..6259327f3454 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -42,6 +42,3 @@ struct pv_lock_ops pv_lock_ops = {
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
-
-struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE;
-EXPORT_SYMBOL(paravirt_ticketlocks_enabled);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index e8a9ea7d7a21..25a7c4302ce7 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -141,25 +141,6 @@ void __init xen_init_spinlocks(void)
 	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
 }
 
-/*
- * While the jump_label init code needs to happend _after_ the jump labels are
- * enabled and before SMP is started. Hence we use pre-SMP initcall level
- * init. We cannot do it in xen_init_spinlocks as that is done before
- * jump labels are activated.
- */
-static __init int xen_init_spinlocks_jump(void)
-{
-	if (!xen_pvspin)
-		return 0;
-
-	if (!xen_domain())
-		return 0;
-
-	static_key_slow_inc(&paravirt_ticketlocks_enabled);
-	return 0;
-}
-early_initcall(xen_init_spinlocks_jump);
-
 static __init int xen_parse_nopvspin(char *arg)
 {
 	xen_pvspin = false;
-- 
cgit 


From 12907fbb1a691807bb0420a27126e15934cb7954 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Dec 2016 11:44:28 +0100
Subject: sched/clock, clocksource: Add optional cs::mark_unstable() method

PeterZ reported that we'd fail to mark the TSC unstable when the
clocksource watchdog finds it unsuitable.

Allow a clocksource to run a custom action when its being marked
unstable and hook up the TSC unstable code.

Reported-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index be3a49ee0356..c8174c815d83 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1106,6 +1106,16 @@ static u64 read_tsc(struct clocksource *cs)
 	return (u64)rdtsc_ordered();
 }
 
+static void tsc_cs_mark_unstable(struct clocksource *cs)
+{
+	if (tsc_unstable)
+		return;
+	tsc_unstable = 1;
+	clear_sched_clock_stable();
+	disable_sched_clock_irqtime();
+	pr_info("Marking TSC unstable due to clocksource watchdog\n");
+}
+
 /*
  * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
  */
@@ -1118,6 +1128,7 @@ static struct clocksource clocksource_tsc = {
 				  CLOCK_SOURCE_MUST_VERIFY,
 	.archdata               = { .vclock_mode = VCLOCK_TSC },
 	.resume			= tsc_resume,
+	.mark_unstable		= tsc_cs_mark_unstable,
 };
 
 void mark_tsc_unstable(char *reason)
-- 
cgit 


From 9e3d6223d2093a8903c8f570a06284453ee59944 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Dec 2016 09:30:11 +0100
Subject: math64, timers: Fix 32bit mul_u64_u32_shr() and friends

It turns out that while GCC-4.4 manages to generate 32x32->64 mult
instructions for the 32bit mul_u64_u32_shr() code, any GCC after that
fails horribly.

Fix this by providing an explicit mul_u32_u32() function which can be
architcture provided.

Reported-by: Chris Metcalf <cmetcalf@mellanox.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Chris Metcalf <cmetcalf@mellanox.com> [for tile]
Cc: Christopher S. Hall <christopher.s.hall@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Laurent Vivier <lvivier@redhat.com>
Cc: Liav Rehana <liavr@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Parit Bhargava <prarit@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161209083011.GD15765@worktop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/div64.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index ced283ac79df..af95c47d5c9e 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -59,6 +59,17 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 }
 #define div_u64_rem	div_u64_rem
 
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	u32 high, low;
+
+	asm ("mull %[b]" : "=a" (low), "=d" (high)
+			 : [a] "a" (a), [b] "rm" (b) );
+
+	return low | ((u64)high) << 32;
+}
+#define mul_u32_u32 mul_u32_u32
+
 #else
 # include <asm-generic/div64.h>
 #endif /* CONFIG_X86_32 */
-- 
cgit 


From 06b35d93af0a5904aa832f58733be84ddbfe2e04 Mon Sep 17 00:00:00 2001
From: Piotr Luc <piotr.luc@intel.com>
Date: Tue, 10 Jan 2017 18:34:02 +0100
Subject: x86/cpufeature: Add AVX512_VPOPCNTDQ feature
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vector population count instructions for dwords and qwords are going to be
available in future Intel Xeon & Xeon Phi processors. Bit 14 of
CPUID[level:0x07, ECX] indicates that the instructions are supported by a
processor.

The specification can be found in the Intel Software Developer Manual (SDM)
and in the Instruction Set Extensions Programming Reference (ISE).

Populate the feature bit and clear it when xsave is disabled.

Signed-off-by: Piotr Luc <piotr.luc@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Link: http://lkml.kernel.org/r/20170110173403.6010-2-piotr.luc@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/cpufeatures.h | 2 +-
 arch/x86/kernel/fpu/xstate.c       | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index eafee3161d1c..d9d7136edf05 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -288,6 +288,7 @@
 #define X86_FEATURE_AVX512VBMI  (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
 #define X86_FEATURE_PKU		(16*32+ 3) /* Protection Keys for Userspace */
 #define X86_FEATURE_OSPKE	(16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
 #define X86_FEATURE_RDPID	(16*32+ 22) /* RDPID instruction */
 
 /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
@@ -320,5 +321,4 @@
 #define X86_BUG_SWAPGS_FENCE	X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_MONITOR		X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_AMD_E400	X86_BUG(13) /* CPU is among the affected by Erratum 400 */
-
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 1d7770447b3e..35f7024aace5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -78,6 +78,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
 	setup_clear_cpu_cap(X86_FEATURE_PKU);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
 }
 
 /*
-- 
cgit 


From a17f32270af1e1054bbc8858b0f27226a2c859ba Mon Sep 17 00:00:00 2001
From: Piotr Luc <piotr.luc@intel.com>
Date: Tue, 10 Jan 2017 18:34:03 +0100
Subject: kvm: x86: Expose Intel VPOPCNTDQ feature to guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vector population count instructions for dwords and qwords are to be
used in future Intel Xeon & Xeon Phi processors. The bit 14 of
CPUID[level:0x07, ECX] indicates that the new instructions are
supported by a processor.

The spec can be found in the Intel Software Developer Manual (SDM)
or in the Instruction Set Extensions Programming Reference (ISE).

Signed-off-by: Piotr Luc <piotr.luc@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/cpuid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e85f6bd7b9d5..09c2ac741567 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -383,7 +383,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.ecx*/
 	const u32 kvm_cpuid_7_0_ecx_x86_features =
-		F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/;
+		F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
-- 
cgit 


From a9b4f08770b415f30f2fb0f8329a370c8f554aa3 Mon Sep 17 00:00:00 2001
From: Ruslan Ruslichenko <rruslich@cisco.com>
Date: Tue, 17 Jan 2017 16:13:52 +0200
Subject: x86/ioapic: Restore IO-APIC irq_chip retrigger callback

commit d32932d02e18 removed the irq_retrigger callback from the IO-APIC
chip and did not add it to the new IO-APIC-IR irq chip.

There is no harm because the interrupts are resent in software when the
retrigger callback is NULL, but it's less efficient. So restore them.

[ tglx: Massaged changelog ]

Fixes: d32932d02e18  ("x86/irq: Convert IOAPIC to use hierarchical irqdomain interfaces")
Signed-off-by: Ruslan Ruslichenko <rruslich@cisco.com>
Cc: xe-linux-external@cisco.com
Link: http://lkml.kernel.org/r/1484662432-13580-1-git-send-email-rruslich@cisco.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/io_apic.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 945e512a112a..1e35dd06b090 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1875,6 +1875,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_eoi		= ioapic_ack_level,
 	.irq_set_affinity	= ioapic_set_affinity,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
@@ -1886,6 +1887,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_eoi		= ioapic_ir_ack_level,
 	.irq_set_affinity	= ioapic_set_affinity,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
-- 
cgit 


From 02cfdc95a0104fa5812d855d1e4ec687312aaa6f Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 18 Jan 2017 14:30:29 -0800
Subject: sched/x86: Remove unnecessary TBM3 check to update topology

Scheduling to the max performance core is enabled by
default for Turbo Boost Maxt Technology 3.0 capable platforms.

Remove the useless sysctl_sched_itmt_enabled check to
update sched topology for adding the prioritized core scheduling
flag.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@suse.de
Cc: jolsa@redhat.com
Cc: linux-acpi@vger.kernel.org
Cc: linux-pm@vger.kernel.org
Cc: rjw@rjwysocki.net
Link: http://lkml.kernel.org/r/1484778629-4404-1-git-send-email-tim.c.chen@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/itmt.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index cb9c1ed1d391..f73f475d0573 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -132,10 +132,8 @@ int sched_set_itmt_support(void)
 
 	sysctl_sched_itmt_enabled = 1;
 
-	if (sysctl_sched_itmt_enabled) {
-		x86_topology_update = true;
-		rebuild_sched_domains();
-	}
+	x86_topology_update = true;
+	rebuild_sched_domains();
 
 	mutex_unlock(&itmt_update_mutex);
 
-- 
cgit 


From 3f646ed70ccd1c4e5c1263d2922247d28c8e08f0 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Wed, 18 Jan 2017 16:45:00 -0700
Subject: Drivers: hv: vmbus: Move the definition of
 hv_x64_msr_hypercall_contents

As part of the effort to separate out architecture specific code, move the
definition of hv_x64_msr_hypercall_contents to x86 specific header file.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index aaf59b7da98a..188ddfdde2b9 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -13,6 +13,18 @@ struct ms_hyperv_info {
 
 extern struct ms_hyperv_info ms_hyperv;
 
+/*
+ * Declare the MSR used to setup pages used to communicate with the hypervisor.
+ */
+union hv_x64_msr_hypercall_contents {
+	u64 as_uint64;
+	struct {
+		u64 enable:1;
+		u64 reserved:11;
+		u64 guest_physical_address:52;
+	};
+};
+
 void hyperv_callback_vector(void);
 #ifdef CONFIG_TRACING
 #define trace_hyperv_callback_vector hyperv_callback_vector
-- 
cgit 


From 352c9624242d5836ad8a960826183011367871a4 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Wed, 18 Jan 2017 16:45:01 -0700
Subject: Drivers: hv: vmbus: Move the definition of generate_guest_id()

As part of the effort to separate out architecture specific code, move the
definition of generate_guest_id() to x86 specific header file.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 43 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 188ddfdde2b9..15a0c275c82e 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -25,6 +25,49 @@ union hv_x64_msr_hypercall_contents {
 	};
 };
 
+/*
+ * The guest OS needs to register the guest ID with the hypervisor.
+ * The guest ID is a 64 bit entity and the structure of this ID is
+ * specified in the Hyper-V specification:
+ *
+ * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
+ *
+ * While the current guideline does not specify how Linux guest ID(s)
+ * need to be generated, our plan is to publish the guidelines for
+ * Linux and other guest operating systems that currently are hosted
+ * on Hyper-V. The implementation here conforms to this yet
+ * unpublished guidelines.
+ *
+ *
+ * Bit(s)
+ * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
+ * 62:56 - Os Type; Linux is 0x100
+ * 55:48 - Distro specific identification
+ * 47:16 - Linux kernel version number
+ * 15:0  - Distro specific identification
+ *
+ *
+ */
+
+#define HV_LINUX_VENDOR_ID              0x8800
+
+/*
+ * Generate the guest ID based on the guideline described above.
+ */
+
+static inline  __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version,
+				       __u64 d_info2)
+{
+	__u64 guest_id = 0;
+
+	guest_id = (((__u64)HV_LINUX_VENDOR_ID) << 56);
+	guest_id |= (d_info1 << 48);
+	guest_id |= (kernel_version << 16);
+	guest_id |= d_info2;
+
+	return guest_id;
+}
+
 void hyperv_callback_vector(void);
 #ifdef CONFIG_TRACING
 #define trace_hyperv_callback_vector hyperv_callback_vector
-- 
cgit 


From 8730046c1498e8fb8c9a124789893944e8ce8220 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Wed, 18 Jan 2017 16:45:02 -0700
Subject: Drivers: hv vmbus: Move Hypercall page setup out of common code

As part of the effort to separate out architecture specific code, move the
hypercall page setup to an architecture specific file.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kbuild                 |  3 ++
 arch/x86/hyperv/Makefile        |  1 +
 arch/x86/hyperv/hv_init.c       | 62 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/mshyperv.h |  5 ++++
 arch/x86/kernel/cpu/mshyperv.c  |  7 +++++
 5 files changed, 78 insertions(+)
 create mode 100644 arch/x86/hyperv/Makefile
 create mode 100644 arch/x86/hyperv/hv_init.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index eb3abf8ac44e..586b786b3edf 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -7,6 +7,9 @@ obj-$(CONFIG_KVM) += kvm/
 # Xen paravirtualization support
 obj-$(CONFIG_XEN) += xen/
 
+# Hyper-V paravirtualization support
+obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/
+
 # lguest paravirtualization support
 obj-$(CONFIG_LGUEST_GUEST) += lguest/
 
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
new file mode 100644
index 000000000000..171ae09864d7
--- /dev/null
+++ b/arch/x86/hyperv/Makefile
@@ -0,0 +1 @@
+obj-y		:= hv_init.o
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
new file mode 100644
index 000000000000..3206bfda586d
--- /dev/null
+++ b/arch/x86/hyperv/hv_init.c
@@ -0,0 +1,62 @@
+/*
+ * X86 specific Hyper-V initialization code.
+ *
+ * Copyright (C) 2016, Microsoft, Inc.
+ *
+ * Author : K. Y. Srinivasan <kys@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ */
+
+#include <linux/types.h>
+#include <asm/hypervisor.h>
+#include <asm/hyperv.h>
+#include <asm/mshyperv.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+void *hv_hypercall_pg;
+/*
+ * This function is to be invoked early in the boot sequence after the
+ * hypervisor has been detected.
+ *
+ * 1. Setup the hypercall page.
+ */
+void hyperv_init(void)
+{
+	u64 guest_id;
+	union hv_x64_msr_hypercall_contents hypercall_msr;
+
+	if (x86_hyper != &x86_hyper_ms_hyperv)
+		return;
+
+	/*
+	 * Setup the hypercall page and enable hypercalls.
+	 * 1. Register the guest ID
+	 * 2. Enable the hypercall and register the hypercall page
+	 */
+	guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0);
+	wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
+	hv_hypercall_pg  = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_EXEC);
+	if (hv_hypercall_pg == NULL) {
+		wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
+		return;
+	}
+
+	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+	hypercall_msr.enable = 1;
+	hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
+	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+}
+EXPORT_SYMBOL_GPL(hv_hypercall_pg);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 15a0c275c82e..e5f57e15a507 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -80,4 +80,9 @@ void hv_setup_kexec_handler(void (*handler)(void));
 void hv_remove_kexec_handler(void);
 void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs));
 void hv_remove_crash_handler(void);
+
+#if IS_ENABLED(CONFIG_HYPERV)
+void hyperv_init(void);
+extern void *hv_hypercall_pg;
+#endif
 #endif
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 65e20c97e04b..c5a1e9ba9ae0 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -227,6 +227,13 @@ static void __init ms_hyperv_init_platform(void)
 	 */
 	if (efi_enabled(EFI_BOOT))
 		x86_platform.get_nmi_reason = hv_get_nmi_reason;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+	/*
+	 * Setup the hook to get control post apic initialization.
+	 */
+	x86_platform.apic_post_init = hyperv_init;
+#endif
 }
 
 const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
-- 
cgit 


From 6ab42a66d2cc10afefea9f9e5d9a5ad5a836d254 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Wed, 18 Jan 2017 16:45:03 -0700
Subject: Drivers: hv: vmbus: Move Hypercall invocation code out of common code

As part of the effort to separate out architecture specific code, move the
hypercall invocation code to an architecture specific file.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/hyperv/hv_init.c       | 54 +++++++++++++++++++++++++++++++++++++----
 arch/x86/include/asm/mshyperv.h |  1 -
 2 files changed, 49 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 3206bfda586d..b5c8e04deacb 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -25,7 +25,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 
-void *hv_hypercall_pg;
+static void *hypercall_pg;
 /*
  * This function is to be invoked early in the boot sequence after the
  * hypervisor has been detected.
@@ -48,15 +48,59 @@ void hyperv_init(void)
 	guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0);
 	wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
 
-	hv_hypercall_pg  = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_EXEC);
-	if (hv_hypercall_pg == NULL) {
+	hypercall_pg  = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_EXEC);
+	if (hypercall_pg == NULL) {
 		wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
 		return;
 	}
 
 	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
 	hypercall_msr.enable = 1;
-	hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
+	hypercall_msr.guest_physical_address = vmalloc_to_pfn(hypercall_pg);
 	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
 }
-EXPORT_SYMBOL_GPL(hv_hypercall_pg);
+
+/*
+ * hv_do_hypercall- Invoke the specified hypercall
+ */
+u64 hv_do_hypercall(u64 control, void *input, void *output)
+{
+	u64 input_address = (input) ? virt_to_phys(input) : 0;
+	u64 output_address = (output) ? virt_to_phys(output) : 0;
+#ifdef CONFIG_X86_64
+	u64 hv_status = 0;
+
+	if (!hypercall_pg)
+		return (u64)ULLONG_MAX;
+
+	__asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8");
+	__asm__ __volatile__("call *%3" : "=a" (hv_status) :
+			     "c" (control), "d" (input_address),
+			     "m" (hypercall_pg));
+
+	return hv_status;
+
+#else
+
+	u32 control_hi = control >> 32;
+	u32 control_lo = control & 0xFFFFFFFF;
+	u32 hv_status_hi = 1;
+	u32 hv_status_lo = 1;
+	u32 input_address_hi = input_address >> 32;
+	u32 input_address_lo = input_address & 0xFFFFFFFF;
+	u32 output_address_hi = output_address >> 32;
+	u32 output_address_lo = output_address & 0xFFFFFFFF;
+
+	if (!hypercall_pg)
+		return (u64)ULLONG_MAX;
+
+	__asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi),
+			      "=a"(hv_status_lo) : "d" (control_hi),
+			      "a" (control_lo), "b" (input_address_hi),
+			      "c" (input_address_lo), "D"(output_address_hi),
+			      "S"(output_address_lo), "m" (hypercall_pg));
+
+	return hv_status_lo | ((u64)hv_status_hi << 32);
+#endif /* !x86_64 */
+}
+EXPORT_SYMBOL_GPL(hv_do_hypercall);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index e5f57e15a507..ed8e07399071 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -83,6 +83,5 @@ void hv_remove_crash_handler(void);
 
 #if IS_ENABLED(CONFIG_HYPERV)
 void hyperv_init(void);
-extern void *hv_hypercall_pg;
 #endif
 #endif
-- 
cgit 


From acb04058de49458010c44bb35b849d45113fd668 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 19 Jan 2017 14:36:33 +0100
Subject: sched/clock: Fix hotplug crash

Mike reported that he could trigger the WARN_ON_ONCE() in
set_sched_clock_stable() using hotplug.

This exposed a fundamental problem with the interface, we should never
mark the TSC stable if we ever find it to be unstable. Therefore
set_sched_clock_stable() is a broken interface.

The reason it existed is that not having it is a pain, it means all
relevant architecture code needs to call clear_sched_clock_stable()
where appropriate.

Of the three architectures that select HAVE_UNSTABLE_SCHED_CLOCK ia64
and parisc are trivial in that they never called
set_sched_clock_stable(), so add an unconditional call to
clear_sched_clock_stable() to them.

For x86 the story is a lot more involved, and what this patch tries to
do is ensure we preserve the status quo. So even is Cyrix or Transmeta
have usable TSC they never called set_sched_clock_stable() so they now
get an explicit mark unstable.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 9881b024b7d7 ("sched/clock: Delay switching sched_clock to stable")
Link: http://lkml.kernel.org/r/20170119133633.GB6536@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/amd.c       | 6 ++++--
 arch/x86/kernel/cpu/centaur.c   | 6 ++++--
 arch/x86/kernel/cpu/common.c    | 3 +++
 arch/x86/kernel/cpu/cyrix.c     | 2 ++
 arch/x86/kernel/cpu/intel.c     | 6 ++++--
 arch/x86/kernel/cpu/transmeta.c | 3 +++
 arch/x86/kernel/kvmclock.c      | 2 +-
 7 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 71cae73a5076..1bb253a6ee4d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -548,8 +548,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-		if (!check_tsc_unstable())
-			set_sched_clock_stable();
+		if (check_tsc_unstable())
+			clear_sched_clock_stable();
+	} else {
+		clear_sched_clock_stable();
 	}
 
 	/* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 1661d8ec9280..2c234a6d94c4 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,5 +1,5 @@
-#include <linux/bitops.h>
-#include <linux/kernel.h>
+
+#include <linux/sched.h>
 
 #include <asm/cpufeature.h>
 #include <asm/e820.h>
@@ -104,6 +104,8 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_64
 	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
 #endif
+
+	clear_sched_clock_stable();
 }
 
 static void init_centaur(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index dc1697ca5191..3457186275a0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -83,6 +83,7 @@ static void default_init(struct cpuinfo_x86 *c)
 			strcpy(c->x86_model_id, "386");
 	}
 #endif
+	clear_sched_clock_stable();
 }
 
 static const struct cpu_dev default_cpu = {
@@ -1055,6 +1056,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	 */
 	if (this_cpu->c_init)
 		this_cpu->c_init(c);
+	else
+		clear_sched_clock_stable();
 
 	/* Disable the PN if appropriate */
 	squash_the_stupid_serial_number(c);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index bd9dcd6b712d..47416f959a48 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -9,6 +9,7 @@
 #include <asm/pci-direct.h>
 #include <asm/tsc.h>
 #include <asm/cpufeature.h>
+#include <linux/sched.h>
 
 #include "cpu.h"
 
@@ -183,6 +184,7 @@ static void early_init_cyrix(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
 		break;
 	}
+	clear_sched_clock_stable();
 }
 
 static void init_cyrix(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcd484d2bb03..26eaff4907b2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -124,8 +124,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-		if (!check_tsc_unstable())
-			set_sched_clock_stable();
+		if (check_tsc_unstable())
+			clear_sched_clock_stable();
+	} else {
+		clear_sched_clock_stable();
 	}
 
 	/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 34178564be2a..c1ea5b999839 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,4 +1,5 @@
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
@@ -14,6 +15,8 @@ static void early_init_transmeta(struct cpuinfo_x86 *c)
 		if (xlvl >= 0x80860001)
 			c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
 	}
+
+	clear_sched_clock_stable();
 }
 
 static void init_transmeta(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2a5cafdf8808..542710b99f52 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -107,12 +107,12 @@ static inline void kvm_sched_clock_init(bool stable)
 {
 	if (!stable) {
 		pv_time_ops.sched_clock = kvm_clock_read;
+		clear_sched_clock_stable();
 		return;
 	}
 
 	kvm_sched_clock_offset = kvm_clock_read();
 	pv_time_ops.sched_clock = kvm_sched_clock_read;
-	set_sched_clock_stable();
 
 	printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
 			kvm_sched_clock_offset);
-- 
cgit 


From 358e96deaed3330a59d9dd6a7e419f4da08d6497 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 19 Jan 2017 21:24:22 +0200
Subject: x86/ioapic: Return suitable error code in mp_map_gsi_to_irq()

mp_map_gsi_to_irq() in some cases might return legacy -1, which would be
wrongly interpreted as -EPERM.

Correct those cases to return proper error code.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: http://lkml.kernel.org/r/20170119192425.189899-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 945e512a112a..f62c38d325da 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1107,12 +1107,12 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
 
 	ioapic = mp_find_ioapic(gsi);
 	if (ioapic < 0)
-		return -1;
+		return -ENODEV;
 
 	pin = mp_find_ioapic_pin(ioapic, gsi);
 	idx = find_irq_entry(ioapic, pin, mp_INT);
 	if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
-		return -1;
+		return -ENODEV;
 
 	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
 }
-- 
cgit 


From 910a26f6e952148a0c8815281737aaead640626c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 19 Jan 2017 21:24:23 +0200
Subject: x86/platform/intel-mid: Allocate RTC interrupt for Merrifield

Legacy RTC requires interrupt line 8 to be dedicated for it. On
Intel MID platforms the legacy PIC is absent and in order to make RTC
work we need to allocate interrupt separately.

Current solution brought by commit de1c2540aa4f does it in a wrong place,
and since it's done unconditionally for all x86 devices, some of them,
e.g. PNP based, might get it wrong because they execute the MID specific
code due to x86_platform.legacy.rtc flag being set.

Move intel_mid_legacy_rtc_init() to its own module and call it before x86 RTC
CMOS initialization.

Fixes: de1c2540aa4f ("x86/platform/intel-mid: Enable RTC on Intel Merrifield")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Link: http://lkml.kernel.org/r/20170119192425.189899-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/platform/intel-mid/device_libs/Makefile   |  1 +
 .../intel-mid/device_libs/platform_mrfld_rtc.c     | 48 ++++++++++++++++++++++
 arch/x86/platform/intel-mid/sfi.c                  | 14 -------
 3 files changed, 49 insertions(+), 14 deletions(-)
 create mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index d4af7785844e..a7dbec4dce27 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -26,4 +26,5 @@ obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o
 obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o
 # MISC Devices
 obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o
+obj-$(subst m,y,$(CONFIG_RTC_DRV_CMOS)) += platform_mrfld_rtc.o
 obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_mrfld_wdt.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
new file mode 100644
index 000000000000..3135416df037
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
@@ -0,0 +1,48 @@
+/*
+ * Intel Merrifield legacy RTC initialization file
+ *
+ * (C) Copyright 2017 Intel Corporation
+ *
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+
+#include <asm/hw_irq.h>
+#include <asm/intel-mid.h>
+#include <asm/io_apic.h>
+#include <asm/time.h>
+#include <asm/x86_init.h>
+
+static int __init mrfld_legacy_rtc_alloc_irq(void)
+{
+	struct irq_alloc_info info;
+	int ret;
+
+	if (!x86_platform.legacy.rtc)
+		return -ENODEV;
+
+	ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, 0);
+	ret = mp_map_gsi_to_irq(RTC_IRQ, IOAPIC_MAP_ALLOC, &info);
+	if (ret < 0) {
+		pr_info("Failed to allocate RTC interrupt. Disabling RTC\n");
+		x86_platform.legacy.rtc = 0;
+		return ret;
+	}
+
+	return 0;
+}
+
+static int __init mrfld_legacy_rtc_init(void)
+{
+	if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
+		return -ENODEV;
+
+	return mrfld_legacy_rtc_alloc_irq();
+}
+arch_initcall(mrfld_legacy_rtc_init);
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index e4d4cabbb370..19b43e3a9f0f 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -41,7 +41,6 @@
 #include <asm/intel_scu_ipc.h>
 #include <asm/apb_timer.h>
 #include <asm/reboot.h>
-#include <asm/time.h>
 
 #define	SFI_SIG_OEM0	"OEM0"
 #define MAX_IPCDEVS	24
@@ -540,21 +539,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 	return 0;
 }
 
-static int __init intel_mid_legacy_rtc_init(void)
-{
-	struct irq_alloc_info info;
-
-	if (!x86_platform.legacy.rtc)
-		return -ENODEV;
-
-	ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, 0);
-	return mp_map_gsi_to_irq(RTC_IRQ, IOAPIC_MAP_ALLOC, &info);
-}
-
 static int __init intel_mid_platform_init(void)
 {
-	intel_mid_legacy_rtc_init();
-
 	sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
 	sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
 	return 0;
-- 
cgit 


From 939533955d1f1d51e8e37d7d675646ce9d55534b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 19 Jan 2017 21:24:24 +0200
Subject: x86/platform/intel-mid: Don't shadow error code of
 mp_map_gsi_to_irq()

When call mp_map_gsi_to_irq() and return its error code do not shadow it.
Note that 0 is not an error.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: http://lkml.kernel.org/r/20170119192425.189899-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
index 3f1f1c77d090..8a10a56f2840 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
@@ -28,9 +28,9 @@ static struct platform_device wdt_dev = {
 
 static int tangier_probe(struct platform_device *pdev)
 {
-	int gsi;
 	struct irq_alloc_info info;
 	struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
+	int gsi, irq;
 
 	if (!pdata)
 		return -EINVAL;
@@ -38,10 +38,10 @@ static int tangier_probe(struct platform_device *pdev)
 	/* IOAPIC builds identity mapping between GSI and IRQ on MID */
 	gsi = pdata->irq;
 	ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0);
-	if (mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) {
-		dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
-			 gsi);
-		return -EINVAL;
+	irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+	if (irq < 0) {
+		dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi);
+		return irq;
 	}
 
 	return 0;
-- 
cgit 


From e2e2eabb68dfd00502bf8501b015862eb8b3f392 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 19 Jan 2017 21:24:25 +0200
Subject: x86/platform/intel-mid: Move watchdog registration to arch_initcall()

There is no need to choose a random initcall level for certainly
architecture dependent code.

Move watchdog registration to arch_initcall() from rootfs_initcall().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: http://lkml.kernel.org/r/20170119192425.189899-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
index 8a10a56f2840..86edd1e941eb 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
@@ -82,4 +82,4 @@ static int __init register_mid_wdt(void)
 
 	return 0;
 }
-rootfs_initcall(register_mid_wdt);
+arch_initcall(register_mid_wdt);
-- 
cgit 


From 63ed4e0c67df332681ebfef6eca6852da28d6300 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:46 -0700
Subject: Drivers: hv: vmbus: Consolidate all Hyper-V specific clocksource code

As part of the effort to separate out architecture specific code,
consolidate all Hyper-V specific clocksource code to an architecture
specific code.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/hyperv/hv_init.c       | 105 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/mshyperv.h |  12 +++++
 arch/x86/kernel/cpu/mshyperv.c  |  23 ---------
 3 files changed, 117 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index b5c8e04deacb..860233af4568 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -24,6 +24,79 @@
 #include <linux/version.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
+#include <linux/clockchips.h>
+
+
+#ifdef CONFIG_X86_64
+
+static struct ms_hyperv_tsc_page *tsc_pg;
+
+static u64 read_hv_clock_tsc(struct clocksource *arg)
+{
+	u64 current_tick;
+
+	if (tsc_pg->tsc_sequence != 0) {
+		/*
+		 * Use the tsc page to compute the value.
+		 */
+
+		while (1) {
+			u64 tmp;
+			u32 sequence = tsc_pg->tsc_sequence;
+			u64 cur_tsc;
+			u64 scale = tsc_pg->tsc_scale;
+			s64 offset = tsc_pg->tsc_offset;
+
+			rdtscll(cur_tsc);
+			/* current_tick = ((cur_tsc *scale) >> 64) + offset */
+			asm("mulq %3"
+				: "=d" (current_tick), "=a" (tmp)
+				: "a" (cur_tsc), "r" (scale));
+
+			current_tick += offset;
+			if (tsc_pg->tsc_sequence == sequence)
+				return current_tick;
+
+			if (tsc_pg->tsc_sequence != 0)
+				continue;
+			/*
+			 * Fallback using MSR method.
+			 */
+			break;
+		}
+	}
+	rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
+	return current_tick;
+}
+
+static struct clocksource hyperv_cs_tsc = {
+		.name		= "hyperv_clocksource_tsc_page",
+		.rating		= 400,
+		.read		= read_hv_clock_tsc,
+		.mask		= CLOCKSOURCE_MASK(64),
+		.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+};
+#endif
+
+static u64 read_hv_clock_msr(struct clocksource *arg)
+{
+	u64 current_tick;
+	/*
+	 * Read the partition counter to get the current tick count. This count
+	 * is set to 0 when the partition is created and is incremented in
+	 * 100 nanosecond units.
+	 */
+	rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
+	return current_tick;
+}
+
+static struct clocksource hyperv_cs_msr = {
+	.name		= "hyperv_clocksource_msr",
+	.rating		= 400,
+	.read		= read_hv_clock_msr,
+	.mask		= CLOCKSOURCE_MASK(64),
+	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+};
 
 static void *hypercall_pg;
 /*
@@ -31,6 +104,7 @@ static void *hypercall_pg;
  * hypervisor has been detected.
  *
  * 1. Setup the hypercall page.
+ * 2. Register Hyper-V specific clocksource.
  */
 void hyperv_init(void)
 {
@@ -58,6 +132,37 @@ void hyperv_init(void)
 	hypercall_msr.enable = 1;
 	hypercall_msr.guest_physical_address = vmalloc_to_pfn(hypercall_pg);
 	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+	/*
+	 * Register Hyper-V specific clocksource.
+	 */
+#ifdef CONFIG_X86_64
+	if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) {
+		union hv_x64_msr_hypercall_contents tsc_msr;
+
+		tsc_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+		if (!tsc_pg) {
+			clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);
+			return;
+		}
+
+		rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+
+		tsc_msr.enable = 1;
+		tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg);
+
+		wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+		clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
+		return;
+	}
+#endif
+	/*
+	 * For 32 bit guests just use the MSR based mechanism for reading
+	 * the partition counter.
+	 */
+
+	if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
+		clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);
 }
 
 /*
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index ed8e07399071..adfe8cc9f7e3 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -25,6 +25,18 @@ union hv_x64_msr_hypercall_contents {
 	};
 };
 
+/*
+ * TSC page layout.
+ */
+
+struct ms_hyperv_tsc_page {
+	volatile u32 tsc_sequence;
+	u32 reserved1;
+	volatile u64 tsc_scale;
+	volatile s64 tsc_offset;
+	u64 reserved2[509];
+};
+
 /*
  * The guest OS needs to register the guest ID with the hypervisor.
  * The guest ID is a 64 bit entity and the structure of this ID is
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index c5a1e9ba9ae0..d3705a44971c 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -133,26 +133,6 @@ static uint32_t  __init ms_hyperv_platform(void)
 	return 0;
 }
 
-static u64 read_hv_clock(struct clocksource *arg)
-{
-	u64 current_tick;
-	/*
-	 * Read the partition counter to get the current tick count. This count
-	 * is set to 0 when the partition is created and is incremented in
-	 * 100 nanosecond units.
-	 */
-	rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
-	return current_tick;
-}
-
-static struct clocksource hyperv_cs = {
-	.name		= "hyperv_clocksource",
-	.rating		= 400, /* use this when running on Hyperv*/
-	.read		= read_hv_clock,
-	.mask		= CLOCKSOURCE_MASK(64),
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
 static unsigned char hv_get_nmi_reason(void)
 {
 	return 0;
@@ -208,9 +188,6 @@ static void __init ms_hyperv_init_platform(void)
 			     "hv_nmi_unknown");
 #endif
 
-	if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
-		clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
-
 #ifdef CONFIG_X86_IO_APIC
 	no_timer_check = 1;
 #endif
-- 
cgit 


From 8de8af7e0873c4fdac2205327dff922819e16657 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:47 -0700
Subject: Drivers: hv: vmbus: Move the extracting of Hypervisor version
 information

As part of the effort to separate out architecture specific code,
extract hypervisor version information in an architecture specific
file.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 19 +++++++++++++++++++
 arch/x86/kernel/cpu/mshyperv.c  | 20 ++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index adfe8cc9f7e3..54729e3cba47 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -5,6 +5,25 @@
 #include <linux/interrupt.h>
 #include <asm/hyperv.h>
 
+/*
+ * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
+ * is set by CPUID(HVCPUID_VERSION_FEATURES).
+ */
+enum hv_cpuid_function {
+	HVCPUID_VERSION_FEATURES		= 0x00000001,
+	HVCPUID_VENDOR_MAXFUNCTION		= 0x40000000,
+	HVCPUID_INTERFACE			= 0x40000001,
+
+	/*
+	 * The remaining functions depend on the value of
+	 * HVCPUID_INTERFACE
+	 */
+	HVCPUID_VERSION				= 0x40000002,
+	HVCPUID_FEATURES			= 0x40000003,
+	HVCPUID_ENLIGHTENMENT_INFO		= 0x40000004,
+	HVCPUID_IMPLEMENTATION_LIMITS		= 0x40000005,
+};
+
 struct ms_hyperv_info {
 	u32 features;
 	u32 misc_features;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index d3705a44971c..b5375b9497b3 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -160,6 +160,11 @@ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
 
 static void __init ms_hyperv_init_platform(void)
 {
+	int hv_host_info_eax;
+	int hv_host_info_ebx;
+	int hv_host_info_ecx;
+	int hv_host_info_edx;
+
 	/*
 	 * Extract the features and hints
 	 */
@@ -170,6 +175,21 @@ static void __init ms_hyperv_init_platform(void)
 	pr_info("HyperV: features 0x%x, hints 0x%x\n",
 		ms_hyperv.features, ms_hyperv.hints);
 
+	/*
+	 * Extract host information.
+	 */
+	if (cpuid_eax(HVCPUID_VENDOR_MAXFUNCTION) >= HVCPUID_VERSION) {
+		hv_host_info_eax = cpuid_eax(HVCPUID_VERSION);
+		hv_host_info_ebx = cpuid_ebx(HVCPUID_VERSION);
+		hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION);
+		hv_host_info_edx = cpuid_edx(HVCPUID_VERSION);
+
+		pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
+			hv_host_info_eax, hv_host_info_ebx >> 16,
+			hv_host_info_ebx & 0xFFFF, hv_host_info_ecx,
+			hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
+	}
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
 		/*
-- 
cgit 


From d058fa7e98ff01a4b4750a2210fc19906db3cbe1 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:48 -0700
Subject: Drivers: hv: vmbus: Move the crash notification function

As part of the effort to separate out architecture specific code, move the
crash notification function.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/hyperv/hv_init.c          | 26 ++++++++++++++++++++++++++
 arch/x86/include/asm/mshyperv.h    |  1 +
 arch/x86/include/uapi/asm/hyperv.h |  8 ++++++++
 3 files changed, 35 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 860233af4568..ce5fc7394814 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -209,3 +209,29 @@ u64 hv_do_hypercall(u64 control, void *input, void *output)
 #endif /* !x86_64 */
 }
 EXPORT_SYMBOL_GPL(hv_do_hypercall);
+
+void hyperv_report_panic(struct pt_regs *regs)
+{
+	static bool panic_reported;
+
+	/*
+	 * We prefer to report panic on 'die' chain as we have proper
+	 * registers to report, but if we miss it (e.g. on BUG()) we need
+	 * to report it on 'panic'.
+	 */
+	if (panic_reported)
+		return;
+	panic_reported = true;
+
+	wrmsrl(HV_X64_MSR_CRASH_P0, regs->ip);
+	wrmsrl(HV_X64_MSR_CRASH_P1, regs->ax);
+	wrmsrl(HV_X64_MSR_CRASH_P2, regs->bx);
+	wrmsrl(HV_X64_MSR_CRASH_P3, regs->cx);
+	wrmsrl(HV_X64_MSR_CRASH_P4, regs->dx);
+
+	/*
+	 * Let Hyper-V know there is crash data available
+	 */
+	wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(hyperv_report_panic);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 54729e3cba47..64e682d88684 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -114,5 +114,6 @@ void hv_remove_crash_handler(void);
 
 #if IS_ENABLED(CONFIG_HYPERV)
 void hyperv_init(void);
+void hyperv_report_panic(struct pt_regs *regs);
 #endif
 #endif
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 9b1a91834ac8..3a20ccf787b8 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -73,6 +73,9 @@
   */
 #define HV_X64_MSR_STAT_PAGES_AVAILABLE		(1 << 8)
 
+/* Crash MSR available */
+#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
+
 /*
  * Feature identification: EBX indicates which flags were specified at
  * partition creation. The format is the same as the partition creation
@@ -144,6 +147,11 @@
  */
 #define HV_X64_RELAXED_TIMING_RECOMMENDED	(1 << 5)
 
+/*
+ * Crash notification flag.
+ */
+#define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63)
+
 /* MSR used to identify the guest OS. */
 #define HV_X64_MSR_GUEST_OS_ID			0x40000000
 
-- 
cgit 


From 73638cddaad861a5ebb2b119d8b318d4bded8f8d Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:49 -0700
Subject: Drivers: hv: vmbus: Move the check for hypercall page setup

As part of the effort to separate out architecture specific code, move the
check for detecting if the hypercall page is setup.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/hyperv/hv_init.c       | 15 +++++++++++++++
 arch/x86/include/asm/mshyperv.h |  1 +
 2 files changed, 16 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index ce5fc7394814..d289bc29d282 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -235,3 +235,18 @@ void hyperv_report_panic(struct pt_regs *regs)
 	wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY);
 }
 EXPORT_SYMBOL_GPL(hyperv_report_panic);
+
+bool hv_is_hypercall_page_setup(void)
+{
+	union hv_x64_msr_hypercall_contents hypercall_msr;
+
+	/* Check if the hypercall page is setup */
+	hypercall_msr.as_uint64 = 0;
+	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+	if (!hypercall_msr.enable)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(hv_is_hypercall_page_setup);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 64e682d88684..c843ef64defe 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -115,5 +115,6 @@ void hv_remove_crash_handler(void);
 #if IS_ENABLED(CONFIG_HYPERV)
 void hyperv_init(void);
 void hyperv_report_panic(struct pt_regs *regs);
+bool hv_is_hypercall_page_setup(void);
 #endif
 #endif
-- 
cgit 


From e810e48c0c9a1a1ebb90cfe966bce6dc80ce08e7 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:50 -0700
Subject: Drivers: hv: vmbus: Move the code to signal end of message

As part of the effort to separate out architecture specific code, move the
code for signaling end of message.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index c843ef64defe..b57b470ac2a7 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -99,6 +99,43 @@ static inline  __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version,
 	return guest_id;
 }
 
+
+/* Free the message slot and signal end-of-message if required */
+static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
+{
+	/*
+	 * On crash we're reading some other CPU's message page and we need
+	 * to be careful: this other CPU may already had cleared the header
+	 * and the host may already had delivered some other message there.
+	 * In case we blindly write msg->header.message_type we're going
+	 * to lose it. We can still lose a message of the same type but
+	 * we count on the fact that there can only be one
+	 * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages
+	 * on crash.
+	 */
+	if (cmpxchg(&msg->header.message_type, old_msg_type,
+		    HVMSG_NONE) != old_msg_type)
+		return;
+
+	/*
+	 * Make sure the write to MessageType (ie set to
+	 * HVMSG_NONE) happens before we read the
+	 * MessagePending and EOMing. Otherwise, the EOMing
+	 * will not deliver any more messages since there is
+	 * no empty slot
+	 */
+	mb();
+
+	if (msg->header.message_flags.msg_pending) {
+		/*
+		 * This will cause message queue rescan to
+		 * possibly deliver another msg from the
+		 * hypervisor
+		 */
+		wrmsrl(HV_X64_MSR_EOM, 0);
+	}
+}
+
 void hyperv_callback_vector(void);
 #ifdef CONFIG_TRACING
 #define trace_hyperv_callback_vector hyperv_callback_vector
-- 
cgit 


From d5116b4091ecca271c249ede43a49c1245920558 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:51 -0700
Subject: Drivers: hv: vmbus: Restructure the clockevents code

Move the relevant code that programs the hypervisor to an architecture
specific file.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index b57b470ac2a7..a58c201f3412 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -136,6 +136,10 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
 	}
 }
 
+#define hv_get_current_tick(tick) rdmsrl(HV_X64_MSR_TIME_REF_COUNT, tick)
+#define hv_init_timer(timer, tick) wrmsrl(timer, tick)
+#define hv_init_timer_config(config, val) wrmsrl(config, val)
+
 void hyperv_callback_vector(void);
 #ifdef CONFIG_TRACING
 #define trace_hyperv_callback_vector hyperv_callback_vector
-- 
cgit 


From 155e4a2f28a59e5344dfa7c5d003161fe59a5bf2 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:54 -0700
Subject: Drivers: hv: vmbus: Define APIs to manipulate the message page

As part of cleaning up architecture specific code, define APIs
to manipulate the message page.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index a58c201f3412..1e75141bc123 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -140,6 +140,9 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
 #define hv_init_timer(timer, tick) wrmsrl(timer, tick)
 #define hv_init_timer_config(config, val) wrmsrl(config, val)
 
+#define hv_get_simp(val) rdmsrl(HV_X64_MSR_SIMP, val)
+#define hv_set_simp(val) wrmsrl(HV_X64_MSR_SIMP, val)
+
 void hyperv_callback_vector(void);
 #ifdef CONFIG_TRACING
 #define trace_hyperv_callback_vector hyperv_callback_vector
-- 
cgit 


From 8e307bf82d76ab02e95a00d132d926f04db6ccab Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:55 -0700
Subject: Drivers: hv: vmbus: Define APIs to manipulate the event page

As part of cleaning up architecture specific code, define APIs
to manipulate the event page.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 1e75141bc123..2ea7e16fc678 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -143,6 +143,9 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
 #define hv_get_simp(val) rdmsrl(HV_X64_MSR_SIMP, val)
 #define hv_set_simp(val) wrmsrl(HV_X64_MSR_SIMP, val)
 
+#define hv_get_siefp(val) rdmsrl(HV_X64_MSR_SIEFP, val)
+#define hv_set_siefp(val) wrmsrl(HV_X64_MSR_SIEFP, val)
+
 void hyperv_callback_vector(void);
 #ifdef CONFIG_TRACING
 #define trace_hyperv_callback_vector hyperv_callback_vector
-- 
cgit 


From 06d1d98a839f196e94cb726008fb2118e430f356 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:56 -0700
Subject: Drivers: hv: vmbus: Define APIs to manipulate the synthetic interrupt
 controller

As part of cleaning up architecture specific code, define APIs
to manipulate the interrupt controller state.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 2ea7e16fc678..1ea19a57cf4c 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -146,6 +146,9 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
 #define hv_get_siefp(val) rdmsrl(HV_X64_MSR_SIEFP, val)
 #define hv_set_siefp(val) wrmsrl(HV_X64_MSR_SIEFP, val)
 
+#define hv_get_synic_state(val) rdmsrl(HV_X64_MSR_SCONTROL, val)
+#define hv_set_synic_state(val) wrmsrl(HV_X64_MSR_SCONTROL, val)
+
 void hyperv_callback_vector(void);
 #ifdef CONFIG_TRACING
 #define trace_hyperv_callback_vector hyperv_callback_vector
-- 
cgit 


From 7297ff0ca9db7e2d830841035b95d8b94b529142 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:57 -0700
Subject: Drivers: hv: vmbus: Define an API to retrieve virtual processor index

As part of cleaning up architecture specific code, define an API
to retrieve the virtual procesor index.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 1ea19a57cf4c..2d40bfc57e7a 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -149,6 +149,8 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
 #define hv_get_synic_state(val) rdmsrl(HV_X64_MSR_SCONTROL, val)
 #define hv_set_synic_state(val) wrmsrl(HV_X64_MSR_SCONTROL, val)
 
+#define hv_get_vp_index(index) rdmsrl(HV_X64_MSR_VP_INDEX, index)
+
 void hyperv_callback_vector(void);
 #ifdef CONFIG_TRACING
 #define trace_hyperv_callback_vector hyperv_callback_vector
-- 
cgit 


From 37e11d5c7052a5ca55ef807731c75218ea341b4c Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:58 -0700
Subject: Drivers: hv: vmbus: Define an APIs to manage interrupt state

As part of cleaning up architecture specific code, define APIs
to manage interrupt state.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 2d40bfc57e7a..42505d1158d6 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -151,6 +151,9 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
 
 #define hv_get_vp_index(index) rdmsrl(HV_X64_MSR_VP_INDEX, index)
 
+#define hv_get_synint_state(int_num, val) rdmsrl(int_num, val)
+#define hv_set_synint_state(int_num, val) wrmsrl(int_num, val)
+
 void hyperv_callback_vector(void);
 #ifdef CONFIG_TRACING
 #define trace_hyperv_callback_vector hyperv_callback_vector
-- 
cgit 


From 0b4c208d443ba2af82b4c70f99ca8df31e9a0020 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Tue, 20 Dec 2016 16:34:50 -0800
Subject: Revert "KVM: nested VMX: disable perf cpuid reporting"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit bc6134942dbbf31c25e9bd7c876be5da81c9e1ce.

A CPUID instruction executed in VMX non-root mode always causes a
VM-exit, regardless of the leaf being queried.

Fixes: bc6134942dbb ("KVM: nested VMX: disable perf cpuid reporting")
Signed-off-by: Jim Mattson <jmattson@google.com>
[The issue solved by bc6134942dbb has been resolved with ff651cb613b4
 ("KVM: nVMX: Add nested msr load/restore algorithm").]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/cpuid.c | 6 ------
 arch/x86/kvm/vmx.c   | 2 --
 2 files changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 09c2ac741567..c0e2036217ad 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -861,12 +861,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
 	if (!best)
 		best = check_cpuid_limit(vcpu, function, index);
 
-	/*
-	 * Perfmon not yet supported for L2 guest.
-	 */
-	if (is_guest_mode(vcpu) && function == 0xa)
-		best = NULL;
-
 	if (best) {
 		*eax = best->eax;
 		*ebx = best->ebx;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4e691035a32d..c7bafa1457e2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8203,8 +8203,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_TASK_SWITCH:
 		return true;
 	case EXIT_REASON_CPUID:
-		if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
-			return false;
 		return true;
 	case EXIT_REASON_HLT:
 		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
-- 
cgit 


From 4c45c5167c9563b1a2eee3e2fe954621355e4ca8 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 19 Jan 2017 12:47:30 +0100
Subject: x86/timer: Make delay() work during early bootup

When a panic happens during bootup, "Rebooting in X seconds.." is
shown, but reboot happens immediatelly. It is because panic() uses mdelay()
and mdelay() calls __const_udelay() immediately, which does not
work while booting.

The per_cpu cpu_info.loops_per_jiffy value is not initialized yet, so
__const_udelay() actually multiplies the number of loops by zero. This
results in __const_udelay() to delay the execution only by a nanosecond
or so.

So check whether cpu_info.loops_per_jiffy is zero and use
loops_per_jiffy in that case. mdelay() will not be so precise without
proper calibration, but it works relatively well.

Before:

  [    0.170039] delaying 100ms
  [    0.170828] done

After

  [    0.214042] delaying 100ms
  [    0.313974] done

I do not think the added check matters given we are about to spin the
processor in the next few hundred cycles.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170119114730.2670-1-jslaby@suse.cz
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/delay.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 073d1f1a620b..a8e91ae89fb3 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -156,13 +156,13 @@ EXPORT_SYMBOL(__delay);
 
 inline void __const_udelay(unsigned long xloops)
 {
+	unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy;
 	int d0;
 
 	xloops *= 4;
 	asm("mull %%edx"
 		:"=d" (xloops), "=&a" (d0)
-		:"1" (xloops), "0"
-		(this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4)));
+		:"1" (xloops), "0" (lpj * (HZ / 4)));
 
 	__delay(++xloops);
 }
-- 
cgit 


From a8d4c8246b290ce97f88752d833804843041ac84 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Mon, 23 Jan 2017 14:48:23 +0800
Subject: x86/crash: Update the stale comment in reserve_crashkernel()

CRASH_KERNEL_ADDR_MAX has been missing for a long time,
update it with a more detailed explanation.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Young <dyoung@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert LeBlanc <robert@leblancnet.us>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kexec@lists.infradead.org
Link: http://lkml.kernel.org/r/1485154103-18426-1-git-send-email-xlpang@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/setup.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4cfba947d774..eb69b14dbfc8 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -575,7 +575,9 @@ static void __init reserve_crashkernel(void)
 	/* 0 means: find the address automatically */
 	if (crash_base <= 0) {
 		/*
-		 *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
+		 * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
+		 * as old kexec-tools loads bzImage below that, unless
+		 * "crashkernel=size[KMG],high" is specified.
 		 */
 		crash_base = memblock_find_in_range(CRASH_ALIGN,
 						    high ? CRASH_ADDR_HIGH_MAX
-- 
cgit 


From c26665ab5c49ad3e142e0f054ca3204f259ba09c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:40 +0100
Subject: x86/microcode/intel: Drop stashed AP patch pointer optimization

This was meant to save us the scanning of the microcode containter in
the initrd since the first AP had already done that but it can also hurt
us:

Imagine a single hyperthreaded CPU (Intel(R) Atom(TM) CPU N270, for
example) which updates the microcode on the BSP but since the microcode
engine is shared between the two threads, the update on CPU1 doesn't
happen because it has already happened on CPU0 and we don't find a newer
microcode revision on CPU1.

Which doesn't set the intel_ucode_patch pointer and at initrd
jettisoning time we don't save the microcode patch for later
application.

Now, when we suspend to RAM, the loaded microcode gets cleared so we
need to reload but there's no patch saved in the cache.

Removing the optimization fixes this issue and all is fine and dandy.

Fixes: 06b8534cb728 ("x86/microcode: Rework microcode loading")
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-2-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/intel.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 3f329b74e040..8325d8a09ab0 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -41,7 +41,7 @@
 
 static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
 
-/* Current microcode patch used in early patching */
+/* Current microcode patch used in early patching on the APs. */
 struct microcode_intel *intel_ucode_patch;
 
 static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
@@ -607,12 +607,6 @@ int __init save_microcode_in_initrd_intel(void)
 	struct ucode_cpu_info uci;
 	struct cpio_data cp;
 
-	/*
-	 * AP loading didn't find any microcode patch, no need to save anything.
-	 */
-	if (!intel_ucode_patch || IS_ERR(intel_ucode_patch))
-		return 0;
-
 	if (!load_builtin_intel_microcode(&cp))
 		cp = find_microcode_in_initrd(ucode_path, false);
 
@@ -628,7 +622,6 @@ int __init save_microcode_in_initrd_intel(void)
 	return 0;
 }
 
-
 /*
  * @res_patch, output: a pointer to the patch we found.
  */
-- 
cgit 


From a585df8edabdb47ae25214ebb3a627ca7ce800d3 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:41 +0100
Subject: x86/MSR: Carve out bare minimum accessors

Add __rdmsr() and __wrmsr() which *only* read and write an MSR with
exception handling. Those are going to be used in early code, like the
microcode loader, which cannot stomach tracing code piggybacking on the
MSR operation.

While at it, get rid of __native_write_msr_notrace().

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-3-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h |  2 +-
 arch/x86/include/asm/msr.h  | 51 +++++++++++++++++++++++++++------------------
 2 files changed, 32 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 0c5fbc68e82d..eff8e36aaf72 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -195,7 +195,7 @@ static inline void native_apic_msr_write(u32 reg, u32 v)
 
 static inline void native_apic_msr_eoi_write(u32 reg, u32 v)
 {
-	wrmsr_notrace(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0);
+	__wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0);
 }
 
 static inline u32 native_apic_msr_read(u32 reg)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index db0b90c3b03e..898dba2e2e2c 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -80,7 +80,14 @@ static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {}
 static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {}
 #endif
 
-static inline unsigned long long native_read_msr(unsigned int msr)
+/*
+ * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
+ * accessors and should not have any tracing or other functionality piggybacking
+ * on them - those are *purely* for accessing MSRs and nothing more. So don't even
+ * think of extending them - you will be slapped with a stinking trout or a frozen
+ * shark will reach you, wherever you are! You've been warned.
+ */
+static inline unsigned long long notrace __rdmsr(unsigned int msr)
 {
 	DECLARE_ARGS(val, low, high);
 
@@ -88,11 +95,30 @@ static inline unsigned long long native_read_msr(unsigned int msr)
 		     "2:\n"
 		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
 		     : EAX_EDX_RET(val, low, high) : "c" (msr));
-	if (msr_tracepoint_active(__tracepoint_read_msr))
-		do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0);
+
 	return EAX_EDX_VAL(val, low, high);
 }
 
+static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 high)
+{
+	asm volatile("1: wrmsr\n"
+		     "2:\n"
+		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
+		     : : "c" (msr), "a"(low), "d" (high) : "memory");
+}
+
+static inline unsigned long long native_read_msr(unsigned int msr)
+{
+	unsigned long long val;
+
+	val = __rdmsr(msr);
+
+	if (msr_tracepoint_active(__tracepoint_read_msr))
+		do_trace_read_msr(msr, val, 0);
+
+	return val;
+}
+
 static inline unsigned long long native_read_msr_safe(unsigned int msr,
 						      int *err)
 {
@@ -114,31 +140,16 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
 	return EAX_EDX_VAL(val, low, high);
 }
 
-/* Can be uninlined because referenced by paravirt */
-static inline void notrace
-__native_write_msr_notrace(unsigned int msr, u32 low, u32 high)
-{
-	asm volatile("1: wrmsr\n"
-		     "2:\n"
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
-		     : : "c" (msr), "a"(low), "d" (high) : "memory");
-}
-
 /* Can be uninlined because referenced by paravirt */
 static inline void notrace
 native_write_msr(unsigned int msr, u32 low, u32 high)
 {
-	__native_write_msr_notrace(msr, low, high);
+	__wrmsr(msr, low, high);
+
 	if (msr_tracepoint_active(__tracepoint_write_msr))
 		do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
 }
 
-static inline void
-wrmsr_notrace(unsigned int msr, u32 low, u32 high)
-{
-	__native_write_msr_notrace(msr, low, high);
-}
-
 /* Can be uninlined because referenced by paravirt */
 static inline int notrace
 native_write_msr_safe(unsigned int msr, u32 low, u32 high)
-- 
cgit 


From 0c12d18ab96e4da9f3e963bc242689bdeaaf2330 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:42 +0100
Subject: x86/microcode: Convert to bare minimum MSR accessors

Having tracepoints to the MSR accessors makes them unsuitable for early
microcode loading: think 32-bit before paging is enabled and us chasing
pointers to test whether a tracepoint is enabled or not. Results in a
reliable triple fault.

Convert to the bare ones.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-4-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/microcode.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 38711df3bcb5..90b22bbdfce9 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -7,18 +7,17 @@
 
 #define native_rdmsr(msr, val1, val2)			\
 do {							\
-	u64 __val = native_read_msr((msr));		\
+	u64 __val = __rdmsr((msr));			\
 	(void)((val1) = (u32)__val);			\
 	(void)((val2) = (u32)(__val >> 32));		\
 } while (0)
 
 #define native_wrmsr(msr, low, high)			\
-	native_write_msr(msr, low, high)
+	__wrmsr(msr, low, high)
 
 #define native_wrmsrl(msr, val)				\
-	native_write_msr((msr),				\
-			 (u32)((u64)(val)),		\
-			 (u32)((u64)(val) >> 32))
+	__wrmsr((msr), (u32)((u64)(val)),		\
+		       (u32)((u64)(val) >> 32))
 
 struct ucode_patch {
 	struct list_head plist;
-- 
cgit 


From 1f02ac0682055418753fe30e6433dcb11fd03e1c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:43 +0100
Subject: x86/microcode/AMD: Clean up find_equiv_id()

No need to have it marked "inline" - let gcc decide. Also, shorten the
argument name and simplify while-test.

While at it, make it into a proper for-loop and simplify it even more,
as tglx suggests.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-5-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/amd.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 6a31e2691f3a..5c1509a38048 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -97,20 +97,13 @@ static size_t compute_container_size(u8 *data, u32 total_size)
 	return size;
 }
 
-static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table,
-				unsigned int sig)
+static u16 find_equiv_id(struct equiv_cpu_entry *equiv_table, u32 sig)
 {
-	int i = 0;
-
-	if (!equiv_cpu_table)
-		return 0;
-
-	while (equiv_cpu_table[i].installed_cpu != 0) {
-		if (sig == equiv_cpu_table[i].installed_cpu)
-			return equiv_cpu_table[i].equiv_cpu;
-
-		i++;
+	for (; equiv_table && equiv_table->installed_cpu; equiv_table++) {
+		if (sig == equiv_table->installed_cpu)
+			return equiv_table->equiv_cpu;
 	}
+
 	return 0;
 }
 
-- 
cgit 


From ef901dc33dfa9abffe7d954628b5ec7ba7f62aec Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:44 +0100
Subject: x86/microcode/AMD: Shorten function parameter's name

The whole driver calls this "mc", do that here too.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-6-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/amd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 5c1509a38048..1691f41fcfdb 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -187,15 +187,15 @@ find_proper_container(u8 *ucode, size_t size, struct container *ret_cont)
 	return eq_id;
 }
 
-static int __apply_microcode_amd(struct microcode_amd *mc_amd)
+static int __apply_microcode_amd(struct microcode_amd *mc)
 {
 	u32 rev, dummy;
 
-	native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
+	native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code);
 
 	/* verify patch application was successful */
 	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-	if (rev != mc_amd->hdr.patch_id)
+	if (rev != mc->hdr.patch_id)
 		return -1;
 
 	return 0;
-- 
cgit 


From f454177f739e92620d84a1f42f91b03007a1cdd0 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:45 +0100
Subject: x86/microcode/AMD: Extend the container struct

Make it into a container descriptor which is being passed around and
stores important info like the matching container and the patch for the
current CPU. Make it static too.

Later patches will use this and thus get rid of a double container
parsing.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-7-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/amd.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 1691f41fcfdb..4e2238345308 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -42,11 +42,15 @@ static struct equiv_cpu_entry *equiv_cpu_table;
 
 /*
  * This points to the current valid container of microcode patches which we will
- * save from the initrd/builtin before jettisoning its contents.
+ * save from the initrd/builtin before jettisoning its contents. @mc is the
+ * microcode patch we found to match.
  */
-struct container {
-	u8 *data;
-	size_t size;
+static struct cont_desc {
+	struct microcode_amd *mc;
+	u32		     psize;
+	u16		     eq_id;
+	u8		     *data;
+	size_t		     size;
 } cont;
 
 static u32 ucode_new_rev;
@@ -113,9 +117,9 @@ static u16 find_equiv_id(struct equiv_cpu_entry *equiv_table, u32 sig)
  * table or 0 if none found.
  */
 static u16
-find_proper_container(u8 *ucode, size_t size, struct container *ret_cont)
+find_proper_container(u8 *ucode, size_t size, struct cont_desc *desc)
 {
-	struct container ret = { NULL, 0 };
+	struct cont_desc ret = { 0 };
 	u32 eax, ebx, ecx, edx;
 	struct equiv_cpu_entry *eq;
 	int offset, left;
@@ -158,7 +162,7 @@ find_proper_container(u8 *ucode, size_t size, struct container *ret_cont)
 			 */
 			left = ret.size - offset;
 
-			*ret_cont = ret;
+			*desc = ret;
 			return eq_id;
 		}
 
@@ -213,11 +217,11 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
  * Returns true if container found (sets @ret_cont), false otherwise.
  */
 static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch,
-				      struct container *ret_cont)
+				      struct cont_desc *desc)
 {
 	u8 (*patch)[PATCH_MAX_SIZE];
 	u32 rev, *header, *new_rev;
-	struct container ret;
+	struct cont_desc ret;
 	int offset, left;
 	u16 eq_id = 0;
 	u8  *data;
@@ -270,8 +274,8 @@ static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch,
 		left   -= offset;
 	}
 
-	if (ret_cont)
-		*ret_cont = ret;
+	if (desc)
+		*desc = ret;
 
 	return true;
 }
-- 
cgit 


From 8801b3fcb5744dc536272ba4ccfd6faca9b46705 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:46 +0100
Subject: x86/microcode/AMD: Rework container parsing

It was pretty clumsy before and the whole work of parsing the microcode
containers was spread around the functions wrongly.

Clean it up so that there's a main scan_containers() function which
iterates over the microcode blob and picks apart the containers glued
together. For each container, it calls a parse_container() helper which
concentrates on one container only: sanity-checking, parsing, counting
microcode patches in there, etc.

It makes much more sense now and it is actually very readable. Oh, and
we luvz a diffstat removing more crap than adding.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-8-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/amd.c | 238 ++++++++++++++++--------------------
 1 file changed, 105 insertions(+), 133 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 4e2238345308..9d5f4f3626f7 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -64,43 +64,6 @@ static u16 this_equiv_id;
 static const char
 ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
 
-static size_t compute_container_size(u8 *data, u32 total_size)
-{
-	size_t size = 0;
-	u32 *header = (u32 *)data;
-
-	if (header[0] != UCODE_MAGIC ||
-	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
-	    header[2] == 0)                            /* size */
-		return size;
-
-	size = header[2] + CONTAINER_HDR_SZ;
-	total_size -= size;
-	data += size;
-
-	while (total_size) {
-		u16 patch_size;
-
-		header = (u32 *)data;
-
-		if (header[0] != UCODE_UCODE_TYPE)
-			break;
-
-		/*
-		 * Sanity-check patch size.
-		 */
-		patch_size = header[1];
-		if (patch_size > PATCH_MAX_SIZE)
-			break;
-
-		size	   += patch_size + SECTION_HDR_SIZE;
-		data	   += patch_size + SECTION_HDR_SIZE;
-		total_size -= patch_size + SECTION_HDR_SIZE;
-	}
-
-	return size;
-}
-
 static u16 find_equiv_id(struct equiv_cpu_entry *equiv_table, u32 sig)
 {
 	for (; equiv_table && equiv_table->installed_cpu; equiv_table++) {
@@ -115,80 +78,106 @@ static u16 find_equiv_id(struct equiv_cpu_entry *equiv_table, u32 sig)
  * This scans the ucode blob for the proper container as we can have multiple
  * containers glued together. Returns the equivalence ID from the equivalence
  * table or 0 if none found.
+ * Returns the amount of bytes consumed while scanning. @desc contains all the
+ * data we're going to use in later stages of the application.
  */
-static u16
-find_proper_container(u8 *ucode, size_t size, struct cont_desc *desc)
+static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc)
 {
-	struct cont_desc ret = { 0 };
-	u32 eax, ebx, ecx, edx;
 	struct equiv_cpu_entry *eq;
-	int offset, left;
-	u16 eq_id = 0;
-	u32 *header;
-	u8 *data;
+	ssize_t orig_size = size;
+	u32 *hdr = (u32 *)ucode;
+	u32 eax, ebx, ecx, edx;
+	u16 eq_id;
+	u8 *buf;
 
-	data   = ucode;
-	left   = size;
-	header = (u32 *)data;
+	/* Am I looking at an equivalence table header? */
+	if (hdr[0] != UCODE_MAGIC ||
+	    hdr[1] != UCODE_EQUIV_CPU_TABLE_TYPE ||
+	    hdr[2] == 0) {
+		desc->eq_id = 0;
+		return CONTAINER_HDR_SZ;
+	}
 
+	buf = ucode;
 
-	/* find equiv cpu table */
-	if (header[0] != UCODE_MAGIC ||
-	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
-	    header[2] == 0)                            /* size */
-		return eq_id;
+	eq = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ);
 
-	eax = 0x00000001;
+	eax = 1;
 	ecx = 0;
 	native_cpuid(&eax, &ebx, &ecx, &edx);
 
-	while (left > 0) {
-		eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
-
-		ret.data = data;
+	/* Find the equivalence ID of our CPU in this table: */
+	eq_id = find_equiv_id(eq, eax);
 
-		/* Advance past the container header */
-		offset = header[2] + CONTAINER_HDR_SZ;
-		data  += offset;
-		left  -= offset;
+	buf  += hdr[2] + CONTAINER_HDR_SZ;
+	size -= hdr[2] + CONTAINER_HDR_SZ;
 
-		eq_id = find_equiv_id(eq, eax);
-		if (eq_id) {
-			ret.size = compute_container_size(ret.data, left + offset);
+	/*
+	 * Scan through the rest of the container to find where it ends. We do
+	 * some basic sanity-checking too.
+	 */
+	while (size > 0) {
+		struct microcode_amd *mc;
+		u32 patch_size;
 
-			/*
-			 * truncate how much we need to iterate over in the
-			 * ucode update loop below
-			 */
-			left = ret.size - offset;
+		hdr = (u32 *)buf;
 
-			*desc = ret;
-			return eq_id;
-		}
+		if (hdr[0] != UCODE_UCODE_TYPE)
+			break;
 
-		/*
-		 * support multiple container files appended together. if this
-		 * one does not have a matching equivalent cpu entry, we fast
-		 * forward to the next container file.
-		 */
-		while (left > 0) {
-			header = (u32 *)data;
+		/* Sanity-check patch size. */
+		patch_size = hdr[1];
+		if (patch_size > PATCH_MAX_SIZE)
+			break;
 
-			if (header[0] == UCODE_MAGIC &&
-			    header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
-				break;
+		/* Skip patch section header: */
+		buf  += SECTION_HDR_SIZE;
+		size -= SECTION_HDR_SIZE;
 
-			offset = header[1] + SECTION_HDR_SIZE;
-			data  += offset;
-			left  -= offset;
+		mc = (struct microcode_amd *)buf;
+		if (eq_id == mc->hdr.processor_rev_id) {
+			desc->psize = patch_size;
+			desc->mc = mc;
 		}
 
-		/* mark where the next microcode container file starts */
-		offset    = data - (u8 *)ucode;
-		ucode     = data;
+		buf  += patch_size;
+		size -= patch_size;
 	}
 
-	return eq_id;
+	/*
+	 * If we have found a patch (desc->mc), it means we're looking at the
+	 * container which has a patch for this CPU so return 0 to mean, @ucode
+	 * already points to the proper container. Otherwise, we return the size
+	 * we scanned so that we can advance to the next container in the
+	 * buffer.
+	 */
+	if (desc->mc) {
+		desc->eq_id = eq_id;
+		desc->data  = ucode;
+		desc->size  = orig_size - size;
+
+		return 0;
+	}
+
+	return orig_size - size;
+}
+
+/*
+ * Scan the ucode blob for the proper container as we can have multiple
+ * containers glued together.
+ */
+static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
+{
+	ssize_t rem = size;
+
+	while (rem >= 0) {
+		ssize_t s = parse_container(ucode, rem, desc);
+		if (!s)
+			return;
+
+		ucode += s;
+		rem   -= s;
+	}
 }
 
 static int __apply_microcode_amd(struct microcode_amd *mc)
@@ -214,17 +203,16 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
  * load_microcode_amd() to save equivalent cpu table and microcode patches in
  * kernel heap memory.
  *
- * Returns true if container found (sets @ret_cont), false otherwise.
+ * Returns true if container found (sets @desc), false otherwise.
  */
 static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch,
-				      struct cont_desc *desc)
+				      struct cont_desc *ret_desc)
 {
+	struct cont_desc desc = { 0 };
 	u8 (*patch)[PATCH_MAX_SIZE];
-	u32 rev, *header, *new_rev;
-	struct cont_desc ret;
-	int offset, left;
-	u16 eq_id = 0;
-	u8  *data;
+	struct microcode_amd *mc;
+	u32 rev, *new_rev;
+	bool ret = false;
 
 #ifdef CONFIG_X86_32
 	new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
@@ -237,47 +225,31 @@ static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch,
 	if (check_current_patch_level(&rev, true))
 		return false;
 
-	eq_id = find_proper_container(ucode, size, &ret);
-	if (!eq_id)
-		return false;
-
-	this_equiv_id = eq_id;
-	header = (u32 *)ret.data;
-
-	/* We're pointing to an equiv table, skip over it. */
-	data = ret.data +  header[2] + CONTAINER_HDR_SZ;
-	left = ret.size - (header[2] + CONTAINER_HDR_SZ);
-
-	while (left > 0) {
-		struct microcode_amd *mc;
-
-		header = (u32 *)data;
-		if (header[0] != UCODE_UCODE_TYPE || /* type */
-		    header[1] == 0)                  /* size */
-			break;
+	scan_containers(ucode, size, &desc);
+	if (!desc.eq_id)
+		return ret;
 
-		mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
+	this_equiv_id = desc.eq_id;
 
-		if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
+	mc = desc.mc;
+	if (!mc)
+		return ret;
 
-			if (!__apply_microcode_amd(mc)) {
-				rev = mc->hdr.patch_id;
-				*new_rev = rev;
+	if (rev >= mc->hdr.patch_id)
+		return ret;
 
-				if (save_patch)
-					memcpy(patch, mc, min_t(u32, header[1], PATCH_MAX_SIZE));
-			}
-		}
+	if (!__apply_microcode_amd(mc)) {
+		*new_rev = mc->hdr.patch_id;
+		ret      = true;
 
-		offset  = header[1] + SECTION_HDR_SIZE;
-		data   += offset;
-		left   -= offset;
+		if (save_patch)
+			memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE));
 	}
 
-	if (desc)
-		*desc = ret;
+	if (ret_desc)
+		*ret_desc = desc;
 
-	return true;
+	return ret;
 }
 
 static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
@@ -396,6 +368,7 @@ reget:
 		}
 
 		if (!apply_microcode_early_amd(cp.data, cp.size, false, &cont)) {
+			cont.data = NULL;
 			cont.size = -1;
 			return;
 		}
@@ -434,7 +407,6 @@ int __init save_microcode_in_initrd_amd(unsigned int fam)
 {
 	enum ucode_state ret;
 	int retval = 0;
-	u16 eq_id;
 
 	if (!cont.data) {
 		if (IS_ENABLED(CONFIG_X86_32) && (cont.size != -1)) {
@@ -450,8 +422,8 @@ int __init save_microcode_in_initrd_amd(unsigned int fam)
 				return -EINVAL;
 			}
 
-			eq_id = find_proper_container(cp.data, cp.size, &cont);
-			if (!eq_id) {
+			scan_containers(cp.data, cp.size, &cont);
+			if (!cont.eq_id) {
 				cont.size = -1;
 				return -EINVAL;
 			}
-- 
cgit 


From 309aac77768c0c689eac4900ca8a6e9ef470035c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:47 +0100
Subject: x86/microcode: Decrease CPUID use

Get CPUID(1).EAX value once per CPU and propagate value into the callers
instead of conveniently calling it every time.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-9-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/amd.c  | 52 +++++++++++++++++-------------------
 arch/x86/kernel/cpu/microcode/core.c | 38 ++++++++++----------------
 2 files changed, 38 insertions(+), 52 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 9d5f4f3626f7..9fb398e64b0a 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -47,6 +47,7 @@ static struct equiv_cpu_entry *equiv_cpu_table;
  */
 static struct cont_desc {
 	struct microcode_amd *mc;
+	u32		     cpuid_1_eax;
 	u32		     psize;
 	u16		     eq_id;
 	u8		     *data;
@@ -86,7 +87,6 @@ static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc)
 	struct equiv_cpu_entry *eq;
 	ssize_t orig_size = size;
 	u32 *hdr = (u32 *)ucode;
-	u32 eax, ebx, ecx, edx;
 	u16 eq_id;
 	u8 *buf;
 
@@ -102,12 +102,8 @@ static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc)
 
 	eq = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ);
 
-	eax = 1;
-	ecx = 0;
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-
 	/* Find the equivalence ID of our CPU in this table: */
-	eq_id = find_equiv_id(eq, eax);
+	eq_id = find_equiv_id(eq, desc->cpuid_1_eax);
 
 	buf  += hdr[2] + CONTAINER_HDR_SZ;
 	size -= hdr[2] + CONTAINER_HDR_SZ;
@@ -205,8 +201,9 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
  *
  * Returns true if container found (sets @desc), false otherwise.
  */
-static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch,
-				      struct cont_desc *ret_desc)
+static bool
+apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size,
+			  bool save_patch, struct cont_desc *ret_desc)
 {
 	struct cont_desc desc = { 0 };
 	u8 (*patch)[PATCH_MAX_SIZE];
@@ -225,6 +222,8 @@ static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch,
 	if (check_current_patch_level(&rev, true))
 		return false;
 
+	desc.cpuid_1_eax = cpuid_1_eax;
+
 	scan_containers(ucode, size, &desc);
 	if (!desc.eq_id)
 		return ret;
@@ -267,10 +266,9 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
 #endif
 }
 
-void __init load_ucode_amd_bsp(unsigned int family)
+void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
 {
 	struct ucode_cpu_info *uci;
-	u32 eax, ebx, ecx, edx;
 	struct cpio_data cp;
 	const char *path;
 	bool use_pa;
@@ -285,19 +283,16 @@ void __init load_ucode_amd_bsp(unsigned int family)
 		use_pa	= false;
 	}
 
-	if (!get_builtin_microcode(&cp, family))
+	if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
 		cp = find_microcode_in_initrd(path, use_pa);
 
 	if (!(cp.data && cp.size))
 		return;
 
-	/* Get BSP's CPUID.EAX(1), needed in load_microcode_amd() */
-	eax = 1;
-	ecx = 0;
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-	uci->cpu_sig.sig = eax;
+	/* Needed in load_microcode_amd() */
+	uci->cpu_sig.sig = cpuid_1_eax;
 
-	apply_microcode_early_amd(cp.data, cp.size, true, NULL);
+	apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true, NULL);
 }
 
 #ifdef CONFIG_X86_32
@@ -308,7 +303,7 @@ void __init load_ucode_amd_bsp(unsigned int family)
  * In save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
  * which is used upon resume from suspend.
  */
-void load_ucode_amd_ap(unsigned int family)
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
 {
 	struct microcode_amd *mc;
 	struct cpio_data cp;
@@ -319,7 +314,7 @@ void load_ucode_amd_ap(unsigned int family)
 		return;
 	}
 
-	if (!get_builtin_microcode(&cp, family))
+	if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
 		cp = find_microcode_in_initrd((const char *)__pa_nodebug(ucode_path), true);
 
 	if (!(cp.data && cp.size))
@@ -329,14 +324,14 @@ void load_ucode_amd_ap(unsigned int family)
 	 * This would set amd_ucode_patch above so that the following APs can
 	 * use it directly instead of going down this path again.
 	 */
-	apply_microcode_early_amd(cp.data, cp.size, true, NULL);
+	apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true, NULL);
 }
 #else
-void load_ucode_amd_ap(unsigned int family)
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
 {
 	struct equiv_cpu_entry *eq;
 	struct microcode_amd *mc;
-	u32 rev, eax;
+	u32 rev;
 	u16 eq_id;
 
 	/* 64-bit runs with paging enabled, thus early==false. */
@@ -351,7 +346,7 @@ void load_ucode_amd_ap(unsigned int family)
 			return;
 
 reget:
-		if (!get_builtin_microcode(&cp, family)) {
+		if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) {
 #ifdef CONFIG_BLK_DEV_INITRD
 			cp = find_cpio_data(ucode_path, (void *)initrd_start,
 					    initrd_end - initrd_start, NULL);
@@ -367,17 +362,16 @@ reget:
 			}
 		}
 
-		if (!apply_microcode_early_amd(cp.data, cp.size, false, &cont)) {
+		if (!apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false, &cont)) {
 			cont.data = NULL;
 			cont.size = -1;
 			return;
 		}
 	}
 
-	eax = cpuid_eax(0x00000001);
 	eq  = (struct equiv_cpu_entry *)(cont.data + CONTAINER_HDR_SZ);
 
-	eq_id = find_equiv_id(eq, eax);
+	eq_id = find_equiv_id(eq, cpuid_1_eax);
 	if (!eq_id)
 		return;
 
@@ -403,7 +397,7 @@ reget:
 static enum ucode_state
 load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
 
-int __init save_microcode_in_initrd_amd(unsigned int fam)
+int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
 {
 	enum ucode_state ret;
 	int retval = 0;
@@ -422,6 +416,8 @@ int __init save_microcode_in_initrd_amd(unsigned int fam)
 				return -EINVAL;
 			}
 
+			cont.cpuid_1_eax = cpuid_1_eax;
+
 			scan_containers(cp.data, cp.size, &cont);
 			if (!cont.eq_id) {
 				cont.size = -1;
@@ -432,7 +428,7 @@ int __init save_microcode_in_initrd_amd(unsigned int fam)
 			return -EINVAL;
 	}
 
-	ret = load_microcode_amd(smp_processor_id(), fam, cont.data, cont.size);
+	ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax), cont.data, cont.size);
 	if (ret != UCODE_OK)
 		retval = -EINVAL;
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 2af69d27da62..437996c9be67 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -64,10 +64,6 @@ static DEFINE_MUTEX(microcode_mutex);
 
 struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
 
-/*
- * Operations that are run on a target cpu:
- */
-
 struct cpu_info_ctx {
 	struct cpu_signature	*cpu_sig;
 	int			err;
@@ -76,7 +72,6 @@ struct cpu_info_ctx {
 static bool __init check_loader_disabled_bsp(void)
 {
 	static const char *__dis_opt_str = "dis_ucode_ldr";
-	u32 a, b, c, d;
 
 #ifdef CONFIG_X86_32
 	const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
@@ -92,16 +87,12 @@ static bool __init check_loader_disabled_bsp(void)
 	if (!have_cpuid_p())
 		return *res;
 
-	a = 1;
-	c = 0;
-	native_cpuid(&a, &b, &c, &d);
-
 	/*
 	 * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
 	 * completely accurate as xen pv guests don't see that CPUID bit set but
 	 * that's good enough as they don't land on the BSP path anyway.
 	 */
-	if (c & BIT(31))
+	if (native_cpuid_ecx(1) & BIT(31))
 		return *res;
 
 	if (cmdline_find_option_bool(cmdline, option) <= 0)
@@ -131,23 +122,22 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
 
 void __init load_ucode_bsp(void)
 {
-	int vendor;
-	unsigned int family;
+	unsigned int vendor, cpuid_1_eax;
 
 	if (check_loader_disabled_bsp())
 		return;
 
-	vendor = x86_cpuid_vendor();
-	family = x86_cpuid_family();
+	vendor	    = x86_cpuid_vendor();
+	cpuid_1_eax = native_cpuid_eax(1);
 
 	switch (vendor) {
 	case X86_VENDOR_INTEL:
-		if (family >= 6)
+		if (x86_family(cpuid_1_eax) >= 6)
 			load_ucode_intel_bsp();
 		break;
 	case X86_VENDOR_AMD:
-		if (family >= 0x10)
-			load_ucode_amd_bsp(family);
+		if (x86_family(cpuid_1_eax) >= 0x10)
+			load_ucode_amd_bsp(cpuid_1_eax);
 		break;
 	default:
 		break;
@@ -165,22 +155,22 @@ static bool check_loader_disabled_ap(void)
 
 void load_ucode_ap(void)
 {
-	int vendor, family;
+	unsigned int vendor, cpuid_1_eax;
 
 	if (check_loader_disabled_ap())
 		return;
 
-	vendor = x86_cpuid_vendor();
-	family = x86_cpuid_family();
+	vendor	    = x86_cpuid_vendor();
+	cpuid_1_eax = native_cpuid_eax(1);
 
 	switch (vendor) {
 	case X86_VENDOR_INTEL:
-		if (family >= 6)
+		if (x86_family(cpuid_1_eax) >= 6)
 			load_ucode_intel_ap();
 		break;
 	case X86_VENDOR_AMD:
-		if (family >= 0x10)
-			load_ucode_amd_ap(family);
+		if (x86_family(cpuid_1_eax) >= 0x10)
+			load_ucode_amd_ap(cpuid_1_eax);
 		break;
 	default:
 		break;
@@ -198,7 +188,7 @@ static int __init save_microcode_in_initrd(void)
 		break;
 	case X86_VENDOR_AMD:
 		if (c->x86 >= 0x10)
-			return save_microcode_in_initrd_amd(c->x86);
+			return save_microcode_in_initrd_amd(cpuid_eax(1));
 		break;
 	default:
 		break;
-- 
cgit 


From 3da9b41794590022d09caa345aaa7c812ac22971 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:48 +0100
Subject: x86/microcode/AMD: Get rid of global this_equiv_id

We have a container which we update/prepare each time before applying a
microcode patch instead of using a global.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-10-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/amd.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 9fb398e64b0a..4cbfe70cf458 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -56,7 +56,6 @@ static struct cont_desc {
 
 static u32 ucode_new_rev;
 static u8 amd_ucode_patch[PATCH_MAX_SIZE];
-static u16 this_equiv_id;
 
 /*
  * Microcode patch container file is prepended to the initrd in cpio
@@ -228,8 +227,6 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size,
 	if (!desc.eq_id)
 		return ret;
 
-	this_equiv_id = desc.eq_id;
-
 	mc = desc.mc;
 	if (!mc)
 		return ret;
@@ -375,7 +372,7 @@ reget:
 	if (!eq_id)
 		return;
 
-	if (eq_id == this_equiv_id) {
+	if (eq_id == cont.eq_id) {
 		mc = (struct microcode_amd *)amd_ucode_patch;
 
 		if (mc && rev < mc->hdr.patch_id) {
-- 
cgit 


From 8cc26e0b4c49246564f773edbbefa3d5dc91d56e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:49 +0100
Subject: x86/microcode/AMD: Use find_microcode_in_initrd()

Use the generic helper instead of semi-open-coding the procedure.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-11-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/amd.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 4cbfe70cf458..7727f278de58 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -337,17 +337,15 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
 
 	/* First AP hasn't cached it yet, go through the blob. */
 	if (!cont.data) {
-		struct cpio_data cp = { NULL, 0, "" };
+		struct cpio_data cp;
 
 		if (cont.size == -1)
 			return;
 
 reget:
 		if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) {
-#ifdef CONFIG_BLK_DEV_INITRD
-			cp = find_cpio_data(ucode_path, (void *)initrd_start,
-					    initrd_end - initrd_start, NULL);
-#endif
+			cp = find_microcode_in_initrd(ucode_path, false);
+
 			if (!(cp.data && cp.size)) {
 				/*
 				 * Mark it so that other APs do not scan again
@@ -401,13 +399,9 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
 
 	if (!cont.data) {
 		if (IS_ENABLED(CONFIG_X86_32) && (cont.size != -1)) {
-			struct cpio_data cp = { NULL, 0, "" };
-
-#ifdef CONFIG_BLK_DEV_INITRD
-			cp = find_cpio_data(ucode_path, (void *)initrd_start,
-					    initrd_end - initrd_start, NULL);
-#endif
+			struct cpio_data cp;
 
+			cp = find_microcode_in_initrd(ucode_path, false);
 			if (!(cp.data && cp.size)) {
 				cont.size = -1;
 				return -EINVAL;
-- 
cgit 


From 7a93a40be23e5557934d773cc89b7b3627c08097 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:50 +0100
Subject: x86/microcode: Remove local vendor variable

Use x86_cpuid_vendor() directly.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-12-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/core.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 437996c9be67..dc54518299c4 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -122,15 +122,14 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
 
 void __init load_ucode_bsp(void)
 {
-	unsigned int vendor, cpuid_1_eax;
+	unsigned int cpuid_1_eax;
 
 	if (check_loader_disabled_bsp())
 		return;
 
-	vendor	    = x86_cpuid_vendor();
 	cpuid_1_eax = native_cpuid_eax(1);
 
-	switch (vendor) {
+	switch (x86_cpuid_vendor()) {
 	case X86_VENDOR_INTEL:
 		if (x86_family(cpuid_1_eax) >= 6)
 			load_ucode_intel_bsp();
@@ -155,15 +154,14 @@ static bool check_loader_disabled_ap(void)
 
 void load_ucode_ap(void)
 {
-	unsigned int vendor, cpuid_1_eax;
+	unsigned int cpuid_1_eax;
 
 	if (check_loader_disabled_ap())
 		return;
 
-	vendor	    = x86_cpuid_vendor();
 	cpuid_1_eax = native_cpuid_eax(1);
 
-	switch (vendor) {
+	switch (x86_cpuid_vendor()) {
 	case X86_VENDOR_INTEL:
 		if (x86_family(cpuid_1_eax) >= 6)
 			load_ucode_intel_ap();
-- 
cgit 


From f3ad136d6ef966c8ba9090770c2bfe7e85f18471 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:51 +0100
Subject: x86/microcode/AMD: Check patch level only on the BSP

Check final patch levels for AMD only on the BSP. This way, we decide
early and only once whether to continue loading or to leave the loader
disabled on such systems.

Simplify a lot.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-13-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/microcode_amd.h |  2 -
 arch/x86/kernel/cpu/microcode/amd.c  | 78 +++++-------------------------------
 arch/x86/kernel/cpu/microcode/core.c | 41 +++++++++++++++++++
 3 files changed, 52 insertions(+), 69 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index 3e3e20be829a..3d57009e168b 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -54,6 +54,4 @@ static inline int __init
 save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
 void reload_ucode_amd(void) {}
 #endif
-
-extern bool check_current_patch_level(u32 *rev, bool early);
 #endif /* _ASM_X86_MICROCODE_AMD_H */
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 7727f278de58..61743476c25b 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -207,7 +207,7 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size,
 	struct cont_desc desc = { 0 };
 	u8 (*patch)[PATCH_MAX_SIZE];
 	struct microcode_amd *mc;
-	u32 rev, *new_rev;
+	u32 rev, dummy, *new_rev;
 	bool ret = false;
 
 #ifdef CONFIG_X86_32
@@ -218,9 +218,6 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size,
 	patch	= &amd_ucode_patch;
 #endif
 
-	if (check_current_patch_level(&rev, true))
-		return false;
-
 	desc.cpuid_1_eax = cpuid_1_eax;
 
 	scan_containers(ucode, size, &desc);
@@ -231,6 +228,7 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size,
 	if (!mc)
 		return ret;
 
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
 	if (rev >= mc->hdr.patch_id)
 		return ret;
 
@@ -328,13 +326,8 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
 {
 	struct equiv_cpu_entry *eq;
 	struct microcode_amd *mc;
-	u32 rev;
 	u16 eq_id;
 
-	/* 64-bit runs with paging enabled, thus early==false. */
-	if (check_current_patch_level(&rev, false))
-		return;
-
 	/* First AP hasn't cached it yet, go through the blob. */
 	if (!cont.data) {
 		struct cpio_data cp;
@@ -371,6 +364,10 @@ reget:
 		return;
 
 	if (eq_id == cont.eq_id) {
+		u32 rev, dummy;
+
+		native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
 		mc = (struct microcode_amd *)amd_ucode_patch;
 
 		if (mc && rev < mc->hdr.patch_id) {
@@ -436,19 +433,14 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
 void reload_ucode_amd(void)
 {
 	struct microcode_amd *mc;
-	u32 rev;
-
-	/*
-	 * early==false because this is a syscore ->resume path and by
-	 * that time paging is long enabled.
-	 */
-	if (check_current_patch_level(&rev, false))
-		return;
+	u32 rev, dummy;
 
 	mc = (struct microcode_amd *)amd_ucode_patch;
 	if (!mc)
 		return;
 
+	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
 	if (rev < mc->hdr.patch_id) {
 		if (!__apply_microcode_amd(mc)) {
 			ucode_new_rev = mc->hdr.patch_id;
@@ -586,60 +578,13 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
 	return patch_size;
 }
 
-/*
- * Those patch levels cannot be updated to newer ones and thus should be final.
- */
-static u32 final_levels[] = {
-	0x01000098,
-	0x0100009f,
-	0x010000af,
-	0, /* T-101 terminator */
-};
-
-/*
- * Check the current patch level on this CPU.
- *
- * @rev: Use it to return the patch level. It is set to 0 in the case of
- * error.
- *
- * Returns:
- *  - true: if update should stop
- *  - false: otherwise
- */
-bool check_current_patch_level(u32 *rev, bool early)
-{
-	u32 lvl, dummy, i;
-	bool ret = false;
-	u32 *levels;
-
-	native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
-
-	if (IS_ENABLED(CONFIG_X86_32) && early)
-		levels = (u32 *)__pa_nodebug(&final_levels);
-	else
-		levels = final_levels;
-
-	for (i = 0; levels[i]; i++) {
-		if (lvl == levels[i]) {
-			lvl = 0;
-			ret = true;
-			break;
-		}
-	}
-
-	if (rev)
-		*rev = lvl;
-
-	return ret;
-}
-
 static int apply_microcode_amd(int cpu)
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	struct microcode_amd *mc_amd;
 	struct ucode_cpu_info *uci;
 	struct ucode_patch *p;
-	u32 rev;
+	u32 rev, dummy;
 
 	BUG_ON(raw_smp_processor_id() != cpu);
 
@@ -652,8 +597,7 @@ static int apply_microcode_amd(int cpu)
 	mc_amd  = p->data;
 	uci->mc = p->data;
 
-	if (check_current_patch_level(&rev, false))
-		return -1;
+	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
 
 	/* need to apply patch? */
 	if (rev >= mc_amd->hdr.patch_id) {
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index dc54518299c4..3b74d2f315d3 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -69,6 +69,42 @@ struct cpu_info_ctx {
 	int			err;
 };
 
+/*
+ * Those patch levels cannot be updated to newer ones and thus should be final.
+ */
+static u32 final_levels[] = {
+	0x01000098,
+	0x0100009f,
+	0x010000af,
+	0, /* T-101 terminator */
+};
+
+/*
+ * Check the current patch level on this CPU.
+ *
+ * Returns:
+ *  - true: if update should stop
+ *  - false: otherwise
+ */
+static bool amd_check_current_patch_level(void)
+{
+	u32 lvl, dummy, i;
+	u32 *levels;
+
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
+
+	if (IS_ENABLED(CONFIG_X86_32))
+		levels = (u32 *)__pa_nodebug(&final_levels);
+	else
+		levels = final_levels;
+
+	for (i = 0; levels[i]; i++) {
+		if (lvl == levels[i])
+			return true;
+	}
+	return false;
+}
+
 static bool __init check_loader_disabled_bsp(void)
 {
 	static const char *__dis_opt_str = "dis_ucode_ldr";
@@ -95,6 +131,11 @@ static bool __init check_loader_disabled_bsp(void)
 	if (native_cpuid_ecx(1) & BIT(31))
 		return *res;
 
+	if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
+		if (amd_check_current_patch_level())
+			return *res;
+	}
+
 	if (cmdline_find_option_bool(cmdline, option) <= 0)
 		*res = false;
 
-- 
cgit 


From e71bb4ec073901ad50bfa86fed74fce7ac3210fe Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:52 +0100
Subject: x86/microcode/AMD: Unify load_ucode_amd_ap()

Use a version for both bitness by adding a helper which does the actual
container finding and parsing which can be used on any CPU - BSP or AP.
Streamlines the paths more.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-14-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/amd.c | 81 ++++++++++++++-----------------------
 1 file changed, 31 insertions(+), 50 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 61743476c25b..fe9e865480af 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -261,7 +261,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
 #endif
 }
 
-void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
+void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
 {
 	struct ucode_cpu_info *uci;
 	struct cpio_data cp;
@@ -281,89 +281,71 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
 	if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
 		cp = find_microcode_in_initrd(path, use_pa);
 
-	if (!(cp.data && cp.size))
-		return;
-
 	/* Needed in load_microcode_amd() */
 	uci->cpu_sig.sig = cpuid_1_eax;
 
-	apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true, NULL);
+	*ret = cp;
 }
 
-#ifdef CONFIG_X86_32
-/*
- * On 32-bit, since AP's early load occurs before paging is turned on, we
- * cannot traverse cpu_equiv_table and microcode_cache in kernel heap memory.
- * So during cold boot, AP will apply_ucode_in_initrd() just like the BSP.
- * In save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
- * which is used upon resume from suspend.
- */
-void load_ucode_amd_ap(unsigned int cpuid_1_eax)
+void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
 {
-	struct microcode_amd *mc;
-	struct cpio_data cp;
+	struct cpio_data cp = { };
 
-	mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
-	if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
-		__apply_microcode_amd(mc);
-		return;
-	}
-
-	if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
-		cp = find_microcode_in_initrd((const char *)__pa_nodebug(ucode_path), true);
+	__load_ucode_amd(cpuid_1_eax, &cp);
 
 	if (!(cp.data && cp.size))
 		return;
 
-	/*
-	 * This would set amd_ucode_patch above so that the following APs can
-	 * use it directly instead of going down this path again.
-	 */
 	apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true, NULL);
 }
-#else
+
 void load_ucode_amd_ap(unsigned int cpuid_1_eax)
 {
 	struct equiv_cpu_entry *eq;
 	struct microcode_amd *mc;
+	struct cont_desc *desc;
 	u16 eq_id;
 
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		mc   = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
+		desc = (struct cont_desc *)__pa_nodebug(&cont);
+	} else {
+		mc   = (struct microcode_amd *)amd_ucode_patch;
+		desc = &cont;
+	}
+
 	/* First AP hasn't cached it yet, go through the blob. */
-	if (!cont.data) {
-		struct cpio_data cp;
+	if (!desc->data) {
+		struct cpio_data cp = { };
 
-		if (cont.size == -1)
+		if (desc->size == -1)
 			return;
 
 reget:
-		if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) {
-			cp = find_microcode_in_initrd(ucode_path, false);
-
-			if (!(cp.data && cp.size)) {
-				/*
-				 * Mark it so that other APs do not scan again
-				 * for no real reason and slow down boot
-				 * needlessly.
-				 */
-				cont.size = -1;
-				return;
-			}
+		__load_ucode_amd(cpuid_1_eax, &cp);
+		if (!(cp.data && cp.size)) {
+			/*
+			 * Mark it so that other APs do not scan again for no
+			 * real reason and slow down boot needlessly.
+			 */
+			desc->size = -1;
+			return;
 		}
 
-		if (!apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false, &cont)) {
-			cont.data = NULL;
-			cont.size = -1;
+		if (!apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false, desc)) {
+			desc->data = NULL;
+			desc->size = -1;
 			return;
 		}
 	}
 
-	eq  = (struct equiv_cpu_entry *)(cont.data + CONTAINER_HDR_SZ);
+	eq  = (struct equiv_cpu_entry *)(desc->data + CONTAINER_HDR_SZ);
 
 	eq_id = find_equiv_id(eq, cpuid_1_eax);
 	if (!eq_id)
 		return;
 
-	if (eq_id == cont.eq_id) {
+	if (eq_id == desc->eq_id) {
 		u32 rev, dummy;
 
 		native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
@@ -384,7 +366,6 @@ reget:
 		goto reget;
 	}
 }
-#endif /* CONFIG_X86_32 */
 
 static enum ucode_state
 load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
-- 
cgit 


From 72edfe950b36308353e27cdc02f334431239938a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:53 +0100
Subject: x86/microcode/AMD: Simplify saving from initrd

No need to use the previously stashed info in the container - simply go
ahead and parse the initrd once more. It simplifies and streamlines the
code a whole lot.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-15-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/amd.c | 43 +++++++++++--------------------------
 1 file changed, 13 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index fe9e865480af..2a194e384207 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -372,43 +372,26 @@ load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
 
 int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
 {
+	struct cont_desc desc = { 0 };
 	enum ucode_state ret;
-	int retval = 0;
-
-	if (!cont.data) {
-		if (IS_ENABLED(CONFIG_X86_32) && (cont.size != -1)) {
-			struct cpio_data cp;
-
-			cp = find_microcode_in_initrd(ucode_path, false);
-			if (!(cp.data && cp.size)) {
-				cont.size = -1;
-				return -EINVAL;
-			}
+	struct cpio_data cp;
 
-			cont.cpuid_1_eax = cpuid_1_eax;
+	cp = find_microcode_in_initrd(ucode_path, false);
+	if (!(cp.data && cp.size))
+		return -EINVAL;
 
-			scan_containers(cp.data, cp.size, &cont);
-			if (!cont.eq_id) {
-				cont.size = -1;
-				return -EINVAL;
-			}
+	desc.cpuid_1_eax = cpuid_1_eax;
 
-		} else
-			return -EINVAL;
-	}
+	scan_containers(cp.data, cp.size, &desc);
+	if (!desc.eq_id)
+		return -EINVAL;
 
-	ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax), cont.data, cont.size);
+	ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax),
+				 desc.data, desc.size);
 	if (ret != UCODE_OK)
-		retval = -EINVAL;
-
-	/*
-	 * This will be freed any msec now, stash patches for the current
-	 * family and switch to patch cache for cpu hotplug, etc later.
-	 */
-	cont.data = NULL;
-	cont.size = 0;
+		return -EINVAL;
 
-	return retval;
+	return 0;
 }
 
 void reload_ucode_amd(void)
-- 
cgit 


From 69f5f983001f6d097aac774a9e917f44657f3367 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:54 +0100
Subject: x86/microcode/AMD: Remove AP scanning optimization

The idea was to not scan the microcode blob on each AP (Application
Processor) during boot and thus save us some milliseconds. However, on
architectures where the microcode engine is shared between threads, this
doesn't work. Here's why:

The microcode on CPU0, i.e., the first thread, gets updated. The second
thread, i.e., CPU1, i.e., the first AP walks into load_ucode_amd_ap(),
sees that there's no container cached and goes and scans for the proper
blob.

It finds it and as a last step of apply_microcode_early_amd(), it tries
to apply the patch but that core has already the updated microcode
revision which it has received through CPU0's update. So it returns
false and we do desc->size = -1 to prevent other APs from scanning.

However, the next AP, CPU2, has a different microcode engine which
hasn't been updated yet. The desc->size == -1 test prevents it from
scanning the blob anew and we fail to update it.

The fix is much more straight-forward than it looks: the BSP
(BootStrapping Processor), i.e., CPU0, caches the microcode patch
in amd_ucode_patch. We use that on the AP and try to apply it.
In the 99.9999% of cases where we have homogeneous cores - *not*
mixed-steppings - the application will be successful and we're good to
go.

In the remaining small set of systems, we will simply rescan the blob
and find (or not, if none present) the proper patch and apply it then.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-16-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/amd.c | 78 +++++++++----------------------------
 1 file changed, 18 insertions(+), 60 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 2a194e384207..5e1b57747c2f 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -45,14 +45,14 @@ static struct equiv_cpu_entry *equiv_cpu_table;
  * save from the initrd/builtin before jettisoning its contents. @mc is the
  * microcode patch we found to match.
  */
-static struct cont_desc {
+struct cont_desc {
 	struct microcode_amd *mc;
 	u32		     cpuid_1_eax;
 	u32		     psize;
 	u16		     eq_id;
 	u8		     *data;
 	size_t		     size;
-} cont;
+};
 
 static u32 ucode_new_rev;
 static u8 amd_ucode_patch[PATCH_MAX_SIZE];
@@ -201,8 +201,7 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
  * Returns true if container found (sets @desc), false otherwise.
  */
 static bool
-apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size,
-			  bool save_patch, struct cont_desc *ret_desc)
+apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch)
 {
 	struct cont_desc desc = { 0 };
 	u8 (*patch)[PATCH_MAX_SIZE];
@@ -240,9 +239,6 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size,
 			memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE));
 	}
 
-	if (ret_desc)
-		*ret_desc = desc;
-
 	return ret;
 }
 
@@ -292,79 +288,41 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
 	struct cpio_data cp = { };
 
 	__load_ucode_amd(cpuid_1_eax, &cp);
-
 	if (!(cp.data && cp.size))
 		return;
 
-	apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true, NULL);
+	apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true);
 }
 
 void load_ucode_amd_ap(unsigned int cpuid_1_eax)
 {
-	struct equiv_cpu_entry *eq;
 	struct microcode_amd *mc;
-	struct cont_desc *desc;
-	u16 eq_id;
+	struct cpio_data cp;
+	u32 *new_rev, rev, dummy;
 
 	if (IS_ENABLED(CONFIG_X86_32)) {
-		mc   = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
-		desc = (struct cont_desc *)__pa_nodebug(&cont);
+		mc	= (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
+		new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
 	} else {
-		mc   = (struct microcode_amd *)amd_ucode_patch;
-		desc = &cont;
+		mc	= (struct microcode_amd *)amd_ucode_patch;
+		new_rev = &ucode_new_rev;
 	}
 
-	/* First AP hasn't cached it yet, go through the blob. */
-	if (!desc->data) {
-		struct cpio_data cp = { };
-
-		if (desc->size == -1)
-			return;
-
-reget:
-		__load_ucode_amd(cpuid_1_eax, &cp);
-		if (!(cp.data && cp.size)) {
-			/*
-			 * Mark it so that other APs do not scan again for no
-			 * real reason and slow down boot needlessly.
-			 */
-			desc->size = -1;
-			return;
-		}
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
 
-		if (!apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false, desc)) {
-			desc->data = NULL;
-			desc->size = -1;
+	/* Check whether we have saved a new patch already: */
+	if (*new_rev && rev < mc->hdr.patch_id) {
+		if (!__apply_microcode_amd(mc)) {
+			*new_rev = mc->hdr.patch_id;
 			return;
 		}
 	}
 
-	eq  = (struct equiv_cpu_entry *)(desc->data + CONTAINER_HDR_SZ);
-
-	eq_id = find_equiv_id(eq, cpuid_1_eax);
-	if (!eq_id)
+	__load_ucode_amd(cpuid_1_eax, &cp);
+	if (!(cp.data && cp.size))
 		return;
 
-	if (eq_id == desc->eq_id) {
-		u32 rev, dummy;
-
-		native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
-		mc = (struct microcode_amd *)amd_ucode_patch;
-
-		if (mc && rev < mc->hdr.patch_id) {
-			if (!__apply_microcode_amd(mc))
-				ucode_new_rev = mc->hdr.patch_id;
-		}
-
-	} else {
-
-		/*
-		 * AP has a different equivalence ID than BSP, looks like
-		 * mixed-steppings silicon so go through the ucode blob anew.
-		 */
-		goto reget;
-	}
+	apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false);
 }
 
 static enum ucode_state
-- 
cgit 


From da0aa3dde05108e180eecd76534c55f43ea4b9c8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 20 Jan 2017 21:29:55 +0100
Subject: x86/microcode/AMD: Remove struct cont_desc.eq_id

The equivalence ID was needed outside of the container scanning logic
but now, after this has been cleaned up, not anymore. Now, cont_desc.mc
is used to denote whether the container we're looking at has the proper
microcode patch for this CPU or not.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/20170120202955.4091-17-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/amd.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 5e1b57747c2f..7889ae492af0 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -49,7 +49,6 @@ struct cont_desc {
 	struct microcode_amd *mc;
 	u32		     cpuid_1_eax;
 	u32		     psize;
-	u16		     eq_id;
 	u8		     *data;
 	size_t		     size;
 };
@@ -92,10 +91,8 @@ static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc)
 	/* Am I looking at an equivalence table header? */
 	if (hdr[0] != UCODE_MAGIC ||
 	    hdr[1] != UCODE_EQUIV_CPU_TABLE_TYPE ||
-	    hdr[2] == 0) {
-		desc->eq_id = 0;
+	    hdr[2] == 0)
 		return CONTAINER_HDR_SZ;
-	}
 
 	buf = ucode;
 
@@ -147,9 +144,8 @@ static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc)
 	 * buffer.
 	 */
 	if (desc->mc) {
-		desc->eq_id = eq_id;
-		desc->data  = ucode;
-		desc->size  = orig_size - size;
+		desc->data = ucode;
+		desc->size = orig_size - size;
 
 		return 0;
 	}
@@ -220,8 +216,6 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p
 	desc.cpuid_1_eax = cpuid_1_eax;
 
 	scan_containers(ucode, size, &desc);
-	if (!desc.eq_id)
-		return ret;
 
 	mc = desc.mc;
 	if (!mc)
@@ -341,7 +335,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
 	desc.cpuid_1_eax = cpuid_1_eax;
 
 	scan_containers(cp.data, cp.size, &desc);
-	if (!desc.eq_id)
+	if (!desc.mc)
 		return -EINVAL;
 
 	ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax),
-- 
cgit 


From 4c833368f0bf748d4147bf301b1f95bc8eccb3c0 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Sun, 22 Jan 2017 16:50:23 +0800
Subject: x86/fpu: Set the xcomp_bv when we fake up a XSAVES area

I got the following calltrace on a Apollo Lake SoC with 32-bit kernel:

  WARNING: CPU: 2 PID: 261 at arch/x86/include/asm/fpu/internal.h:363 fpu__restore+0x1f5/0x260
  [...]
  Hardware name: Intel Corp. Broxton P/NOTEBOOK, BIOS APLIRVPA.X64.0138.B35.1608091058 08/09/2016
  Call Trace:
   dump_stack()
   __warn()
   ? fpu__restore()
   warn_slowpath_null()
   fpu__restore()
   __fpu__restore_sig()
   fpu__restore_sig()
   restore_sigcontext.isra.9()
   sys_sigreturn()
   do_int80_syscall_32()
   entry_INT80_32()

The reason is that a #GP occurs when executing XRSTORS. The root cause
is that we forget to set the xcomp_bv when we fake up the XSAVES area
in the copyin_to_xsaves() function.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/1485075023-30161-1-git-send-email-haokexin@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/fpu/xstate.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 1d7770447b3e..e287b9075527 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1070,6 +1070,7 @@ int copyin_to_xsaves(const void *kbuf, const void __user *ubuf,
 	 * Add back in the features that came in from userspace:
 	 */
 	xsave->header.xfeatures |= xfeatures;
+	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xsave->header.xfeatures;
 
 	return 0;
 }
-- 
cgit 


From 587d531b8f67ebe62f8326849a7a685a03cbc904 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Thu, 19 Jan 2017 22:28:05 +0100
Subject: crypto: x86/crc32c - fix %progbits -> @progbits

%progbits form is used on ARM (where @ is a comment char).

x86 consistently uses @progbits everywhere else.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: George Spelvin <linux@horizon.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index dc05f010ca9b..7a7de27c6f41 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -312,7 +312,7 @@ do_return:
         ret
 ENDPROC(crc_pcl)
 
-.section	.rodata, "a", %progbits
+.section	.rodata, "a", @progbits
         ################################################################
         ## jump table        Table is 129 entries x 2 bytes each
         ################################################################
-- 
cgit 


From e183914af00e15eb41ae666d44e323bfa154be13 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Thu, 19 Jan 2017 22:33:04 +0100
Subject: crypto: x86 - make constants readonly, allow linker to merge them

A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.

Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.

There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".

GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:

	.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16

It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.

When this is not the case, use non-mergeable section:

	.section .rodata[.VAR_NAME], "a", @progbits

This reduces .data by ~15 kbytes:

    text    data     bss     dec      hex filename
11097415 2705840 2630712 16433967  fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479  fac147 vmlinux.o

Merged objects are visible in System.map:

ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK   <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512

Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_asm.S                  | 37 +++++++++++++++++-----
 arch/x86/crypto/aesni-intel_avx-x86_64.S           | 32 ++++++++++++++-----
 arch/x86/crypto/camellia-aesni-avx-asm_64.S        |  5 ++-
 arch/x86/crypto/camellia-aesni-avx2-asm_64.S       | 12 +++++--
 arch/x86/crypto/cast5-avx-x86_64-asm_64.S          | 14 ++++++--
 arch/x86/crypto/cast6-avx-x86_64-asm_64.S          | 12 +++++--
 arch/x86/crypto/chacha20-avx2-x86_64.S             |  9 ++++--
 arch/x86/crypto/chacha20-ssse3-x86_64.S            |  7 ++--
 arch/x86/crypto/crct10dif-pcl-asm_64.S             | 14 ++++++--
 arch/x86/crypto/des3_ede-asm_64.S                  |  2 +-
 arch/x86/crypto/ghash-clmulni-intel_asm.S          |  3 +-
 arch/x86/crypto/poly1305-avx2-x86_64.S             |  6 ++--
 arch/x86/crypto/poly1305-sse2-x86_64.S             |  6 ++--
 arch/x86/crypto/serpent-avx-x86_64-asm_64.S        |  5 +--
 arch/x86/crypto/serpent-avx2-asm_64.S              |  9 ++++--
 arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S   |  6 ++--
 arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S  |  3 +-
 arch/x86/crypto/sha1-mb/sha1_x8_avx2.S             | 15 +++++++--
 arch/x86/crypto/sha1_ni_asm.S                      |  8 +++--
 arch/x86/crypto/sha256-avx-asm.S                   |  9 +++++-
 arch/x86/crypto/sha256-avx2-asm.S                  |  9 +++++-
 .../crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S    |  6 ++--
 .../crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S   |  3 +-
 arch/x86/crypto/sha256-mb/sha256_x8_avx2.S         |  7 +++-
 arch/x86/crypto/sha256-ssse3-asm.S                 |  8 ++++-
 arch/x86/crypto/sha256_ni_asm.S                    |  4 ++-
 arch/x86/crypto/sha512-avx-asm.S                   |  9 ++++--
 arch/x86/crypto/sha512-avx2-asm.S                  | 10 ++++--
 .../crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S    | 10 ++++--
 .../crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S   |  4 ++-
 arch/x86/crypto/sha512-mb/sha512_x4_avx2.S         |  4 ++-
 arch/x86/crypto/sha512-ssse3-asm.S                 |  9 ++++--
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S        |  6 ++--
 33 files changed, 229 insertions(+), 74 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 383a6f84a060..3c465184ff8a 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -46,28 +46,49 @@
 
 #ifdef __x86_64__
 
-.data
+# constants in mergeable sections, linker can reorder and merge
+.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
 .align 16
 .Lgf128mul_x_ble_mask:
 	.octa 0x00000000000000010000000000000087
+.section	.rodata.cst16.POLY, "aM", @progbits, 16
+.align 16
 POLY:   .octa 0xC2000000000000000000000000000001
+.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
+.align 16
 TWOONE: .octa 0x00000001000000000000000000000001
 
-# order of these constants should not change.
-# more specifically, ALL_F should follow SHIFT_MASK,
-# and ZERO should follow ALL_F
-
+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
+.section	.rodata.cst16.MASK1, "aM", @progbits, 16
+.align 16
 MASK1:      .octa 0x0000000000000000ffffffffffffffff
+.section	.rodata.cst16.MASK2, "aM", @progbits, 16
+.align 16
 MASK2:      .octa 0xffffffffffffffff0000000000000000
-SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
-ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
-ZERO:       .octa 0x00000000000000000000000000000000
+.section	.rodata.cst16.ONE, "aM", @progbits, 16
+.align 16
 ONE:        .octa 0x00000000000000000000000000000001
+.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
+.align 16
 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+.section	.rodata.cst16.dec, "aM", @progbits, 16
+.align 16
 dec:        .octa 0x1
+.section	.rodata.cst16.enc, "aM", @progbits, 16
+.align 16
 enc:        .octa 0x2
 
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK,
+# and zero should follow ALL_F
+.section	.rodata, "a", @progbits
+.align 16
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
+            .octa 0x00000000000000000000000000000000
+
 
 .text
 
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 522ab68d1c88..d664382c6e56 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -122,23 +122,39 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 
-.data
+# constants in mergeable sections, linker can reorder and merge
+.section	.rodata.cst16.POLY, "aM", @progbits, 16
 .align 16
-
 POLY:            .octa     0xC2000000000000000000000000000001
+
+.section	.rodata.cst16.POLY2, "aM", @progbits, 16
+.align 16
 POLY2:           .octa     0xC20000000000000000000001C2000000
-TWOONE:          .octa     0x00000001000000000000000000000001
 
-# order of these constants should not change.
-# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
+.align 16
+TWOONE:          .octa     0x00000001000000000000000000000001
 
+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
 SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
-SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
-ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
-ZERO:            .octa     0x00000000000000000000000000000000
+
+.section	.rodata.cst16.ONE, "aM", @progbits, 16
+.align 16
 ONE:             .octa     0x00000000000000000000000000000001
+
+.section	.rodata.cst16.ONEf, "aM", @progbits, 16
+.align 16
 ONEf:            .octa     0x01000000000000000000000000000000
 
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
+.section	.rodata, "a", @progbits
+.align 16
+SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
+ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
+                 .octa     0x00000000000000000000000000000000
+
 .text
 
 
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index aa9e8bd163f6..f7c495e2863c 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -571,7 +571,9 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	vmovdqu y6, 14 * 16(rio); \
 	vmovdqu y7, 15 * 16(rio);
 
-.data
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section	.rodata.cst16, "aM", @progbits, 16
 .align 16
 
 #define SHUFB_BYTES(idx) \
@@ -711,6 +713,7 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 
 /* 4-bit mask */
+.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
 .align 4
 .L0f0f0f0f:
 	.long 0x0f0f0f0f
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 16186c18656d..eee5b3982cfd 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -610,20 +610,25 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	vmovdqu y6, 14 * 32(rio); \
 	vmovdqu y7, 15 * 32(rio);
 
-.data
-.align 32
 
+.section	.rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
 #define SHUFB_BYTES(idx) \
 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
-
 .Lshufb_16x16b:
 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
 
+.section	.rodata.cst32.pack_bswap, "aM", @progbits, 32
+.align 32
 .Lpack_bswap:
 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section	.rodata.cst16, "aM", @progbits, 16
+.align 16
+
 /* For CTR-mode IV byteswap */
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -750,6 +755,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 
+.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 14fa1966bf01..b4a8806234ea 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -195,19 +195,29 @@
 	vpshufb rmask,	x0, x0;           \
 	vpshufb rmask,	x1, x1;
 
-.data
-
+.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
 .align 16
 .Lbswap_mask:
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.section	.rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
+.align 16
 .Lbswap_iv_mask:
 	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst4.16_mask, "aM", @progbits, 4
+.align 4
 .L16_mask:
 	.byte 16, 16, 16, 16
+.section	.rodata.cst4.32_mask, "aM", @progbits, 4
+.align 4
 .L32_mask:
 	.byte 32, 0, 0, 0
+.section	.rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
 .Lfirst_mask:
 	.byte 0x1f, 0, 0, 0
 
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index c419389889cd..952d3156a933 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -225,8 +225,7 @@
 	vpshufb rmask,		x2, x2;       \
 	vpshufb rmask,		x3, x3;
 
-.data
-
+.section	.rodata.cst16, "aM", @progbits, 16
 .align 16
 .Lxts_gf128mul_and_shl1_mask:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
@@ -244,10 +243,19 @@
 	.byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst4.L16_mask, "aM", @progbits, 4
+.align 4
 .L16_mask:
 	.byte 16, 16, 16, 16
+
+.section	.rodata.cst4.L32_mask, "aM", @progbits, 4
+.align 4
 .L32_mask:
 	.byte 32, 0, 0, 0
+
+.section	.rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
 .Lfirst_mask:
 	.byte 0x1f, 0, 0, 0
 
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
index 16694e625f77..3a2dc3dc6cac 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -11,13 +11,18 @@
 
 #include <linux/linkage.h>
 
-.data
+.section	.rodata.cst32.ROT8, "aM", @progbits, 32
 .align 32
-
 ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
 	.octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section	.rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
 ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
 	.octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
 CTRINC:	.octa 0x00000003000000020000000100000000
 	.octa 0x00000007000000060000000500000004
 
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 3a33124e9112..3f511a7d73b8 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -11,11 +11,14 @@
 
 #include <linux/linkage.h>
 
-.data
+.section	.rodata.cst16.ROT8, "aM", @progbits, 16
 .align 16
-
 ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+.section	.rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
 ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
+.align 16
 CTRINC:	.octa 0x00000003000000020000000100000000
 
 .text
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
index 35e97569d05f..de04d3e98d8d 100644
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -554,12 +554,11 @@ _only_less_than_2:
 
 ENDPROC(crc_t10dif_pcl)
 
-.data
-
+.section	.rodata, "a", @progbits
+.align 16
 # precomputed constants
 # these constants are precomputed from the poly:
 # 0x8bb70000 (0x8bb7 scaled to 32 bits)
-.align 16
 # Q = 0x18BB70000
 # rk1 = 2^(32*3) mod Q << 32
 # rk2 = 2^(32*5) mod Q << 32
@@ -613,14 +612,23 @@ rk20:
 
 
+.section	.rodata.cst16.mask1, "aM", @progbits, 16
+.align 16
 mask1:
 .octa 0x80808080808080808080808080808080
+
+.section	.rodata.cst16.mask2, "aM", @progbits, 16
+.align 16
 mask2:
 .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
 
+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
 SHUF_MASK:
 .octa 0x000102030405060708090A0B0C0D0E0F
 
+.section	.rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
+.align 32
 pshufb_shf_table:
 # use these values for shift constants for the pshufb instruction
 # different alignments result in values as shown:
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index 038f6ae87c5e..f3e91647ca27 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -537,7 +537,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 	ret;
 ENDPROC(des3_ede_x86_64_crypt_blk_3way)
 
-.data
+.section	.rodata, "a", @progbits
 .align 16
 .L_s1:
 	.quad 0x0010100001010400, 0x0000000000000000
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index eed55c8cca4f..f94375a8dcd1 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -20,8 +20,7 @@
 #include <asm/inst.h>
 #include <asm/frame.h>
 
-.data
-
+.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
 .align 16
 .Lbswap_mask:
 	.octa 0x000102030405060708090a0b0c0d0e0f
diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S
index eff2f414e22b..3b6e70d085da 100644
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -11,11 +11,13 @@
 
 #include <linux/linkage.h>
 
-.data
+.section	.rodata.cst32.ANMASK, "aM", @progbits, 32
 .align 32
-
 ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
 	.octa 0x0000000003ffffff0000000003ffffff
+
+.section	.rodata.cst32.ORMASK, "aM", @progbits, 32
+.align 32
 ORMASK:	.octa 0x00000000010000000000000001000000
 	.octa 0x00000000010000000000000001000000
 
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
index 338c748054ed..c88c670cb5fc 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -11,10 +11,12 @@
 
 #include <linux/linkage.h>
 
-.data
+.section	.rodata.cst16.ANMASK, "aM", @progbits, 16
 .align 16
-
 ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
+
+.section	.rodata.cst16.ORMASK, "aM", @progbits, 16
+.align 16
 ORMASK:	.octa 0x00000000010000000000000001000000
 
 .text
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 8be571808342..2925077f8c6a 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -29,11 +29,12 @@
 
 .file "serpent-avx-x86_64-asm_64.S"
 
-.data
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
 .align 16
-
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index 97c48add33ed..d67888f2a52a 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -20,13 +20,18 @@
 
 .file "serpent-avx2-asm_64.S"
 
-.data
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
 .align 16
-
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask_0:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask_1:
 	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
 
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
index 96df6a39d7e2..93b945597ecf 100644
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
@@ -281,11 +281,13 @@ ENTRY(sha1_mb_mgr_get_comp_job_avx2)
 	ret
 ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
 
-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 .octa	0x000000000000000000000000FFFFFFF0
+
+.section	.rodata.cst8, "aM", @progbits, 8
+.align 8
 one:
 .quad  1
 two:
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
index 63a0d9c8e31f..7a93b1c0d69a 100644
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
@@ -203,8 +203,7 @@ return_null:
 
 ENDPROC(sha1_mb_mgr_submit_avx2)
 
-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 	.octa	0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
index c9dae1cd2919..20f77aa633de 100644
--- a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
@@ -461,21 +461,32 @@ lloop:
 ENDPROC(sha1_x8_avx2)
 
 
-.data
-
+.section	.rodata.cst32.K00_19, "aM", @progbits, 32
 .align 32
 K00_19:
 .octa 0x5A8279995A8279995A8279995A827999
 .octa 0x5A8279995A8279995A8279995A827999
+
+.section	.rodata.cst32.K20_39, "aM", @progbits, 32
+.align 32
 K20_39:
 .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
 .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+
+.section	.rodata.cst32.K40_59, "aM", @progbits, 32
+.align 32
 K40_59:
 .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
 .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+
+.section	.rodata.cst32.K60_79, "aM", @progbits, 32
+.align 32
 K60_79:
 .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
 .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK:
 .octa 0x0c0d0e0f08090a0b0405060700010203
 .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
index 874a651b9e7d..ebbdba72ae07 100644
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -293,10 +293,12 @@ ENTRY(sha1_ni_transform)
 	ret
 ENDPROC(sha1_ni_transform)
 
-.data
-
-.align 64
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x000102030405060708090a0b0c0d0e0f
+
+.section	.rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
+.align 16
 UPPER_WORD_MASK:
 	.octa 0xFFFFFFFF000000000000000000000000
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 92b3b5d75ba9..e08888a1a5f2 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -463,7 +463,7 @@ done_hash:
 	ret
 ENDPROC(sha256_transform_avx)
 
-.data
+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
 	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@@ -483,14 +483,21 @@ K256:
 	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203
 
+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
 # shuffle xBxA -> 00BA
 _SHUF_00BA:
 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
 
+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
 # shuffle xDxC -> DC00
 _SHUF_DC00:
 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+
 #endif
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 570ec5ec62d7..89c8f09787d2 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -723,7 +723,7 @@ done_hash:
 	ret
 ENDPROC(sha256_transform_rorx)
 
-.data
+.section	.rodata.cst512.K256, "aM", @progbits, 512
 .align 64
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@@ -759,14 +759,21 @@ K256:
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
 
 # shuffle xBxA -> 00BA
+.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
+.align 32
 _SHUF_00BA:
 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
 
 # shuffle xDxC -> DC00
+.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
+.align 32
 _SHUF_DC00:
 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
+
 #endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
index a78a0694ddef..8fe6338bcc84 100644
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
@@ -284,11 +284,13 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2)
 	ret
 ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
 
-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 .octa	0x000000000000000000000000FFFFFFF0
+
+.section	.rodata.cst8, "aM", @progbits, 8
+.align 8
 one:
 .quad	1
 two:
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
index 7ea670e25acc..b36ae7454084 100644
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
@@ -208,8 +208,7 @@ return_null:
 
 ENDPROC(sha256_mb_mgr_submit_avx2)
 
-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 	.octa	0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
index aa21aea4c722..1687c80c5995 100644
--- a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
@@ -437,7 +437,8 @@ Lrounds_16_xx:
 
 	ret
 ENDPROC(sha256_x8_avx2)
-.data
+
+.section	.rodata.K256_8, "a", @progbits
 .align 64
 K256_8:
 	.octa	0x428a2f98428a2f98428a2f98428a2f98
@@ -568,10 +569,14 @@ K256_8:
 	.octa	0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
 	.octa	0xc67178f2c67178f2c67178f2c67178f2
 	.octa	0xc67178f2c67178f2c67178f2c67178f2
+
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK:
 .octa 0x0c0d0e0f08090a0b0405060700010203
 .octa 0x0c0d0e0f08090a0b0405060700010203
 
+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 .global K256
 K256:
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index 2cedc44e8121..39b83c93e7fd 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -474,7 +474,7 @@ done_hash:
 	ret
 ENDPROC(sha256_transform_ssse3)
 
-.data
+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
         .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@@ -494,13 +494,19 @@ K256:
         .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
         .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203
 
+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
 # shuffle xBxA -> 00BA
 _SHUF_00BA:
 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
 
+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
 # shuffle xDxC -> DC00
 _SHUF_DC00:
 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 748cdf21a938..fb58f58ecfbc 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -329,7 +329,7 @@ ENTRY(sha256_ni_transform)
 	ret
 ENDPROC(sha256_ni_transform)
 
-.data
+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@@ -349,5 +349,7 @@ K256:
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 565274d6a641..39235fefe6f7 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -370,14 +370,17 @@ ENDPROC(sha512_transform_avx)
 ########################################################################
 ### Binary Data
 
-.data
-
+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
 .align 16
-
 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 XMM_QWORD_BSWAP:
 	.octa 0x08090a0b0c0d0e0f0001020304050607
 
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
 # K[t] used in SHA512 hashing
 K512:
 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 1f20b35d8573..7f5f6c6ec72e 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -684,8 +684,11 @@ ENDPROC(sha512_transform_rorx)
 ########################################################################
 ### Binary Data
 
-.data
 
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
 .align 64
 # K[t] used in SHA512 hashing
 K512:
@@ -730,14 +733,17 @@ K512:
 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
 .align 32
-
 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x08090a0b0c0d0e0f0001020304050607
 	.octa 0x18191a1b1c1d1e1f1011121314151617
 
+.section	.rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
+.align 32
 MASK_YMM_LO:
 	.octa 0x00000000000000000000000000000000
 	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+
 #endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
index 3ddba19a0db6..7c629caebc05 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
@@ -280,12 +280,18 @@ ENTRY(sha512_mb_mgr_get_comp_job_avx2)
 	pop     %rbx
         ret
 ENDPROC(sha512_mb_mgr_get_comp_job_avx2)
-.data
 
-.align 16
+.section	.rodata.cst8.one, "aM", @progbits, 8
+.align 8
 one:
 .quad  1
+
+.section	.rodata.cst8.two, "aM", @progbits, 8
+.align 8
 two:
 .quad  2
+
+.section	.rodata.cst8.three, "aM", @progbits, 8
+.align 8
 three:
 .quad  3
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
index 815f07bdd1f8..4ba709ba78e5 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
@@ -209,8 +209,9 @@ return_null:
 	xor     job_rax, job_rax
 	jmp     return
 ENDPROC(sha512_mb_mgr_submit_avx2)
-.data
 
+/* UNUSED?
+.section	.rodata.cst16, "aM", @progbits, 16
 .align 16
 H0:     .int  0x6a09e667
 H1:     .int  0xbb67ae85
@@ -220,3 +221,4 @@ H4:     .int  0x510e527f
 H5:     .int  0x9b05688c
 H6:     .int  0x1f83d9ab
 H7:     .int  0x5be0cd19
+*/
diff --git a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
index 31ab1eff6413..e22e907643a6 100644
--- a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
@@ -361,7 +361,7 @@ Lrounds_16_xx:
 	ret
 ENDPROC(sha512_x4_avx2)
 
-.data
+.section	.rodata.K512_4, "a", @progbits
 .align 64
 K512_4:
 	.octa 0x428a2f98d728ae22428a2f98d728ae22,\
@@ -525,5 +525,7 @@ K512_4:
 	.octa 0x6c44198c4a4758176c44198c4a475817,\
 		0x6c44198c4a4758176c44198c4a475817
 
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
                          .octa 0x18191a1b1c1d1e1f1011121314151617
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index e610e29cbc81..66bbd9058a90 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -369,14 +369,17 @@ ENDPROC(sha512_transform_ssse3)
 ########################################################################
 ### Binary Data
 
-.data
-
+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
 .align 16
-
 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 XMM_QWORD_BSWAP:
 	.octa 0x08090a0b0c0d0e0f0001020304050607
 
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
 # K[t] used in SHA512 hashing
 K512:
 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index dc66273e610d..b3f49d286348 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -29,11 +29,13 @@
 
 .file "twofish-avx-x86_64-asm_64.S"
 
-.data
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
 .align 16
-
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 
-- 
cgit 


From dffba9a31c7769be3231c420d4b364c92ba3f1ac Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Mon, 23 Jan 2017 14:54:44 -0800
Subject: x86/fpu/xstate: Fix xcomp_bv in XSAVES header

The compacted-format XSAVES area is determined at boot time and
never changed after.  The field xsave.header.xcomp_bv indicates
which components are in the fixed XSAVES format.

In fpstate_init() we did not set xcomp_bv to reflect the XSAVES
format since at the time there is no valid data.

However, after we do copy_init_fpstate_to_fpregs() in fpu__clear(),
as in commit:

  b22cbe404a9c x86/fpu: Fix invalid FPU ptrace state after execve()

and when __fpu_restore_sig() does fpu__restore() for a COMPAT-mode
app, a #GP occurs.  This can be easily triggered by doing valgrind on
a COMPAT-mode "Hello World," as reported by Joakim Tjernlund and
others:

	https://bugzilla.kernel.org/show_bug.cgi?id=190061

Fix it by setting xcomp_bv correctly.

This patch also moves the xcomp_bv initialization to the proper
place, which was in copyin_to_xsaves() as of:

  4c833368f0bf x86/fpu: Set the xcomp_bv when we fake up a XSAVES area

which fixed the bug too, but it's more efficient and cleaner to
initialize things once per boot, not for every signal handling
operation.

Reported-by: Kevin Hao <haokexin@gmail.com>
Reported-by: Joakim Tjernlund <Joakim.Tjernlund@infinera.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi V. Shankar <ravi.v.shankar@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: haokexin@gmail.com
Link: http://lkml.kernel.org/r/1485212084-4418-1-git-send-email-yu-cheng.yu@intel.com
[ Combined it with 4c833368f0bf. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/core.c   | 4 +++-
 arch/x86/kernel/fpu/xstate.c | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index e4e97a5355ce..de7234401275 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -9,6 +9,7 @@
 #include <asm/fpu/regset.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/types.h>
+#include <asm/fpu/xstate.h>
 #include <asm/traps.h>
 
 #include <linux/hardirq.h>
@@ -183,7 +184,8 @@ void fpstate_init(union fpregs_state *state)
 	 * it will #GP. Make sure it is replaced after the memset().
 	 */
 	if (static_cpu_has(X86_FEATURE_XSAVES))
-		state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT;
+		state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
+					       xfeatures_mask;
 
 	if (static_cpu_has(X86_FEATURE_FXSR))
 		fpstate_init_fxstate(&state->fxsave);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index e287b9075527..1d7770447b3e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1070,7 +1070,6 @@ int copyin_to_xsaves(const void *kbuf, const void __user *ubuf,
 	 * Add back in the features that came in from userspace:
 	 */
 	xsave->header.xfeatures |= xfeatures;
-	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xsave->header.xfeatures;
 
 	return 0;
 }
-- 
cgit 


From d4b2ac63b0eae461fc10c9791084be24724ef57a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Jan 2017 19:35:06 +0100
Subject: x86/ras/inject: Make it depend on X86_LOCAL_APIC=y
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

... and get rid of the annoying:

  arch/x86/kernel/cpu/mcheck/mce-inject.c:97:13: warning: ‘mce_irq_ipi’ defined but not used [-Wunused-function]

when doing randconfig builds.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170123183514.13356-2-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                        | 2 +-
 arch/x86/kernel/cpu/mcheck/mce-inject.c | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e487493bbd47..7b6fd68b4715 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1070,7 +1070,7 @@ config X86_MCE_THRESHOLD
 	def_bool y
 
 config X86_MCE_INJECT
-	depends on X86_MCE
+	depends on X86_MCE && X86_LOCAL_APIC
 	tristate "Machine check injector support"
 	---help---
 	  Provide support for injecting machine checks for testing purposes.
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 517619ea6498..99165b206df3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -152,7 +152,6 @@ static void raise_mce(struct mce *m)
 	if (context == MCJ_CTX_RANDOM)
 		return;
 
-#ifdef CONFIG_X86_LOCAL_APIC
 	if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
 		unsigned long start;
 		int cpu;
@@ -192,9 +191,7 @@ static void raise_mce(struct mce *m)
 		raise_local();
 		put_cpu();
 		put_online_cpus();
-	} else
-#endif
-	{
+	} else {
 		preempt_disable();
 		raise_local();
 		preempt_enable();
-- 
cgit 


From 9b052ea4ced0fa1ad30a2eafe86984a16297e6f1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Jan 2017 19:35:07 +0100
Subject: x86/ras/therm_throt: Do not log a fake MCE for thermal events

We log a fake bank 128 MCE to note that we're handling a CPU thermal
event. However, this confuses people into thinking that their hardware
generates MCEs. Hijacking MCA for logging thermal events is a gross
misuse anyway and it shouldn't have been done in the first place. And
besides we have other means for dealing with thermal events which are
much more suitable.

So let's kill the MCE logging part.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Ashok Raj <ashok.raj@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170105213846.GA12024@gmail.com
Link: http://lkml.kernel.org/r/20170123183514.13356-3-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h               |  6 ------
 arch/x86/kernel/cpu/mcheck/mce.c         | 25 -------------------------
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 30 +++++++++++-------------------
 3 files changed, 11 insertions(+), 50 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 5132f2a6c0a2..a09ed05725c2 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -97,10 +97,6 @@
 
 #define MCE_OVERFLOW 0		/* bit 0 in flags means overflow */
 
-/* Software defined banks */
-#define MCE_EXTENDED_BANK	128
-#define MCE_THERMAL_BANK	(MCE_EXTENDED_BANK + 0)
-
 #define MCE_LOG_LEN 32
 #define MCE_LOG_SIGNATURE	"MACHINECHECK"
 
@@ -306,8 +302,6 @@ extern void (*deferred_error_int_vector)(void);
 
 void intel_init_thermal(struct cpuinfo_x86 *c);
 
-void mce_log_therm_throt_event(__u64 status);
-
 /* Interrupt Handler for core thermal thresholds */
 extern int (*platform_thermal_notify)(__u64 msr_val);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 00ef43233e03..6eef6fde0f02 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1331,31 +1331,6 @@ static void mce_process_work(struct work_struct *dummy)
 	mce_gen_pool_process();
 }
 
-#ifdef CONFIG_X86_MCE_INTEL
-/***
- * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
- * @cpu: The CPU on which the event occurred.
- * @status: Event status information
- *
- * This function should be called by the thermal interrupt after the
- * event has been processed and the decision was made to log the event
- * further.
- *
- * The status parameter will be saved to the 'status' field of 'struct mce'
- * and historically has been the register value of the
- * MSR_IA32_THERMAL_STATUS (Intel) msr.
- */
-void mce_log_therm_throt_event(__u64 status)
-{
-	struct mce m;
-
-	mce_setup(&m);
-	m.bank = MCE_THERMAL_BANK;
-	m.status = status;
-	mce_log(&m);
-}
-#endif /* CONFIG_X86_MCE_INTEL */
-
 /*
  * Periodic polling timer for "silent" machine check errors.  If the
  * poller finds an MCE, poll 2x faster.  When the poller finds no more
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 465aca8be009..85469f84c921 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -6,7 +6,7 @@
  *
  * Maintains a counter in /sys that keeps track of the number of thermal
  * events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog and mcelog is rate limited).
+ * (since the logging to syslog is rate limited).
  *
  * Author: Dmitriy Zavin (dmitriyz@google.com)
  *
@@ -141,13 +141,8 @@ static struct attribute_group thermal_attr_group = {
  * IRQ has been acknowledged.
  *
  * It will take care of rate limiting and printing messages to the syslog.
- *
- * Returns: 0 : Event should NOT be further logged, i.e. still in
- *              "timeout" from previous log message.
- *          1 : Event should be logged further, and a message has been
- *              printed to the syslog.
  */
-static int therm_throt_process(bool new_event, int event, int level)
+static void therm_throt_process(bool new_event, int event, int level)
 {
 	struct _thermal_state *state;
 	unsigned int this_cpu = smp_processor_id();
@@ -162,16 +157,16 @@ static int therm_throt_process(bool new_event, int event, int level)
 		else if (event == POWER_LIMIT_EVENT)
 			state = &pstate->core_power_limit;
 		else
-			 return 0;
+			return;
 	} else if (level == PACKAGE_LEVEL) {
 		if (event == THERMAL_THROTTLING_EVENT)
 			state = &pstate->package_throttle;
 		else if (event == POWER_LIMIT_EVENT)
 			state = &pstate->package_power_limit;
 		else
-			return 0;
+			return;
 	} else
-		return 0;
+		return;
 
 	old_event = state->new_event;
 	state->new_event = new_event;
@@ -181,7 +176,7 @@ static int therm_throt_process(bool new_event, int event, int level)
 
 	if (time_before64(now, state->next_check) &&
 			state->count != state->last_count)
-		return 0;
+		return;
 
 	state->next_check = now + CHECK_INTERVAL;
 	state->last_count = state->count;
@@ -193,16 +188,14 @@ static int therm_throt_process(bool new_event, int event, int level)
 				this_cpu,
 				level == CORE_LEVEL ? "Core" : "Package",
 				state->count);
-		return 1;
+		return;
 	}
 	if (old_event) {
 		if (event == THERMAL_THROTTLING_EVENT)
 			pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
 				level == CORE_LEVEL ? "Core" : "Package");
-		return 1;
+		return;
 	}
-
-	return 0;
 }
 
 static int thresh_event_valid(int level, int event)
@@ -365,10 +358,9 @@ static void intel_thermal_interrupt(void)
 	/* Check for violation of core thermal thresholds*/
 	notify_thresholds(msr_val);
 
-	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
-				THERMAL_THROTTLING_EVENT,
-				CORE_LEVEL) != 0)
-		mce_log_therm_throt_event(msr_val);
+	therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+			    THERMAL_THROTTLING_EVENT,
+			    CORE_LEVEL);
 
 	if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
 		therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
-- 
cgit 


From 0b737a9c2af85cc8295f9308d9250f9111bbf94d Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <Yazen.Ghannam@amd.com>
Date: Mon, 23 Jan 2017 19:35:08 +0100
Subject: x86/ras/amd: Make sysfs names of banks more user-friendly

Currently, we append the MCA_IPID[InstanceId] to the bank name to create
the sysfs filename. The InstanceId field uniquely identifies a bank
instance but it doesn't look very nice for most banks.

Replace the InstanceId with a simpler, ascending (0, 1, ..) value.
Only use this in the sysfs name when there is more than 1 instance.
Otherwise, just use the bank's name as the sysfs name.

Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1484322741-41884-3-git-send-email-Yazen.Ghannam@amd.com
Link: http://lkml.kernel.org/r/20170123183514.13356-4-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h           | 5 +++--
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 6 +++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index a09ed05725c2..528f6ec897cb 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -356,12 +356,13 @@ struct smca_hwid {
 	unsigned int bank_type;	/* Use with smca_bank_types for easy indexing. */
 	u32 hwid_mcatype;	/* (hwid,mcatype) tuple */
 	u32 xec_bitmap;		/* Bitmap of valid ExtErrorCodes; current max is 21. */
+	u8 count;		/* Number of instances. */
 };
 
 struct smca_bank {
 	struct smca_hwid *hwid;
-	/* Instance ID */
-	u32 id;
+	u32 id;			/* Value of MCA_IPID[InstanceId]. */
+	u8 sysfs_id;		/* Value used for sysfs name. */
 };
 
 extern struct smca_bank smca_banks[MAX_NR_BANKS];
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index a5fd137417a2..776379e4a39c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -192,6 +192,7 @@ static void get_smca_bank_info(unsigned int bank)
 
 			smca_banks[bank].hwid = s_hwid;
 			smca_banks[bank].id = instance_id;
+			smca_banks[bank].sysfs_id = s_hwid->count++;
 			break;
 		}
 	}
@@ -1064,9 +1065,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
 		return NULL;
 	}
 
+	if (smca_banks[bank].hwid->count == 1)
+		return smca_get_name(bank_type);
+
 	snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
 		 "%s_%x", smca_get_name(bank_type),
-			  smca_banks[bank].id);
+			  smca_banks[bank].sysfs_id);
 	return buf_mcatype;
 }
 
-- 
cgit 


From 669c00f09935fc7a22297eadee04536af141595b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Jan 2017 19:35:09 +0100
Subject: x86/ras: Flip the TSC-adding logic

Add the TSC value to the MCE record only when the MCE being logged is
precise, i.e., it is logged as an exception or an MCE-related interrupt.

So it doesn't look particularly easy to do without touching/changing a
bunch of places. That's why I'm trying tricks first.

For example, the mce-apei.c case I'm addressing by setting ->tsc only
for errors of panic severity. The idea there is, that, panic errors will
have raised an #MC and not polled.

And then instead of propagating a flag to mce_setup(), it seems
easier/less code to set ->tsc depending on the call sites, i.e.,
are we polling or are we preparing an MCE record in an exception
handler/thresholding interrupt.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170123183514.13356-5-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce-apei.c |  5 ++++-
 arch/x86/kernel/cpu/mcheck/mce.c      | 12 +++---------
 arch/x86/kernel/cpu/mcheck/mce_amd.c  |  3 ++-
 3 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 83f1a98d37db..2eee85379689 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -52,8 +52,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
 
 	if (severity >= GHES_SEV_RECOVERABLE)
 		m.status |= MCI_STATUS_UC;
-	if (severity >= GHES_SEV_PANIC)
+
+	if (severity >= GHES_SEV_PANIC) {
 		m.status |= MCI_STATUS_PCC;
+		m.tsc = rdtsc();
+	}
 
 	m.addr = mem_err->physical_addr;
 	mce_log(&m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6eef6fde0f02..ca15a7e1f97d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -128,7 +128,6 @@ void mce_setup(struct mce *m)
 {
 	memset(m, 0, sizeof(struct mce));
 	m->cpu = m->extcpu = smp_processor_id();
-	m->tsc = rdtsc();
 	/* We hope get_seconds stays lockless */
 	m->time = get_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -710,14 +709,8 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 	mce_gather_info(&m, NULL);
 
-	/*
-	 * m.tsc was set in mce_setup(). Clear it if not requested.
-	 *
-	 * FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid
-	 *	  that dance.
-	 */
-	if (!(flags & MCP_TIMESTAMP))
-		m.tsc = 0;
+	if (flags & MCP_TIMESTAMP)
+		m.tsc = rdtsc();
 
 	for (i = 0; i < mca_cfg.banks; i++) {
 		if (!mce_banks[i].ctl || !test_bit(i, *b))
@@ -1156,6 +1149,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		goto out;
 
 	mce_gather_info(&m, regs);
+	m.tsc = rdtsc();
 
 	final = this_cpu_ptr(&mces_seen);
 	*final = m;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 776379e4a39c..9e5427df3243 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -778,7 +778,8 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 	mce_setup(&m);
 
 	m.status = status;
-	m.bank = bank;
+	m.bank   = bank;
+	m.tsc	 = rdtsc();
 
 	if (threshold_err)
 		m.misc = misc;
-- 
cgit 


From bd43f60a260c83cbc9befd7d710a3f2bfd3b2dd2 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Jan 2017 19:35:10 +0100
Subject: x86/ras/amd/inj: Change dependency

Change dependency to mce.c as we're using mce_inject_log() now to stick
an MCE into the MCA subsystem.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170123183514.13356-6-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ras/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index d957d5f21a86..0bc60a308730 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -1,6 +1,6 @@
 config MCE_AMD_INJ
 	tristate "Simple MCE injection interface for AMD processors"
-	depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB
+	depends on RAS && X86_MCE && DEBUG_FS && AMD_NB
 	default n
 	help
 	  This is a simple debugfs interface to inject MCEs and test different
-- 
cgit 


From cff4c0391a692cf9b89932c62a7f879fb3637148 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Jan 2017 19:35:13 +0100
Subject: x86/ras: Get rid of mce_process_work()

Make mce_gen_pool_process() the workqueue function directly and save us
an indirection.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170123183514.13356-9-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce-genpool.c  |  2 +-
 arch/x86/kernel/cpu/mcheck/mce-internal.h |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c          | 12 +-----------
 3 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
index 93d824ec3120..1e5a50c11d3c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -72,7 +72,7 @@ struct llist_node *mce_gen_pool_prepare_records(void)
 	return new_head.first;
 }
 
-void mce_gen_pool_process(void)
+void mce_gen_pool_process(struct work_struct *__unused)
 {
 	struct llist_node *head;
 	struct mce_evt_llist *node, *tmp;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index cd74a3f00aea..903043e6a62b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -31,7 +31,7 @@ struct mce_evt_llist {
 	struct mce mce;
 };
 
-void mce_gen_pool_process(void);
+void mce_gen_pool_process(struct work_struct *__unused);
 bool mce_gen_pool_empty(void);
 int mce_gen_pool_add(struct mce *mce);
 int mce_gen_pool_init(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ca15a7e1f97d..0fef5406f0eb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1315,16 +1315,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
 }
 #endif
 
-/*
- * Action optional processing happens here (picking up
- * from the list of faulting pages that do_machine_check()
- * placed into the genpool).
- */
-static void mce_process_work(struct work_struct *dummy)
-{
-	mce_gen_pool_process();
-}
-
 /*
  * Periodic polling timer for "silent" machine check errors.  If the
  * poller finds an MCE, poll 2x faster.  When the poller finds no more
@@ -2165,7 +2155,7 @@ int __init mcheck_init(void)
 	mce_register_decode_chain(&mce_default_nb);
 	mcheck_vendor_init_severity();
 
-	INIT_WORK(&mce_work, mce_process_work);
+	INIT_WORK(&mce_work, mce_gen_pool_process);
 	init_irq_work(&mce_irq_work, mce_irq_work_cb);
 
 	return 0;
-- 
cgit 


From 9026cc82b632ed1a859935c82ed8ad65f27f2781 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Jan 2017 19:35:14 +0100
Subject: x86/ras, EDAC, acpi: Assign MCE notifier handlers a priority

Assign all notifiers on the MCE decode chain a priority so that they get
called in the correct order.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170123183514.13356-10-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h       | 9 +++++++++
 arch/x86/kernel/cpu/mcheck/mce.c | 8 +++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 528f6ec897cb..e63873683d4a 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -189,6 +189,15 @@ extern struct mce_vendor_flags mce_flags;
 
 extern struct mca_config mca_cfg;
 extern struct mca_msr_regs msr_ops;
+
+enum mce_notifier_prios {
+	MCE_PRIO_SRAO		= INT_MAX,
+	MCE_PRIO_EXTLOG		= INT_MAX - 1,
+	MCE_PRIO_NFIT		= INT_MAX - 2,
+	MCE_PRIO_EDAC		= INT_MAX - 3,
+	MCE_PRIO_LOWEST		= 0,
+};
+
 extern void mce_register_decode_chain(struct notifier_block *nb);
 extern void mce_unregister_decode_chain(struct notifier_block *nb);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 0fef5406f0eb..e39bbc0e7c8b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -216,9 +216,7 @@ void mce_register_decode_chain(struct notifier_block *nb)
 {
 	atomic_inc(&num_notifiers);
 
-	/* Ensure SRAO notifier has the highest priority in the decode chain. */
-	if (nb != &mce_srao_nb && nb->priority == INT_MAX)
-		nb->priority -= 1;
+	WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
 
 	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
 }
@@ -582,7 +580,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 }
 static struct notifier_block mce_srao_nb = {
 	.notifier_call	= srao_decode_notifier,
-	.priority = INT_MAX,
+	.priority	= MCE_PRIO_SRAO,
 };
 
 static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
@@ -608,7 +606,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
 static struct notifier_block mce_default_nb = {
 	.notifier_call	= mce_default_notifier,
 	/* lowest prio, we want it to run last. */
-	.priority	= 0,
+	.priority	= MCE_PRIO_LOWEST,
 };
 
 /*
-- 
cgit 


From 5299709d0a87342dadc1fc9850484fadeb488bf8 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 20 Jan 2017 13:04:01 -0800
Subject: treewide: Constify most dma_map_ops structures

Most dma_map_ops structures are never modified. Constify these
structures such that these can be write-protected. This patch
has been generated as follows:

git grep -l 'struct dma_map_ops' |
  xargs -d\\n sed -i \
    -e 's/struct dma_map_ops/const struct dma_map_ops/g' \
    -e 's/const struct dma_map_ops {/struct dma_map_ops {/g' \
    -e 's/^const struct dma_map_ops;$/struct dma_map_ops;/' \
    -e 's/const const struct dma_map_ops /const struct dma_map_ops /g';
sed -i -e 's/const \(struct dma_map_ops intel_dma_ops\)/\1/' \
  $(git grep -l 'struct dma_map_ops intel_dma_ops');
sed -i -e 's/const \(struct dma_map_ops dma_iommu_ops\)/\1/' \
  $(git grep -l 'struct dma_map_ops' | grep ^arch/powerpc);
sed -i -e '/^struct vmd_dev {$/,/^};$/ s/const \(struct dma_map_ops[[:blank:]]dma_ops;\)/\1/' \
       -e '/^static void vmd_setup_dma_ops/,/^}$/ s/const \(struct dma_map_ops \*dest\)/\1/' \
       -e 's/const \(struct dma_map_ops \*dest = \&vmd->dma_ops\)/\1/' \
    drivers/pci/host/*.c
sed -i -e '/^void __init pci_iommu_alloc(void)$/,/^}$/ s/dma_ops->/intel_dma_ops./' arch/ia64/kernel/pci-dma.c
sed -i -e 's/static const struct dma_map_ops sn_dma_ops/static struct dma_map_ops sn_dma_ops/' arch/ia64/sn/pci/pci_dma.c
sed -i -e 's/(const struct dma_map_ops \*)//' drivers/misc/mic/bus/vop_bus.c

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: Russell King <linux@armlinux.org.uk>
Cc: x86@kernel.org
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 arch/x86/include/asm/device.h      | 4 ++--
 arch/x86/include/asm/dma-mapping.h | 4 ++--
 arch/x86/include/asm/iommu.h       | 2 +-
 arch/x86/kernel/amd_gart_64.c      | 2 +-
 arch/x86/kernel/pci-calgary_64.c   | 2 +-
 arch/x86/kernel/pci-dma.c          | 4 ++--
 arch/x86/kernel/pci-nommu.c        | 2 +-
 arch/x86/kernel/pci-swiotlb.c      | 2 +-
 arch/x86/pci/sta2x11-fixup.c       | 2 +-
 arch/x86/xen/pci-swiotlb-xen.c     | 2 +-
 10 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 684ed6c3aa67..b2d0b4ced7e3 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -3,7 +3,7 @@
 
 struct dev_archdata {
 #ifdef CONFIG_X86_DEV_DMA_OPS
-	struct dma_map_ops *dma_ops;
+	const struct dma_map_ops *dma_ops;
 #endif
 #if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU)
 	void *iommu; /* hook for IOMMU specific extension */
@@ -13,7 +13,7 @@ struct dev_archdata {
 #if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
 struct dma_domain {
 	struct list_head node;
-	struct dma_map_ops *dma_ops;
+	const struct dma_map_ops *dma_ops;
 	int domain_nr;
 };
 void add_dma_domain(struct dma_domain *domain);
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 44461626830e..5e4772886a1e 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -25,9 +25,9 @@ extern int iommu_merge;
 extern struct device x86_dma_fallback_dev;
 extern int panic_on_overflow;
 
-extern struct dma_map_ops *dma_ops;
+extern const struct dma_map_ops *dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 #ifndef CONFIG_X86_DEV_DMA_OPS
 	return dma_ops;
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 345c99cef152..793869879464 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_X86_IOMMU_H
 #define _ASM_X86_IOMMU_H
 
-extern struct dma_map_ops nommu_dma_ops;
+extern const struct dma_map_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 extern int iommu_pass_through;
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 63ff468a7986..82dfe32faaf4 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -695,7 +695,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
 	return -1;
 }
 
-static struct dma_map_ops gart_dma_ops = {
+static const struct dma_map_ops gart_dma_ops = {
 	.map_sg				= gart_map_sg,
 	.unmap_sg			= gart_unmap_sg,
 	.map_page			= gart_map_page,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 5d400ba1349d..17f180148c80 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -478,7 +478,7 @@ static void calgary_free_coherent(struct device *dev, size_t size,
 	free_pages((unsigned long)vaddr, get_order(size));
 }
 
-static struct dma_map_ops calgary_dma_ops = {
+static const struct dma_map_ops calgary_dma_ops = {
 	.alloc = calgary_alloc_coherent,
 	.free = calgary_free_coherent,
 	.map_sg = calgary_map_sg,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d30c37750765..76f4c039baae 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -17,7 +17,7 @@
 
 static int forbid_dac __read_mostly;
 
-struct dma_map_ops *dma_ops = &nommu_dma_ops;
+const struct dma_map_ops *dma_ops = &nommu_dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static int iommu_sac_force __read_mostly;
@@ -214,7 +214,7 @@ early_param("iommu", iommu_setup);
 
 int dma_supported(struct device *dev, u64 mask)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 #ifdef CONFIG_PCI
 	if (mask > 0xffffffff && forbid_dac > 0) {
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 00e71ce396a8..a88952ef371c 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -88,7 +88,7 @@ static void nommu_sync_sg_for_device(struct device *dev,
 	flush_write_buffers();
 }
 
-struct dma_map_ops nommu_dma_ops = {
+const struct dma_map_ops nommu_dma_ops = {
 	.alloc			= dma_generic_alloc_coherent,
 	.free			= dma_generic_free_coherent,
 	.map_sg			= nommu_map_sg,
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 410efb2c7b80..1e23577e17cf 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -45,7 +45,7 @@ void x86_swiotlb_free_coherent(struct device *dev, size_t size,
 		dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
 }
 
-static struct dma_map_ops swiotlb_dma_ops = {
+static const struct dma_map_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc = x86_swiotlb_alloc_coherent,
 	.free = x86_swiotlb_free_coherent,
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 052c1cb76305..aa3828823170 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -179,7 +179,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev,
 }
 
 /* We have our own dma_ops: the same as swiotlb but from alloc (above) */
-static struct dma_map_ops sta2x11_dma_ops = {
+static const struct dma_map_ops sta2x11_dma_ops = {
 	.alloc = sta2x11_swiotlb_alloc_coherent,
 	.free = x86_swiotlb_free_coherent,
 	.map_page = swiotlb_map_page,
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index a0b36a9d5df1..42b08f8fc2ca 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -18,7 +18,7 @@
 
 int xen_swiotlb __read_mostly;
 
-static struct dma_map_ops xen_swiotlb_dma_ops = {
+static const struct dma_map_ops xen_swiotlb_dma_ops = {
 	.alloc = xen_swiotlb_alloc_coherent,
 	.free = xen_swiotlb_free_coherent,
 	.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
-- 
cgit 


From 5657933dbb6e25feaf5d8df8c88f96cdade693a3 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 20 Jan 2017 13:04:02 -0800
Subject: treewide: Move dma_ops from struct dev_archdata into struct device

Some but not all architectures provide set_dma_ops(). Move dma_ops
from struct dev_archdata into struct device such that it becomes
possible on all architectures to configure dma_ops per device.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: Russell King <linux@armlinux.org.uk>
Cc: x86@kernel.org
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 arch/x86/include/asm/device.h      | 3 ---
 arch/x86/include/asm/dma-mapping.h | 4 ++--
 arch/x86/kernel/pci-calgary_64.c   | 4 ++--
 arch/x86/pci/common.c              | 2 +-
 arch/x86/pci/sta2x11-fixup.c       | 8 ++++----
 5 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index b2d0b4ced7e3..1b3ef26e77df 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -2,9 +2,6 @@
 #define _ASM_X86_DEVICE_H
 
 struct dev_archdata {
-#ifdef CONFIG_X86_DEV_DMA_OPS
-	const struct dma_map_ops *dma_ops;
-#endif
 #if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU)
 	void *iommu; /* hook for IOMMU specific extension */
 #endif
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 5e4772886a1e..94b5b96966cb 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -32,10 +32,10 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 #ifndef CONFIG_X86_DEV_DMA_OPS
 	return dma_ops;
 #else
-	if (unlikely(!dev) || !dev->archdata.dma_ops)
+	if (unlikely(!dev) || !dev->dma_ops)
 		return dma_ops;
 	else
-		return dev->archdata.dma_ops;
+		return dev->dma_ops;
 #endif
 }
 
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 17f180148c80..5070320780c6 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1177,7 +1177,7 @@ static int __init calgary_init(void)
 		tbl = find_iommu_table(&dev->dev);
 
 		if (translation_enabled(tbl))
-			dev->dev.archdata.dma_ops = &calgary_dma_ops;
+			dev->dev.dma_ops = &calgary_dma_ops;
 	}
 
 	return ret;
@@ -1201,7 +1201,7 @@ error:
 		calgary_disable_translation(dev);
 		calgary_free_bus(dev);
 		pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
-		dev->dev.archdata.dma_ops = NULL;
+		dev->dev.dma_ops = NULL;
 	} while (1);
 
 	return ret;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index a4fdfa7dcc1b..0cb52ae0a8f0 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -667,7 +667,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev)
 	spin_lock(&dma_domain_list_lock);
 	list_for_each_entry(domain, &dma_domain_list, node) {
 		if (pci_domain_nr(pdev->bus) == domain->domain_nr) {
-			pdev->dev.archdata.dma_ops = domain->dma_ops;
+			pdev->dev.dma_ops = domain->dma_ops;
 			break;
 		}
 	}
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index aa3828823170..ec008e800b45 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -203,7 +203,7 @@ static void sta2x11_setup_pdev(struct pci_dev *pdev)
 		return;
 	pci_set_consistent_dma_mask(pdev, STA2X11_AMBA_SIZE - 1);
 	pci_set_dma_mask(pdev, STA2X11_AMBA_SIZE - 1);
-	pdev->dev.archdata.dma_ops = &sta2x11_dma_ops;
+	pdev->dev.dma_ops = &sta2x11_dma_ops;
 
 	/* We must enable all devices as master, for audio DMA to work */
 	pci_set_master(pdev);
@@ -223,7 +223,7 @@ bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	struct sta2x11_mapping *map;
 
-	if (dev->archdata.dma_ops != &sta2x11_dma_ops) {
+	if (dev->dma_ops != &sta2x11_dma_ops) {
 		if (!dev->dma_mask)
 			return false;
 		return addr + size - 1 <= *dev->dma_mask;
@@ -247,7 +247,7 @@ bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
  */
 dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
-	if (dev->archdata.dma_ops != &sta2x11_dma_ops)
+	if (dev->dma_ops != &sta2x11_dma_ops)
 		return paddr;
 	return p2a(paddr, to_pci_dev(dev));
 }
@@ -259,7 +259,7 @@ dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
  */
 phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
-	if (dev->archdata.dma_ops != &sta2x11_dma_ops)
+	if (dev->dma_ops != &sta2x11_dma_ops)
 		return daddr;
 	return a2p(daddr, to_pci_dev(dev));
 }
-- 
cgit 


From 815dd18788fe0d41899f51b91d0560279cf16b0d Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 20 Jan 2017 13:04:04 -0800
Subject: treewide: Consolidate get_dma_ops() implementations

Introduce a new architecture-specific get_arch_dma_ops() function
that takes a struct bus_type * argument. Add get_dma_ops() in
<linux/dma-mapping.h>.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: Russell King <linux@armlinux.org.uk>
Cc: x86@kernel.org
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 arch/x86/include/asm/dma-mapping.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 94b5b96966cb..08a0838b83fb 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -27,16 +27,9 @@ extern int panic_on_overflow;
 
 extern const struct dma_map_ops *dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-#ifndef CONFIG_X86_DEV_DMA_OPS
 	return dma_ops;
-#else
-	if (unlikely(!dev) || !dev->dma_ops)
-		return dma_ops;
-	else
-		return dev->dma_ops;
-#endif
 }
 
 bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
-- 
cgit 


From a5828ed3d03d38399159dc17a98cefde3109a66b Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Tue, 24 Jan 2017 10:25:46 -0800
Subject: x86/fpu/xstate: Move XSAVES state init to a function

Make XSTATE init similar to existing code; move it to a separate function.
There is no functionality change.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi V. Shankar <ravi.v.shankar@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1485282346-15437-1-git-send-email-yu-cheng.yu@intel.com
[ Minor cleanliness edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h | 10 ++++++++++
 arch/x86/kernel/fpu/core.c          |  9 +--------
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index d4a684997497..255645f60ca2 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -87,6 +87,16 @@ extern void fpstate_init_soft(struct swregs_state *soft);
 #else
 static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
+
+static inline void fpstate_init_xstate(struct xregs_state *xsave)
+{
+	/*
+	 * XRSTORS requires these bits set in xcomp_bv, or it will
+	 * trigger #GP:
+	 */
+	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask;
+}
+
 static inline void fpstate_init_fxstate(struct fxregs_state *fx)
 {
 	fx->cwd = 0x37f;
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index de7234401275..e1114f070c2d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -9,7 +9,6 @@
 #include <asm/fpu/regset.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/types.h>
-#include <asm/fpu/xstate.h>
 #include <asm/traps.h>
 
 #include <linux/hardirq.h>
@@ -179,14 +178,8 @@ void fpstate_init(union fpregs_state *state)
 
 	memset(state, 0, fpu_kernel_xstate_size);
 
-	/*
-	 * XRSTORS requires that this bit is set in xcomp_bv, or
-	 * it will #GP. Make sure it is replaced after the memset().
-	 */
 	if (static_cpu_has(X86_FEATURE_XSAVES))
-		state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
-					       xfeatures_mask;
-
+		fpstate_init_xstate(&state->xsave);
 	if (static_cpu_has(X86_FEATURE_FXSR))
 		fpstate_init_fxstate(&state->fxsave);
 	else
-- 
cgit 


From 78d1b296843a719935277b0d24d1358416ed0204 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 18 Jan 2017 11:15:37 -0800
Subject: x86/cpu: Add X86_FEATURE_CPUID

Add a synthetic CPUID flag denoting whether the CPU sports the CPUID
instruction or not. This will come useful later when accomodating
CPUID-less CPUs.

Signed-off-by: Borislav Petkov <bp@suse.de>
[ Slightly prettified. ]
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Whitehead <tedheadster@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/dcb355adae3ab812c79397056a61c212f1a0c7cc.1484705016.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h | 2 +-
 arch/x86/kernel/cpu/common.c       | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index eafee3161d1c..a0f4ff25669e 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -100,7 +100,7 @@
 #define X86_FEATURE_XTOPOLOGY	( 3*32+22) /* cpu topology enum extensions */
 #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
 #define X86_FEATURE_NONSTOP_TSC	( 3*32+24) /* TSC does not stop in C states */
-/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
+#define X86_FEATURE_CPUID	( 3*32+25) /* CPU has CPUID instruction itself */
 #define X86_FEATURE_EXTD_APICID	( 3*32+26) /* has extended APICID (8 bits) */
 #define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
 #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9bab7a8a4293..c3328d0e13f0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -801,14 +801,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
 	c->extended_cpuid_level = 0;
 
-	if (!have_cpuid_p())
-		identify_cpu_without_cpuid(c);
-
 	/* cyrix could have cpuid enabled via c_identify()*/
 	if (have_cpuid_p()) {
 		cpu_detect(c);
 		get_cpu_vendor(c);
 		get_cpu_cap(c);
+		setup_force_cpu_cap(X86_FEATURE_CPUID);
 
 		if (this_cpu->c_early_init)
 			this_cpu->c_early_init(c);
@@ -818,6 +816,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 		if (this_cpu->c_bsp_init)
 			this_cpu->c_bsp_init(c);
+	} else {
+		identify_cpu_without_cpuid(c);
+		setup_clear_cpu_cap(X86_FEATURE_CPUID);
 	}
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
-- 
cgit 


From 8bf1ebca215c262e48c15a4a15f175991776f57f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 18 Jan 2017 11:15:38 -0800
Subject: x86/cpu: Factor out application of forced CPU caps

There are multiple call sites that apply forced CPU caps.  Factor
them into a helper.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Whitehead <tedheadster@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/623ff7555488122143e4417de09b18be2085ad06.1484705016.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c3328d0e13f0..2ea16e05de43 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -655,6 +655,16 @@ void cpu_detect(struct cpuinfo_x86 *c)
 	}
 }
 
+static void apply_forced_caps(struct cpuinfo_x86 *c)
+{
+	int i;
+
+	for (i = 0; i < NCAPINTS; i++) {
+		c->x86_capability[i] &= ~cpu_caps_cleared[i];
+		c->x86_capability[i] |= cpu_caps_set[i];
+	}
+}
+
 void get_cpu_cap(struct cpuinfo_x86 *c)
 {
 	u32 eax, ebx, ecx, edx;
@@ -1035,10 +1045,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 		this_cpu->c_identify(c);
 
 	/* Clear/Set all flags overridden by options, after probe */
-	for (i = 0; i < NCAPINTS; i++) {
-		c->x86_capability[i] &= ~cpu_caps_cleared[i];
-		c->x86_capability[i] |= cpu_caps_set[i];
-	}
+	apply_forced_caps(c);
 
 #ifdef CONFIG_X86_64
 	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
@@ -1097,10 +1104,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	 * Clear/Set all flags overridden by options, need do it
 	 * before following smp all cpus cap AND.
 	 */
-	for (i = 0; i < NCAPINTS; i++) {
-		c->x86_capability[i] &= ~cpu_caps_cleared[i];
-		c->x86_capability[i] |= cpu_caps_set[i];
-	}
+	apply_forced_caps(c);
 
 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
-- 
cgit 


From 60d3450167433f2d099ce2869dc52dd9e7dc9b29 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 18 Jan 2017 11:15:39 -0800
Subject: x86/cpu: Re-apply forced caps every time CPU caps are re-read

Calling get_cpu_cap() will reset a bunch of CPU features.  This will
cause the system to lose track of force-set and force-cleared
features in the words that are reset until the end of CPU
initialization.  This can cause X86_FEATURE_FPU, for example, to
change back and forth during boot and potentially confuse CPU setup.

To minimize the chance of confusion, re-apply forced caps every time
get_cpu_cap() is called.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Whitehead <tedheadster@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/c817eb373d2c67c2c81413a70fc9b845fa34a37e.1484705016.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2ea16e05de43..d09b5eefaddf 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -758,6 +758,13 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
 
 	init_scattered_cpuid_features(c);
+
+	/*
+	 * Clear/Set all flags overridden by options, after probe.
+	 * This needs to happen each time we re-probe, which may happen
+	 * several times during CPU initialization.
+	 */
+	apply_forced_caps(c);
 }
 
 static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
-- 
cgit 


From 9170fb409437b246078bcad3f1481d32dfe2ca28 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 18 Jan 2017 11:15:40 -0800
Subject: x86/fpu: Fix "x86/fpu: Legacy x87 FPU detected" message

That message isn't at all clear -- what does "Legacy x87" even mean?

Clarify it.  If there's no FPU, say:

  x86/fpu: No FPU detected

If there's an FPU that doesn't have XSAVE, say:

  x86/fpu: x87 FPU will use FSAVE|FXSAVE

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Whitehead <tedheadster@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/bb839385e18e27bca23fe8666dfdad8170473045.1484705016.git.luto@kernel.org
[ Small tweaks to the messages. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 1d7770447b3e..96002f19c113 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -705,8 +705,14 @@ void __init fpu__init_system_xstate(void)
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
 
+	if (!boot_cpu_has(X86_FEATURE_FPU)) {
+		pr_info("x86/fpu: No FPU detected\n");
+		return;
+	}
+
 	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
-		pr_info("x86/fpu: Legacy x87 FPU detected.\n");
+		pr_info("x86/fpu: x87 FPU will use %s\n",
+			boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
 		return;
 	}
 
-- 
cgit 


From 37ac78b67b3384d1ced5424d5a13ee146041bda3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 18 Jan 2017 11:15:41 -0800
Subject: x86/fpu: Fix CPUID-less FPU detection

The old code didn't work at all because it adjusted the current caps
instead of the forced caps.  Anything it did would be undone later
during CPU identification.  Fix that and, while we're at it, improve
the logging and don't bother running it if CPUID is available.

Reported-by: Matthew Whitehead <tedheadster@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/f1134e30cafa73c4e2e68119e9741793622cfd15.1484705016.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/init.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 60dece392b3a..8b526c5fc306 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -48,13 +48,7 @@ void fpu__init_cpu(void)
 	fpu__init_cpu_xstate();
 }
 
-/*
- * The earliest FPU detection code.
- *
- * Set the X86_FEATURE_FPU CPU-capability bit based on
- * trying to execute an actual sequence of FPU instructions:
- */
-static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+static bool fpu__probe_without_cpuid(void)
 {
 	unsigned long cr0;
 	u16 fsw, fcw;
@@ -65,14 +59,21 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
 	cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
 	write_cr0(cr0);
 
-	if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
-		asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-			     : "+m" (fsw), "+m" (fcw));
+	asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw));
+
+	pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw);
 
-		if (fsw == 0 && (fcw & 0x103f) == 0x003f)
-			set_cpu_cap(c, X86_FEATURE_FPU);
+	return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+{
+	if (!boot_cpu_has(X86_FEATURE_CPUID) &&
+	    !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+		if (fpu__probe_without_cpuid())
+			setup_force_cpu_cap(X86_FEATURE_FPU);
 		else
-			clear_cpu_cap(c, X86_FEATURE_FPU);
+			setup_clear_cpu_cap(X86_FEATURE_FPU);
 	}
 
 #ifndef CONFIG_MATH_EMULATION
-- 
cgit 


From 9729017f844431ab2800519297d8d1b0ecbc420d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 18 Jan 2017 11:15:42 -0800
Subject: x86/fpu: Fix the "Giving up, no FPU found" test

We would never print "Giving up, no FPU found" because
X86_FEATURE_FPU was in REQUIRED_MASK on non-FPU-emulating builds, so
the boot_cpu_has() test didn't do anything.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Whitehead <tedheadster@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/1499077fa76f0f84b8ea28e37d3fa70beca4e310.1484705016.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 8b526c5fc306..19bdd1bf8160 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -77,7 +77,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
 	}
 
 #ifndef CONFIG_MATH_EMULATION
-	if (!boot_cpu_has(X86_FEATURE_FPU)) {
+	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) {
 		pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
 		for (;;)
 			asm volatile("hlt");
-- 
cgit 


From 80a7581f38c0b2e83dc883a2125340b90b5635ec Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Mon, 23 Jan 2017 12:07:43 -0600
Subject: arch/x86/platform/atom: Move pmc_atom to drivers/platform/x86

The pmc_atom driver does not contain any architecture specific
code. It only enables the SoC Power Management Controller driver
for BayTrail and CherryTrail platforms.

Move the pmc_atom driver from arch/x86/platform/atom to
drivers/platform/x86. Also clean-up and reorder include files by
alphabetical order in pmc_atom.h

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 arch/x86/Kconfig                  |   4 -
 arch/x86/include/asm/pmc_atom.h   | 158 -------------
 arch/x86/platform/atom/Makefile   |   1 -
 arch/x86/platform/atom/pmc_atom.c | 460 --------------------------------------
 4 files changed, 623 deletions(-)
 delete mode 100644 arch/x86/include/asm/pmc_atom.h
 delete mode 100644 arch/x86/platform/atom/pmc_atom.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e487493bbd47..7b4f1789f386 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2789,10 +2789,6 @@ config X86_DMA_REMAP
 	bool
 	depends on STA2X11
 
-config PMC_ATOM
-	def_bool y
-        depends on PCI
-
 source "net/Kconfig"
 
 source "drivers/Kconfig"
diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h
deleted file mode 100644
index aa8744c77c6d..000000000000
--- a/arch/x86/include/asm/pmc_atom.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Intel Atom SOC Power Management Controller Header File
- * Copyright (c) 2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- */
-
-#ifndef PMC_ATOM_H
-#define PMC_ATOM_H
-
-/* ValleyView Power Control Unit PCI Device ID */
-#define	PCI_DEVICE_ID_VLV_PMC	0x0F1C
-/* CherryTrail Power Control Unit PCI Device ID */
-#define	PCI_DEVICE_ID_CHT_PMC	0x229C
-
-/* PMC Memory mapped IO registers */
-#define	PMC_BASE_ADDR_OFFSET	0x44
-#define	PMC_BASE_ADDR_MASK	0xFFFFFE00
-#define	PMC_MMIO_REG_LEN	0x100
-#define	PMC_REG_BIT_WIDTH	32
-
-/* BIOS uses FUNC_DIS to disable specific function */
-#define	PMC_FUNC_DIS		0x34
-#define	PMC_FUNC_DIS_2		0x38
-
-/* CHT specific bits in FUNC_DIS2 register */
-#define	BIT_FD_GMM		BIT(3)
-#define	BIT_FD_ISH		BIT(4)
-
-/* S0ix wake event control */
-#define	PMC_S0IX_WAKE_EN	0x3C
-
-#define	BIT_LPC_CLOCK_RUN		BIT(4)
-#define	BIT_SHARED_IRQ_GPSC		BIT(5)
-#define	BIT_ORED_DEDICATED_IRQ_GPSS	BIT(18)
-#define	BIT_ORED_DEDICATED_IRQ_GPSC	BIT(19)
-#define	BIT_SHARED_IRQ_GPSS		BIT(20)
-
-#define	PMC_WAKE_EN_SETTING	~(BIT_LPC_CLOCK_RUN | \
-				BIT_SHARED_IRQ_GPSC | \
-				BIT_ORED_DEDICATED_IRQ_GPSS | \
-				BIT_ORED_DEDICATED_IRQ_GPSC | \
-				BIT_SHARED_IRQ_GPSS)
-
-/* The timers acumulate time spent in sleep state */
-#define	PMC_S0IR_TMR		0x80
-#define	PMC_S0I1_TMR		0x84
-#define	PMC_S0I2_TMR		0x88
-#define	PMC_S0I3_TMR		0x8C
-#define	PMC_S0_TMR		0x90
-/* Sleep state counter is in units of of 32us */
-#define	PMC_TMR_SHIFT		5
-
-/* Power status of power islands */
-#define	PMC_PSS			0x98
-
-#define PMC_PSS_BIT_GBE			BIT(0)
-#define PMC_PSS_BIT_SATA		BIT(1)
-#define PMC_PSS_BIT_HDA			BIT(2)
-#define PMC_PSS_BIT_SEC			BIT(3)
-#define PMC_PSS_BIT_PCIE		BIT(4)
-#define PMC_PSS_BIT_LPSS		BIT(5)
-#define PMC_PSS_BIT_LPE			BIT(6)
-#define PMC_PSS_BIT_DFX			BIT(7)
-#define PMC_PSS_BIT_USH_CTRL		BIT(8)
-#define PMC_PSS_BIT_USH_SUS		BIT(9)
-#define PMC_PSS_BIT_USH_VCCS		BIT(10)
-#define PMC_PSS_BIT_USH_VCCA		BIT(11)
-#define PMC_PSS_BIT_OTG_CTRL		BIT(12)
-#define PMC_PSS_BIT_OTG_VCCS		BIT(13)
-#define PMC_PSS_BIT_OTG_VCCA_CLK	BIT(14)
-#define PMC_PSS_BIT_OTG_VCCA		BIT(15)
-#define PMC_PSS_BIT_USB			BIT(16)
-#define PMC_PSS_BIT_USB_SUS		BIT(17)
-
-/* CHT specific bits in PSS register */
-#define	PMC_PSS_BIT_CHT_UFS		BIT(7)
-#define	PMC_PSS_BIT_CHT_UXD		BIT(11)
-#define	PMC_PSS_BIT_CHT_UXD_FD		BIT(12)
-#define	PMC_PSS_BIT_CHT_UX_ENG		BIT(15)
-#define	PMC_PSS_BIT_CHT_USB_SUS		BIT(16)
-#define	PMC_PSS_BIT_CHT_GMM		BIT(17)
-#define	PMC_PSS_BIT_CHT_ISH		BIT(18)
-#define	PMC_PSS_BIT_CHT_DFX_MASTER	BIT(26)
-#define	PMC_PSS_BIT_CHT_DFX_CLUSTER1	BIT(27)
-#define	PMC_PSS_BIT_CHT_DFX_CLUSTER2	BIT(28)
-#define	PMC_PSS_BIT_CHT_DFX_CLUSTER3	BIT(29)
-#define	PMC_PSS_BIT_CHT_DFX_CLUSTER4	BIT(30)
-#define	PMC_PSS_BIT_CHT_DFX_CLUSTER5	BIT(31)
-
-/* These registers reflect D3 status of functions */
-#define	PMC_D3_STS_0		0xA0
-
-#define	BIT_LPSS1_F0_DMA	BIT(0)
-#define	BIT_LPSS1_F1_PWM1	BIT(1)
-#define	BIT_LPSS1_F2_PWM2	BIT(2)
-#define	BIT_LPSS1_F3_HSUART1	BIT(3)
-#define	BIT_LPSS1_F4_HSUART2	BIT(4)
-#define	BIT_LPSS1_F5_SPI	BIT(5)
-#define	BIT_LPSS1_F6_XXX	BIT(6)
-#define	BIT_LPSS1_F7_XXX	BIT(7)
-#define	BIT_SCC_EMMC		BIT(8)
-#define	BIT_SCC_SDIO		BIT(9)
-#define	BIT_SCC_SDCARD		BIT(10)
-#define	BIT_SCC_MIPI		BIT(11)
-#define	BIT_HDA			BIT(12)
-#define	BIT_LPE			BIT(13)
-#define	BIT_OTG			BIT(14)
-#define	BIT_USH			BIT(15)
-#define	BIT_GBE			BIT(16)
-#define	BIT_SATA		BIT(17)
-#define	BIT_USB_EHCI		BIT(18)
-#define	BIT_SEC			BIT(19)
-#define	BIT_PCIE_PORT0		BIT(20)
-#define	BIT_PCIE_PORT1		BIT(21)
-#define	BIT_PCIE_PORT2		BIT(22)
-#define	BIT_PCIE_PORT3		BIT(23)
-#define	BIT_LPSS2_F0_DMA	BIT(24)
-#define	BIT_LPSS2_F1_I2C1	BIT(25)
-#define	BIT_LPSS2_F2_I2C2	BIT(26)
-#define	BIT_LPSS2_F3_I2C3	BIT(27)
-#define	BIT_LPSS2_F4_I2C4	BIT(28)
-#define	BIT_LPSS2_F5_I2C5	BIT(29)
-#define	BIT_LPSS2_F6_I2C6	BIT(30)
-#define	BIT_LPSS2_F7_I2C7	BIT(31)
-
-#define	PMC_D3_STS_1		0xA4
-#define	BIT_SMB			BIT(0)
-#define	BIT_OTG_SS_PHY		BIT(1)
-#define	BIT_USH_SS_PHY		BIT(2)
-#define	BIT_DFX			BIT(3)
-
-/* CHT specific bits in PMC_D3_STS_1 register */
-#define	BIT_STS_GMM		BIT(1)
-#define	BIT_STS_ISH		BIT(2)
-
-/* PMC I/O Registers */
-#define	ACPI_BASE_ADDR_OFFSET	0x40
-#define	ACPI_BASE_ADDR_MASK	0xFFFFFE00
-#define	ACPI_MMIO_REG_LEN	0x100
-
-#define	PM1_CNT			0x4
-#define	SLEEP_TYPE_MASK		0xFFFFECFF
-#define	SLEEP_TYPE_S5		0x1C00
-#define	SLEEP_ENABLE		0x2000
-
-extern int pmc_atom_read(int offset, u32 *value);
-extern int pmc_atom_write(int offset, u32 value);
-
-#endif /* PMC_ATOM_H */
diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile
index 40983f5b0858..57be88fa34bb 100644
--- a/arch/x86/platform/atom/Makefile
+++ b/arch/x86/platform/atom/Makefile
@@ -1,2 +1 @@
-obj-$(CONFIG_PMC_ATOM)		+= pmc_atom.o
 obj-$(CONFIG_PUNIT_ATOM_DEBUG)	+= punit_atom_debug.o
diff --git a/arch/x86/platform/atom/pmc_atom.c b/arch/x86/platform/atom/pmc_atom.c
deleted file mode 100644
index 964ff4fc61f9..000000000000
--- a/arch/x86/platform/atom/pmc_atom.c
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
- * Intel Atom SOC Power Management Controller Driver
- * Copyright (c) 2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/device.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <linux/io.h>
-
-#include <asm/pmc_atom.h>
-
-struct pmc_bit_map {
-	const char *name;
-	u32 bit_mask;
-};
-
-struct pmc_reg_map {
-	const struct pmc_bit_map *d3_sts_0;
-	const struct pmc_bit_map *d3_sts_1;
-	const struct pmc_bit_map *func_dis;
-	const struct pmc_bit_map *func_dis_2;
-	const struct pmc_bit_map *pss;
-};
-
-struct pmc_dev {
-	u32 base_addr;
-	void __iomem *regmap;
-	const struct pmc_reg_map *map;
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *dbgfs_dir;
-#endif /* CONFIG_DEBUG_FS */
-	bool init;
-};
-
-static struct pmc_dev pmc_device;
-static u32 acpi_base_addr;
-
-static const struct pmc_bit_map d3_sts_0_map[] = {
-	{"LPSS1_F0_DMA",	BIT_LPSS1_F0_DMA},
-	{"LPSS1_F1_PWM1",	BIT_LPSS1_F1_PWM1},
-	{"LPSS1_F2_PWM2",	BIT_LPSS1_F2_PWM2},
-	{"LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
-	{"LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
-	{"LPSS1_F5_SPI",	BIT_LPSS1_F5_SPI},
-	{"LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
-	{"LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
-	{"SCC_EMMC",		BIT_SCC_EMMC},
-	{"SCC_SDIO",		BIT_SCC_SDIO},
-	{"SCC_SDCARD",		BIT_SCC_SDCARD},
-	{"SCC_MIPI",		BIT_SCC_MIPI},
-	{"HDA",			BIT_HDA},
-	{"LPE",			BIT_LPE},
-	{"OTG",			BIT_OTG},
-	{"USH",			BIT_USH},
-	{"GBE",			BIT_GBE},
-	{"SATA",		BIT_SATA},
-	{"USB_EHCI",		BIT_USB_EHCI},
-	{"SEC",			BIT_SEC},
-	{"PCIE_PORT0",		BIT_PCIE_PORT0},
-	{"PCIE_PORT1",		BIT_PCIE_PORT1},
-	{"PCIE_PORT2",		BIT_PCIE_PORT2},
-	{"PCIE_PORT3",		BIT_PCIE_PORT3},
-	{"LPSS2_F0_DMA",	BIT_LPSS2_F0_DMA},
-	{"LPSS2_F1_I2C1",	BIT_LPSS2_F1_I2C1},
-	{"LPSS2_F2_I2C2",	BIT_LPSS2_F2_I2C2},
-	{"LPSS2_F3_I2C3",	BIT_LPSS2_F3_I2C3},
-	{"LPSS2_F3_I2C4",	BIT_LPSS2_F4_I2C4},
-	{"LPSS2_F5_I2C5",	BIT_LPSS2_F5_I2C5},
-	{"LPSS2_F6_I2C6",	BIT_LPSS2_F6_I2C6},
-	{"LPSS2_F7_I2C7",	BIT_LPSS2_F7_I2C7},
-	{},
-};
-
-static struct pmc_bit_map byt_d3_sts_1_map[] = {
-	{"SMB",			BIT_SMB},
-	{"OTG_SS_PHY",		BIT_OTG_SS_PHY},
-	{"USH_SS_PHY",		BIT_USH_SS_PHY},
-	{"DFX",			BIT_DFX},
-	{},
-};
-
-static struct pmc_bit_map cht_d3_sts_1_map[] = {
-	{"SMB",			BIT_SMB},
-	{"GMM",			BIT_STS_GMM},
-	{"ISH",			BIT_STS_ISH},
-	{},
-};
-
-static struct pmc_bit_map cht_func_dis_2_map[] = {
-	{"SMB",			BIT_SMB},
-	{"GMM",			BIT_FD_GMM},
-	{"ISH",			BIT_FD_ISH},
-	{},
-};
-
-static const struct pmc_bit_map byt_pss_map[] = {
-	{"GBE",			PMC_PSS_BIT_GBE},
-	{"SATA",		PMC_PSS_BIT_SATA},
-	{"HDA",			PMC_PSS_BIT_HDA},
-	{"SEC",			PMC_PSS_BIT_SEC},
-	{"PCIE",		PMC_PSS_BIT_PCIE},
-	{"LPSS",		PMC_PSS_BIT_LPSS},
-	{"LPE",			PMC_PSS_BIT_LPE},
-	{"DFX",			PMC_PSS_BIT_DFX},
-	{"USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
-	{"USH_SUS",		PMC_PSS_BIT_USH_SUS},
-	{"USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
-	{"USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
-	{"OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
-	{"OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
-	{"OTG_VCCA_CLK",	PMC_PSS_BIT_OTG_VCCA_CLK},
-	{"OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
-	{"USB",			PMC_PSS_BIT_USB},
-	{"USB_SUS",		PMC_PSS_BIT_USB_SUS},
-	{},
-};
-
-static const struct pmc_bit_map cht_pss_map[] = {
-	{"SATA",		PMC_PSS_BIT_SATA},
-	{"HDA",			PMC_PSS_BIT_HDA},
-	{"SEC",			PMC_PSS_BIT_SEC},
-	{"PCIE",		PMC_PSS_BIT_PCIE},
-	{"LPSS",		PMC_PSS_BIT_LPSS},
-	{"LPE",			PMC_PSS_BIT_LPE},
-	{"UFS",			PMC_PSS_BIT_CHT_UFS},
-	{"UXD",			PMC_PSS_BIT_CHT_UXD},
-	{"UXD_FD",		PMC_PSS_BIT_CHT_UXD_FD},
-	{"UX_ENG",		PMC_PSS_BIT_CHT_UX_ENG},
-	{"USB_SUS",		PMC_PSS_BIT_CHT_USB_SUS},
-	{"GMM",			PMC_PSS_BIT_CHT_GMM},
-	{"ISH",			PMC_PSS_BIT_CHT_ISH},
-	{"DFX_MASTER",		PMC_PSS_BIT_CHT_DFX_MASTER},
-	{"DFX_CLUSTER1",	PMC_PSS_BIT_CHT_DFX_CLUSTER1},
-	{"DFX_CLUSTER2",	PMC_PSS_BIT_CHT_DFX_CLUSTER2},
-	{"DFX_CLUSTER3",	PMC_PSS_BIT_CHT_DFX_CLUSTER3},
-	{"DFX_CLUSTER4",	PMC_PSS_BIT_CHT_DFX_CLUSTER4},
-	{"DFX_CLUSTER5",	PMC_PSS_BIT_CHT_DFX_CLUSTER5},
-	{},
-};
-
-static const struct pmc_reg_map byt_reg_map = {
-	.d3_sts_0	= d3_sts_0_map,
-	.d3_sts_1	= byt_d3_sts_1_map,
-	.func_dis	= d3_sts_0_map,
-	.func_dis_2	= byt_d3_sts_1_map,
-	.pss		= byt_pss_map,
-};
-
-static const struct pmc_reg_map cht_reg_map = {
-	.d3_sts_0	= d3_sts_0_map,
-	.d3_sts_1	= cht_d3_sts_1_map,
-	.func_dis	= d3_sts_0_map,
-	.func_dis_2	= cht_func_dis_2_map,
-	.pss		= cht_pss_map,
-};
-
-static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
-{
-	return readl(pmc->regmap + reg_offset);
-}
-
-static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
-{
-	writel(val, pmc->regmap + reg_offset);
-}
-
-int pmc_atom_read(int offset, u32 *value)
-{
-	struct pmc_dev *pmc = &pmc_device;
-
-	if (!pmc->init)
-		return -ENODEV;
-
-	*value = pmc_reg_read(pmc, offset);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pmc_atom_read);
-
-int pmc_atom_write(int offset, u32 value)
-{
-	struct pmc_dev *pmc = &pmc_device;
-
-	if (!pmc->init)
-		return -ENODEV;
-
-	pmc_reg_write(pmc, offset, value);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pmc_atom_write);
-
-static void pmc_power_off(void)
-{
-	u16	pm1_cnt_port;
-	u32	pm1_cnt_value;
-
-	pr_info("Preparing to enter system sleep state S5\n");
-
-	pm1_cnt_port = acpi_base_addr + PM1_CNT;
-
-	pm1_cnt_value = inl(pm1_cnt_port);
-	pm1_cnt_value &= SLEEP_TYPE_MASK;
-	pm1_cnt_value |= SLEEP_TYPE_S5;
-	pm1_cnt_value |= SLEEP_ENABLE;
-
-	outl(pm1_cnt_value, pm1_cnt_port);
-}
-
-static void pmc_hw_reg_setup(struct pmc_dev *pmc)
-{
-	/*
-	 * Disable PMC S0IX_WAKE_EN events coming from:
-	 * - LPC clock run
-	 * - GPIO_SUS ored dedicated IRQs
-	 * - GPIO_SCORE ored dedicated IRQs
-	 * - GPIO_SUS shared IRQ
-	 * - GPIO_SCORE shared IRQ
-	 */
-	pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING);
-}
-
-#ifdef CONFIG_DEBUG_FS
-static void pmc_dev_state_print(struct seq_file *s, int reg_index,
-				u32 sts, const struct pmc_bit_map *sts_map,
-				u32 fd, const struct pmc_bit_map *fd_map)
-{
-	int offset = PMC_REG_BIT_WIDTH * reg_index;
-	int index;
-
-	for (index = 0; sts_map[index].name; index++) {
-		seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
-			offset + index, sts_map[index].name,
-			fd_map[index].bit_mask & fd ?  "Disabled" : "Enabled ",
-			sts_map[index].bit_mask & sts ?  "D3" : "D0");
-	}
-}
-
-static int pmc_dev_state_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	const struct pmc_reg_map *m = pmc->map;
-	u32 func_dis, func_dis_2;
-	u32 d3_sts_0, d3_sts_1;
-
-	func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
-	func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
-	d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
-	d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
-
-	/* Low part */
-	pmc_dev_state_print(s, 0, d3_sts_0, m->d3_sts_0, func_dis, m->func_dis);
-
-	/* High part */
-	pmc_dev_state_print(s, 1, d3_sts_1, m->d3_sts_1, func_dis_2, m->func_dis_2);
-
-	return 0;
-}
-
-static int pmc_dev_state_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_dev_state_show, inode->i_private);
-}
-
-static const struct file_operations pmc_dev_state_ops = {
-	.open		= pmc_dev_state_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int pmc_pss_state_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	const struct pmc_bit_map *map = pmc->map->pss;
-	u32 pss = pmc_reg_read(pmc, PMC_PSS);
-	int index;
-
-	for (index = 0; map[index].name; index++) {
-		seq_printf(s, "Island: %-2d - %-32s\tState: %s\n",
-			index, map[index].name,
-			map[index].bit_mask & pss ? "Off" : "On");
-	}
-	return 0;
-}
-
-static int pmc_pss_state_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_pss_state_show, inode->i_private);
-}
-
-static const struct file_operations pmc_pss_state_ops = {
-	.open		= pmc_pss_state_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr;
-
-	s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT;
-	s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT;
-	s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT;
-	s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT;
-	s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT;
-
-	seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr);
-	seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr);
-	seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr);
-	seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr);
-	seq_printf(s, "S0   Residency:\t%lldus\n", s0_tmr);
-	return 0;
-}
-
-static int pmc_sleep_tmr_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_sleep_tmr_show, inode->i_private);
-}
-
-static const struct file_operations pmc_sleep_tmr_ops = {
-	.open		= pmc_sleep_tmr_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
-{
-	debugfs_remove_recursive(pmc->dbgfs_dir);
-}
-
-static int pmc_dbgfs_register(struct pmc_dev *pmc)
-{
-	struct dentry *dir, *f;
-
-	dir = debugfs_create_dir("pmc_atom", NULL);
-	if (!dir)
-		return -ENOMEM;
-
-	pmc->dbgfs_dir = dir;
-
-	f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_dev_state_ops);
-	if (!f)
-		goto err;
-
-	f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_pss_state_ops);
-	if (!f)
-		goto err;
-
-	f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_sleep_tmr_ops);
-	if (!f)
-		goto err;
-
-	return 0;
-err:
-	pmc_dbgfs_unregister(pmc);
-	return -ENODEV;
-}
-#else
-static int pmc_dbgfs_register(struct pmc_dev *pmc)
-{
-	return 0;
-}
-#endif /* CONFIG_DEBUG_FS */
-
-static int pmc_setup_dev(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-	struct pmc_dev *pmc = &pmc_device;
-	const struct pmc_reg_map *map = (struct pmc_reg_map *)ent->driver_data;
-	int ret;
-
-	/* Obtain ACPI base address */
-	pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr);
-	acpi_base_addr &= ACPI_BASE_ADDR_MASK;
-
-	/* Install power off function */
-	if (acpi_base_addr != 0 && pm_power_off == NULL)
-		pm_power_off = pmc_power_off;
-
-	pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr);
-	pmc->base_addr &= PMC_BASE_ADDR_MASK;
-
-	pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN);
-	if (!pmc->regmap) {
-		dev_err(&pdev->dev, "error: ioremap failed\n");
-		return -ENOMEM;
-	}
-
-	pmc->map = map;
-
-	/* PMC hardware registers setup */
-	pmc_hw_reg_setup(pmc);
-
-	ret = pmc_dbgfs_register(pmc);
-	if (ret)
-		dev_warn(&pdev->dev, "debugfs register failed\n");
-
-	pmc->init = true;
-	return ret;
-}
-
-/*
- * Data for PCI driver interface
- *
- * used by pci_match_id() call below.
- */
-static const struct pci_device_id pmc_pci_ids[] = {
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_VLV_PMC), (kernel_ulong_t)&byt_reg_map },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_CHT_PMC), (kernel_ulong_t)&cht_reg_map },
-	{ 0, },
-};
-
-static int __init pmc_atom_init(void)
-{
-	struct pci_dev *pdev = NULL;
-	const struct pci_device_id *ent;
-
-	/* We look for our device - PCU PMC
-	 * we assume that there is max. one device.
-	 *
-	 * We can't use plain pci_driver mechanism,
-	 * as the device is really a multiple function device,
-	 * main driver that binds to the pci_device is lpc_ich
-	 * and have to find & bind to the device this way.
-	 */
-	for_each_pci_dev(pdev) {
-		ent = pci_match_id(pmc_pci_ids, pdev);
-		if (ent)
-			return pmc_setup_dev(pdev, ent);
-	}
-	/* Device not found. */
-	return -ENODEV;
-}
-
-device_initcall(pmc_atom_init);
-
-/*
-MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
-MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
-MODULE_LICENSE("GPL v2");
-*/
-- 
cgit 


From 2dc8ffad8c53e65f85d1a9ece2721463d729054a Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nick.desaulniers@gmail.com>
Date: Mon, 12 Dec 2016 15:28:05 -0800
Subject: ACPI / idle: small formatting fixes

A quick cleanup with scripts/checkpatch.pl -f <file>.

Signed-off-by: Nick Desaulniers <nick.desaulniers@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/kernel/acpi/cstate.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index af15f4444330..8233a630280f 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -12,7 +12,6 @@
 #include <linux/sched.h>
 
 #include <acpi/processor.h>
-#include <asm/acpi.h>
 #include <asm/mwait.h>
 #include <asm/special_insns.h>
 
@@ -89,7 +88,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
 	retval = 0;
 	/* If the HW does not support any sub-states in this C-state */
 	if (num_cstate_subtype == 0) {
-		pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part);
+		pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n",
+				cx->address, edx_part);
 		retval = -1;
 		goto out;
 	}
@@ -104,8 +104,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
 	if (!mwait_supported[cstate_type]) {
 		mwait_supported[cstate_type] = 1;
 		printk(KERN_DEBUG
-			"Monitor-Mwait will be used to enter C-%d "
-			"state\n", cx->type);
+			"Monitor-Mwait will be used to enter C-%d state\n",
+			cx->type);
 	}
 	snprintf(cx->desc,
 			ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x",
@@ -166,6 +166,7 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter);
 static int __init ffh_cstate_init(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
+
 	if (c->x86_vendor != X86_VENDOR_INTEL)
 		return -1;
 
-- 
cgit 


From ab22a4733fe919d22bc2957680506ed17e40941e Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 21 Dec 2016 20:29:28 -0800
Subject: kvm: x86: mmu: Rename EPT_VIOLATION_READ/WRITE/INSTR constants

Rename the EPT_VIOLATION_READ/WRITE/INSTR constants to
EPT_VIOLATION_ACC_READ/WRITE/INSTR to more clearly indicate that these
signify the type of the memory access as opposed to the permissions
granted by the PTE.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h | 12 ++++++------
 arch/x86/kvm/vmx.c         |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index a22a4790f1ac..cc54b7026567 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -510,15 +510,15 @@ struct vmx_msr_entry {
 /*
  * Exit Qualifications for EPT Violations
  */
-#define EPT_VIOLATION_READ_BIT		0
-#define EPT_VIOLATION_WRITE_BIT		1
-#define EPT_VIOLATION_INSTR_BIT		2
+#define EPT_VIOLATION_ACC_READ_BIT	0
+#define EPT_VIOLATION_ACC_WRITE_BIT	1
+#define EPT_VIOLATION_ACC_INSTR_BIT	2
 #define EPT_VIOLATION_READABLE_BIT	3
 #define EPT_VIOLATION_WRITABLE_BIT	4
 #define EPT_VIOLATION_EXECUTABLE_BIT	5
-#define EPT_VIOLATION_READ		(1 << EPT_VIOLATION_READ_BIT)
-#define EPT_VIOLATION_WRITE		(1 << EPT_VIOLATION_WRITE_BIT)
-#define EPT_VIOLATION_INSTR		(1 << EPT_VIOLATION_INSTR_BIT)
+#define EPT_VIOLATION_ACC_READ		(1 << EPT_VIOLATION_ACC_READ_BIT)
+#define EPT_VIOLATION_ACC_WRITE		(1 << EPT_VIOLATION_ACC_WRITE_BIT)
+#define EPT_VIOLATION_ACC_INSTR		(1 << EPT_VIOLATION_ACC_INSTR_BIT)
 #define EPT_VIOLATION_READABLE		(1 << EPT_VIOLATION_READABLE_BIT)
 #define EPT_VIOLATION_WRITABLE		(1 << EPT_VIOLATION_WRITABLE_BIT)
 #define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c7bafa1457e2..81c301def1af 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6375,13 +6375,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	trace_kvm_page_fault(gpa, exit_qualification);
 
 	/* Is it a read fault? */
-	error_code = (exit_qualification & EPT_VIOLATION_READ)
+	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
 		     ? PFERR_USER_MASK : 0;
 	/* Is it a write fault? */
-	error_code |= (exit_qualification & EPT_VIOLATION_WRITE)
+	error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
 		      ? PFERR_WRITE_MASK : 0;
 	/* Is it a fetch fault? */
-	error_code |= (exit_qualification & EPT_VIOLATION_INSTR)
+	error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
 		      ? PFERR_FETCH_MASK : 0;
 	/* ept page table entry is present? */
 	error_code |= (exit_qualification &
-- 
cgit 


From 312b616b30d87581b88d3db54c14ed89610cc97b Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 21 Dec 2016 20:29:29 -0800
Subject: kvm: x86: mmu: Set SPTE_SPECIAL_MASK within mmu.c

Instead of the caller including the SPTE_SPECIAL_MASK in the masks being
supplied to kvm_mmu_set_mmio_spte_mask() and kvm_mmu_set_mask_ptes(),
those functions now themselves include the SPTE_SPECIAL_MASK.

Note that bit 63 is now reset in the default MMIO mask.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++++-
 arch/x86/kvm/vmx.c | 6 ++----
 arch/x86/kvm/x86.c | 3 ---
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 64821ca3a7c3..e3312e22e8db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -208,7 +208,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu);
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 {
-	shadow_mmio_mask = mmio_mask;
+	shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
@@ -318,6 +318,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
 		u64 acc_track_mask)
 {
+	if (acc_track_mask != 0)
+		acc_track_mask |= SPTE_SPECIAL_MASK;
+
 	shadow_user_mask = user_mask;
 	shadow_accessed_mask = accessed_mask;
 	shadow_dirty_mask = dirty_mask;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81c301def1af..d850d5d36182 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5236,10 +5236,8 @@ static void ept_set_mmio_spte_mask(void)
 	/*
 	 * EPT Misconfigurations can be generated if the value of bits 2:0
 	 * of an EPT paging-structure entry is 110b (write/execute).
-	 * Also, special bit (62) is set to quickly identify mmio spte.
 	 */
-	kvm_mmu_set_mmio_spte_mask(SPTE_SPECIAL_MASK |
-				   VMX_EPT_MISCONFIG_WX_VALUE);
+	kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE);
 }
 
 #define VMX_XSS_EXIT_BITMAP 0
@@ -6585,7 +6583,7 @@ void vmx_enable_tdp(void)
 		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
 		0ull, VMX_EPT_EXECUTABLE_MASK,
 		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-		enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK);
+		enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK);
 
 	ept_set_mmio_spte_mask();
 	kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6e2c71ea0627..4fd4d4f35caf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5952,9 +5952,6 @@ static void kvm_set_mmio_spte_mask(void)
 	 /* Mask the reserved physical address bits. */
 	mask = rsvd_bits(maxphyaddr, 51);
 
-	/* Bit 62 is always reserved for 32bit host. */
-	mask |= 0x3ull << 62;
-
 	/* Set the present bit. */
 	mask |= 1ull;
 
-- 
cgit 


From 20d65236d01cdbe14a88f0e2c0f985669f8c41fc Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 21 Dec 2016 20:29:31 -0800
Subject: kvm: x86: mmu: Update comment in mark_spte_for_access_track

Reword the comment to hopefully make it more clear.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e3312e22e8db..e13041ac7cdf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -708,9 +708,9 @@ static u64 mark_spte_for_access_track(u64 spte)
 		return spte;
 
 	/*
-	 * Verify that the write-protection that we do below will be fixable
-	 * via the fast page fault path. Currently, that is always the case, at
-	 * least when using EPT (which is when access tracking would be used).
+	 * Making an Access Tracking PTE will result in removal of write access
+	 * from the PTE. So, verify that we will be able to restore the write
+	 * access in the fast page fault path later on.
 	 */
 	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
 		  !spte_can_locklessly_be_made_writable(spte),
-- 
cgit 


From d162f30a7cebe9731fd331419b3a14089d0b41e3 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 21 Dec 2016 20:29:30 -0800
Subject: kvm: x86: mmu: Move pgtbl walk inside retry loop in fast_page_fault

Redo the page table walk in fast_page_fault when retrying so that we are
working on the latest PTE even if the hierarchy changes.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e13041ac7cdf..437d16274701 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3088,14 +3088,16 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		return false;
 
 	walk_shadow_page_lockless_begin(vcpu);
-	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
-		if (!is_shadow_present_pte(spte) || iterator.level < level)
-			break;
 
 	do {
 		bool remove_write_prot = false;
 		bool remove_acc_track;
 
+		for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+			if (!is_shadow_present_pte(spte) ||
+			    iterator.level < level)
+				break;
+
 		sp = page_header(__pa(iterator.sptep));
 		if (!is_last_spte(spte, sp->role.level))
 			break;
@@ -3176,8 +3178,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 			break;
 		}
 
-		spte = mmu_spte_get_lockless(iterator.sptep);
-
 	} while (true);
 
 	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
-- 
cgit 


From d3e328f2cb01f6f09259a5810baae3edf5416076 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 21 Dec 2016 20:29:32 -0800
Subject: kvm: x86: mmu: Verify that restored PTE has needed perms in fast page
 fault

Before fast page fault restores an access track PTE back to a regular PTE,
it now also verifies that the restored PTE would grant the necessary
permissions for the faulting access to succeed. If not, it falls back
to the slow page fault path.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 127 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 68 insertions(+), 59 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 437d16274701..2fd7586aad4d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -373,6 +373,11 @@ static int is_last_spte(u64 pte, int level)
 	return 0;
 }
 
+static bool is_executable_pte(u64 spte)
+{
+	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+}
+
 static kvm_pfn_t spte_to_pfn(u64 pte)
 {
 	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -728,6 +733,23 @@ static u64 mark_spte_for_access_track(u64 spte)
 	return spte;
 }
 
+/* Restore an acc-track PTE back to a regular PTE */
+static u64 restore_acc_track_spte(u64 spte)
+{
+	u64 new_spte = spte;
+	u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
+			 & shadow_acc_track_saved_bits_mask;
+
+	WARN_ON_ONCE(!is_access_track_spte(spte));
+
+	new_spte &= ~shadow_acc_track_mask;
+	new_spte &= ~(shadow_acc_track_saved_bits_mask <<
+		      shadow_acc_track_saved_bits_shift);
+	new_spte |= saved_bits;
+
+	return new_spte;
+}
+
 /* Returns the Accessed status of the PTE and resets it at the same time. */
 static bool mmu_spte_age(u64 *sptep)
 {
@@ -3019,27 +3041,12 @@ static bool page_fault_can_be_fast(u32 error_code)
  */
 static bool
 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			u64 *sptep, u64 old_spte,
-			bool remove_write_prot, bool remove_acc_track)
+			u64 *sptep, u64 old_spte, u64 new_spte)
 {
 	gfn_t gfn;
-	u64 new_spte = old_spte;
 
 	WARN_ON(!sp->role.direct);
 
-	if (remove_acc_track) {
-		u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift)
-				 & shadow_acc_track_saved_bits_mask;
-
-		new_spte &= ~shadow_acc_track_mask;
-		new_spte &= ~(shadow_acc_track_saved_bits_mask <<
-			      shadow_acc_track_saved_bits_shift);
-		new_spte |= saved_bits;
-	}
-
-	if (remove_write_prot)
-		new_spte |= PT_WRITABLE_MASK;
-
 	/*
 	 * Theoretically we could also set dirty bit (and flush TLB) here in
 	 * order to eliminate unnecessary PML logging. See comments in
@@ -3055,7 +3062,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
 		return false;
 
-	if (remove_write_prot) {
+	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
 		/*
 		 * The gfn of direct spte is stable since it is
 		 * calculated by sp->gfn.
@@ -3067,6 +3074,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	return true;
 }
 
+static bool is_access_allowed(u32 fault_err_code, u64 spte)
+{
+	if (fault_err_code & PFERR_FETCH_MASK)
+		return is_executable_pte(spte);
+
+	if (fault_err_code & PFERR_WRITE_MASK)
+		return is_writable_pte(spte);
+
+	/* Fault was on Read access */
+	return spte & PT_PRESENT_MASK;
+}
+
 /*
  * Return value:
  * - true: let the vcpu to access on the same address again.
@@ -3090,8 +3109,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 	walk_shadow_page_lockless_begin(vcpu);
 
 	do {
-		bool remove_write_prot = false;
-		bool remove_acc_track;
+		u64 new_spte;
 
 		for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
 			if (!is_shadow_present_pte(spte) ||
@@ -3112,52 +3130,44 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		 * Need not check the access of upper level table entries since
 		 * they are always ACC_ALL.
 		 */
+		if (is_access_allowed(error_code, spte)) {
+			fault_handled = true;
+			break;
+		}
 
-		if (error_code & PFERR_FETCH_MASK) {
-			if ((spte & (shadow_x_mask | shadow_nx_mask))
-			    == shadow_x_mask) {
-				fault_handled = true;
-				break;
-			}
-		} else if (error_code & PFERR_WRITE_MASK) {
-			if (is_writable_pte(spte)) {
-				fault_handled = true;
-				break;
-			}
+		new_spte = spte;
+
+		if (is_access_track_spte(spte))
+			new_spte = restore_acc_track_spte(new_spte);
+
+		/*
+		 * Currently, to simplify the code, write-protection can
+		 * be removed in the fast path only if the SPTE was
+		 * write-protected for dirty-logging or access tracking.
+		 */
+		if ((error_code & PFERR_WRITE_MASK) &&
+		    spte_can_locklessly_be_made_writable(spte))
+		{
+			new_spte |= PT_WRITABLE_MASK;
 
 			/*
-			 * Currently, to simplify the code, write-protection can
-			 * be removed in the fast path only if the SPTE was
-			 * write-protected for dirty-logging.
+			 * Do not fix write-permission on the large spte.  Since
+			 * we only dirty the first page into the dirty-bitmap in
+			 * fast_pf_fix_direct_spte(), other pages are missed
+			 * if its slot has dirty logging enabled.
+			 *
+			 * Instead, we let the slow page fault path create a
+			 * normal spte to fix the access.
+			 *
+			 * See the comments in kvm_arch_commit_memory_region().
 			 */
-			remove_write_prot =
-				spte_can_locklessly_be_made_writable(spte);
-		} else {
-			/* Fault was on Read access */
-			if (spte & PT_PRESENT_MASK) {
-				fault_handled = true;
+			if (sp->role.level > PT_PAGE_TABLE_LEVEL)
 				break;
-			}
 		}
 
-		remove_acc_track = is_access_track_spte(spte);
-
 		/* Verify that the fault can be handled in the fast path */
-		if (!remove_acc_track && !remove_write_prot)
-			break;
-
-		/*
-		 * Do not fix write-permission on the large spte since we only
-		 * dirty the first page into the dirty-bitmap in
-		 * fast_pf_fix_direct_spte() that means other pages are missed
-		 * if its slot is dirty-logged.
-		 *
-		 * Instead, we let the slow page fault path create a normal spte
-		 * to fix the access.
-		 *
-		 * See the comments in kvm_arch_commit_memory_region().
-		 */
-		if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot)
+		if (new_spte == spte ||
+		    !is_access_allowed(error_code, new_spte))
 			break;
 
 		/*
@@ -3167,8 +3177,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		 */
 		fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
 							iterator.sptep, spte,
-							remove_write_prot,
-							remove_acc_track);
+							new_spte);
 		if (fault_handled)
 			break;
 
-- 
cgit 


From bf29bddf0417a4783da3b24e8c9e017ac649326f Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 27 Jan 2017 22:25:52 +0000
Subject: x86/efi: Always map the first physical page into the EFI pagetables

Commit:

  129766708 ("x86/efi: Only map RAM into EFI page tables if in mixed-mode")

stopped creating 1:1 mappings for all RAM, when running in native 64-bit mode.

It turns out though that there are 64-bit EFI implementations in the wild
(this particular problem has been reported on a Lenovo Yoga 710-11IKB),
which still make use of the first physical page for their own private use,
even though they explicitly mark it EFI_CONVENTIONAL_MEMORY in the memory
map.

In case there is no mapping for this particular frame in the EFI pagetables,
as soon as firmware tries to make use of it, a triple fault occurs and the
system reboots (in case of the Yoga 710-11IKB this is very early during bootup).

Fix that by always mapping the first page of physical memory into the EFI
pagetables. We're free to hand this page to the BIOS, as trim_bios_range()
will reserve the first page and isolate it away from memory allocators anyway.

Note that just reverting 129766708 alone is not enough on v4.9-rc1+ to fix the
regression on affected hardware, as this commit:

   ab72a27da ("x86/efi: Consolidate region mapping logic")

later made the first physical frame not to be mapped anyway.

Reported-by: Hanka Pavlikova <hanka@ucw.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vojtech Pavlik <vojtech@ucw.cz>
Cc: Waiman Long <waiman.long@hpe.com>
Cc: linux-efi@vger.kernel.org
Cc: stable@kernel.org # v4.8+
Fixes: 129766708 ("x86/efi: Only map RAM into EFI page tables if in mixed-mode")
Link: http://lkml.kernel.org/r/20170127222552.22336-1-matt@codeblueprint.co.uk
[ Tidied up the changelog and the comment. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi_64.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 319148bd4b05..2f25a363068c 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -268,6 +268,22 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 
 	efi_scratch.use_pgd = true;
 
+	/*
+	 * Certain firmware versions are way too sentimential and still believe
+	 * they are exclusive and unquestionable owners of the first physical page,
+	 * even though they explicitly mark it as EFI_CONVENTIONAL_MEMORY
+	 * (but then write-access it later during SetVirtualAddressMap()).
+	 *
+	 * Create a 1:1 mapping for this page, to avoid triple faults during early
+	 * boot with such firmware. We are free to hand this page to the BIOS,
+	 * as trim_bios_range() will reserve the first page and isolate it away
+	 * from memory allocators anyway.
+	 */
+	if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, _PAGE_RW)) {
+		pr_err("Failed to create 1:1 mapping for the first page!\n");
+		return 1;
+	}
+
 	/*
 	 * When making calls to the firmware everything needs to be 1:1
 	 * mapped and addressable with 32-bit pointers. Map the kernel
-- 
cgit 


From f58576666ccdcfb9cf7cae8669dffe1eed844f88 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 27 Jan 2017 16:17:52 -0700
Subject: x86/mm: Improve documentation for low-level device I/O functions

Add kerneldoc comments for memcpy_{to,from}io() and memset_io().  The
existing documentation for ioremap() was distant from the definition,
causing kernel-doc to miss it; move it appropriately.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170127161752.0b95e95b@lwn.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/io.h | 46 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d34bd370074b..7afb0e2f07f4 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -164,6 +164,17 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
 #define virt_to_bus virt_to_phys
 #define bus_to_virt phys_to_virt
 
+/*
+ * The default ioremap() behavior is non-cached; if you need something
+ * else, you probably want one of the following.
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
+#define ioremap_uc ioremap_uc
+
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
+
 /**
  * ioremap     -   map bus memory into CPU space
  * @offset:    bus address of the memory
@@ -178,17 +189,6 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
  * If the area you are trying to map is a PCI BAR you should have a
  * look at pci_iomap().
  */
-extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
-extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
-#define ioremap_uc ioremap_uc
-
-extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
-extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
-				unsigned long prot_val);
-
-/*
- * The default ioremap() behavior is non-cached:
- */
 static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
 {
 	return ioremap_nocache(offset, size);
@@ -207,18 +207,42 @@ extern void set_iounmap_nonlazy(void);
  */
 #define xlate_dev_kmem_ptr(p)	p
 
+/**
+ * memset_io	Set a range of I/O memory to a constant value
+ * @addr:	The beginning of the I/O-memory range to set
+ * @val:	The value to set the memory to
+ * @count:	The number of bytes to set
+ *
+ * Set a range of I/O memory to a given value.
+ */
 static inline void
 memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
 {
 	memset((void __force *)addr, val, count);
 }
 
+/**
+ * memcpy_fromio	Copy a block of data from I/O memory
+ * @dst:		The (RAM) destination for the copy
+ * @src:		The (I/O memory) source for the data
+ * @count:		The number of bytes to copy
+ *
+ * Copy a block of data from I/O memory.
+ */
 static inline void
 memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
 {
 	memcpy(dst, (const void __force *)src, count);
 }
 
+/**
+ * memcpy_toio		Copy a block of data into I/O memory
+ * @dst:		The (I/O memory) destination for the copy
+ * @src:		The (RAM) source for the data
+ * @count:		The number of bytes to copy
+ *
+ * Copy a block of data to I/O memory.
+ */
 static inline void
 memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
 {
-- 
cgit 


From cc272163ea554a97dac180fa8dd6cd54c2810bd1 Mon Sep 17 00:00:00 2001
From: Mohit Gambhir <mohit.gambhir@oracle.com>
Date: Thu, 26 Jan 2017 13:12:27 -0500
Subject: x86/xen: Fix APIC id mismatch warning on Intel

This patch fixes the following warning message seen when booting the
kernel as Dom0 with Xen on Intel machines.

[0.003000] [Firmware Bug]: CPU1: APIC id mismatch. Firmware: 0 APIC: 1]

The code generating the warning in validate_apic_and_package_id() matches
cpu_data(cpu).apicid (initialized in init_intel()->
detect_extended_topology() using cpuid) against the apicid returned from
xen_apic_read(). Now, xen_apic_read() makes a hypercall to retrieve apicid
for the boot  cpu but returns 0 otherwise. Hence the warning gets thrown
for all but the boot cpu.

The idea behind xen_apic_read() returning 0 for apicid is that the
guests (even Dom0) should not need to know what physical processor their
vcpus are running on. This is because we currently  do not have topology
information in Xen and also because xen allows more vcpus than physical
processors. However, boot cpu's apicid is required for loading
xen-acpi-processor driver on AMD machines. Look at following patch for
details:

commit 558daa289a40 ("xen/apic: Return the APIC ID (and version) for CPU
0.")

So to get rid of the warning, this patch modifies
xen_cpu_present_to_apicid() to return cpu_data(cpu).apicid instead of
calling xen_apic_read().

The warning is not seen on AMD machines because init_amd() populates
cpu_data(cpu).apicid by calling hard_smp_processor_id()->xen_apic_read()
as opposed to using apicid from cpuid as is done on Intel machines.

Signed-off-by: Mohit Gambhir <mohit.gambhir@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 44c88ad1841a..bcea81f36fc5 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -145,7 +145,7 @@ static void xen_silent_inquire(int apicid)
 static int xen_cpu_present_to_apicid(int cpu)
 {
 	if (cpu_present(cpu))
-		return xen_get_apic_id(xen_apic_read(APIC_ID));
+		return cpu_data(cpu).apicid;
 	else
 		return BAD_APICID;
 }
-- 
cgit 


From 24c2503255d35c269b67162c397a1a1c1e02f6ce Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 25 Jan 2017 21:00:48 +0100
Subject: x86/microcode: Do not access the initrd after it has been freed

When we look for microcode blobs, we first try builtin and if that
doesn't succeed, we fallback to the initrd supplied to the kernel.

However, at some point doing boot, that initrd gets jettisoned and we
shouldn't access it anymore. But we do, as the below KASAN report shows.
That's because find_microcode_in_initrd() doesn't check whether the
initrd is still valid or not.

So do that.

  ==================================================================
  BUG: KASAN: use-after-free in find_cpio_data
  Read of size 1 by task swapper/1/0
  page:ffffea0000db9d40 count:0 mapcount:0 mapping:          (null) index:0x1
  flags: 0x100000000000000()
  raw: 0100000000000000 0000000000000000 0000000000000001 00000000ffffffff
  raw: dead000000000100 dead000000000200 0000000000000000 0000000000000000
  page dumped because: kasan: bad access detected
  CPU: 1 PID: 0 Comm: swapper/1 Tainted: G        W       4.10.0-rc5-debug-00075-g2dbde22 #3
  Hardware name: Dell Inc. XPS 13 9360/0839Y6, BIOS 1.2.3 12/01/2016
  Call Trace:
   dump_stack
   ? _atomic_dec_and_lock
   ? __dump_page
   kasan_report_error
   ? pointer
   ? find_cpio_data
   __asan_report_load1_noabort
   ? find_cpio_data
   find_cpio_data
   ? vsprintf
   ? dump_stack
   ? get_ucode_user
   ? print_usage_bug
   find_microcode_in_initrd
   __load_ucode_intel
   ? collect_cpu_info_early
   ? debug_check_no_locks_freed
   load_ucode_intel_ap
   ? collect_cpu_info
   ? trace_hardirqs_on
   ? flat_send_IPI_mask_allbutself
   load_ucode_ap
   ? get_builtin_firmware
   ? flush_tlb_func
   ? do_raw_spin_trylock
   ? cpumask_weight
   cpu_init
   ? trace_hardirqs_off
   ? play_dead_common
   ? native_play_dead
   ? hlt_play_dead
   ? syscall_init
   ? arch_cpu_idle_dead
   ? do_idle
   start_secondary
   start_cpu
  Memory state around the buggy address:
   ffff880036e74f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
   ffff880036e74f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
  >ffff880036e75000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
                     ^
   ffff880036e75080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
   ffff880036e75100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
  ==================================================================

Reported-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Tested-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170126165833.evjemhbqzaepirxo@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/microcode.h     |  1 +
 arch/x86/kernel/cpu/microcode/amd.c  |  5 +++--
 arch/x86/kernel/cpu/microcode/core.c | 22 +++++++++++++++++-----
 3 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 38711df3bcb5..2266f864b747 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -140,6 +140,7 @@ extern void __init load_ucode_bsp(void);
 extern void load_ucode_ap(void);
 void reload_early_microcode(void);
 extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
+extern bool initrd_gone;
 #else
 static inline int __init microcode_init(void)			{ return 0; };
 static inline void __init load_ucode_bsp(void)			{ }
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 6a31e2691f3a..079e81733a58 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -384,8 +384,9 @@ void load_ucode_amd_ap(unsigned int family)
 reget:
 		if (!get_builtin_microcode(&cp, family)) {
 #ifdef CONFIG_BLK_DEV_INITRD
-			cp = find_cpio_data(ucode_path, (void *)initrd_start,
-					    initrd_end - initrd_start, NULL);
+			if (!initrd_gone)
+				cp = find_cpio_data(ucode_path, (void *)initrd_start,
+						    initrd_end - initrd_start, NULL);
 #endif
 			if (!(cp.data && cp.size)) {
 				/*
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 2af69d27da62..73102d932760 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -46,6 +46,8 @@
 static struct microcode_ops	*microcode_ops;
 static bool dis_ucode_ldr = true;
 
+bool initrd_gone;
+
 LIST_HEAD(microcode_cache);
 
 /*
@@ -190,21 +192,24 @@ void load_ucode_ap(void)
 static int __init save_microcode_in_initrd(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
+	int ret = -EINVAL;
 
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
 		if (c->x86 >= 6)
-			return save_microcode_in_initrd_intel();
+			ret = save_microcode_in_initrd_intel();
 		break;
 	case X86_VENDOR_AMD:
 		if (c->x86 >= 0x10)
-			return save_microcode_in_initrd_amd(c->x86);
+			ret = save_microcode_in_initrd_amd(c->x86);
 		break;
 	default:
 		break;
 	}
 
-	return -EINVAL;
+	initrd_gone = true;
+
+	return ret;
 }
 
 struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
@@ -247,9 +252,16 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
 	 * has the virtual address of the beginning of the initrd. It also
 	 * possibly relocates the ramdisk. In either case, initrd_start contains
 	 * the updated address so use that instead.
+	 *
+	 * initrd_gone is for the hotplug case where we've thrown out initrd
+	 * already.
 	 */
-	if (!use_pa && initrd_start)
-		start = initrd_start;
+	if (!use_pa) {
+		if (initrd_gone)
+			return (struct cpio_data){ NULL, 0, "" };
+		if (initrd_start)
+			start = initrd_start;
+	}
 
 	return find_cpio_data(path, (void *)start, size, NULL);
 #else /* !CONFIG_BLK_DEV_INITRD */
-- 
cgit 


From a83f4c00dd6a646ac3c7604ee255d732fc5e0e0b Mon Sep 17 00:00:00 2001
From: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Date: Mon, 16 Jan 2017 17:36:21 -0600
Subject: perf/x86/amd/uncore: Rename 'L2' to 'LLC'

This patch renames L2 counters to LLC counters. In AMD Family17h
processors, L3 cache counter is supported.

Since older families have at most L2 counters, last level cache (LLC)
indicates L2/L3 based on the family.

Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/5d8cd8736d8d578354597a548e64ff16210c319b.1484598705.git.Janakarajan.Natarajan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/amd/uncore.c | 98 ++++++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 49 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index a0b1bdb3ad42..a53bfbe02c7c 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -25,7 +25,7 @@
 #define MAX_COUNTERS		NUM_COUNTERS_NB
 
 #define RDPMC_BASE_NB		6
-#define RDPMC_BASE_L2		10
+#define RDPMC_BASE_LLC		10
 
 #define COUNTER_SHIFT		16
 
@@ -45,30 +45,30 @@ struct amd_uncore {
 };
 
 static struct amd_uncore * __percpu *amd_uncore_nb;
-static struct amd_uncore * __percpu *amd_uncore_l2;
+static struct amd_uncore * __percpu *amd_uncore_llc;
 
 static struct pmu amd_nb_pmu;
-static struct pmu amd_l2_pmu;
+static struct pmu amd_llc_pmu;
 
 static cpumask_t amd_nb_active_mask;
-static cpumask_t amd_l2_active_mask;
+static cpumask_t amd_llc_active_mask;
 
 static bool is_nb_event(struct perf_event *event)
 {
 	return event->pmu->type == amd_nb_pmu.type;
 }
 
-static bool is_l2_event(struct perf_event *event)
+static bool is_llc_event(struct perf_event *event)
 {
-	return event->pmu->type == amd_l2_pmu.type;
+	return event->pmu->type == amd_llc_pmu.type;
 }
 
 static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
 {
 	if (is_nb_event(event) && amd_uncore_nb)
 		return *per_cpu_ptr(amd_uncore_nb, event->cpu);
-	else if (is_l2_event(event) && amd_uncore_l2)
-		return *per_cpu_ptr(amd_uncore_l2, event->cpu);
+	else if (is_llc_event(event) && amd_uncore_llc)
+		return *per_cpu_ptr(amd_uncore_llc, event->cpu);
 
 	return NULL;
 }
@@ -183,16 +183,16 @@ static int amd_uncore_event_init(struct perf_event *event)
 		return -ENOENT;
 
 	/*
-	 * NB and L2 counters (MSRs) are shared across all cores that share the
-	 * same NB / L2 cache. Interrupts can be directed to a single target
-	 * core, however, event counts generated by processes running on other
-	 * cores cannot be masked out. So we do not support sampling and
-	 * per-thread events.
+	 * NB and Last level cache counters (MSRs) are shared across all cores
+	 * that share the same NB / Last level cache. Interrupts can be directed
+	 * to a single target core, however, event counts generated by processes
+	 * running on other cores cannot be masked out. So we do not support
+	 * sampling and per-thread events.
 	 */
 	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
 		return -EINVAL;
 
-	/* NB and L2 counters do not have usr/os/guest/host bits */
+	/* NB and Last level cache counters do not have usr/os/guest/host bits */
 	if (event->attr.exclude_user || event->attr.exclude_kernel ||
 	    event->attr.exclude_host || event->attr.exclude_guest)
 		return -EINVAL;
@@ -226,8 +226,8 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
 
 	if (pmu->type == amd_nb_pmu.type)
 		active_mask = &amd_nb_active_mask;
-	else if (pmu->type == amd_l2_pmu.type)
-		active_mask = &amd_l2_active_mask;
+	else if (pmu->type == amd_llc_pmu.type)
+		active_mask = &amd_llc_active_mask;
 	else
 		return 0;
 
@@ -276,7 +276,7 @@ static struct pmu amd_nb_pmu = {
 	.read		= amd_uncore_read,
 };
 
-static struct pmu amd_l2_pmu = {
+static struct pmu amd_llc_pmu = {
 	.task_ctx_nr	= perf_invalid_context,
 	.attr_groups	= amd_uncore_attr_groups,
 	.name		= "amd_l2",
@@ -296,7 +296,7 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
 
 static int amd_uncore_cpu_up_prepare(unsigned int cpu)
 {
-	struct amd_uncore *uncore_nb = NULL, *uncore_l2;
+	struct amd_uncore *uncore_nb = NULL, *uncore_llc;
 
 	if (amd_uncore_nb) {
 		uncore_nb = amd_uncore_alloc(cpu);
@@ -312,18 +312,18 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
 		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
 	}
 
-	if (amd_uncore_l2) {
-		uncore_l2 = amd_uncore_alloc(cpu);
-		if (!uncore_l2)
+	if (amd_uncore_llc) {
+		uncore_llc = amd_uncore_alloc(cpu);
+		if (!uncore_llc)
 			goto fail;
-		uncore_l2->cpu = cpu;
-		uncore_l2->num_counters = NUM_COUNTERS_L2;
-		uncore_l2->rdpmc_base = RDPMC_BASE_L2;
-		uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
-		uncore_l2->active_mask = &amd_l2_active_mask;
-		uncore_l2->pmu = &amd_l2_pmu;
-		uncore_l2->id = -1;
-		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
+		uncore_llc->cpu = cpu;
+		uncore_llc->num_counters = NUM_COUNTERS_L2;
+		uncore_llc->rdpmc_base = RDPMC_BASE_LLC;
+		uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL;
+		uncore_llc->active_mask = &amd_llc_active_mask;
+		uncore_llc->pmu = &amd_llc_pmu;
+		uncore_llc->id = -1;
+		*per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc;
 	}
 
 	return 0;
@@ -376,17 +376,17 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
 		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
 	}
 
-	if (amd_uncore_l2) {
+	if (amd_uncore_llc) {
 		unsigned int apicid = cpu_data(cpu).apicid;
 		unsigned int nshared;
 
-		uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
+		uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
 		cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
 		nshared = ((eax >> 14) & 0xfff) + 1;
 		uncore->id = apicid - (apicid % nshared);
 
-		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
-		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
+		*per_cpu_ptr(amd_uncore_llc, cpu) = uncore;
 	}
 
 	return 0;
@@ -419,8 +419,8 @@ static int amd_uncore_cpu_online(unsigned int cpu)
 	if (amd_uncore_nb)
 		uncore_online(cpu, amd_uncore_nb);
 
-	if (amd_uncore_l2)
-		uncore_online(cpu, amd_uncore_l2);
+	if (amd_uncore_llc)
+		uncore_online(cpu, amd_uncore_llc);
 
 	return 0;
 }
@@ -456,8 +456,8 @@ static int amd_uncore_cpu_down_prepare(unsigned int cpu)
 	if (amd_uncore_nb)
 		uncore_down_prepare(cpu, amd_uncore_nb);
 
-	if (amd_uncore_l2)
-		uncore_down_prepare(cpu, amd_uncore_l2);
+	if (amd_uncore_llc)
+		uncore_down_prepare(cpu, amd_uncore_llc);
 
 	return 0;
 }
@@ -479,8 +479,8 @@ static int amd_uncore_cpu_dead(unsigned int cpu)
 	if (amd_uncore_nb)
 		uncore_dead(cpu, amd_uncore_nb);
 
-	if (amd_uncore_l2)
-		uncore_dead(cpu, amd_uncore_l2);
+	if (amd_uncore_llc)
+		uncore_dead(cpu, amd_uncore_llc);
 
 	return 0;
 }
@@ -510,16 +510,16 @@ static int __init amd_uncore_init(void)
 	}
 
 	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
-		amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
-		if (!amd_uncore_l2) {
+		amd_uncore_llc = alloc_percpu(struct amd_uncore *);
+		if (!amd_uncore_llc) {
 			ret = -ENOMEM;
-			goto fail_l2;
+			goto fail_llc;
 		}
-		ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+		ret = perf_pmu_register(&amd_llc_pmu, amd_llc_pmu.name, -1);
 		if (ret)
-			goto fail_l2;
+			goto fail_llc;
 
-		pr_info("perf: AMD L2I counters detected\n");
+		pr_info("perf: AMD LLC counters detected\n");
 		ret = 0;
 	}
 
@@ -529,7 +529,7 @@ static int __init amd_uncore_init(void)
 	if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP,
 			      "perf/x86/amd/uncore:prepare",
 			      amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead))
-		goto fail_l2;
+		goto fail_llc;
 
 	if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
 			      "perf/x86/amd/uncore:starting",
@@ -546,11 +546,11 @@ fail_start:
 	cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING);
 fail_prep:
 	cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP);
-fail_l2:
+fail_llc:
 	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
 		perf_pmu_unregister(&amd_nb_pmu);
-	if (amd_uncore_l2)
-		free_percpu(amd_uncore_l2);
+	if (amd_uncore_llc)
+		free_percpu(amd_uncore_llc);
 fail_nb:
 	if (amd_uncore_nb)
 		free_percpu(amd_uncore_nb);
-- 
cgit 


From bc1daef6b5da574bca0a2ec7f9b4d0c5fe0c7d11 Mon Sep 17 00:00:00 2001
From: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Date: Mon, 16 Jan 2017 17:36:22 -0600
Subject: perf/x86/amd/uncore: Update the number of uncore counters

This patch updates the AMD uncore driver to support AMD Family17h
processors. In Family17h, there are two extra last level cache counters.

The maximum available counters is increased and the number of counters
for each uncore type is now based on the family.

Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/799f9c5be8963cc209d9169a08f4a2643b748dc7.1484598705.git.Janakarajan.Natarajan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/amd/uncore.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index a53bfbe02c7c..e6a2eb54c4a4 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -22,13 +22,17 @@
 
 #define NUM_COUNTERS_NB		4
 #define NUM_COUNTERS_L2		4
-#define MAX_COUNTERS		NUM_COUNTERS_NB
+#define NUM_COUNTERS_L3		6
+#define MAX_COUNTERS		6
 
 #define RDPMC_BASE_NB		6
 #define RDPMC_BASE_LLC		10
 
 #define COUNTER_SHIFT		16
 
+static int num_counters_llc;
+static int num_counters_nb;
+
 static HLIST_HEAD(uncore_unused_list);
 
 struct amd_uncore {
@@ -303,7 +307,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
 		if (!uncore_nb)
 			goto fail;
 		uncore_nb->cpu = cpu;
-		uncore_nb->num_counters = NUM_COUNTERS_NB;
+		uncore_nb->num_counters = num_counters_nb;
 		uncore_nb->rdpmc_base = RDPMC_BASE_NB;
 		uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
 		uncore_nb->active_mask = &amd_nb_active_mask;
@@ -317,7 +321,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
 		if (!uncore_llc)
 			goto fail;
 		uncore_llc->cpu = cpu;
-		uncore_llc->num_counters = NUM_COUNTERS_L2;
+		uncore_llc->num_counters = num_counters_llc;
 		uncore_llc->rdpmc_base = RDPMC_BASE_LLC;
 		uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL;
 		uncore_llc->active_mask = &amd_llc_active_mask;
@@ -492,6 +496,27 @@ static int __init amd_uncore_init(void)
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		goto fail_nodev;
 
+	switch(boot_cpu_data.x86) {
+		case 23:
+			/* Family 17h: */
+			num_counters_nb = NUM_COUNTERS_NB;
+			num_counters_llc = NUM_COUNTERS_L3;
+			break;
+		case 22:
+			/* Family 16h - may change: */
+			num_counters_nb = NUM_COUNTERS_NB;
+			num_counters_llc = NUM_COUNTERS_L2;
+			break;
+		default:
+			/*
+			 * All prior families have the same number of
+			 * NorthBridge and Last Level Cache counters
+			 */
+			num_counters_nb = NUM_COUNTERS_NB;
+			num_counters_llc = NUM_COUNTERS_L2;
+			break;
+	}
+
 	if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
 		goto fail_nodev;
 
-- 
cgit 


From da6adaea2b7ef658c61a557c28508668eac29fe1 Mon Sep 17 00:00:00 2001
From: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Date: Mon, 16 Jan 2017 17:36:23 -0600
Subject: perf/x86/amd/uncore: Update sysfs attributes for Family17h processors

This patch updates the sysfs attributes for AMD Family17h processors. In
Family17h, the event bit position is changed for both the NorthBridge
and Last level cache counters.

The sysfs attributes are assigned based on the family and the type of
the counter.

Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/617570ed3634e804991f95db62c3cf3856a9d2a7.1484598705.git.Janakarajan.Natarajan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/amd/uncore.c | 77 ++++++++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index e6a2eb54c4a4..4d1f7f2d9aff 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -248,30 +248,47 @@ static struct attribute_group amd_uncore_attr_group = {
 	.attrs = amd_uncore_attrs,
 };
 
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15");
-
-static struct attribute *amd_uncore_format_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	NULL,
-};
-
-static struct attribute_group amd_uncore_format_group = {
-	.name = "format",
-	.attrs = amd_uncore_format_attr,
+/*
+ * Similar to PMU_FORMAT_ATTR but allowing for format_attr to be assigned based
+ * on family
+ */
+#define AMD_FORMAT_ATTR(_dev, _name, _format)				     \
+static ssize_t								     \
+_dev##_show##_name(struct device *dev,					     \
+		struct device_attribute *attr,				     \
+		char *page)						     \
+{									     \
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			     \
+	return sprintf(page, _format "\n");				     \
+}									     \
+static struct device_attribute format_attr_##_dev##_name = __ATTR_RO(_dev);
+
+/* Used for each uncore counter type */
+#define AMD_ATTRIBUTE(_name)						     \
+static struct attribute *amd_uncore_format_attr_##_name[] = {		     \
+	&format_attr_event_##_name.attr,				     \
+	&format_attr_umask.attr,					     \
+	NULL,								     \
+};									     \
+static struct attribute_group amd_uncore_format_group_##_name = {	     \
+	.name = "format",						     \
+	.attrs = amd_uncore_format_attr_##_name,			     \
+};									     \
+static const struct attribute_group *amd_uncore_attr_groups_##_name[] = {    \
+	&amd_uncore_attr_group,						     \
+	&amd_uncore_format_group_##_name,				     \
+	NULL,								     \
 };
 
-static const struct attribute_group *amd_uncore_attr_groups[] = {
-	&amd_uncore_attr_group,
-	&amd_uncore_format_group,
-	NULL,
-};
+AMD_FORMAT_ATTR(event, , "config:0-7,32-35");
+AMD_FORMAT_ATTR(umask, , "config:8-15");
+AMD_FORMAT_ATTR(event, _df, "config:0-7,32-35,59-60");
+AMD_FORMAT_ATTR(event, _l3, "config:0-7");
+AMD_ATTRIBUTE(df);
+AMD_ATTRIBUTE(l3);
 
 static struct pmu amd_nb_pmu = {
 	.task_ctx_nr	= perf_invalid_context,
-	.attr_groups	= amd_uncore_attr_groups,
-	.name		= "amd_nb",
 	.event_init	= amd_uncore_event_init,
 	.add		= amd_uncore_add,
 	.del		= amd_uncore_del,
@@ -282,8 +299,6 @@ static struct pmu amd_nb_pmu = {
 
 static struct pmu amd_llc_pmu = {
 	.task_ctx_nr	= perf_invalid_context,
-	.attr_groups	= amd_uncore_attr_groups,
-	.name		= "amd_l2",
 	.event_init	= amd_uncore_event_init,
 	.add		= amd_uncore_add,
 	.del		= amd_uncore_del,
@@ -501,11 +516,25 @@ static int __init amd_uncore_init(void)
 			/* Family 17h: */
 			num_counters_nb = NUM_COUNTERS_NB;
 			num_counters_llc = NUM_COUNTERS_L3;
+			/*
+			 * For Family17h, the NorthBridge counters are
+			 * re-purposed as Data Fabric counters. Also, support is
+			 * added for L3 counters. The pmus are exported based on
+			 * family as either L2 or L3 and NB or DF.
+			 */
+			amd_nb_pmu.name = "amd_df";
+			amd_llc_pmu.name = "amd_l3";
+			format_attr_event_df.show = &event_show_df;
+			format_attr_event_l3.show = &event_show_l3;
 			break;
 		case 22:
 			/* Family 16h - may change: */
 			num_counters_nb = NUM_COUNTERS_NB;
 			num_counters_llc = NUM_COUNTERS_L2;
+			amd_nb_pmu.name = "amd_nb";
+			amd_llc_pmu.name = "amd_l2";
+			format_attr_event_df = format_attr_event;
+			format_attr_event_l3 = format_attr_event;
 			break;
 		default:
 			/*
@@ -514,8 +543,14 @@ static int __init amd_uncore_init(void)
 			 */
 			num_counters_nb = NUM_COUNTERS_NB;
 			num_counters_llc = NUM_COUNTERS_L2;
+			amd_nb_pmu.name = "amd_nb";
+			amd_llc_pmu.name = "amd_l2";
+			format_attr_event_df = format_attr_event;
+			format_attr_event_l3 = format_attr_event;
 			break;
 	}
+	amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df;
+	amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3;
 
 	if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
 		goto fail_nodev;
-- 
cgit 


From 612f0c0b859ee99f800dc88ad470d938d90ad111 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 26 Jan 2017 09:08:19 +0100
Subject: perf/x86/events: Add an AMD-specific Makefile

Move the AMD pieces from the generic Makefile so that

  $ make arch/x86/events/amd/<file>.s

can work too. Otherwise you get:

  $ make arch/x86/events/amd/ibs.s
  scripts/Makefile.build:44: arch/x86/events/amd/Makefile: No such file or directory
  make[1]: *** No rule to make target 'arch/x86/events/amd/Makefile'.  Stop.
  Makefile:1636: recipe for target 'arch/x86/events/amd/ibs.s' failed
  make: *** [arch/x86/events/amd/ibs.s] Error 2

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20170126080819.417-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile     | 13 +++----------
 arch/x86/events/amd/Makefile |  7 +++++++
 2 files changed, 10 insertions(+), 10 deletions(-)
 create mode 100644 arch/x86/events/amd/Makefile

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 1d392c39fe56..b8ccdb5c9244 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,11 +1,4 @@
-obj-y			+= core.o
-
-obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o amd/uncore.o
-obj-$(CONFIG_PERF_EVENTS_AMD_POWER)	+= amd/power.o
-obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o msr.o
-ifdef CONFIG_AMD_IOMMU
-obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
-endif
-
-obj-$(CONFIG_CPU_SUP_INTEL)		+= msr.o
+obj-y					+= core.o
+obj-y					+= amd/
+obj-$(CONFIG_X86_LOCAL_APIC)            += msr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile
new file mode 100644
index 000000000000..b1da46f396e0
--- /dev/null
+++ b/arch/x86/events/amd/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_CPU_SUP_AMD)		+= core.o uncore.o
+obj-$(CONFIG_PERF_EVENTS_AMD_POWER)	+= power.o
+obj-$(CONFIG_X86_LOCAL_APIC)		+= ibs.o
+ifdef CONFIG_AMD_IOMMU
+obj-$(CONFIG_CPU_SUP_AMD)		+= iommu.o
+endif
+
-- 
cgit 


From 459fbe00693449fade2d1bc802791b081c94edcf Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Jan 2017 09:41:21 +0100
Subject: x86/mm/cpa: Avoid wbinvd() for PREEMPT

Although wbinvd() is faster than flushing many individual pages, it blocks
the memory bus for "long" periods of time (>100us), thus directly causing
unusually large latencies on all CPUs, regardless of any CPU isolation
features that may be active. This is an unpriviledged operatation as it is
exposed to user space via the graphics subsystem.

For 1024 pages, flushing those pages individually can take up to 2200us,
but the task remains fully preemptible during that time.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: linux-rt-users <linux-rt-users@vger.kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pageattr.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 5a287e523eab..28d42130243c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -214,7 +214,20 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 			    int in_flags, struct page **pages)
 {
 	unsigned int i, level;
+#ifdef CONFIG_PREEMPT
+	/*
+	 * Avoid wbinvd() because it causes latencies on all CPUs,
+	 * regardless of any CPU isolation that may be in effect.
+	 *
+	 * This should be extended for CAT enabled systems independent of
+	 * PREEMPT because wbinvd() does not respect the CAT partitions and
+	 * this is exposed to unpriviledged users through the graphics
+	 * subsystem.
+	 */
+	unsigned long do_wbinvd = 0;
+#else
 	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
+#endif
 
 	BUG_ON(irqs_disabled());
 
-- 
cgit 


From 3ad38ceb2769f08d0abd132331a7a6130536a36c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 30 Jan 2017 16:37:11 -0800
Subject: x86/mm: Remove CONFIG_DEBUG_NX_TEST

CONFIG_DEBUG_NX_TEST has been broken since CONFIG_DEBUG_SET_MODULE_RONX=y
was added in v2.6.37 via:

  84e1c6bb38eb ("x86: Add RO/NX protection for loadable kernel modules")

since the exception table was then made read-only.

Additionally, the manually constructed extables were never fixed when
relative extables were introduced in v3.5 via:

  706276543b69 ("x86, extable: Switch to relative exception table entries")

However, relative extables won't work for test_nx.c, since test instruction
memory areas may be more than INT_MAX away from an executable fixup
(e.g. stack and heap too far away from executable memory with the fixup).

Since clearly no one has been using this code for a while now, and similar
tests exist in LKDTM, this should just be removed entirely.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jinbum Park <jinb.park7@gmail.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170131003711.GA74048@beast
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig.debug    |   8 ---
 arch/x86/kernel/Makefile  |   1 -
 arch/x86/kernel/test_nx.c | 173 ----------------------------------------------
 3 files changed, 182 deletions(-)
 delete mode 100644 arch/x86/kernel/test_nx.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 67eec55093a5..783099f2ac72 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -120,14 +120,6 @@ config DEBUG_SET_MODULE_RONX
 	  against certain classes of kernel exploits.
 	  If in doubt, say "N".
 
-config DEBUG_NX_TEST
-	tristate "Testcase for the NX non-executable stack feature"
-	depends on DEBUG_KERNEL && m
-	---help---
-	  This option enables a testcase for the CPU NX capability
-	  and the software setup of this feature.
-	  If in doubt, say "N"
-
 config DOUBLEFAULT
 	default y
 	bool "Enable doublefault exception handler" if EXPERT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 581386c7e429..bdcdb3b3a219 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -101,7 +101,6 @@ obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
 
 obj-$(CONFIG_AMD_NB)		+= amd_nb.o
 obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
-obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
deleted file mode 100644
index a3b875c9e6af..000000000000
--- a/arch/x86/kernel/test_nx.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * test_nx.c: functional test for NX functionality
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/module.h>
-#include <linux/sort.h>
-#include <linux/slab.h>
-
-#include <linux/uaccess.h>
-#include <asm/asm.h>
-
-extern int rodata_test_data;
-
-/*
- * This file checks 4 things:
- * 1) Check if the stack is not executable
- * 2) Check if kmalloc memory is not executable
- * 3) Check if the .rodata section is not executable
- * 4) Check if the .data section of a module is not executable
- *
- * To do this, the test code tries to execute memory in stack/kmalloc/etc,
- * and then checks if the expected trap happens.
- *
- * Sadly, this implies having a dynamic exception handling table entry.
- * ... which can be done (and will make Rusty cry)... but it can only
- * be done in a stand-alone module with only 1 entry total.
- * (otherwise we'd have to sort and that's just too messy)
- */
-
-
-
-/*
- * We want to set up an exception handling point on our stack,
- * which means a variable value. This function is rather dirty
- * and walks the exception table of the module, looking for a magic
- * marker and replaces it with a specific function.
- */
-static void fudze_exception_table(void *marker, void *new)
-{
-	struct module *mod = THIS_MODULE;
-	struct exception_table_entry *extable;
-
-	/*
-	 * Note: This module has only 1 exception table entry,
-	 * so searching and sorting is not needed. If that changes,
-	 * this would be the place to search and re-sort the exception
-	 * table.
-	 */
-	if (mod->num_exentries > 1) {
-		printk(KERN_ERR "test_nx: too many exception table entries!\n");
-		printk(KERN_ERR "test_nx: test results are not reliable.\n");
-		return;
-	}
-	extable = (struct exception_table_entry *)mod->extable;
-	extable[0].insn = (unsigned long)new;
-}
-
-
-/*
- * exception tables get their symbols translated so we need
- * to use a fake function to put in there, which we can then
- * replace at runtime.
- */
-void foo_label(void);
-
-/*
- * returns 0 for not-executable, negative for executable
- *
- * Note: we cannot allow this function to be inlined, because
- * that would give us more than 1 exception table entry.
- * This in turn would break the assumptions above.
- */
-static noinline int test_address(void *address)
-{
-	unsigned long result;
-
-	/* Set up an exception table entry for our address */
-	fudze_exception_table(&foo_label, address);
-	result = 1;
-	asm volatile(
-		"foo_label:\n"
-		"0:	call *%[fake_code]\n"
-		"1:\n"
-		".section .fixup,\"ax\"\n"
-		"2:	mov %[zero], %[rslt]\n"
-		"	ret\n"
-		".previous\n"
-		_ASM_EXTABLE(0b,2b)
-		: [rslt] "=r" (result)
-		: [fake_code] "r" (address), [zero] "r" (0UL), "0" (result)
-	);
-	/* change the exception table back for the next round */
-	fudze_exception_table(address, &foo_label);
-
-	if (result)
-		return -ENODEV;
-	return 0;
-}
-
-static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */
-
-static int test_NX(void)
-{
-	int ret = 0;
-	/* 0xC3 is the opcode for "ret" */
-	char stackcode[] = {0xC3, 0x90, 0 };
-	char *heap;
-
-	test_data = 0xC3;
-
-	printk(KERN_INFO "Testing NX protection\n");
-
-	/* Test 1: check if the stack is not executable */
-	if (test_address(&stackcode)) {
-		printk(KERN_ERR "test_nx: stack was executable\n");
-		ret = -ENODEV;
-	}
-
-
-	/* Test 2: Check if the heap is executable */
-	heap = kmalloc(64, GFP_KERNEL);
-	if (!heap)
-		return -ENOMEM;
-	heap[0] = 0xC3; /* opcode for "ret" */
-
-	if (test_address(heap)) {
-		printk(KERN_ERR "test_nx: heap was executable\n");
-		ret = -ENODEV;
-	}
-	kfree(heap);
-
-	/*
-	 * The following 2 tests currently fail, this needs to get fixed
-	 * Until then, don't run them to avoid too many people getting scared
-	 * by the error message
-	 */
-
-	/* Test 3: Check if the .rodata section is executable */
-	if (rodata_test_data != 0xC3) {
-		printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
-		ret = -ENODEV;
-	} else if (test_address(&rodata_test_data)) {
-		printk(KERN_ERR "test_nx: .rodata section is executable\n");
-		ret = -ENODEV;
-	}
-
-#if 0
-	/* Test 4: Check if the .data section of a module is executable */
-	if (test_address(&test_data)) {
-		printk(KERN_ERR "test_nx: .data section is executable\n");
-		ret = -ENODEV;
-	}
-
-#endif
-	return ret;
-}
-
-static void test_exit(void)
-{
-}
-
-module_init(test_NX);
-module_exit(test_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the NX infrastructure");
-MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
-- 
cgit 


From d6f3609d2b4c6d0eec01f398cb685e50da3e6013 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Sat, 28 Jan 2017 12:37:14 -0700
Subject: Drivers: hv: restore hypervcall page cleanup before kexec

We need to cleanup the hypercall page before doing kexec/kdump or the new
kernel may crash if it tries to use it. Reuse the now-empty hv_cleanup
function renaming it to hyperv_cleanup and moving to the arch specific
code.

Fixes: 8730046c1498 ("Drivers: hv vmbus: Move Hypercall page setup out of common code")
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/hyperv/hv_init.c       | 16 ++++++++++++++++
 arch/x86/include/asm/mshyperv.h |  1 +
 2 files changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index d289bc29d282..d6b018b86c42 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -165,6 +165,22 @@ void hyperv_init(void)
 		clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);
 }
 
+/*
+ * This routine is called before kexec/kdump, it does the required cleanup.
+ */
+void hyperv_cleanup(void)
+{
+	union hv_x64_msr_hypercall_contents hypercall_msr;
+
+	/* Reset our OS id */
+	wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
+
+	/* Reset the hypercall page */
+	hypercall_msr.as_uint64 = 0;
+	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+}
+EXPORT_SYMBOL_GPL(hyperv_cleanup);
+
 /*
  * hv_do_hypercall- Invoke the specified hypercall
  */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 42505d1158d6..f8dc3700de67 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -171,5 +171,6 @@ void hv_remove_crash_handler(void);
 void hyperv_init(void);
 void hyperv_report_panic(struct pt_regs *regs);
 bool hv_is_hypercall_page_setup(void);
+void hyperv_cleanup(void);
 #endif
 #endif
-- 
cgit 


From 5647dbf8f0807a35421bd0232247b02413ef2cab Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Sat, 28 Jan 2017 12:37:15 -0700
Subject: Drivers: hv: restore TSC page cleanup before kexec

We need to cleanup the TSC page before doing kexec/kdump or the new kernel
may crash if it tries to use it.

Fixes: 63ed4e0c67df ("Drivers: hv: vmbus: Consolidate all Hyper-V specific clocksource code")
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/hyperv/hv_init.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index d6b018b86c42..b371d0e984a9 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -178,6 +178,10 @@ void hyperv_cleanup(void)
 	/* Reset the hypercall page */
 	hypercall_msr.as_uint64 = 0;
 	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+	/* Reset the TSC page */
+	hypercall_msr.as_uint64 = 0;
+	wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64);
 }
 EXPORT_SYMBOL_GPL(hyperv_cleanup);
 
-- 
cgit 


From aaaec6fc755447a1d056765b11b24d8ff2b81366 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 Jan 2017 19:03:21 +0100
Subject: x86/irq: Make irq activate operations symmetric

The recent commit which prevents double activation of interrupts unearthed
interesting code in x86. The code (ab)uses irq_domain_activate_irq() to
reconfigure an already activated interrupt. That trips over the prevention
code now.

Fix it by deactivating the interrupt before activating the new configuration.

Fixes: 08d85f3ea99f1 "irqdomain: Avoid activating interrupts more than once"
Reported-and-tested-by: Mike Galbraith <efault@gmx.de>
Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1701311901580.3457@nanos
---
 arch/x86/kernel/apic/io_apic.c | 2 ++
 arch/x86/kernel/hpet.c         | 1 +
 2 files changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1e35dd06b090..52f352b063fd 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2117,6 +2117,7 @@ static inline void __init check_timer(void)
 			if (idx != -1 && irq_trigger(idx))
 				unmask_ioapic_irq(irq_get_chip_data(0));
 		}
+		irq_domain_deactivate_irq(irq_data);
 		irq_domain_activate_irq(irq_data);
 		if (timer_irq_works()) {
 			if (disable_timer_pin_1 > 0)
@@ -2138,6 +2139,7 @@ static inline void __init check_timer(void)
 		 * legacy devices should be connected to IO APIC #0
 		 */
 		replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+		irq_domain_deactivate_irq(irq_data);
 		irq_domain_activate_irq(irq_data);
 		legacy_pic->unmask(0);
 		if (timer_irq_works()) {
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 85e87b46c318..dc6ba5bda9fc 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -352,6 +352,7 @@ static int hpet_resume(struct clock_event_device *evt, int timer)
 	} else {
 		struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
 
+		irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq));
 		irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
 		disable_irq(hdev->irq);
 		irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
-- 
cgit 


From 0becc0ae5b42828785b589f686725ff5bc3b9b25 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 Jan 2017 09:37:34 +0100
Subject: x86/mce: Make timer handling more robust

Erik reported that on a preproduction hardware a CMCI storm triggers the
BUG_ON in add_timer_on(). The reason is that the per CPU MCE timer is
started by the CMCI logic before the MCE CPU hotplug callback starts the
timer with add_timer_on(). So the timer is already queued which triggers
the BUG.

Using add_timer_on() is pretty pointless in this code because the timer is
strictlty per CPU, initialized as pinned and all operations which arm the
timer happen on the CPU to which the timer belongs.

Simplify the whole machinery by using mod_timer() instead of add_timer_on()
which avoids the problem because mod_timer() can handle already queued
timers. Use __start_timer() everywhere so the earliest armed expiry time is
preserved.

Reported-by: Erik Veijola <erik.veijola@intel.com>
Tested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@alien8.de>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1701310936080.3457@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 00ef43233e03..537c6647d84c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1373,20 +1373,15 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
 
 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
 
-static void __restart_timer(struct timer_list *t, unsigned long interval)
+static void __start_timer(struct timer_list *t, unsigned long interval)
 {
 	unsigned long when = jiffies + interval;
 	unsigned long flags;
 
 	local_irq_save(flags);
 
-	if (timer_pending(t)) {
-		if (time_before(when, t->expires))
-			mod_timer(t, when);
-	} else {
-		t->expires = round_jiffies(when);
-		add_timer_on(t, smp_processor_id());
-	}
+	if (!timer_pending(t) || time_before(when, t->expires))
+		mod_timer(t, round_jiffies(when));
 
 	local_irq_restore(flags);
 }
@@ -1421,7 +1416,7 @@ static void mce_timer_fn(unsigned long data)
 
 done:
 	__this_cpu_write(mce_next_interval, iv);
-	__restart_timer(t, iv);
+	__start_timer(t, iv);
 }
 
 /*
@@ -1432,7 +1427,7 @@ void mce_timer_kick(unsigned long interval)
 	struct timer_list *t = this_cpu_ptr(&mce_timer);
 	unsigned long iv = __this_cpu_read(mce_next_interval);
 
-	__restart_timer(t, interval);
+	__start_timer(t, interval);
 
 	if (interval < iv)
 		__this_cpu_write(mce_next_interval, interval);
@@ -1779,17 +1774,15 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
 	}
 }
 
-static void mce_start_timer(unsigned int cpu, struct timer_list *t)
+static void mce_start_timer(struct timer_list *t)
 {
 	unsigned long iv = check_interval * HZ;
 
 	if (mca_cfg.ignore_ce || !iv)
 		return;
 
-	per_cpu(mce_next_interval, cpu) = iv;
-
-	t->expires = round_jiffies(jiffies + iv);
-	add_timer_on(t, cpu);
+	this_cpu_write(mce_next_interval, iv);
+	__start_timer(t, iv);
 }
 
 static void __mcheck_cpu_setup_timer(void)
@@ -1806,7 +1799,7 @@ static void __mcheck_cpu_init_timer(void)
 	unsigned int cpu = smp_processor_id();
 
 	setup_pinned_timer(t, mce_timer_fn, cpu);
-	mce_start_timer(cpu, t);
+	mce_start_timer(t);
 }
 
 /* Handle unconfigured int18 (should never happen) */
@@ -2566,7 +2559,7 @@ static int mce_cpu_dead(unsigned int cpu)
 
 static int mce_cpu_online(unsigned int cpu)
 {
-	struct timer_list *t = &per_cpu(mce_timer, cpu);
+	struct timer_list *t = this_cpu_ptr(&mce_timer);
 	int ret;
 
 	mce_device_create(cpu);
@@ -2577,13 +2570,13 @@ static int mce_cpu_online(unsigned int cpu)
 		return ret;
 	}
 	mce_reenable_cpu();
-	mce_start_timer(cpu, t);
+	mce_start_timer(t);
 	return 0;
 }
 
 static int mce_cpu_pre_down(unsigned int cpu)
 {
-	struct timer_list *t = &per_cpu(mce_timer, cpu);
+	struct timer_list *t = this_cpu_ptr(&mce_timer);
 
 	mce_disable_cpu();
 	del_timer_sync(t);
-- 
cgit 


From dd86e373e09fb16b83e8adf5c48c421a4ca76468 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 Jan 2017 23:58:38 +0100
Subject: perf/x86/intel/rapl: Make package handling more robust

The package management code in RAPL relies on package mapping being
available before a CPU is started. This changed with:

  9d85eb9119f4 ("x86/smpboot: Make logical package management more robust")

because the ACPI/BIOS information turned out to be unreliable, but that
left RAPL in broken state. This was not noticed because on a regular boot
all CPUs are online before RAPL is initialized.

A possible fix would be to reintroduce the mess which allocates a package
data structure in CPU prepare and when it turns out to already exist in
starting throw it away later in the CPU online callback. But that's a
horrible hack and not required at all because RAPL becomes functional for
perf only in the CPU online callback. That's correct because user space is
not yet informed about the CPU being onlined, so nothing caan rely on RAPL
being available on that particular CPU.

Move the allocation to the CPU online callback and simplify the hotplug
handling. At this point the package mapping is established and correct.

This also adds a missing check for available package data in the
event_init() function.

Reported-by: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: 9d85eb9119f4 ("x86/smpboot: Make logical package management more robust")
Link: http://lkml.kernel.org/r/20170131230141.212593966@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 60 +++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 34 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 17c3564d087a..22ef4f72cf32 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -161,7 +161,13 @@ static u64 rapl_timer_ms;
 
 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
 {
-	return rapl_pmus->pmus[topology_logical_package_id(cpu)];
+	unsigned int pkgid = topology_logical_package_id(cpu);
+
+	/*
+	 * The unsigned check also catches the '-1' return value for non
+	 * existent mappings in the topology map.
+	 */
+	return pkgid < rapl_pmus->maxpkg ? rapl_pmus->pmus[pkgid] : NULL;
 }
 
 static inline u64 rapl_read_counter(struct perf_event *event)
@@ -402,6 +408,8 @@ static int rapl_pmu_event_init(struct perf_event *event)
 
 	/* must be done before validate_group */
 	pmu = cpu_to_rapl_pmu(event->cpu);
+	if (!pmu)
+		return -EINVAL;
 	event->cpu = pmu->cpu;
 	event->pmu_private = pmu;
 	event->hw.event_base = msr;
@@ -585,6 +593,20 @@ static int rapl_cpu_online(unsigned int cpu)
 	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 	int target;
 
+	if (!pmu) {
+		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+		if (!pmu)
+			return -ENOMEM;
+
+		raw_spin_lock_init(&pmu->lock);
+		INIT_LIST_HEAD(&pmu->active_list);
+		pmu->pmu = &rapl_pmus->pmu;
+		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+		rapl_hrtimer_init(pmu);
+
+		rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu;
+	}
+
 	/*
 	 * Check if there is an online cpu in the package which collects rapl
 	 * events already.
@@ -598,27 +620,6 @@ static int rapl_cpu_online(unsigned int cpu)
 	return 0;
 }
 
-static int rapl_cpu_prepare(unsigned int cpu)
-{
-	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
-
-	if (pmu)
-		return 0;
-
-	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
-	if (!pmu)
-		return -ENOMEM;
-
-	raw_spin_lock_init(&pmu->lock);
-	INIT_LIST_HEAD(&pmu->active_list);
-	pmu->pmu = &rapl_pmus->pmu;
-	pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
-	pmu->cpu = -1;
-	rapl_hrtimer_init(pmu);
-	rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu;
-	return 0;
-}
-
 static int rapl_check_hw_unit(bool apply_quirk)
 {
 	u64 msr_rapl_power_unit_bits;
@@ -803,29 +804,21 @@ static int __init rapl_pmu_init(void)
 	/*
 	 * Install callbacks. Core will call them for each online cpu.
 	 */
-
-	ret = cpuhp_setup_state(CPUHP_PERF_X86_RAPL_PREP, "perf/x86/rapl:prepare",
-				rapl_cpu_prepare, NULL);
-	if (ret)
-		goto out;
-
 	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
 				"perf/x86/rapl:online",
 				rapl_cpu_online, rapl_cpu_offline);
 	if (ret)
-		goto out1;
+		goto out;
 
 	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
 	if (ret)
-		goto out2;
+		goto out1;
 
 	rapl_advertise();
 	return 0;
 
-out2:
-	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 out1:
-	cpuhp_remove_state(CPUHP_PERF_X86_RAPL_PREP);
+	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 out:
 	pr_warn("Initialization failed (%d), disabled\n", ret);
 	cleanup_rapl_pmus();
@@ -836,7 +829,6 @@ module_init(rapl_pmu_init);
 static void __exit intel_rapl_exit(void)
 {
 	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
-	cpuhp_remove_state_nocalls(CPUHP_PERF_X86_RAPL_PREP);
 	perf_pmu_unregister(&rapl_pmus->pmu);
 	cleanup_rapl_pmus();
 }
-- 
cgit 


From 1aa6cfd33df492939b0be15ebdbcff1f8ae5ddb6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 Jan 2017 23:58:39 +0100
Subject: perf/x86/intel/uncore: Clean up hotplug conversion fallout

The recent conversion to the hotplug state machine kept two mechanisms from
the original code:

 1) The first_init logic which adds the number of online CPUs in a package
    to the refcount. That's wrong because the callbacks are executed for
    all online CPUs.

    Remove it so the refcounting is correct.

 2) The on_each_cpu() call to undo box->init() in the error handling
    path. That's bogus because when the prepare callback fails no box has
    been initialized yet.

    Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Fixes: 1a246b9f58c6 ("perf/x86/intel/uncore: Convert to hotplug state machine")
Link: http://lkml.kernel.org/r/20170131230141.298032324@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 44 ++++--------------------------------------
 1 file changed, 4 insertions(+), 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 8c4ccdc3a3f3..56c5235dcc29 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -764,30 +764,6 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
 	pmu->registered = false;
 }
 
-static void __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
-{
-	struct intel_uncore_pmu *pmu = type->pmus;
-	struct intel_uncore_box *box;
-	int i, pkg;
-
-	if (pmu) {
-		pkg = topology_physical_package_id(cpu);
-		for (i = 0; i < type->num_boxes; i++, pmu++) {
-			box = pmu->boxes[pkg];
-			if (box)
-				uncore_box_exit(box);
-		}
-	}
-}
-
-static void uncore_exit_boxes(void *dummy)
-{
-	struct intel_uncore_type **types;
-
-	for (types = uncore_msr_uncores; *types; types++)
-		__uncore_exit_boxes(*types++, smp_processor_id());
-}
-
 static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
 {
 	int pkg;
@@ -1078,22 +1054,12 @@ static int uncore_cpu_dying(unsigned int cpu)
 	return 0;
 }
 
-static int first_init;
-
 static int uncore_cpu_starting(unsigned int cpu)
 {
 	struct intel_uncore_type *type, **types = uncore_msr_uncores;
 	struct intel_uncore_pmu *pmu;
 	struct intel_uncore_box *box;
-	int i, pkg, ncpus = 1;
-
-	if (first_init) {
-		/*
-		 * On init we get the number of online cpus in the package
-		 * and set refcount for all of them.
-		 */
-		ncpus = cpumask_weight(topology_core_cpumask(cpu));
-	}
+	int i, pkg;
 
 	pkg = topology_logical_package_id(cpu);
 	for (; *types; types++) {
@@ -1104,7 +1070,7 @@ static int uncore_cpu_starting(unsigned int cpu)
 			if (!box)
 				continue;
 			/* The first cpu on a package activates the box */
-			if (atomic_add_return(ncpus, &box->refcnt) == ncpus)
+			if (atomic_inc_return(&box->refcnt) == 1)
 				uncore_box_init(box);
 		}
 	}
@@ -1408,19 +1374,17 @@ static int __init intel_uncore_init(void)
 					  "perf/x86/intel/uncore:prepare",
 					  uncore_cpu_prepare, NULL);
 	}
-	first_init = 1;
+
 	cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_STARTING,
 			  "perf/x86/uncore:starting",
 			  uncore_cpu_starting, uncore_cpu_dying);
-	first_init = 0;
+
 	cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
 			  "perf/x86/uncore:online",
 			  uncore_event_cpu_online, uncore_event_cpu_offline);
 	return 0;
 
 err:
-	/* Undo box->init_box() */
-	on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1);
 	uncore_types_exit(uncore_msr_uncores);
 	uncore_pci_exit();
 	return ret;
-- 
cgit 


From fff4b87e594ad3d2e4f51e8d3d86a6f9d3d8b654 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 Jan 2017 23:58:40 +0100
Subject: perf/x86/intel/uncore: Make package handling more robust

The package management code in uncore relies on package mapping being
available before a CPU is started. This changed with:

  9d85eb9119f4 ("x86/smpboot: Make logical package management more robust")

because the ACPI/BIOS information turned out to be unreliable, but that
left uncore in broken state. This was not noticed because on a regular boot
all CPUs are online before uncore is initialized.

Move the allocation to the CPU online callback and simplify the hotplug
handling. At this point the package mapping is established and correct.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Fixes: 9d85eb9119f4 ("x86/smpboot: Make logical package management more robust")
Link: http://lkml.kernel.org/r/20170131230141.377156255@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 196 +++++++++++++++++++----------------------
 1 file changed, 91 insertions(+), 105 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 56c5235dcc29..1ab45976474d 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -100,7 +100,13 @@ ssize_t uncore_event_show(struct kobject *kobj,
 
 struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
 {
-	return pmu->boxes[topology_logical_package_id(cpu)];
+	unsigned int pkgid = topology_logical_package_id(cpu);
+
+	/*
+	 * The unsigned check also catches the '-1' return value for non
+	 * existent mappings in the topology map.
+	 */
+	return pkgid < max_packages ? pmu->boxes[pkgid] : NULL;
 }
 
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
@@ -1034,76 +1040,6 @@ static void uncore_pci_exit(void)
 	}
 }
 
-static int uncore_cpu_dying(unsigned int cpu)
-{
-	struct intel_uncore_type *type, **types = uncore_msr_uncores;
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	int i, pkg;
-
-	pkg = topology_logical_package_id(cpu);
-	for (; *types; types++) {
-		type = *types;
-		pmu = type->pmus;
-		for (i = 0; i < type->num_boxes; i++, pmu++) {
-			box = pmu->boxes[pkg];
-			if (box && atomic_dec_return(&box->refcnt) == 0)
-				uncore_box_exit(box);
-		}
-	}
-	return 0;
-}
-
-static int uncore_cpu_starting(unsigned int cpu)
-{
-	struct intel_uncore_type *type, **types = uncore_msr_uncores;
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	int i, pkg;
-
-	pkg = topology_logical_package_id(cpu);
-	for (; *types; types++) {
-		type = *types;
-		pmu = type->pmus;
-		for (i = 0; i < type->num_boxes; i++, pmu++) {
-			box = pmu->boxes[pkg];
-			if (!box)
-				continue;
-			/* The first cpu on a package activates the box */
-			if (atomic_inc_return(&box->refcnt) == 1)
-				uncore_box_init(box);
-		}
-	}
-
-	return 0;
-}
-
-static int uncore_cpu_prepare(unsigned int cpu)
-{
-	struct intel_uncore_type *type, **types = uncore_msr_uncores;
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	int i, pkg;
-
-	pkg = topology_logical_package_id(cpu);
-	for (; *types; types++) {
-		type = *types;
-		pmu = type->pmus;
-		for (i = 0; i < type->num_boxes; i++, pmu++) {
-			if (pmu->boxes[pkg])
-				continue;
-			/* First cpu of a package allocates the box */
-			box = uncore_alloc_box(type, cpu_to_node(cpu));
-			if (!box)
-				return -ENOMEM;
-			box->pmu = pmu;
-			box->pkgid = pkg;
-			pmu->boxes[pkg] = box;
-		}
-	}
-	return 0;
-}
-
 static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
 				   int new_cpu)
 {
@@ -1143,12 +1079,14 @@ static void uncore_change_context(struct intel_uncore_type **uncores,
 
 static int uncore_event_cpu_offline(unsigned int cpu)
 {
-	int target;
+	struct intel_uncore_type *type, **types = uncore_msr_uncores;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, pkg, target;
 
 	/* Check if exiting cpu is used for collecting uncore events */
 	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
-		return 0;
-
+		goto unref;
 	/* Find a new cpu to collect uncore events */
 	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
 
@@ -1160,12 +1098,82 @@ static int uncore_event_cpu_offline(unsigned int cpu)
 
 	uncore_change_context(uncore_msr_uncores, cpu, target);
 	uncore_change_context(uncore_pci_uncores, cpu, target);
+
+unref:
+	/* Clear the references */
+	pkg = topology_logical_package_id(cpu);
+	for (; *types; types++) {
+		type = *types;
+		pmu = type->pmus;
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			box = pmu->boxes[pkg];
+			if (box && atomic_dec_return(&box->refcnt) == 0)
+				uncore_box_exit(box);
+		}
+	}
 	return 0;
 }
 
+static int allocate_boxes(struct intel_uncore_type **types,
+			 unsigned int pkg, unsigned int cpu)
+{
+	struct intel_uncore_box *box, *tmp;
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	LIST_HEAD(allocated);
+	int i;
+
+	/* Try to allocate all required boxes */
+	for (; *types; types++) {
+		type = *types;
+		pmu = type->pmus;
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			if (pmu->boxes[pkg])
+				continue;
+			box = uncore_alloc_box(type, cpu_to_node(cpu));
+			if (!box)
+				goto cleanup;
+			box->pmu = pmu;
+			box->pkgid = pkg;
+			list_add(&box->active_list, &allocated);
+		}
+	}
+	/* Install them in the pmus */
+	list_for_each_entry_safe(box, tmp, &allocated, active_list) {
+		list_del_init(&box->active_list);
+		box->pmu->boxes[pkg] = box;
+	}
+	return 0;
+
+cleanup:
+	list_for_each_entry_safe(box, tmp, &allocated, active_list) {
+		list_del_init(&box->active_list);
+		kfree(box);
+	}
+	return -ENOMEM;
+}
+
 static int uncore_event_cpu_online(unsigned int cpu)
 {
-	int target;
+	struct intel_uncore_type *type, **types = uncore_msr_uncores;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, ret, pkg, target;
+
+	pkg = topology_logical_package_id(cpu);
+	ret = allocate_boxes(types, pkg, cpu);
+	if (ret)
+		return ret;
+
+	for (; *types; types++) {
+		type = *types;
+		pmu = type->pmus;
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			box = pmu->boxes[pkg];
+			if (!box && atomic_inc_return(&box->refcnt) == 1)
+				uncore_box_init(box);
+		}
+	}
 
 	/*
 	 * Check if there is an online cpu in the package
@@ -1355,33 +1363,13 @@ static int __init intel_uncore_init(void)
 	if (cret && pret)
 		return -ENODEV;
 
-	/*
-	 * Install callbacks. Core will call them for each online cpu.
-	 *
-	 * The first online cpu of each package allocates and takes
-	 * the refcounts for all other online cpus in that package.
-	 * If msrs are not enabled no allocation is required and
-	 * uncore_cpu_prepare() is not called for each online cpu.
-	 */
-	if (!cret) {
-	       ret = cpuhp_setup_state(CPUHP_PERF_X86_UNCORE_PREP,
-				       "perf/x86/intel/uncore:prepare",
-				       uncore_cpu_prepare, NULL);
-		if (ret)
-			goto err;
-	} else {
-		cpuhp_setup_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP,
-					  "perf/x86/intel/uncore:prepare",
-					  uncore_cpu_prepare, NULL);
-	}
-
-	cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_STARTING,
-			  "perf/x86/uncore:starting",
-			  uncore_cpu_starting, uncore_cpu_dying);
-
-	cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
-			  "perf/x86/uncore:online",
-			  uncore_event_cpu_online, uncore_event_cpu_offline);
+	/* Install hotplug callbacks to setup the targets for each package */
+	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
+				"perf/x86/intel/uncore:online",
+				uncore_event_cpu_online,
+				uncore_event_cpu_offline);
+	if (ret)
+		goto err;
 	return 0;
 
 err:
@@ -1393,9 +1381,7 @@ module_init(intel_uncore_init);
 
 static void __exit intel_uncore_exit(void)
 {
-	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_ONLINE);
-	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_STARTING);
-	cpuhp_remove_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP);
+	cpuhp_remove_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE);
 	uncore_types_exit(uncore_msr_uncores);
 	uncore_pci_exit();
 }
-- 
cgit 


From 2bd79f30eea1a7c3082c930a91370bb68435b86d Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Tue, 31 Jan 2017 13:21:33 +0000
Subject: efi: Deduplicate efi_file_size() / _read() / _close()

There's one ARM, one x86_32 and one x86_64 version which can be folded
into a single shared version by masking their differences with the shiny
new efi_call_proto() macro.

No functional change intended.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1485868902-20401-2-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/eboot.c | 148 ---------------------------------------
 1 file changed, 148 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index ff01c8fc76f7..f1cf284d631e 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -38,154 +38,6 @@ static void setup_boot_services##bits(struct efi_config *c)		\
 BOOT_SERVICES(32);
 BOOT_SERVICES(64);
 
-void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
-
-static efi_status_t
-__file_size32(void *__fh, efi_char16_t *filename_16,
-	      void **handle, u64 *file_sz)
-{
-	efi_file_handle_32_t *h, *fh = __fh;
-	efi_file_info_t *info;
-	efi_status_t status;
-	efi_guid_t info_guid = EFI_FILE_INFO_ID;
-	u32 info_sz;
-
-	status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16,
-				 EFI_FILE_MODE_READ, (u64)0);
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to open file: ");
-		efi_char16_printk(sys_table, filename_16);
-		efi_printk(sys_table, "\n");
-		return status;
-	}
-
-	*handle = h;
-
-	info_sz = 0;
-	status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
-				 &info_sz, NULL);
-	if (status != EFI_BUFFER_TOO_SMALL) {
-		efi_printk(sys_table, "Failed to get file info size\n");
-		return status;
-	}
-
-grow:
-	status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
-				info_sz, (void **)&info);
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to alloc mem for file info\n");
-		return status;
-	}
-
-	status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
-				 &info_sz, info);
-	if (status == EFI_BUFFER_TOO_SMALL) {
-		efi_call_early(free_pool, info);
-		goto grow;
-	}
-
-	*file_sz = info->file_size;
-	efi_call_early(free_pool, info);
-
-	if (status != EFI_SUCCESS)
-		efi_printk(sys_table, "Failed to get initrd info\n");
-
-	return status;
-}
-
-static efi_status_t
-__file_size64(void *__fh, efi_char16_t *filename_16,
-	      void **handle, u64 *file_sz)
-{
-	efi_file_handle_64_t *h, *fh = __fh;
-	efi_file_info_t *info;
-	efi_status_t status;
-	efi_guid_t info_guid = EFI_FILE_INFO_ID;
-	u64 info_sz;
-
-	status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16,
-				 EFI_FILE_MODE_READ, (u64)0);
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to open file: ");
-		efi_char16_printk(sys_table, filename_16);
-		efi_printk(sys_table, "\n");
-		return status;
-	}
-
-	*handle = h;
-
-	info_sz = 0;
-	status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
-				 &info_sz, NULL);
-	if (status != EFI_BUFFER_TOO_SMALL) {
-		efi_printk(sys_table, "Failed to get file info size\n");
-		return status;
-	}
-
-grow:
-	status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
-				info_sz, (void **)&info);
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to alloc mem for file info\n");
-		return status;
-	}
-
-	status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
-				 &info_sz, info);
-	if (status == EFI_BUFFER_TOO_SMALL) {
-		efi_call_early(free_pool, info);
-		goto grow;
-	}
-
-	*file_sz = info->file_size;
-	efi_call_early(free_pool, info);
-
-	if (status != EFI_SUCCESS)
-		efi_printk(sys_table, "Failed to get initrd info\n");
-
-	return status;
-}
-efi_status_t
-efi_file_size(efi_system_table_t *sys_table, void *__fh,
-	      efi_char16_t *filename_16, void **handle, u64 *file_sz)
-{
-	if (efi_early->is64)
-		return __file_size64(__fh, filename_16, handle, file_sz);
-
-	return __file_size32(__fh, filename_16, handle, file_sz);
-}
-
-efi_status_t
-efi_file_read(void *handle, unsigned long *size, void *addr)
-{
-	unsigned long func;
-
-	if (efi_early->is64) {
-		efi_file_handle_64_t *fh = handle;
-
-		func = (unsigned long)fh->read;
-		return efi_early->call(func, handle, size, addr);
-	} else {
-		efi_file_handle_32_t *fh = handle;
-
-		func = (unsigned long)fh->read;
-		return efi_early->call(func, handle, size, addr);
-	}
-}
-
-efi_status_t efi_file_close(void *handle)
-{
-	if (efi_early->is64) {
-		efi_file_handle_64_t *fh = handle;
-
-		return efi_early->call((unsigned long)fh->close, handle);
-	} else {
-		efi_file_handle_32_t *fh = handle;
-
-		return efi_early->call((unsigned long)fh->close, handle);
-	}
-}
-
 static inline efi_status_t __open_volume32(void *__image, void **__fh)
 {
 	efi_file_io_interface_t *io;
-- 
cgit 


From db4545d9a7881db0a7e18599e6cd1adbcb93db33 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Tue, 31 Jan 2017 13:21:34 +0000
Subject: x86/efi: Deduplicate efi_char16_printk()

Eliminate the separate 32-bit and 64x- bit code paths by way of the shiny
new efi_call_proto() macro.

No functional change intended.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1485868902-20401-3-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/eboot.c | 26 ++------------------------
 1 file changed, 2 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index f1cf284d631e..6d3aeabbce68 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -101,30 +101,8 @@ efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
 
 void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
 {
-	unsigned long output_string;
-	size_t offset;
-
-	if (efi_early->is64) {
-		struct efi_simple_text_output_protocol_64 *out;
-		u64 *func;
-
-		offset = offsetof(typeof(*out), output_string);
-		output_string = efi_early->text_output + offset;
-		out = (typeof(out))(unsigned long)efi_early->text_output;
-		func = (u64 *)output_string;
-
-		efi_early->call(*func, out, str);
-	} else {
-		struct efi_simple_text_output_protocol_32 *out;
-		u32 *func;
-
-		offset = offsetof(typeof(*out), output_string);
-		output_string = efi_early->text_output + offset;
-		out = (typeof(out))(unsigned long)efi_early->text_output;
-		func = (u32 *)output_string;
-
-		efi_early->call(*func, out, str);
-	}
+	efi_call_proto(efi_simple_text_output_protocol, output_string,
+		       efi_early->text_output, str);
 }
 
 static efi_status_t
-- 
cgit 


From 18141e89a76c58101860486fd9cc0999da2eed43 Mon Sep 17 00:00:00 2001
From: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Date: Tue, 31 Jan 2017 13:21:37 +0000
Subject: x86/efi: Add support for EFI_MEMORY_ATTRIBUTES_TABLE

UEFI v2.6 introduces EFI_MEMORY_ATTRIBUTES_TABLE which describes memory
protections that may be applied to the EFI Runtime code and data regions by
the kernel. This enables the kernel to map these regions more strictly thereby
increasing security.

Presently, the only valid bits for the attribute field of a memory descriptor
are EFI_MEMORY_RO and EFI_MEMORY_XP, hence use these bits to update the
mappings in efi_pgd.

The UEFI specification recommends to use this feature instead of
EFI_PROPERTIES_TABLE and hence while updating EFI mappings we first
check for EFI_MEMORY_ATTRIBUTES_TABLE and if it's present we update
the mappings according to this table and hence disregarding
EFI_PROPERTIES_TABLE even if it's published by the firmware. We consider
EFI_PROPERTIES_TABLE only when EFI_MEMORY_ATTRIBUTES_TABLE is absent.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Lee, Chun-Yi <jlee@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1485868902-20401-6-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi_64.c | 64 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 2f25a363068c..a4695da42d77 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -414,10 +414,44 @@ void __init parse_efi_setup(u64 phys_addr, u32 data_len)
 	efi_setup = phys_addr + sizeof(struct setup_data);
 }
 
-void __init efi_runtime_update_mappings(void)
+static int __init efi_update_mappings(efi_memory_desc_t *md, unsigned long pf)
 {
 	unsigned long pfn;
 	pgd_t *pgd = efi_pgd;
+	int err1, err2;
+
+	/* Update the 1:1 mapping */
+	pfn = md->phys_addr >> PAGE_SHIFT;
+	err1 = kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, md->num_pages, pf);
+	if (err1) {
+		pr_err("Error while updating 1:1 mapping PA 0x%llx -> VA 0x%llx!\n",
+			   md->phys_addr, md->virt_addr);
+	}
+
+	err2 = kernel_map_pages_in_pgd(pgd, pfn, md->virt_addr, md->num_pages, pf);
+	if (err2) {
+		pr_err("Error while updating VA mapping PA 0x%llx -> VA 0x%llx!\n",
+			   md->phys_addr, md->virt_addr);
+	}
+
+	return err1 || err2;
+}
+
+static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *md)
+{
+	unsigned long pf = 0;
+
+	if (md->attribute & EFI_MEMORY_XP)
+		pf |= _PAGE_NX;
+
+	if (!(md->attribute & EFI_MEMORY_RO))
+		pf |= _PAGE_RW;
+
+	return efi_update_mappings(md, pf);
+}
+
+void __init efi_runtime_update_mappings(void)
+{
 	efi_memory_desc_t *md;
 
 	if (efi_enabled(EFI_OLD_MEMMAP)) {
@@ -426,6 +460,24 @@ void __init efi_runtime_update_mappings(void)
 		return;
 	}
 
+	/*
+	 * Use the EFI Memory Attribute Table for mapping permissions if it
+	 * exists, since it is intended to supersede EFI_PROPERTIES_TABLE.
+	 */
+	if (efi_enabled(EFI_MEM_ATTR)) {
+		efi_memattr_apply_permissions(NULL, efi_update_mem_attr);
+		return;
+	}
+
+	/*
+	 * EFI_MEMORY_ATTRIBUTES_TABLE is intended to replace
+	 * EFI_PROPERTIES_TABLE. So, use EFI_PROPERTIES_TABLE to update
+	 * permissions only if EFI_MEMORY_ATTRIBUTES_TABLE is not
+	 * published by the firmware. Even if we find a buggy implementation of
+	 * EFI_MEMORY_ATTRIBUTES_TABLE, don't fall back to
+	 * EFI_PROPERTIES_TABLE, because of the same reason.
+	 */
+
 	if (!efi_enabled(EFI_NX_PE_DATA))
 		return;
 
@@ -446,15 +498,7 @@ void __init efi_runtime_update_mappings(void)
 			(md->type != EFI_RUNTIME_SERVICES_CODE))
 			pf |= _PAGE_RW;
 
-		/* Update the 1:1 mapping */
-		pfn = md->phys_addr >> PAGE_SHIFT;
-		if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, md->num_pages, pf))
-			pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
-				   md->phys_addr, md->virt_addr);
-
-		if (kernel_map_pages_in_pgd(pgd, pfn, md->virt_addr, md->num_pages, pf))
-			pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
-				   md->phys_addr, md->virt_addr);
+		efi_update_mappings(md, pf);
 	}
 }
 
-- 
cgit 


From 7b0a911478c74ca02581d496f732c10e811e894f Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Tue, 31 Jan 2017 13:21:40 +0000
Subject: efi/x86: Move the EFI BGRT init code to early init code

Before invoking the arch specific handler, efi_mem_reserve() reserves
the given memory region through memblock.

efi_bgrt_init() will call efi_mem_reserve() after mm_init(), at which
time memblock is dead and should not be used anymore.

The EFI BGRT code depends on ACPI initialization to get the BGRT ACPI
table, so move parsing of the BGRT table to ACPI early boot code to
ensure that efi_mem_reserve() in EFI BGRT code still use memblock safely.

Tested-by: Bhupesh Sharma <bhsharma@redhat.com>
Signed-off-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-acpi@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1485868902-20401-9-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/acpi/boot.c      |  9 ++++++
 arch/x86/platform/efi/efi-bgrt.c | 59 +++++++++++++++++-----------------------
 arch/x86/platform/efi/efi.c      |  5 ----
 3 files changed, 34 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 64422f850e95..7ff007ed899d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -35,6 +35,7 @@
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
+#include <linux/efi-bgrt.h>
 
 #include <asm/irqdomain.h>
 #include <asm/pci_x86.h>
@@ -1557,6 +1558,12 @@ int __init early_acpi_boot_init(void)
 	return 0;
 }
 
+static int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+	efi_bgrt_init(table);
+	return 0;
+}
+
 int __init acpi_boot_init(void)
 {
 	/* those are executed after early-quirks are executed */
@@ -1581,6 +1588,8 @@ int __init acpi_boot_init(void)
 	acpi_process_madt();
 
 	acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+	if (IS_ENABLED(CONFIG_ACPI_BGRT))
+		acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
 
 	if (!acpi_noirq)
 		x86_init.pci.init = pci_acpi_init;
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index 6aad870e8962..04ca8764f0c0 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -19,8 +19,7 @@
 #include <linux/efi.h>
 #include <linux/efi-bgrt.h>
 
-struct acpi_table_bgrt *bgrt_tab;
-void *__initdata bgrt_image;
+struct acpi_table_bgrt bgrt_tab;
 size_t __initdata bgrt_image_size;
 
 struct bmp_header {
@@ -28,66 +27,58 @@ struct bmp_header {
 	u32 size;
 } __packed;
 
-void __init efi_bgrt_init(void)
+void __init efi_bgrt_init(struct acpi_table_header *table)
 {
-	acpi_status status;
 	void *image;
 	struct bmp_header bmp_header;
+	struct acpi_table_bgrt *bgrt = &bgrt_tab;
 
 	if (acpi_disabled)
 		return;
 
-	status = acpi_get_table("BGRT", 0,
-	                        (struct acpi_table_header **)&bgrt_tab);
-	if (ACPI_FAILURE(status))
-		return;
-
-	if (bgrt_tab->header.length < sizeof(*bgrt_tab)) {
+	if (table->length < sizeof(bgrt_tab)) {
 		pr_notice("Ignoring BGRT: invalid length %u (expected %zu)\n",
-		       bgrt_tab->header.length, sizeof(*bgrt_tab));
+		       table->length, sizeof(bgrt_tab));
 		return;
 	}
-	if (bgrt_tab->version != 1) {
+	*bgrt = *(struct acpi_table_bgrt *)table;
+	if (bgrt->version != 1) {
 		pr_notice("Ignoring BGRT: invalid version %u (expected 1)\n",
-		       bgrt_tab->version);
-		return;
+		       bgrt->version);
+		goto out;
 	}
-	if (bgrt_tab->status & 0xfe) {
+	if (bgrt->status & 0xfe) {
 		pr_notice("Ignoring BGRT: reserved status bits are non-zero %u\n",
-		       bgrt_tab->status);
-		return;
+		       bgrt->status);
+		goto out;
 	}
-	if (bgrt_tab->image_type != 0) {
+	if (bgrt->image_type != 0) {
 		pr_notice("Ignoring BGRT: invalid image type %u (expected 0)\n",
-		       bgrt_tab->image_type);
-		return;
+		       bgrt->image_type);
+		goto out;
 	}
-	if (!bgrt_tab->image_address) {
+	if (!bgrt->image_address) {
 		pr_notice("Ignoring BGRT: null image address\n");
-		return;
+		goto out;
 	}
 
-	image = memremap(bgrt_tab->image_address, sizeof(bmp_header), MEMREMAP_WB);
+	image = early_memremap(bgrt->image_address, sizeof(bmp_header));
 	if (!image) {
 		pr_notice("Ignoring BGRT: failed to map image header memory\n");
-		return;
+		goto out;
 	}
 
 	memcpy(&bmp_header, image, sizeof(bmp_header));
-	memunmap(image);
+	early_memunmap(image, sizeof(bmp_header));
 	if (bmp_header.id != 0x4d42) {
 		pr_notice("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n",
 			bmp_header.id);
-		return;
+		goto out;
 	}
 	bgrt_image_size = bmp_header.size;
+	efi_mem_reserve(bgrt->image_address, bgrt_image_size);
 
-	bgrt_image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB);
-	if (!bgrt_image) {
-		pr_notice("Ignoring BGRT: failed to map image memory\n");
-		bgrt_image = NULL;
-		return;
-	}
-
-	efi_mem_reserve(bgrt_tab->image_address, bgrt_image_size);
+	return;
+out:
+	memset(bgrt, 0, sizeof(bgrt_tab));
 }
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 274dfc481849..0d4becfc5145 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -542,11 +542,6 @@ void __init efi_init(void)
 		efi_print_memmap();
 }
 
-void __init efi_late_init(void)
-{
-	efi_bgrt_init();
-}
-
 void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
 {
 	u64 addr, npages;
-- 
cgit 


From 22c091d02a5422d2825a4fb1af71e5a62f9e4d0f Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Tue, 31 Jan 2017 13:21:41 +0000
Subject: efi/x86: Add debug code to print cooked memmap

It is not obvious if the reserved boot area are added correctly, add a
efi_print_memmap() call to print the new memmap.

Tested-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Nicolai Stange <nicstange@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1485868902-20401-10-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 0d4becfc5145..565dff3c9a12 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -955,6 +955,11 @@ static void __init __efi_enter_virtual_mode(void)
 		return;
 	}
 
+	if (efi_enabled(EFI_DBG)) {
+		pr_info("EFI runtime memory map:\n");
+		efi_print_memmap();
+	}
+
 	BUG_ON(!efi.systab);
 
 	if (efi_setup_page_tables(pa, 1 << pg_shift)) {
-- 
cgit 


From a1cecf2ba78e0a6de00ff99df34b662728535aa5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:22 +0100
Subject: sched/cputime: Introduce special task_cputime_t() API to return
 old-typed cputime

This API returns a task's cputime in cputime_t in order to ease the
conversion of cputime internals to use nsecs units instead. Blindly
converting all cputime readers to use this API now will later let us
convert more smoothly and step by step all these places to use the
new nsec based cputime.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-7-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apm_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 45d44c173cf9..89c84fcdd3c0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -913,7 +913,7 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
 	unsigned int bucket;
 
 recalc:
-	task_cputime(current, &utime, &stime);
+	task_cputime_t(current, &utime, &stime);
 	if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
 		use_apm_idle = 0;
 	} else if (jiffies_since_last_check > idle_period) {
-- 
cgit 


From 5613fda9a503cd6137b120298902a34a1386b2c1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:23 +0100
Subject: sched/cputime: Convert task/group cputime to nsecs

Now that most cputime readers use the transition API which return the
task cputime in old style cputime_t, we can safely store the cputime in
nsecs. This will eventually make cputime statistics less opaque and more
granular. Back and forth convertions between cputime_t and nsecs in order
to deal with cputime_t random granularity won't be needed anymore.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-8-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kvm/hyperv.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 1572c35b4f1a..2ecd7dab4631 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -964,10 +964,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 /* Calculate cpu time spent by current task in 100ns units */
 static u64 current_task_runtime_100ns(void)
 {
-	cputime_t utime, stime;
+	u64 utime, stime;
 
 	task_cputime_adjusted(current, &utime, &stime);
-	return div_u64(cputime_to_nsecs(utime + stime), 100);
+
+	return div_u64(utime + stime, 100);
 }
 
 static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
-- 
cgit 


From f7dcd63de44219fb5e9a36fc2c0ca23ddd79d01c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:25 +0100
Subject: x86: Convert obsolete cputime type to nsecs

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-10-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apm_32.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 89c84fcdd3c0..4a7080c84a5a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -905,21 +905,21 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
 {
 	static int use_apm_idle; /* = 0 */
 	static unsigned int last_jiffies; /* = 0 */
-	static unsigned int last_stime; /* = 0 */
-	cputime_t stime, utime;
+	static u64 last_stime; /* = 0 */
+	u64 stime, utime;
 
 	int apm_idle_done = 0;
 	unsigned int jiffies_since_last_check = jiffies - last_jiffies;
 	unsigned int bucket;
 
 recalc:
-	task_cputime_t(current, &utime, &stime);
+	task_cputime(current, &utime, &stime);
 	if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
 		use_apm_idle = 0;
 	} else if (jiffies_since_last_check > idle_period) {
 		unsigned int idle_percentage;
 
-		idle_percentage = cputime_to_jiffies(stime - last_stime);
+		idle_percentage = nsecs_to_jiffies(stime - last_stime);
 		idle_percentage *= 100;
 		idle_percentage /= jiffies_since_last_check;
 		use_apm_idle = (idle_percentage > idle_threshold);
-- 
cgit 


From b672592f022152155fde7db99aafbcf04a2c3ba5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:51 +0100
Subject: sched/cputime: Remove generic asm headers

cputime_t is now only used by two architectures:

	* powerpc (when CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y)
	* s390

And since the core doesn't use it anymore, we don't need any arch support
from the others. So we can remove their stub implementations.

A final cleanup would be to provide an efficient pure arch
implementation of cputime_to_nsec() for s390 and powerpc and finally
remove include/linux/cputime.h .

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-36-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/Kbuild | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 2b892e2313a9..5d6a53fd7521 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -7,7 +7,6 @@ generated-y += unistd_64_x32.h
 generated-y += xen-hypercalls.h
 
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += dma-contiguous.h
 generic-y += early_ioremap.h
 generic-y += mcs_spinlock.h
-- 
cgit 


From 7243e10689fd17a3e151f41216569295cefa2958 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 14 Jan 2017 09:26:12 +0100
Subject: x86/platform/UV: Clean up the UV APIC code

Make it more readable.

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Dimitri Sivanich <sivanich@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170114082612.GA27842@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 500 ++++++++++++++++++-------------------
 1 file changed, 244 insertions(+), 256 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 97ea712fc72f..656994ac4677 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -41,15 +41,13 @@
 
 DEFINE_PER_CPU(int, x2apic_extra_bits);
 
-#define PR_DEVEL(fmt, args...)	pr_devel("%s: " fmt, __func__, args)
+static enum uv_system_type	uv_system_type;
+static u64			gru_start_paddr, gru_end_paddr;
+static u64			gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
+static u64			gru_dist_lmask, gru_dist_umask;
+static union uvh_apicid		uvh_apicid;
 
-static enum uv_system_type uv_system_type;
-static u64 gru_start_paddr, gru_end_paddr;
-static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
-static u64 gru_dist_lmask, gru_dist_umask;
-static union uvh_apicid uvh_apicid;
-
-/* info derived from CPUID */
+/* Information derived from CPUID: */
 static struct {
 	unsigned int apicid_shift;
 	unsigned int apicid_mask;
@@ -61,21 +59,25 @@ static struct {
 
 int uv_min_hub_revision_id;
 EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
+
 unsigned int uv_apicid_hibits;
 EXPORT_SYMBOL_GPL(uv_apicid_hibits);
 
 static struct apic apic_x2apic_uv_x;
 static struct uv_hub_info_s uv_hub_info_node0;
 
-/* Set this to use hardware error handler instead of kernel panic */
+/* Set this to use hardware error handler instead of kernel panic: */
 static int disable_uv_undefined_panic = 1;
+
 unsigned long uv_undefined(char *str)
 {
 	if (likely(!disable_uv_undefined_panic))
 		panic("UV: error: undefined MMR: %s\n", str);
 	else
 		pr_crit("UV: error: undefined MMR: %s\n", str);
-	return ~0ul;	/* cause a machine fault  */
+
+	/* Cause a machine fault: */
+	return ~0ul;
 }
 EXPORT_SYMBOL(uv_undefined);
 
@@ -86,18 +88,19 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr)
 	mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
 	val = *mmr;
 	early_iounmap(mmr, sizeof(*mmr));
+
 	return val;
 }
 
 static inline bool is_GRU_range(u64 start, u64 end)
 {
 	if (gru_dist_base) {
-		u64 su = start & gru_dist_umask; /* upper (incl pnode) bits */
-		u64 sl = start & gru_dist_lmask; /* base offset bits */
+		u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */
+		u64 sl = start & gru_dist_lmask; /* Base offset bits */
 		u64 eu = end & gru_dist_umask;
 		u64 el = end & gru_dist_lmask;
 
-		/* Must reside completely within a single GRU range */
+		/* Must reside completely within a single GRU range: */
 		return (sl == gru_dist_base && el == gru_dist_base &&
 			su >= gru_first_node_paddr &&
 			su <= gru_last_node_paddr &&
@@ -141,7 +144,7 @@ static int __init early_get_pnodeid(void)
 	uv_hub_info->hub_revision = uv_min_hub_revision_id;
 	uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1;
 	pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask;
-	uv_cpuid.gpa_shift = 46;	/* default unless changed */
+	uv_cpuid.gpa_shift = 46;	/* Default unless changed */
 
 	pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n",
 		node_id.s.revision, node_id.s.part_number, node_id.s.node_id,
@@ -149,11 +152,12 @@ static int __init early_get_pnodeid(void)
 	return pnode;
 }
 
-/* [copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
-#define SMT_LEVEL	0	/* leaf 0xb SMT level */
-#define INVALID_TYPE	0	/* leaf 0xb sub-leaf types */
-#define SMT_TYPE	1
-#define CORE_TYPE	2
+/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
+
+#define SMT_LEVEL			0	/* Leaf 0xb SMT level */
+#define INVALID_TYPE			0	/* Leaf 0xb sub-leaf types */
+#define SMT_TYPE			1
+#define CORE_TYPE			2
 #define LEAFB_SUBTYPE(ecx)		(((ecx) >> 8) & 0xff)
 #define BITS_SHIFT_NEXT_LEVEL(eax)	((eax) & 0x1f)
 
@@ -167,11 +171,13 @@ static void set_x2apic_bits(void)
 		pr_info("UV: CPU does not have CPUID.11\n");
 		return;
 	}
+
 	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
 	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) {
 		pr_info("UV: CPUID.11 not implemented\n");
 		return;
 	}
+
 	sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
 	sub_index = 1;
 	do {
@@ -182,8 +188,9 @@ static void set_x2apic_bits(void)
 		}
 		sub_index++;
 	} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
-	uv_cpuid.apicid_shift = 0;
-	uv_cpuid.apicid_mask = (~(-1 << sid_shift));
+
+	uv_cpuid.apicid_shift	= 0;
+	uv_cpuid.apicid_mask	= (~(-1 << sid_shift));
 	uv_cpuid.socketid_shift = sid_shift;
 }
 
@@ -194,10 +201,8 @@ static void __init early_get_apic_socketid_shift(void)
 
 	set_x2apic_bits();
 
-	pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n",
-		uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
-	pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n",
-		uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
+	pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
+	pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
 }
 
 /*
@@ -210,10 +215,8 @@ static void __init uv_set_apicid_hibit(void)
 	union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
 
 	if (is_uv1_hub()) {
-		apicid_mask.v =
-			uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
-		uv_apicid_hibits =
-			apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+		apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+		uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
 	}
 }
 
@@ -230,12 +233,12 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 		return 0;
 	}
 
-	/* Setup early hub type field in uv_hub_info for Node 0 */
+	/* Set up early hub type field in uv_hub_info for Node 0 */
 	uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
 
 	/*
 	 * Determine UV arch type.
-	 *   SGI: UV100/1000
+	 *   SGI:  UV100/1000
 	 *   SGI2: UV2000/3000
 	 *   SGI3: UV300 (truncated to 4 chars because of different varieties)
 	 *   SGI4: UV400 (truncated to 4 chars because of different varieties)
@@ -251,31 +254,32 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 	pnodeid = early_get_pnodeid();
 	early_get_apic_socketid_shift();
-	x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
+
+	x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
 	x86_platform.nmi_init = uv_nmi_init;
 
-	if (!strcmp(oem_table_id, "UVX")) {		/* most common */
+	if (!strcmp(oem_table_id, "UVX")) {
+		/* This is the most common hardware variant: */
 		uv_system_type = UV_X2APIC;
 		uv_apic = 0;
 
-	} else if (!strcmp(oem_table_id, "UVH")) {	/* only UV1 systems */
+	} else if (!strcmp(oem_table_id, "UVH")) {
+		/* Only UV1 systems: */
 		uv_system_type = UV_NON_UNIQUE_APIC;
-		__this_cpu_write(x2apic_extra_bits,
-			pnodeid << uvh_apicid.s.pnode_shift);
+		__this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift);
 		uv_set_apicid_hibit();
 		uv_apic = 1;
 
-	} else	if (!strcmp(oem_table_id, "UVL")) {	/* only used for */
-		uv_system_type = UV_LEGACY_APIC;	/* very small systems */
+	} else if (!strcmp(oem_table_id, "UVL")) {
+		/* Only used for very small systems:  */
+		uv_system_type = UV_LEGACY_APIC;
 		uv_apic = 0;
 
 	} else {
 		goto badbios;
 	}
 
-	pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n",
-		oem_id, oem_table_id, uv_system_type,
-		uv_min_hub_revision_id, uv_apic);
+	pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic);
 
 	return uv_apic;
 
@@ -308,16 +312,18 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
 unsigned long sn_rtc_cycles_per_second;
 EXPORT_SYMBOL(sn_rtc_cycles_per_second);
 
-/* the following values are used for the per node hub info struct */
-static __initdata unsigned short *_node_to_pnode;
-static __initdata unsigned short _min_socket, _max_socket;
-static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len;
-static __initdata struct uv_gam_range_entry *uv_gre_table;
-static __initdata struct uv_gam_parameters *uv_gp_table;
-static __initdata unsigned short *_socket_to_node;
-static __initdata unsigned short *_socket_to_pnode;
-static __initdata unsigned short *_pnode_to_socket;
-static __initdata struct uv_gam_range_s *_gr_table;
+/* The following values are used for the per node hub info struct */
+static __initdata unsigned short		*_node_to_pnode;
+static __initdata unsigned short		_min_socket, _max_socket;
+static __initdata unsigned short		_min_pnode, _max_pnode, _gr_table_len;
+static __initdata struct uv_gam_range_entry	*uv_gre_table;
+static __initdata struct uv_gam_parameters	*uv_gp_table;
+static __initdata unsigned short		*_socket_to_node;
+static __initdata unsigned short		*_socket_to_pnode;
+static __initdata unsigned short		*_pnode_to_socket;
+
+static __initdata struct uv_gam_range_s		*_gr_table;
+
 #define	SOCK_EMPTY	((unsigned short)~0)
 
 extern int uv_hub_info_version(void)
@@ -326,7 +332,7 @@ extern int uv_hub_info_version(void)
 }
 EXPORT_SYMBOL(uv_hub_info_version);
 
-/* Build GAM range lookup table */
+/* Build GAM range lookup table: */
 static __init void build_uv_gr_table(void)
 {
 	struct uv_gam_range_entry *gre = uv_gre_table;
@@ -344,25 +350,24 @@ static __init void build_uv_gr_table(void)
 
 	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
 		if (gre->type == UV_GAM_RANGE_TYPE_HOLE) {
-			if (!ram_limit) {   /* mark hole between ram/non-ram */
+			if (!ram_limit) {
+				/* Mark hole between RAM/non-RAM: */
 				ram_limit = last_limit;
 				last_limit = gre->limit;
 				lsid++;
 				continue;
 			}
 			last_limit = gre->limit;
-			pr_info("UV: extra hole in GAM RE table @%d\n",
-				(int)(gre - uv_gre_table));
+			pr_info("UV: extra hole in GAM RE table @%d\n", (int)(gre - uv_gre_table));
 			continue;
 		}
 		if (_max_socket < gre->sockid) {
-			pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n",
-				gre->sockid, _max_socket,
-				(int)(gre - uv_gre_table));
+			pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", gre->sockid, _max_socket, (int)(gre - uv_gre_table));
 			continue;
 		}
 		sid = gre->sockid - _min_socket;
-		if (lsid < sid) {		/* new range */
+		if (lsid < sid) {
+			/* New range: */
 			grt = &_gr_table[indx];
 			grt->base = lindx;
 			grt->nasid = gre->nasid;
@@ -371,27 +376,32 @@ static __init void build_uv_gr_table(void)
 			lindx = indx++;
 			continue;
 		}
-		if (lsid == sid && !ram_limit) {	/* update range */
-			if (grt->limit == last_limit) {	/* .. if contiguous */
+		/* Update range: */
+		if (lsid == sid && !ram_limit) {
+			/* .. if contiguous: */
+			if (grt->limit == last_limit) {
 				grt->limit = last_limit = gre->limit;
 				continue;
 			}
 		}
-		if (!ram_limit) {		/* non-contiguous ram range */
+		/* Non-contiguous RAM range: */
+		if (!ram_limit) {
 			grt++;
 			grt->base = lindx;
 			grt->nasid = gre->nasid;
 			grt->limit = last_limit = gre->limit;
 			continue;
 		}
-		grt++;				/* non-contiguous/non-ram */
-		grt->base = grt - _gr_table;	/* base is this entry */
+		/* Non-contiguous/non-RAM: */
+		grt++;
+		/* base is this entry */
+		grt->base = grt - _gr_table;
 		grt->nasid = gre->nasid;
 		grt->limit = last_limit = gre->limit;
 		lsid++;
 	}
 
-	/* shorten table if possible */
+	/* Shorten table if possible */
 	grt++;
 	i = grt - _gr_table;
 	if (i < _gr_table_len) {
@@ -405,16 +415,15 @@ static __init void build_uv_gr_table(void)
 		}
 	}
 
-	/* display resultant gam range table */
+	/* Display resultant GAM range table: */
 	for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) {
+		unsigned long start, end;
 		int gb = grt->base;
-		unsigned long start = gb < 0 ?  0 :
-			(unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT;
-		unsigned long end =
-			(unsigned long)grt->limit << UV_GAM_RANGE_SHFT;
 
-		pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n",
-			i, grt->nasid, start, end, gb);
+		start = gb < 0 ?  0 : (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT;
+		end = (unsigned long)grt->limit << UV_GAM_RANGE_SHFT;
+
+		pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", i, grt->nasid, start, end, gb);
 	}
 }
 
@@ -425,16 +434,19 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 
 	pnode = uv_apicid_to_pnode(phys_apicid);
 	phys_apicid |= uv_apicid_hibits;
+
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
 	    ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
 	    APIC_DM_INIT;
+
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
 	    ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
 	    APIC_DM_STARTUP;
+
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 
 	return 0;
@@ -568,7 +580,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
 	.apic_id_registered		= uv_apic_id_registered,
 
 	.irq_delivery_mode		= dest_Fixed,
-	.irq_dest_mode			= 0, /* physical */
+	.irq_dest_mode			= 0, /* Physical */
 
 	.target_cpus			= online_target_cpus,
 	.disable_esr			= 0,
@@ -629,23 +641,22 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
 		switch (i) {
 		case 0:
 			m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR;
-			m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR;
+			m_overlay  = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR;
 			break;
 		case 1:
 			m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR;
-			m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR;
+			m_overlay  = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR;
 			break;
 		case 2:
 			m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR;
-			m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR;
+			m_overlay  = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR;
 			break;
 		}
 		alias.v = uv_read_local_mmr(m_overlay);
 		if (alias.s.enable && alias.s.base == 0) {
 			*size = (1UL << alias.s.m_alias);
 			redirect.v = uv_read_local_mmr(m_redirect);
-			*base = (unsigned long)redirect.s.dest_base
-							<< DEST_SHIFT;
+			*base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
 			return;
 		}
 	}
@@ -654,8 +665,7 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
 
 enum map_type {map_wb, map_uc};
 
-static __init void map_high(char *id, unsigned long base, int pshift,
-			int bshift, int max_pnode, enum map_type map_type)
+static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type)
 {
 	unsigned long bytes, paddr;
 
@@ -680,16 +690,19 @@ static __init void map_gru_distributed(unsigned long c)
 	int nid;
 
 	gru.v = c;
-	/* only base bits 42:28 relevant in dist mode */
+
+	/* Only base bits 42:28 relevant in dist mode */
 	gru_dist_base = gru.v & 0x000007fff0000000UL;
 	if (!gru_dist_base) {
 		pr_info("UV: Map GRU_DIST base address NULL\n");
 		return;
 	}
+
 	bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
 	gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1);
 	gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1);
 	gru_dist_base &= gru_dist_lmask; /* Clear bits above M */
+
 	for_each_online_node(nid) {
 		paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) |
 				gru_dist_base;
@@ -697,11 +710,12 @@ static __init void map_gru_distributed(unsigned long c)
 		gru_first_node_paddr = min(paddr, gru_first_node_paddr);
 		gru_last_node_paddr = max(paddr, gru_last_node_paddr);
 	}
+
 	/* Save upper (63:M) bits of address only for is_GRU_range */
 	gru_first_node_paddr &= gru_dist_umask;
 	gru_last_node_paddr &= gru_dist_umask;
-	pr_debug("UV: Map GRU_DIST base 0x%016llx  0x%016llx - 0x%016llx\n",
-		gru_dist_base, gru_first_node_paddr, gru_last_node_paddr);
+
+	pr_debug("UV: Map GRU_DIST base 0x%016llx  0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr);
 }
 
 static __init void map_gru_high(int max_pnode)
@@ -721,6 +735,7 @@ static __init void map_gru_high(int max_pnode)
 		map_gru_distributed(gru.v);
 		return;
 	}
+
 	base = (gru.v & mask) >> shift;
 	map_high("GRU", base, shift, shift, max_pnode, map_wb);
 	gru_start_paddr = ((u64)base << shift);
@@ -774,8 +789,8 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
 
 	id = mmiohs[index].id;
 	overlay.v = uv_read_local_mmr(mmiohs[index].overlay);
-	pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n",
-		id, overlay.v, overlay.s3.base, overlay.s3.m_io);
+
+	pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", id, overlay.v, overlay.s3.base, overlay.s3.m_io);
 	if (!overlay.s3.enable) {
 		pr_info("UV: %s disabled\n", id);
 		return;
@@ -786,7 +801,8 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
 	m_io = overlay.s3.m_io;
 	mmr = mmiohs[index].redirect;
 	n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
-	min_pnode *= 2;				/* convert to NASID */
+	/* Convert to NASID: */
+	min_pnode *= 2;
 	max_pnode *= 2;
 	max_io = lnasid = fi = li = -1;
 
@@ -795,16 +811,18 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
 
 		redirect.v = uv_read_local_mmr(mmr + i * 8);
 		nasid = redirect.s3.nasid;
+		/* Invalid NASID: */
 		if (nasid < min_pnode || max_pnode < nasid)
-			nasid = -1;		/* invalid NASID */
+			nasid = -1;
 
 		if (nasid == lnasid) {
 			li = i;
-			if (i != n-1)		/* last entry check */
+			/* Last entry check: */
+			if (i != n-1)
 				continue;
 		}
 
-		/* check if we have a cached (or last) redirect to print */
+		/* Check if we have a cached (or last) redirect to print: */
 		if (lnasid != -1 || (i == n-1 && nasid != -1))  {
 			unsigned long addr1, addr2;
 			int f, l;
@@ -816,12 +834,9 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
 				f = fi;
 				l = li;
 			}
-			addr1 = (base << shift) +
-				f * (1ULL << m_io);
-			addr2 = (base << shift) +
-				(l + 1) * (1ULL << m_io);
-			pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n",
-				id, fi, li, lnasid, addr1, addr2);
+			addr1 = (base << shift) + f * (1ULL << m_io);
+			addr2 = (base << shift) + (l + 1) * (1ULL << m_io);
+			pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2);
 			if (max_io < l)
 				max_io = l;
 		}
@@ -829,8 +844,7 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
 		lnasid = nasid;
 	}
 
-	pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n",
-		id, base, shift, m_io, max_io);
+	pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", id, base, shift, m_io, max_io);
 
 	if (max_io >= 0)
 		map_high(id, base, shift, m_io, max_io, map_uc);
@@ -843,36 +857,35 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode)
 	int shift, enable, m_io, n_io;
 
 	if (is_uv3_hub() || is_uv4_hub()) {
-		/* Map both MMIOH Regions */
+		/* Map both MMIOH regions: */
 		map_mmioh_high_uv3(0, min_pnode, max_pnode);
 		map_mmioh_high_uv3(1, min_pnode, max_pnode);
 		return;
 	}
 
 	if (is_uv1_hub()) {
-		mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
-		shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
-		mmioh.v = uv_read_local_mmr(mmr);
-		enable = !!mmioh.s1.enable;
-		base = mmioh.s1.base;
-		m_io = mmioh.s1.m_io;
-		n_io = mmioh.s1.n_io;
+		mmr	= UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+		shift	= UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+		mmioh.v	= uv_read_local_mmr(mmr);
+		enable	= !!mmioh.s1.enable;
+		base	= mmioh.s1.base;
+		m_io	= mmioh.s1.m_io;
+		n_io	= mmioh.s1.n_io;
 	} else if (is_uv2_hub()) {
-		mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
-		shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
-		mmioh.v = uv_read_local_mmr(mmr);
-		enable = !!mmioh.s2.enable;
-		base = mmioh.s2.base;
-		m_io = mmioh.s2.m_io;
-		n_io = mmioh.s2.n_io;
-	} else
+		mmr	= UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+		shift	= UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+		mmioh.v	= uv_read_local_mmr(mmr);
+		enable	= !!mmioh.s2.enable;
+		base	= mmioh.s2.base;
+		m_io	= mmioh.s2.m_io;
+		n_io	= mmioh.s2.n_io;
+	} else {
 		return;
+	}
 
 	if (enable) {
 		max_pnode &= (1 << n_io) - 1;
-		pr_info(
-		    "UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n",
-			base, shift, m_io, n_io, max_pnode);
+		pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode);
 		map_high("MMIOH", base, shift, m_io, max_pnode, map_uc);
 	} else {
 		pr_info("UV: MMIOH disabled\n");
@@ -890,16 +903,16 @@ static __init void uv_rtc_init(void)
 	long status;
 	u64 ticks_per_sec;
 
-	status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
-					&ticks_per_sec);
+	status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec);
+
 	if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
-		printk(KERN_WARNING
-			"unable to determine platform RTC clock frequency, "
-			"guessing.\n");
-		/* BIOS gives wrong value for clock freq. so guess */
+		pr_warn("UV: unable to determine platform RTC clock frequency, guessing.\n");
+
+		/* BIOS gives wrong value for clock frequency, so guess: */
 		sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
-	} else
+	} else {
 		sn_rtc_cycles_per_second = ticks_per_sec;
+	}
 }
 
 /*
@@ -910,19 +923,19 @@ static void uv_heartbeat(unsigned long ignored)
 	struct timer_list *timer = &uv_scir_info->timer;
 	unsigned char bits = uv_scir_info->state;
 
-	/* flip heartbeat bit */
+	/* Flip heartbeat bit: */
 	bits ^= SCIR_CPU_HEARTBEAT;
 
-	/* is this cpu idle? */
+	/* Is this CPU idle? */
 	if (idle_cpu(raw_smp_processor_id()))
 		bits &= ~SCIR_CPU_ACTIVITY;
 	else
 		bits |= SCIR_CPU_ACTIVITY;
 
-	/* update system controller interface reg */
+	/* Update system controller interface reg: */
 	uv_set_scir_bits(bits);
 
-	/* enable next timer period */
+	/* Enable next timer period: */
 	mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
 }
 
@@ -937,7 +950,7 @@ static int uv_heartbeat_enable(unsigned int cpu)
 		add_timer_on(timer, cpu);
 		uv_cpu_scir_info(cpu)->enabled = 1;
 
-		/* also ensure that boot cpu is enabled */
+		/* Also ensure that boot CPU is enabled: */
 		cpu = 0;
 	}
 	return 0;
@@ -970,9 +983,11 @@ static __init int uv_init_heartbeat(void)
 {
 	int cpu;
 
-	if (is_uv_system())
+	if (is_uv_system()) {
 		for_each_online_cpu(cpu)
 			uv_heartbeat_enable(cpu);
+	}
+
 	return 0;
 }
 
@@ -981,14 +996,10 @@ late_initcall(uv_init_heartbeat);
 #endif /* !CONFIG_HOTPLUG_CPU */
 
 /* Direct Legacy VGA I/O traffic to designated IOH */
-int uv_set_vga_state(struct pci_dev *pdev, bool decode,
-		      unsigned int command_bits, u32 flags)
+int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags)
 {
 	int domain, bus, rc;
 
-	PR_DEVEL("devfn %x decode %d cmd %x flags %d\n",
-			pdev->devfn, decode, command_bits, flags);
-
 	if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
 		return 0;
 
@@ -999,13 +1010,12 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode,
 	bus = pdev->bus->number;
 
 	rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
-	PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
 
 	return rc;
 }
 
 /*
- * Called on each cpu to initialize the per_cpu UV data area.
+ * Called on each CPU to initialize the per_cpu UV data area.
  * FIXME: hotplug not supported yet
  */
 void uv_cpu_init(void)
@@ -1032,92 +1042,79 @@ static void get_mn(struct mn *mnp)
 	union uvh_rh_gam_config_mmr_u m_n_config;
 	union uv3h_gr0_gam_gr_config_u m_gr_config;
 
-	m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR);
-	mnp->n_val = m_n_config.s.n_skt;
+	/* Make sure the whole structure is well initialized: */
+	memset(mnp, 0, sizeof(*mnp));
+
+	m_n_config.v	= uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR);
+	mnp->n_val	= m_n_config.s.n_skt;
+
 	if (is_uv4_hub()) {
-		mnp->m_val = 0;
-		mnp->n_lshift = 0;
+		mnp->m_val	= 0;
+		mnp->n_lshift	= 0;
 	} else if (is_uv3_hub()) {
-		mnp->m_val = m_n_config.s3.m_skt;
-		m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
-		mnp->n_lshift = m_gr_config.s3.m_skt;
+		mnp->m_val	= m_n_config.s3.m_skt;
+		m_gr_config.v	= uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
+		mnp->n_lshift	= m_gr_config.s3.m_skt;
 	} else if (is_uv2_hub()) {
-		mnp->m_val = m_n_config.s2.m_skt;
-		mnp->n_lshift = mnp->m_val == 40 ? 40 : 39;
+		mnp->m_val	= m_n_config.s2.m_skt;
+		mnp->n_lshift	= mnp->m_val == 40 ? 40 : 39;
 	} else if (is_uv1_hub()) {
-		mnp->m_val = m_n_config.s1.m_skt;
-		mnp->n_lshift = mnp->m_val;
+		mnp->m_val	= m_n_config.s1.m_skt;
+		mnp->n_lshift	= mnp->m_val;
 	}
 	mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0;
 }
 
-void __init uv_init_hub_info(struct uv_hub_info_s *hub_info)
+void __init uv_init_hub_info(struct uv_hub_info_s *hi)
 {
-	struct mn mn = {0};	/* avoid unitialized warnings */
 	union uvh_node_id_u node_id;
+	struct mn mn;
 
 	get_mn(&mn);
-	hub_info->m_val = mn.m_val;
-	hub_info->n_val = mn.n_val;
-	hub_info->m_shift = mn.m_shift;
-	hub_info->n_lshift = mn.n_lshift ? mn.n_lshift : 0;
-
-	hub_info->hub_revision = uv_hub_info->hub_revision;
-	hub_info->pnode_mask = uv_cpuid.pnode_mask;
-	hub_info->min_pnode = _min_pnode;
-	hub_info->min_socket = _min_socket;
-	hub_info->pnode_to_socket = _pnode_to_socket;
-	hub_info->socket_to_node = _socket_to_node;
-	hub_info->socket_to_pnode = _socket_to_pnode;
-	hub_info->gr_table_len = _gr_table_len;
-	hub_info->gr_table = _gr_table;
-	hub_info->gpa_mask = mn.m_val ?
+	hi->gpa_mask = mn.m_val ?
 		(1UL << (mn.m_val + mn.n_val)) - 1 :
 		(1UL << uv_cpuid.gpa_shift) - 1;
 
-	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
-	uv_cpuid.gnode_shift = max_t(unsigned int,
-					uv_cpuid.gnode_shift, mn.n_val);
-	hub_info->gnode_extra =
-		(node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1;
-
-	hub_info->gnode_upper =
-		((unsigned long)hub_info->gnode_extra << mn.m_val);
+	hi->m_val		= mn.m_val;
+	hi->n_val		= mn.n_val;
+	hi->m_shift		= mn.m_shift;
+	hi->n_lshift		= mn.n_lshift ? mn.n_lshift : 0;
+	hi->hub_revision	= uv_hub_info->hub_revision;
+	hi->pnode_mask		= uv_cpuid.pnode_mask;
+	hi->min_pnode		= _min_pnode;
+	hi->min_socket		= _min_socket;
+	hi->pnode_to_socket	= _pnode_to_socket;
+	hi->socket_to_node	= _socket_to_node;
+	hi->socket_to_pnode	= _socket_to_pnode;
+	hi->gr_table_len	= _gr_table_len;
+	hi->gr_table		= _gr_table;
+
+	node_id.v		= uv_read_local_mmr(UVH_NODE_ID);
+	uv_cpuid.gnode_shift	= max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val);
+	hi->gnode_extra		= (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1;
+	hi->gnode_upper		= (unsigned long)hi->gnode_extra << mn.m_val;
 
 	if (uv_gp_table) {
-		hub_info->global_mmr_base = uv_gp_table->mmr_base;
-		hub_info->global_mmr_shift = uv_gp_table->mmr_shift;
-		hub_info->global_gru_base = uv_gp_table->gru_base;
-		hub_info->global_gru_shift = uv_gp_table->gru_shift;
-		hub_info->gpa_shift = uv_gp_table->gpa_shift;
-		hub_info->gpa_mask = (1UL << hub_info->gpa_shift) - 1;
+		hi->global_mmr_base	= uv_gp_table->mmr_base;
+		hi->global_mmr_shift	= uv_gp_table->mmr_shift;
+		hi->global_gru_base	= uv_gp_table->gru_base;
+		hi->global_gru_shift	= uv_gp_table->gru_shift;
+		hi->gpa_shift		= uv_gp_table->gpa_shift;
+		hi->gpa_mask		= (1UL << hi->gpa_shift) - 1;
 	} else {
-		hub_info->global_mmr_base =
-			uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
-					~UV_MMR_ENABLE;
-		hub_info->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT;
+		hi->global_mmr_base	= uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE;
+		hi->global_mmr_shift	= _UV_GLOBAL_MMR64_PNODE_SHIFT;
 	}
 
-	get_lowmem_redirect(
-		&hub_info->lowmem_remap_base, &hub_info->lowmem_remap_top);
-
-	hub_info->apic_pnode_shift = uv_cpuid.socketid_shift;
+	get_lowmem_redirect(&hi->lowmem_remap_base, &hi->lowmem_remap_top);
 
-	/* show system specific info */
-	pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n",
-		hub_info->n_val, hub_info->m_val,
-		hub_info->m_shift, hub_info->n_lshift);
+	hi->apic_pnode_shift = uv_cpuid.socketid_shift;
 
-	pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n",
-		hub_info->gpa_mask, hub_info->gpa_shift,
-		hub_info->pnode_mask, hub_info->apic_pnode_shift);
-
-	pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n",
-		hub_info->global_mmr_base, hub_info->global_mmr_shift,
-		hub_info->global_gru_base, hub_info->global_gru_shift);
-
-	pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n",
-		hub_info->gnode_upper, hub_info->gnode_extra);
+	/* Show system specific info: */
+	pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift);
+	pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift);
+	pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift, hi->global_gru_base, hi->global_gru_shift);
+	pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra);
 }
 
 static void __init decode_gam_params(unsigned long ptr)
@@ -1143,12 +1140,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
 	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
 		if (!index) {
 			pr_info("UV: GAM Range Table...\n");
-			pr_info("UV:  # %20s %14s %5s %4s %5s %3s %2s\n",
-				"Range", "", "Size", "Type", "NASID",
-				"SID", "PN");
+			pr_info("UV:  # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");
 		}
-		pr_info(
-		"UV: %2d: 0x%014lx-0x%014lx %5luG %3d   %04x  %02x %02x\n",
+		pr_info("UV: %2d: 0x%014lx-0x%014lx %5luG %3d   %04x  %02x %02x\n",
 			index++,
 			(unsigned long)lgre << UV_GAM_RANGE_SHFT,
 			(unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
@@ -1166,14 +1160,13 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
 		if (pnode_max < gre->pnode)
 			pnode_max = gre->pnode;
 	}
-	_min_socket = sock_min;
-	_max_socket = sock_max;
-	_min_pnode = pnode_min;
-	_max_pnode = pnode_max;
-	_gr_table_len = index;
-	pr_info(
-	"UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n",
-		index, _min_socket, _max_socket, _min_pnode, _max_pnode);
+	_min_socket	= sock_min;
+	_max_socket	= sock_max;
+	_min_pnode	= pnode_min;
+	_max_pnode	= pnode_max;
+	_gr_table_len	= index;
+
+	pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode);
 }
 
 static int __init decode_uv_systab(void)
@@ -1188,12 +1181,10 @@ static int __init decode_uv_systab(void)
 	if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) {
 		int rev = st ? st->revision : 0;
 
-		pr_err(
-		"UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n",
-			rev, UV_SYSTAB_VERSION_UV4_LATEST);
-		pr_err(
-		"UV: Cannot support UV operations, switching to generic PC\n");
+		pr_err("UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", rev, UV_SYSTAB_VERSION_UV4_LATEST);
+		pr_err("UV: Cannot support UV operations, switching to generic PC\n");
 		uv_system_type = UV_NONE;
+
 		return -EINVAL;
 	}
 
@@ -1219,7 +1210,7 @@ static int __init decode_uv_systab(void)
 }
 
 /*
- * Setup physical blade translations from UVH_NODE_PRESENT_TABLE
+ * Set up physical blade translations from UVH_NODE_PRESENT_TABLE
  * .. NB: UVH_NODE_PRESENT_TABLE is going away,
  * .. being replaced by GAM Range Table
  */
@@ -1255,14 +1246,13 @@ static void __init build_socket_tables(void)
 	if (!gre) {
 		if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) {
 			pr_info("UV: No UVsystab socket table, ignoring\n");
-			return;		/* not required */
+			return;
 		}
-		pr_crit(
-		"UV: Error: UVsystab address translations not available!\n");
+		pr_crit("UV: Error: UVsystab address translations not available!\n");
 		BUG();
 	}
 
-	/* build socket id -> node id, pnode */
+	/* Build socket id -> node id, pnode */
 	num = maxsock - minsock + 1;
 	bytes = num * sizeof(_socket_to_node[0]);
 	_socket_to_node = kmalloc(bytes, GFP_KERNEL);
@@ -1279,27 +1269,27 @@ static void __init build_socket_tables(void)
 	for (i = 0; i < nump; i++)
 		_pnode_to_socket[i] = SOCK_EMPTY;
 
-	/* fill in pnode/node/addr conversion list values */
+	/* Fill in pnode/node/addr conversion list values: */
 	pr_info("UV: GAM Building socket/pnode conversion tables\n");
 	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
 		if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
 			continue;
 		i = gre->sockid - minsock;
+		/* Duplicate: */
 		if (_socket_to_pnode[i] != SOCK_EMPTY)
-			continue;	/* duplicate */
+			continue;
 		_socket_to_pnode[i] = gre->pnode;
 
 		i = gre->pnode - minpnode;
 		_pnode_to_socket[i] = gre->sockid;
 
-		pr_info(
-		"UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
+		pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
 			gre->sockid, gre->type, gre->nasid,
 			_socket_to_pnode[gre->sockid - minsock],
 			_pnode_to_socket[gre->pnode - minpnode]);
 	}
 
-	/* Set socket -> node values */
+	/* Set socket -> node values: */
 	lnid = -1;
 	for_each_present_cpu(cpu) {
 		int nid = cpu_to_node(cpu);
@@ -1315,7 +1305,7 @@ static void __init build_socket_tables(void)
 			sockid, apicid, nid);
 	}
 
-	/* Setup physical blade to pnode translation from GAM Range Table */
+	/* Set up physical blade to pnode translation from GAM Range Table: */
 	bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]);
 	_node_to_pnode = kmalloc(bytes, GFP_KERNEL);
 	BUG_ON(!_node_to_pnode);
@@ -1325,8 +1315,7 @@ static void __init build_socket_tables(void)
 
 		for (sockid = minsock; sockid <= maxsock; sockid++) {
 			if (lnid == _socket_to_node[sockid - minsock]) {
-				_node_to_pnode[lnid] =
-					_socket_to_pnode[sockid - minsock];
+				_node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock];
 				break;
 			}
 		}
@@ -1343,8 +1332,7 @@ static void __init build_socket_tables(void)
 	pr_info("UV: Checking socket->node/pnode for identity maps\n");
 	if (minsock == 0) {
 		for (i = 0; i < num; i++)
-			if (_socket_to_node[i] == SOCK_EMPTY ||
-				i != _socket_to_node[i])
+			if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i])
 				break;
 		if (i >= num) {
 			kfree(_socket_to_node);
@@ -1383,9 +1371,13 @@ void __init uv_system_init(void)
 
 	map_low_mmrs();
 
-	uv_bios_init();			/* get uv_systab for decoding */
+	/* Get uv_systab for decoding: */
+	uv_bios_init();
+
+	/* If there's an UVsystab problem then abort UV init: */
 	if (decode_uv_systab() < 0)
-		return;			/* UVsystab problem, abort UV init */
+		return;
+
 	build_socket_tables();
 	build_uv_gr_table();
 	uv_init_hub_info(&hub_info);
@@ -1393,14 +1385,10 @@ void __init uv_system_init(void)
 	if (!_node_to_pnode)
 		boot_init_possible_blades(&hub_info);
 
-	/* uv_num_possible_blades() is really the hub count */
-	pr_info("UV: Found %d hubs, %d nodes, %d cpus\n",
-			uv_num_possible_blades(),
-			num_possible_nodes(),
-			num_possible_cpus());
+	/* uv_num_possible_blades() is really the hub count: */
+	pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus());
 
-	uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
-			    &sn_region_size, &system_serial_number);
+	uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number);
 	hub_info.coherency_domain_number = sn_coherency_id;
 	uv_rtc_init();
 
@@ -1413,33 +1401,31 @@ void __init uv_system_init(void)
 		struct uv_hub_info_s *new_hub;
 
 		if (__uv_hub_info_list[nodeid]) {
-			pr_err("UV: Node %d UV HUB already initialized!?\n",
-				nodeid);
+			pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid);
 			BUG();
 		}
 
 		/* Allocate new per hub info list */
-		new_hub = (nodeid == 0) ?
-			&uv_hub_info_node0 :
-			kzalloc_node(bytes, GFP_KERNEL, nodeid);
+		new_hub = (nodeid == 0) ?  &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid);
 		BUG_ON(!new_hub);
 		__uv_hub_info_list[nodeid] = new_hub;
 		new_hub = uv_hub_info_list(nodeid);
 		BUG_ON(!new_hub);
 		*new_hub = hub_info;
 
-		/* Use information from GAM table if available */
+		/* Use information from GAM table if available: */
 		if (_node_to_pnode)
 			new_hub->pnode = _node_to_pnode[nodeid];
-		else	/* Fill in during cpu loop */
+		else /* Or fill in during CPU loop: */
 			new_hub->pnode = 0xffff;
+
 		new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
 		new_hub->memory_nid = -1;
 		new_hub->nr_possible_cpus = 0;
 		new_hub->nr_online_cpus = 0;
 	}
 
-	/* Initialize per cpu info */
+	/* Initialize per CPU info: */
 	for_each_possible_cpu(cpu) {
 		int apicid = per_cpu(x86_cpu_to_apicid, cpu);
 		int numa_node_id;
@@ -1450,22 +1436,24 @@ void __init uv_system_init(void)
 		pnode = uv_apicid_to_pnode(apicid);
 
 		uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
-		uv_cpu_info_per(cpu)->blade_cpu_id =
-			uv_cpu_hub_info(cpu)->nr_possible_cpus++;
+		uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
 		if (uv_cpu_hub_info(cpu)->memory_nid == -1)
 			uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
-		if (nodeid != numa_node_id &&	/* init memoryless node */
+
+		/* Init memoryless node: */
+		if (nodeid != numa_node_id &&
 		    uv_hub_info_list(numa_node_id)->pnode == 0xffff)
 			uv_hub_info_list(numa_node_id)->pnode = pnode;
 		else if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
 			uv_cpu_hub_info(cpu)->pnode = pnode;
+
 		uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid);
 	}
 
 	for_each_node(nodeid) {
 		unsigned short pnode = uv_hub_info_list(nodeid)->pnode;
 
-		/* Add pnode info for pre-GAM list nodes without cpus */
+		/* Add pnode info for pre-GAM list nodes without CPUs: */
 		if (pnode == 0xffff) {
 			unsigned long paddr;
 
@@ -1491,12 +1479,12 @@ void __init uv_system_init(void)
 	uv_scir_register_cpu_notifier();
 	proc_mkdir("sgi_uv", NULL);
 
-	/* register Legacy VGA I/O redirection handler */
+	/* Register Legacy VGA I/O redirection handler: */
 	pci_register_set_vga_state(uv_set_vga_state);
 
 	/*
 	 * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
-	 * EFI is not enabled in the kdump kernel.
+	 * EFI is not enabled in the kdump kernel:
 	 */
 	if (is_kdump_kernel())
 		reboot_type = BOOT_ACPI;
-- 
cgit 


From 74862b03b46a852662c1a30c859b985261ff5d5c Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 25 Jan 2017 10:35:18 -0600
Subject: x86/platform/UV: Add Support for UV4 Hubless systems

Add recognition and support for UV4 hubless systems.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Russ Anderson <rja@hpe.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dimitri Sivanich <sivanich@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170125163517.398537358@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/uv/uv.h       |  2 ++
 arch/x86/kernel/apic/x2apic_uv_x.c | 30 ++++++++++++++++++++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 062921ef34e9..6686820feae9 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -10,6 +10,7 @@ struct mm_struct;
 
 extern enum uv_system_type get_uv_system_type(void);
 extern int is_uv_system(void);
+extern int is_uv_hubless(void);
 extern void uv_cpu_init(void);
 extern void uv_nmi_init(void);
 extern void uv_system_init(void);
@@ -23,6 +24,7 @@ extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 
 static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
 static inline int is_uv_system(void)	{ return 0; }
+static inline int is_uv_hubless(void)	{ return 0; }
 static inline void uv_cpu_init(void)	{ }
 static inline void uv_system_init(void)	{ }
 static inline const struct cpumask *
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 656994ac4677..d02cc7e65e4d 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -42,6 +42,7 @@
 DEFINE_PER_CPU(int, x2apic_extra_bits);
 
 static enum uv_system_type	uv_system_type;
+static bool			uv_hubless_system;
 static u64			gru_start_paddr, gru_end_paddr;
 static u64			gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
 static u64			gru_dist_lmask, gru_dist_umask;
@@ -225,8 +226,14 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	int pnodeid;
 	int uv_apic;
 
-	if (strncmp(oem_id, "SGI", 3) != 0)
+	if (strncmp(oem_id, "SGI", 3) != 0) {
+		if (strncmp(oem_id, "NSGI", 4) == 0) {
+			uv_hubless_system = true;
+			pr_info("UV: OEM IDs %s/%s, HUBLESS\n",
+				oem_id, oem_table_id);
+		}
 		return 0;
+	}
 
 	if (numa_off) {
 		pr_err("UV: NUMA is off, disabling UV support\n");
@@ -300,6 +307,12 @@ int is_uv_system(void)
 }
 EXPORT_SYMBOL_GPL(is_uv_system);
 
+int is_uv_hubless(void)
+{
+	return uv_hubless_system;
+}
+EXPORT_SYMBOL_GPL(is_uv_hubless);
+
 void **__uv_hub_info_list;
 EXPORT_SYMBOL_GPL(__uv_hub_info_list);
 
@@ -1353,7 +1366,7 @@ static void __init build_socket_tables(void)
 	}
 }
 
-void __init uv_system_init(void)
+static void __init uv_system_init_hub(void)
 {
 	struct uv_hub_info_s hub_info = {0};
 	int bytes, cpu, nodeid;
@@ -1490,4 +1503,17 @@ void __init uv_system_init(void)
 		reboot_type = BOOT_ACPI;
 }
 
+/*
+ * There is a small amount of UV specific code needed to initialize a
+ * UV system that does not have a "UV HUB" (referred to as "hubless").
+ */
+void __init uv_system_init(void)
+{
+	if (likely(!is_uv_system() && !is_uv_hubless()))
+		return;
+
+	if (is_uv_system())
+		uv_system_init_hub();
+}
+
 apic_driver(apic_x2apic_uv_x);
-- 
cgit 


From abdf1df6bc0416ec19b841e92b497ca55b23454c Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 25 Jan 2017 10:35:19 -0600
Subject: x86/platform/UV: Add Support for UV4 Hubless NMIs

Merge new UV Hubless NMI support into existing UV NMI handler.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Russ Anderson <rja@hpe.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dimitri Sivanich <sivanich@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170125163517.585269837@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/uv/uv_hub.h   |   3 +
 arch/x86/kernel/apic/x2apic_uv_x.c |   2 +
 arch/x86/platform/uv/uv_nmi.c      | 193 ++++++++++++++++++++++++++++++++-----
 3 files changed, 176 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 097b80c989c4..72e8300b1e8a 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -772,6 +772,7 @@ static inline int uv_num_possible_blades(void)
 
 /* Per Hub NMI support */
 extern void uv_nmi_setup(void);
+extern void uv_nmi_setup_hubless(void);
 
 /* BMC sets a bit this MMR non-zero before sending an NMI */
 #define UVH_NMI_MMR		UVH_SCRATCH5
@@ -799,6 +800,8 @@ struct uv_hub_nmi_s {
 	atomic_t	read_mmr_count;	/* count of MMR reads */
 	atomic_t	nmi_count;	/* count of true UV NMIs */
 	unsigned long	nmi_value;	/* last value read from NMI MMR */
+	bool		hub_present;	/* false means UV hubless system */
+	bool		pch_owner;	/* indicates this hub owns PCH */
 };
 
 struct uv_cpu_nmi_s {
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d02cc7e65e4d..e9f8f8cdd570 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -1514,6 +1514,8 @@ void __init uv_system_init(void)
 
 	if (is_uv_system())
 		uv_system_init_hub();
+	else
+		uv_nmi_setup_hubless();
 }
 
 apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 8410e7d0a5b5..df7b092941fe 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -67,6 +67,18 @@ static struct uv_hub_nmi_s **uv_hub_nmi_list;
 DEFINE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi);
 EXPORT_PER_CPU_SYMBOL_GPL(uv_cpu_nmi);
 
+/* UV hubless values */
+#define NMI_CONTROL_PORT	0x70
+#define NMI_DUMMY_PORT		0x71
+#define GPI_NMI_STS_GPP_D_0	0x164
+#define GPI_NMI_ENA_GPP_D_0	0x174
+#define STS_GPP_D_0_MASK	0x1
+#define PAD_CFG_DW0_GPP_D_0	0x4c0
+#define GPIROUTNMI		(1ul << 17)
+#define PCH_PCR_GPIO_1_BASE	0xfdae0000ul
+#define PCH_PCR_GPIO_ADDRESS(offset) (int *)((u64)(pch_base) | (u64)(offset))
+
+static u64 *pch_base;
 static unsigned long nmi_mmr;
 static unsigned long nmi_mmr_clear;
 static unsigned long nmi_mmr_pending;
@@ -144,6 +156,19 @@ module_param_named(wait_count, uv_nmi_wait_count, int, 0644);
 static int uv_nmi_retry_count = 500;
 module_param_named(retry_count, uv_nmi_retry_count, int, 0644);
 
+static bool uv_pch_intr_enable = true;
+static bool uv_pch_intr_now_enabled;
+module_param_named(pch_intr_enable, uv_pch_intr_enable, bool, 0644);
+
+static int uv_nmi_debug;
+module_param_named(debug, uv_nmi_debug, int, 0644);
+
+#define nmi_debug(fmt, ...)				\
+	do {						\
+		if (uv_nmi_debug)			\
+			pr_info(fmt, ##__VA_ARGS__);	\
+	} while (0)
+
 /*
  * Valid NMI Actions:
  *  "dump"	- dump process stack for each cpu
@@ -191,6 +216,77 @@ static inline void uv_local_mmr_clear_nmi(void)
 	uv_write_local_mmr(nmi_mmr_clear, nmi_mmr_pending);
 }
 
+/*
+ * UV hubless NMI handler functions
+ */
+static inline void uv_reassert_nmi(void)
+{
+	/* (from arch/x86/include/asm/mach_traps.h) */
+	outb(0x8f, NMI_CONTROL_PORT);
+	inb(NMI_DUMMY_PORT);		/* dummy read */
+	outb(0x0f, NMI_CONTROL_PORT);
+	inb(NMI_DUMMY_PORT);		/* dummy read */
+}
+
+static void uv_init_hubless_pch_io(int offset, int mask, int data)
+{
+	int *addr = PCH_PCR_GPIO_ADDRESS(offset);
+	int readd = readl(addr);
+
+	if (mask) {			/* OR in new data */
+		int writed = (readd & ~mask) | data;
+
+		nmi_debug("UV:PCH: %p = %x & %x | %x (%x)\n",
+			addr, readd, ~mask, data, writed);
+		writel(writed, addr);
+	} else if (readd & data) {	/* clear status bit */
+		nmi_debug("UV:PCH: %p = %x\n", addr, data);
+		writel(data, addr);
+	}
+
+	(void)readl(addr);		/* flush write data */
+}
+
+static void uv_nmi_setup_hubless_intr(void)
+{
+	uv_pch_intr_now_enabled = uv_pch_intr_enable;
+
+	uv_init_hubless_pch_io(
+		PAD_CFG_DW0_GPP_D_0, GPIROUTNMI,
+		uv_pch_intr_now_enabled ? GPIROUTNMI : 0);
+
+	nmi_debug("UV:NMI: GPP_D_0 interrupt %s\n",
+		uv_pch_intr_now_enabled ? "enabled" : "disabled");
+}
+
+static int uv_nmi_test_hubless(struct uv_hub_nmi_s *hub_nmi)
+{
+	int *pstat = PCH_PCR_GPIO_ADDRESS(GPI_NMI_STS_GPP_D_0);
+	int status = *pstat;
+
+	hub_nmi->nmi_value = status;
+	atomic_inc(&hub_nmi->read_mmr_count);
+
+	if (!(status & STS_GPP_D_0_MASK))	/* Not a UV external NMI */
+		return 0;
+
+	*pstat = STS_GPP_D_0_MASK;	/* Is a UV NMI: clear GPP_D_0 status */
+	(void)*pstat;			/* flush write */
+
+	return 1;
+}
+
+static int uv_test_nmi(struct uv_hub_nmi_s *hub_nmi)
+{
+	if (hub_nmi->hub_present)
+		return uv_nmi_test_mmr(hub_nmi);
+
+	if (hub_nmi->pch_owner)		/* Only PCH owner can check status */
+		return uv_nmi_test_hubless(hub_nmi);
+
+	return -1;
+}
+
 /*
  * If first cpu in on this hub, set hub_nmi "in_nmi" and "owner" values and
  * return true.  If first cpu in on the system, set global "in_nmi" flag.
@@ -214,6 +310,7 @@ static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi)
 {
 	int cpu = smp_processor_id();
 	int nmi = 0;
+	int nmi_detected = 0;
 
 	local64_inc(&uv_nmi_count);
 	this_cpu_inc(uv_cpu_nmi.queries);
@@ -224,20 +321,26 @@ static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi)
 			break;
 
 		if (raw_spin_trylock(&hub_nmi->nmi_lock)) {
+			nmi_detected = uv_test_nmi(hub_nmi);
 
-			/* check hub MMR NMI flag */
-			if (uv_nmi_test_mmr(hub_nmi)) {
+			/* check flag for UV external NMI */
+			if (nmi_detected > 0) {
 				uv_set_in_nmi(cpu, hub_nmi);
 				nmi = 1;
 				break;
 			}
 
-			/* MMR NMI flag is clear */
+			/* A non-PCH node in a hubless system waits for NMI */
+			else if (nmi_detected < 0)
+				goto slave_wait;
+
+			/* MMR/PCH NMI flag is clear */
 			raw_spin_unlock(&hub_nmi->nmi_lock);
 
 		} else {
-			/* wait a moment for the hub nmi locker to set flag */
-			cpu_relax();
+
+			/* Wait a moment for the HUB NMI locker to set flag */
+slave_wait:		cpu_relax();
 			udelay(uv_nmi_slave_delay);
 
 			/* re-check hub in_nmi flag */
@@ -246,13 +349,20 @@ static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi)
 				break;
 		}
 
-		/* check if this BMC missed setting the MMR NMI flag */
+		/*
+		 * Check if this BMC missed setting the MMR NMI flag (or)
+		 * UV hubless system where only PCH owner can check flag
+		 */
 		if (!nmi) {
 			nmi = atomic_read(&uv_in_nmi);
 			if (nmi)
 				uv_set_in_nmi(cpu, hub_nmi);
 		}
 
+		/* If we're holding the hub lock, release it now */
+		if (nmi_detected < 0)
+			raw_spin_unlock(&hub_nmi->nmi_lock);
+
 	} while (0);
 
 	if (!nmi)
@@ -269,7 +379,10 @@ static inline void uv_clear_nmi(int cpu)
 	if (cpu == atomic_read(&hub_nmi->cpu_owner)) {
 		atomic_set(&hub_nmi->cpu_owner, -1);
 		atomic_set(&hub_nmi->in_nmi, 0);
-		uv_local_mmr_clear_nmi();
+		if (hub_nmi->hub_present)
+			uv_local_mmr_clear_nmi();
+		else
+			uv_reassert_nmi();
 		raw_spin_unlock(&hub_nmi->nmi_lock);
 	}
 }
@@ -297,11 +410,12 @@ static void uv_nmi_cleanup_mask(void)
 	}
 }
 
-/* Loop waiting as cpus enter nmi handler */
+/* Loop waiting as cpus enter NMI handler */
 static int uv_nmi_wait_cpus(int first)
 {
 	int i, j, k, n = num_online_cpus();
 	int last_k = 0, waiting = 0;
+	int cpu = smp_processor_id();
 
 	if (first) {
 		cpumask_copy(uv_nmi_cpu_mask, cpu_online_mask);
@@ -310,6 +424,12 @@ static int uv_nmi_wait_cpus(int first)
 		k = n - cpumask_weight(uv_nmi_cpu_mask);
 	}
 
+	/* PCH NMI causes only one cpu to respond */
+	if (first && uv_pch_intr_now_enabled) {
+		cpumask_clear_cpu(cpu, uv_nmi_cpu_mask);
+		return n - k - 1;
+	}
+
 	udelay(uv_nmi_initial_delay);
 	for (i = 0; i < uv_nmi_retry_count; i++) {
 		int loop_delay = uv_nmi_loop_delay;
@@ -358,7 +478,7 @@ static void uv_nmi_wait(int master)
 			break;
 
 		/* if not all made it in, send IPI NMI to them */
-		pr_alert("UV: Sending NMI IPI to %d non-responding CPUs: %*pbl\n",
+		pr_alert("UV: Sending NMI IPI to %d CPUs: %*pbl\n",
 			 cpumask_weight(uv_nmi_cpu_mask),
 			 cpumask_pr_args(uv_nmi_cpu_mask));
 
@@ -538,7 +658,7 @@ static inline int uv_nmi_kdb_reason(void)
 #else /* !CONFIG_KGDB_KDB */
 static inline int uv_nmi_kdb_reason(void)
 {
-	/* Insure user is expecting to attach gdb remote */
+	/* Ensure user is expecting to attach gdb remote */
 	if (uv_nmi_action_is("kgdb"))
 		return 0;
 
@@ -626,15 +746,18 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
 	/* Pause as all cpus enter the NMI handler */
 	uv_nmi_wait(master);
 
-	/* Dump state of each cpu */
-	if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump"))
+	/* Process actions other than "kdump": */
+	if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) {
 		uv_nmi_dump_state(cpu, regs, master);
-
-	/* Call KGDB/KDB if enabled */
-	else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb"))
+	} else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb")) {
 		uv_call_kgdb_kdb(cpu, regs, master);
+	} else {
+		if (master)
+			pr_alert("UV: unknown NMI action: %s\n", uv_nmi_action);
+		uv_nmi_sync_exit(master);
+	}
 
-	/* Clear per_cpu "in nmi" flag */
+	/* Clear per_cpu "in_nmi" flag */
 	this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_OUT);
 
 	/* Clear MMR NMI flag on each hub */
@@ -648,6 +771,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
 		atomic_set(&uv_nmi_cpu, -1);
 		atomic_set(&uv_in_nmi, 0);
 		atomic_set(&uv_nmi_kexec_failed, 0);
+		atomic_set(&uv_nmi_slave_continue, SLAVE_CLEAR);
 	}
 
 	uv_nmi_touch_watchdogs();
@@ -697,28 +821,53 @@ void uv_nmi_init(void)
 	apic_write(APIC_LVT1, value);
 }
 
-void uv_nmi_setup(void)
+/* Setup HUB NMI info */
+void __init uv_nmi_setup_common(bool hubbed)
 {
 	int size = sizeof(void *) * (1 << NODES_SHIFT);
-	int cpu, nid;
+	int cpu;
 
-	/* Setup hub nmi info */
-	uv_nmi_setup_mmrs();
 	uv_hub_nmi_list = kzalloc(size, GFP_KERNEL);
-	pr_info("UV: NMI hub list @ 0x%p (%d)\n", uv_hub_nmi_list, size);
+	nmi_debug("UV: NMI hub list @ 0x%p (%d)\n", uv_hub_nmi_list, size);
 	BUG_ON(!uv_hub_nmi_list);
 	size = sizeof(struct uv_hub_nmi_s);
 	for_each_present_cpu(cpu) {
-		nid = cpu_to_node(cpu);
+		int nid = cpu_to_node(cpu);
 		if (uv_hub_nmi_list[nid] == NULL) {
 			uv_hub_nmi_list[nid] = kzalloc_node(size,
 							    GFP_KERNEL, nid);
 			BUG_ON(!uv_hub_nmi_list[nid]);
 			raw_spin_lock_init(&(uv_hub_nmi_list[nid]->nmi_lock));
 			atomic_set(&uv_hub_nmi_list[nid]->cpu_owner, -1);
+			uv_hub_nmi_list[nid]->hub_present = hubbed;
+			uv_hub_nmi_list[nid]->pch_owner = (nid == 0);
 		}
 		uv_hub_nmi_per(cpu) = uv_hub_nmi_list[nid];
 	}
 	BUG_ON(!alloc_cpumask_var(&uv_nmi_cpu_mask, GFP_KERNEL));
+}
+
+/* Setup for UV Hub systems */
+void __init uv_nmi_setup(void)
+{
+	uv_nmi_setup_mmrs();
+	uv_nmi_setup_common(true);
+	uv_register_nmi_notifier();
+	pr_info("UV: Hub NMI enabled\n");
+}
+
+/* Setup for UV Hubless systems */
+void __init uv_nmi_setup_hubless(void)
+{
+	uv_nmi_setup_common(false);
+	pch_base = xlate_dev_mem_ptr(PCH_PCR_GPIO_1_BASE);
+	nmi_debug("UV: PCH base:%p from 0x%lx, GPP_D_0\n",
+		pch_base, PCH_PCR_GPIO_1_BASE);
+	uv_init_hubless_pch_io(GPI_NMI_ENA_GPP_D_0,
+				STS_GPP_D_0_MASK, STS_GPP_D_0_MASK);
+	uv_nmi_setup_hubless_intr();
+	/* Ensure NMI enabled in Processor Interface Reg: */
+	uv_reassert_nmi();
 	uv_register_nmi_notifier();
+	pr_info("UV: Hubless NMI enabled\n");
 }
-- 
cgit 


From 278c9b099b2fc0cc0a51de95a1dcefcf54ca2183 Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 25 Jan 2017 10:35:20 -0600
Subject: x86/platform/UV: Add basic CPU NMI health check

Add a low impact health check triggered by the system NMI command
that essentially checks which CPUs are responding to external NMI's.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Russ Anderson <rja@hpe.com>
Reviewed-by: Alex Thorlton <athorlton@sgi.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dimitri Sivanich <sivanich@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170125163517.756690240@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/uv/uv_nmi.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index df7b092941fe..8a4aa5b3c11a 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -176,6 +176,7 @@ module_param_named(debug, uv_nmi_debug, int, 0644);
  *  "kdump"	- do crash dump
  *  "kdb"	- enter KDB (default)
  *  "kgdb"	- enter KGDB
+ *  "health"	- check if CPUs respond to NMI
  */
 static char uv_nmi_action[8] = "kdb";
 module_param_string(action, uv_nmi_action, sizeof(uv_nmi_action), 0644);
@@ -571,6 +572,22 @@ static void uv_nmi_sync_exit(int master)
 	}
 }
 
+/* Current "health" check is to check which CPU's are responsive */
+static void uv_nmi_action_health(int cpu, struct pt_regs *regs, int master)
+{
+	if (master) {
+		int in = atomic_read(&uv_nmi_cpus_in_nmi);
+		int out = num_online_cpus() - in;
+
+		pr_alert("UV: NMI CPU health check (non-responding:%d)\n", out);
+		atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT);
+	} else {
+		while (!atomic_read(&uv_nmi_slave_continue))
+			cpu_relax();
+	}
+	uv_nmi_sync_exit(master);
+}
+
 /* Walk through cpu list and dump state of each */
 static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master)
 {
@@ -747,7 +764,9 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
 	uv_nmi_wait(master);
 
 	/* Process actions other than "kdump": */
-	if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) {
+	if (uv_nmi_action_is("health")) {
+		uv_nmi_action_health(cpu, regs, master);
+	} else if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) {
 		uv_nmi_dump_state(cpu, regs, master);
 	} else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb")) {
 		uv_call_kgdb_kdb(cpu, regs, master);
-- 
cgit 


From f550e4692749a909d3f5453ef11b4c8ab2071070 Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 25 Jan 2017 10:35:21 -0600
Subject: x86/platform/UV: Verify NMI action is valid, default is standard

Verify that the NMI action being set is valid.  The default NMI action
changes from the non-standard 'kdb' to the more standard 'dump'.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Russ Anderson <rja@hpe.com>
Reviewed-by: Alex Thorlton <athorlton@sgi.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dimitri Sivanich <sivanich@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170125163517.922751779@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/uv/uv_nmi.c | 69 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 8a4aa5b3c11a..c10e00b2b2ee 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -169,17 +169,64 @@ module_param_named(debug, uv_nmi_debug, int, 0644);
 			pr_info(fmt, ##__VA_ARGS__);	\
 	} while (0)
 
-/*
- * Valid NMI Actions:
- *  "dump"	- dump process stack for each cpu
- *  "ips"	- dump IP info for each cpu
- *  "kdump"	- do crash dump
- *  "kdb"	- enter KDB (default)
- *  "kgdb"	- enter KGDB
- *  "health"	- check if CPUs respond to NMI
- */
-static char uv_nmi_action[8] = "kdb";
-module_param_string(action, uv_nmi_action, sizeof(uv_nmi_action), 0644);
+/* Valid NMI Actions */
+#define	ACTION_LEN	16
+static struct nmi_action {
+	char	*action;
+	char	*desc;
+} valid_acts[] = {
+	{	"kdump",	"do kernel crash dump"			},
+	{	"dump",		"dump process stack for each cpu"	},
+	{	"ips",		"dump Inst Ptr info for each cpu"	},
+	{	"kdb",		"enter KDB (needs kgdboc= assignment)"	},
+	{	"kgdb",		"enter KGDB (needs gdb target remote)"	},
+	{	"health",	"check if CPUs respond to NMI"		},
+};
+typedef char action_t[ACTION_LEN];
+static action_t uv_nmi_action = { "dump" };
+
+static int param_get_action(char *buffer, const struct kernel_param *kp)
+{
+	return sprintf(buffer, "%s\n", uv_nmi_action);
+}
+
+static int param_set_action(const char *val, const struct kernel_param *kp)
+{
+	int i;
+	int n = ARRAY_SIZE(valid_acts);
+	char arg[ACTION_LEN], *p;
+
+	/* (remove possible '\n') */
+	strncpy(arg, val, ACTION_LEN - 1);
+	arg[ACTION_LEN - 1] = '\0';
+	p = strchr(arg, '\n');
+	if (p)
+		*p = '\0';
+
+	for (i = 0; i < n; i++)
+		if (!strcmp(arg, valid_acts[i].action))
+			break;
+
+	if (i < n) {
+		strcpy(uv_nmi_action, arg);
+		pr_info("UV: New NMI action:%s\n", uv_nmi_action);
+		return 0;
+	}
+
+	pr_err("UV: Invalid NMI action:%s, valid actions are:\n", arg);
+	for (i = 0; i < n; i++)
+		pr_err("UV: %-8s - %s\n",
+			valid_acts[i].action, valid_acts[i].desc);
+	return -EINVAL;
+}
+
+static const struct kernel_param_ops param_ops_action = {
+	.get = param_get_action,
+	.set = param_set_action,
+};
+#define param_check_action(name, p) __param_check(name, p, action_t)
+
+module_param_named(action, uv_nmi_action, action, 0644);
 
 static inline bool uv_nmi_action_is(const char *action)
 {
-- 
cgit 


From 56e17ca2c5ed31f5812ed8e0694e7ef880068cfd Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 25 Jan 2017 10:35:22 -0600
Subject: x86/platform/UV: Initialize PCH GPP_D_0 NMI Pin to be NMI source

The initialize PCH NMI I/O function is separate and may be moved to BIOS
for security reasons.  This function detects whether the PCH NMI config
has already been done and if not, it will then initialize the PCH here.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Russ Anderson <rja@hpe.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dimitri Sivanich <sivanich@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170125163518.089387859@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/uv/uv_nmi.c | 127 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index c10e00b2b2ee..6a71b087da98 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -70,6 +70,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(uv_cpu_nmi);
 /* UV hubless values */
 #define NMI_CONTROL_PORT	0x70
 #define NMI_DUMMY_PORT		0x71
+#define PAD_OWN_GPP_D_0		0x2c
 #define GPI_NMI_STS_GPP_D_0	0x164
 #define GPI_NMI_ENA_GPP_D_0	0x174
 #define STS_GPP_D_0_MASK	0x1
@@ -160,6 +161,9 @@ static bool uv_pch_intr_enable = true;
 static bool uv_pch_intr_now_enabled;
 module_param_named(pch_intr_enable, uv_pch_intr_enable, bool, 0644);
 
+static bool uv_pch_init_enable = true;
+module_param_named(pch_init_enable, uv_pch_init_enable, bool, 0644);
+
 static int uv_nmi_debug;
 module_param_named(debug, uv_nmi_debug, int, 0644);
 
@@ -307,6 +311,127 @@ static void uv_nmi_setup_hubless_intr(void)
 		uv_pch_intr_now_enabled ? "enabled" : "disabled");
 }
 
+static struct init_nmi {
+	unsigned int	offset;
+	unsigned int	mask;
+	unsigned int	data;
+} init_nmi[] = {
+	{	/* HOSTSW_OWN_GPP_D_0 */
+	.offset = 0x84,
+	.mask = 0x1,
+	.data = 0x0,	/* ACPI Mode */
+	},
+
+/* clear status */
+	{	/* GPI_INT_STS_GPP_D_0 */
+	.offset = 0x104,
+	.mask = 0x0,
+	.data = 0x1,	/* Clear Status */
+	},
+	{	/* GPI_GPE_STS_GPP_D_0 */
+	.offset = 0x124,
+	.mask = 0x0,
+	.data = 0x1,	/* Clear Status */
+	},
+	{	/* GPI_SMI_STS_GPP_D_0 */
+	.offset = 0x144,
+	.mask = 0x0,
+	.data = 0x1,	/* Clear Status */
+	},
+	{	/* GPI_NMI_STS_GPP_D_0 */
+	.offset = 0x164,
+	.mask = 0x0,
+	.data = 0x1,	/* Clear Status */
+	},
+
+/* disable interrupts */
+	{	/* GPI_INT_EN_GPP_D_0 */
+	.offset = 0x114,
+	.mask = 0x1,
+	.data = 0x0,	/* disable interrupt generation */
+	},
+	{	/* GPI_GPE_EN_GPP_D_0 */
+	.offset = 0x134,
+	.mask = 0x1,
+	.data = 0x0,	/* disable interrupt generation */
+	},
+	{	/* GPI_SMI_EN_GPP_D_0 */
+	.offset = 0x154,
+	.mask = 0x1,
+	.data = 0x0,	/* disable interrupt generation */
+	},
+	{	/* GPI_NMI_EN_GPP_D_0 */
+	.offset = 0x174,
+	.mask = 0x1,
+	.data = 0x0,	/* disable interrupt generation */
+	},
+
+/* setup GPP_D_0 Pad Config */
+	{	/* PAD_CFG_DW0_GPP_D_0 */
+	.offset = 0x4c0,
+	.mask = 0xffffffff,
+	.data = 0x82020100,
+/*
+ *  31:30 Pad Reset Config (PADRSTCFG): = 2h  # PLTRST# (default)
+ *
+ *  29    RX Pad State Select (RXPADSTSEL): = 0 # Raw RX pad state directly
+ *                                                from RX buffer (default)
+ *
+ *  28    RX Raw Override to '1' (RXRAW1): = 0 # No Override
+ *
+ *  26:25 RX Level/Edge Configuration (RXEVCFG):
+ *      = 0h # Level
+ *      = 1h # Edge
+ *
+ *  23    RX Invert (RXINV): = 0 # No Inversion (signal active high)
+ *
+ *  20    GPIO Input Route IOxAPIC (GPIROUTIOXAPIC):
+ * = 0 # Routing does not cause peripheral IRQ...
+ *     # (we want an NMI not an IRQ)
+ *
+ *  19    GPIO Input Route SCI (GPIROUTSCI): = 0 # Routing does not cause SCI.
+ *  18    GPIO Input Route SMI (GPIROUTSMI): = 0 # Routing does not cause SMI.
+ *  17    GPIO Input Route NMI (GPIROUTNMI): = 1 # Routing can cause NMI.
+ *
+ *  11:10 Pad Mode (PMODE1/0): = 0h = GPIO control the Pad.
+ *   9    GPIO RX Disable (GPIORXDIS):
+ * = 0 # Enable the input buffer (active low enable)
+ *
+ *   8    GPIO TX Disable (GPIOTXDIS):
+ * = 1 # Disable the output buffer; i.e. Hi-Z
+ *
+ *   1 GPIO RX State (GPIORXSTATE): This is the current internal RX pad state..
+ *   0 GPIO TX State (GPIOTXSTATE):
+ * = 0 # (Leave at default)
+ */
+	},
+
+/* Pad Config DW1 */
+	{	/* PAD_CFG_DW1_GPP_D_0 */
+	.offset = 0x4c4,
+	.mask = 0x3c00,
+	.data = 0,	/* Termination = none (default) */
+	},
+};
+
+static void uv_init_hubless_pch_d0(void)
+{
+	int i, read;
+
+	read = *PCH_PCR_GPIO_ADDRESS(PAD_OWN_GPP_D_0);
+	if (read != 0) {
+		pr_info("UV: Hubless NMI already configured\n");
+		return;
+	}
+
+	nmi_debug("UV: Initializing UV Hubless NMI on PCH\n");
+	for (i = 0; i < ARRAY_SIZE(init_nmi); i++) {
+		uv_init_hubless_pch_io(init_nmi[i].offset,
+					init_nmi[i].mask,
+					init_nmi[i].data);
+	}
+}
+
 static int uv_nmi_test_hubless(struct uv_hub_nmi_s *hub_nmi)
 {
 	int *pstat = PCH_PCR_GPIO_ADDRESS(GPI_NMI_STS_GPP_D_0);
@@ -929,6 +1054,8 @@ void __init uv_nmi_setup_hubless(void)
 	pch_base = xlate_dev_mem_ptr(PCH_PCR_GPIO_1_BASE);
 	nmi_debug("UV: PCH base:%p from 0x%lx, GPP_D_0\n",
 		pch_base, PCH_PCR_GPIO_1_BASE);
+	if (uv_pch_init_enable)
+		uv_init_hubless_pch_d0();
 	uv_init_hubless_pch_io(GPI_NMI_ENA_GPP_D_0,
 				STS_GPP_D_0_MASK, STS_GPP_D_0_MASK);
 	uv_nmi_setup_hubless_intr();
-- 
cgit 


From 9ec808a0225aabab59fb2932b70784b087ac0f58 Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 25 Jan 2017 10:35:23 -0600
Subject: x86/platform/UV: Ensure uv_system_init is called when necessary

Move the check to whether this is a UV system that needs initialization
from is_uv_system() to the internal uv_system_init() function.  This is
because on a UV system without a HUB the is_uv_system() returns false.
But we still need some specific UV system initialization.  See the
uv_system_init() for change to a quick check if UV is applicable. This
change should not increase overhead since is_uv_system() also called
into this same area.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Russ Anderson <rja@hpe.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dimitri Sivanich <sivanich@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170125163518.256403963@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 46732dc3b73c..386c7f713c2a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1341,8 +1341,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	pr_info("CPU0: ");
 	print_cpu_info(&cpu_data(0));
 
-	if (is_uv_system())
-		uv_system_init();
+	uv_system_init();
 
 	set_mtrr_aps_delayed_init();
 
-- 
cgit 


From 1e74016370ec3d552a7f5df18bb2b0f1c80b5a9f Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 25 Jan 2017 10:35:24 -0600
Subject: x86/platform/UV: Clean up the NMI code to match current coding style

Update UV NMI to current coding style.

Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <rja@hpe.com>
Link: http://lkml.kernel.org/r/20170125163518.419094259@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/uv/uv_nmi.c | 74 +++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 6a71b087da98..0ecd7bf7d2d3 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -45,8 +45,8 @@
  *
  * Handle system-wide NMI events generated by the global 'power nmi' command.
  *
- * Basic operation is to field the NMI interrupt on each cpu and wait
- * until all cpus have arrived into the nmi handler.  If some cpus do not
+ * Basic operation is to field the NMI interrupt on each CPU and wait
+ * until all CPU's have arrived into the nmi handler.  If some CPU's do not
  * make it into the handler, try and force them in with the IPI(NMI) signal.
  *
  * We also have to lessen UV Hub MMR accesses as much as possible as this
@@ -56,7 +56,7 @@
  * To do this we register our primary NMI notifier on the NMI_UNKNOWN
  * chain.  This reduces the number of false NMI calls when the perf
  * tools are running which generate an enormous number of NMIs per
- * second (~4M/s for 1024 cpu threads).  Our secondary NMI handler is
+ * second (~4M/s for 1024 CPU threads).  Our secondary NMI handler is
  * very short as it only checks that if it has been "pinged" with the
  * IPI(NMI) signal as mentioned above, and does not read the UV Hub's MMR.
  *
@@ -113,7 +113,7 @@ static int param_get_local64(char *buffer, const struct kernel_param *kp)
 
 static int param_set_local64(const char *val, const struct kernel_param *kp)
 {
-	/* clear on any write */
+	/* Clear on any write */
 	local64_set((local64_t *)kp->arg, 0);
 	return 0;
 }
@@ -322,7 +322,7 @@ static struct init_nmi {
 	.data = 0x0,	/* ACPI Mode */
 	},
 
-/* clear status */
+/* Clear status: */
 	{	/* GPI_INT_STS_GPP_D_0 */
 	.offset = 0x104,
 	.mask = 0x0,
@@ -344,29 +344,29 @@ static struct init_nmi {
 	.data = 0x1,	/* Clear Status */
 	},
 
-/* disable interrupts */
+/* Disable interrupts: */
 	{	/* GPI_INT_EN_GPP_D_0 */
 	.offset = 0x114,
 	.mask = 0x1,
-	.data = 0x0,	/* disable interrupt generation */
+	.data = 0x0,	/* Disable interrupt generation */
 	},
 	{	/* GPI_GPE_EN_GPP_D_0 */
 	.offset = 0x134,
 	.mask = 0x1,
-	.data = 0x0,	/* disable interrupt generation */
+	.data = 0x0,	/* Disable interrupt generation */
 	},
 	{	/* GPI_SMI_EN_GPP_D_0 */
 	.offset = 0x154,
 	.mask = 0x1,
-	.data = 0x0,	/* disable interrupt generation */
+	.data = 0x0,	/* Disable interrupt generation */
 	},
 	{	/* GPI_NMI_EN_GPP_D_0 */
 	.offset = 0x174,
 	.mask = 0x1,
-	.data = 0x0,	/* disable interrupt generation */
+	.data = 0x0,	/* Disable interrupt generation */
 	},
 
-/* setup GPP_D_0 Pad Config */
+/* Setup GPP_D_0 Pad Config: */
 	{	/* PAD_CFG_DW0_GPP_D_0 */
 	.offset = 0x4c0,
 	.mask = 0xffffffff,
@@ -444,7 +444,7 @@ static int uv_nmi_test_hubless(struct uv_hub_nmi_s *hub_nmi)
 		return 0;
 
 	*pstat = STS_GPP_D_0_MASK;	/* Is a UV NMI: clear GPP_D_0 status */
-	(void)*pstat;			/* flush write */
+	(void)*pstat;			/* Flush write */
 
 	return 1;
 }
@@ -461,8 +461,8 @@ static int uv_test_nmi(struct uv_hub_nmi_s *hub_nmi)
 }
 
 /*
- * If first cpu in on this hub, set hub_nmi "in_nmi" and "owner" values and
- * return true.  If first cpu in on the system, set global "in_nmi" flag.
+ * If first CPU in on this hub, set hub_nmi "in_nmi" and "owner" values and
+ * return true.  If first CPU in on the system, set global "in_nmi" flag.
  */
 static int uv_set_in_nmi(int cpu, struct uv_hub_nmi_s *hub_nmi)
 {
@@ -496,7 +496,7 @@ static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi)
 		if (raw_spin_trylock(&hub_nmi->nmi_lock)) {
 			nmi_detected = uv_test_nmi(hub_nmi);
 
-			/* check flag for UV external NMI */
+			/* Check flag for UV external NMI */
 			if (nmi_detected > 0) {
 				uv_set_in_nmi(cpu, hub_nmi);
 				nmi = 1;
@@ -516,7 +516,7 @@ static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi)
 slave_wait:		cpu_relax();
 			udelay(uv_nmi_slave_delay);
 
-			/* re-check hub in_nmi flag */
+			/* Re-check hub in_nmi flag */
 			nmi = atomic_read(&hub_nmi->in_nmi);
 			if (nmi)
 				break;
@@ -560,7 +560,7 @@ static inline void uv_clear_nmi(int cpu)
 	}
 }
 
-/* Ping non-responding cpus attemping to force them into the NMI handler */
+/* Ping non-responding CPU's attemping to force them into the NMI handler */
 static void uv_nmi_nr_cpus_ping(void)
 {
 	int cpu;
@@ -571,7 +571,7 @@ static void uv_nmi_nr_cpus_ping(void)
 	apic->send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI);
 }
 
-/* Clean up flags for cpus that ignored both NMI and ping */
+/* Clean up flags for CPU's that ignored both NMI and ping */
 static void uv_nmi_cleanup_mask(void)
 {
 	int cpu;
@@ -583,7 +583,7 @@ static void uv_nmi_cleanup_mask(void)
 	}
 }
 
-/* Loop waiting as cpus enter NMI handler */
+/* Loop waiting as CPU's enter NMI handler */
 static int uv_nmi_wait_cpus(int first)
 {
 	int i, j, k, n = num_online_cpus();
@@ -597,7 +597,7 @@ static int uv_nmi_wait_cpus(int first)
 		k = n - cpumask_weight(uv_nmi_cpu_mask);
 	}
 
-	/* PCH NMI causes only one cpu to respond */
+	/* PCH NMI causes only one CPU to respond */
 	if (first && uv_pch_intr_now_enabled) {
 		cpumask_clear_cpu(cpu, uv_nmi_cpu_mask);
 		return n - k - 1;
@@ -618,13 +618,13 @@ static int uv_nmi_wait_cpus(int first)
 			k = n;
 			break;
 		}
-		if (last_k != k) {	/* abort if no new cpus coming in */
+		if (last_k != k) {	/* abort if no new CPU's coming in */
 			last_k = k;
 			waiting = 0;
 		} else if (++waiting > uv_nmi_wait_count)
 			break;
 
-		/* extend delay if waiting only for cpu 0 */
+		/* Extend delay if waiting only for CPU 0: */
 		if (waiting && (n - k) == 1 &&
 		    cpumask_test_cpu(0, uv_nmi_cpu_mask))
 			loop_delay *= 100;
@@ -635,29 +635,29 @@ static int uv_nmi_wait_cpus(int first)
 	return n - k;
 }
 
-/* Wait until all slave cpus have entered UV NMI handler */
+/* Wait until all slave CPU's have entered UV NMI handler */
 static void uv_nmi_wait(int master)
 {
-	/* indicate this cpu is in */
+	/* Indicate this CPU is in: */
 	this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_IN);
 
-	/* if not the first cpu in (the master), then we are a slave cpu */
+	/* If not the first CPU in (the master), then we are a slave CPU */
 	if (!master)
 		return;
 
 	do {
-		/* wait for all other cpus to gather here */
+		/* Wait for all other CPU's to gather here */
 		if (!uv_nmi_wait_cpus(1))
 			break;
 
-		/* if not all made it in, send IPI NMI to them */
+		/* If not all made it in, send IPI NMI to them */
 		pr_alert("UV: Sending NMI IPI to %d CPUs: %*pbl\n",
 			 cpumask_weight(uv_nmi_cpu_mask),
 			 cpumask_pr_args(uv_nmi_cpu_mask));
 
 		uv_nmi_nr_cpus_ping();
 
-		/* if all cpus are in, then done */
+		/* If all CPU's are in, then done */
 		if (!uv_nmi_wait_cpus(0))
 			break;
 
@@ -709,7 +709,7 @@ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs)
 	this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE);
 }
 
-/* Trigger a slave cpu to dump it's state */
+/* Trigger a slave CPU to dump it's state */
 static void uv_nmi_trigger_dump(int cpu)
 {
 	int retry = uv_nmi_trigger_delay;
@@ -730,7 +730,7 @@ static void uv_nmi_trigger_dump(int cpu)
 	uv_cpu_nmi_per(cpu).state = UV_NMI_STATE_DUMP_DONE;
 }
 
-/* Wait until all cpus ready to exit */
+/* Wait until all CPU's ready to exit */
 static void uv_nmi_sync_exit(int master)
 {
 	atomic_dec(&uv_nmi_cpus_in_nmi);
@@ -760,7 +760,7 @@ static void uv_nmi_action_health(int cpu, struct pt_regs *regs, int master)
 	uv_nmi_sync_exit(master);
 }
 
-/* Walk through cpu list and dump state of each */
+/* Walk through CPU list and dump state of each */
 static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master)
 {
 	if (master) {
@@ -872,7 +872,7 @@ static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)
 		if (reason < 0)
 			return;
 
-		/* call KGDB NMI handler as MASTER */
+		/* Call KGDB NMI handler as MASTER */
 		ret = kgdb_nmicallin(cpu, X86_TRAP_NMI, regs, reason,
 				&uv_nmi_slave_continue);
 		if (ret) {
@@ -880,7 +880,7 @@ static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)
 			atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT);
 		}
 	} else {
-		/* wait for KGDB signal that it's ready for slaves to enter */
+		/* Wait for KGDB signal that it's ready for slaves to enter */
 		int sig;
 
 		do {
@@ -888,7 +888,7 @@ static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)
 			sig = atomic_read(&uv_nmi_slave_continue);
 		} while (!sig);
 
-		/* call KGDB as slave */
+		/* Call KGDB as slave */
 		if (sig == SLAVE_CONTINUE)
 			kgdb_nmicallback(cpu, regs);
 	}
@@ -932,7 +932,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
 			strncpy(uv_nmi_action, "dump", strlen(uv_nmi_action));
 	}
 
-	/* Pause as all cpus enter the NMI handler */
+	/* Pause as all CPU's enter the NMI handler */
 	uv_nmi_wait(master);
 
 	/* Process actions other than "kdump": */
@@ -972,7 +972,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
 }
 
 /*
- * NMI handler for pulling in CPUs when perf events are grabbing our NMI
+ * NMI handler for pulling in CPU's when perf events are grabbing our NMI
  */
 static int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs)
 {
@@ -1005,7 +1005,7 @@ void uv_nmi_init(void)
 	unsigned int value;
 
 	/*
-	 * Unmask NMI on all cpus
+	 * Unmask NMI on all CPU's
 	 */
 	value = apic_read(APIC_LVT1) | APIC_DM_NMI;
 	value &= ~APIC_LVT_MASKED;
-- 
cgit 


From 5443624bedd0d23e112d5f2a919435182875bce9 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 27 Jan 2017 17:16:43 +0200
Subject: perf/x86/intel/pt: Add format strings for PTWRITE and power event
 tracing

Commit:

  8ee83b2ab3 ("perf/x86/intel/pt: Add support for PTWRITE and power event tracing")

forgot to add format strings to the PT driver. So one could enable these features
by setting corresponding bits in the event config, but not by their mnemonic names.

This patch adds the format strings.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Fixes: 8ee83b2ab3 ("perf/x86/intel/pt: Add support for PTWRITE...")
Link: http://lkml.kernel.org/r/20170127151644.8585-2-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/pt.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 1c1b9fe705c8..5900471ee508 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -99,18 +99,24 @@ static struct attribute_group pt_cap_group = {
 };
 
 PMU_FORMAT_ATTR(cyc,		"config:1"	);
+PMU_FORMAT_ATTR(pwr_evt,	"config:4"	);
+PMU_FORMAT_ATTR(fup_on_ptw,	"config:5"	);
 PMU_FORMAT_ATTR(mtc,		"config:9"	);
 PMU_FORMAT_ATTR(tsc,		"config:10"	);
 PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
+PMU_FORMAT_ATTR(ptw,		"config:12"	);
 PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
 PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
 PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
 
 static struct attribute *pt_formats_attr[] = {
 	&format_attr_cyc.attr,
+	&format_attr_pwr_evt.attr,
+	&format_attr_fup_on_ptw.attr,
 	&format_attr_mtc.attr,
 	&format_attr_tsc.attr,
 	&format_attr_noretcomp.attr,
+	&format_attr_ptw.attr,
 	&format_attr_mtc_period.attr,
 	&format_attr_cyc_thresh.attr,
 	&format_attr_psb_period.attr,
-- 
cgit 


From c26819900036f5b91608051a0fc7c76f6b4ffc7b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 1 Feb 2017 22:17:39 +0800
Subject: crypto: aesni - Fix failure when pcbc module is absent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When aesni is built as a module together with pcbc, the pcbc module
must be present for aesni to load.  However, the pcbc module may not
be present for reasons such as its absence on initramfs.  This patch
allows the aesni to function even if the pcbc module is enabled but
not present.

Reported-by: Arkadiusz Miśkiewicz <arekm@maven.pl>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 6ef688a1ef3e..7ff1b0c86a8e 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1085,9 +1085,9 @@ static void aesni_free_simds(void)
 		    aesni_simd_skciphers[i]; i++)
 		simd_skcipher_free(aesni_simd_skciphers[i]);
 
-	for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2) &&
-		    aesni_simd_skciphers2[i].simd; i++)
-		simd_skcipher_free(aesni_simd_skciphers2[i].simd);
+	for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++)
+		if (aesni_simd_skciphers2[i].simd)
+			simd_skcipher_free(aesni_simd_skciphers2[i].simd);
 }
 
 static int __init aesni_init(void)
@@ -1168,7 +1168,7 @@ static int __init aesni_init(void)
 		simd = simd_skcipher_create_compat(algname, drvname, basename);
 		err = PTR_ERR(simd);
 		if (IS_ERR(simd))
-			goto unregister_simds;
+			continue;
 
 		aesni_simd_skciphers2[i].simd = simd;
 	}
-- 
cgit 


From 00c87e9a70a17b355b81c36adedf05e84f54e10d Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 1 Feb 2017 14:19:53 +0100
Subject: KVM: x86: do not save guest-unsupported XSAVE state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Saving unsupported state prevents migration when the new host does not
support a XSAVE feature of the original host, even if the feature is not
exposed to the guest.

We've masked host features with guest-visible features before, with
4344ee981e21 ("KVM: x86: only copy XSAVE state for the supported
features") and dropped it when implementing XSAVES.  Do it again.

Fixes: df1daba7d1cb ("KVM: x86: support XSAVES usage in the host")
Cc: stable@vger.kernel.org
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d153be8929a6..e52c9088660f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3182,6 +3182,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 	memcpy(dest, xsave, XSAVE_HDR_OFFSET);
 
 	/* Set XSTATE_BV */
+	xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
 	*(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
 
 	/*
-- 
cgit 


From ae47eda905e61ef6ba0b6f79b967c9de15ca4f8e Mon Sep 17 00:00:00 2001
From: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Date: Fri, 20 Jan 2017 14:22:33 +0100
Subject: x86/msr: Add MSR_MISC_FEATURE_ENABLES and RING3MWAIT bit

Define new MSR MISC_FEATURE_ENABLES (0x140).

On supported CPUs if bit 1 of this MSR is set, then calling MONITOR and
MWAIT instructions outside of ring 0 will not cause invalid-opcode
exception.

The MSR MISC_FEATURE_ENABLES is not yet documented in the SDM. Here is the
relevant documentation:

Hex   Dec  Name                     Scope
140H  320  MISC_FEATURE_ENABLES     Thread
           0    Reserved
           1    If set to 1, the MONITOR and MWAIT instructions do not
                cause invalid-opcode exceptions when executed with CPL > 0
                or in virtual-8086 mode. If MWAIT is executed when CPL > 0
                or in virtual-8086 mode, and if EAX indicates a C-state
                other than C0 or C1, the instruction operates as if EAX
                indicated the C-state C1.
           63:2 Reserved

Signed-off-by: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Cc: Piotr.Luc@intel.com
Cc: dave.hansen@linux.intel.com
Link: http://lkml.kernel.org/r/1484918557-15481-2-git-send-email-grzegorz.andrejczuk@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/msr-index.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 710273c617b8..00293a94ffaf 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -543,6 +543,11 @@
 #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT	39
 #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE		(1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT)
 
+/* MISC_FEATURE_ENABLES non-architectural features */
+#define MSR_MISC_FEATURE_ENABLES	0x00000140
+
+#define MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT		1
+
 #define MSR_IA32_TSC_DEADLINE		0x000006E0
 
 /* P4/Xeon+ specific */
-- 
cgit 


From 0274f9551eff55dbd63b5f5f3efe30fe5d4c801c Mon Sep 17 00:00:00 2001
From: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Date: Fri, 20 Jan 2017 14:22:34 +0100
Subject: x86/elf: Add HWCAP2 to expose ring 3 MONITOR/MWAIT

Introduce ELF_HWCAP2 variable for x86 and reserve its bit 0 to expose the
ring 3 MONITOR/MWAIT.

HWCAP variables contain bitmasks which can be used by userspace
applications to detect which instruction sets are supported by CPU.  On x86
architecture information about CPU capabilities can be checked via CPUID
instructions, unfortunately presence of ring 3 MONITOR/MWAIT feature cannot
be checked this way. ELF_HWCAP cannot be used as well, because on x86 it is
set to CPUID[1].EDX which means that all bits are reserved there.

HWCAP2 approach was chosen because it reuses existing solution present
in other architectures, so only minor modifications are required to the
kernel and userspace applications. When ELF_HWCAP2 is defined
kernel maps it to AT_HWCAP2 during the start of the application.
This way the ring 3 MONITOR/MWAIT feature can be detected using getauxval()
API in a simple and fast manner. ELF_HWCAP2 type is u32 to be consistent
with x86 ELF_HWCAP type.

Signed-off-by: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Cc: Piotr.Luc@intel.com
Cc: dave.hansen@linux.intel.com
Link: http://lkml.kernel.org/r/1484918557-15481-3-git-send-email-grzegorz.andrejczuk@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/elf.h         | 9 +++++++++
 arch/x86/include/uapi/asm/hwcap2.h | 7 +++++++
 arch/x86/kernel/cpu/common.c       | 3 +++
 3 files changed, 19 insertions(+)
 create mode 100644 arch/x86/include/uapi/asm/hwcap2.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index e7f155c3045e..9d49c18b5ea9 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -258,6 +258,15 @@ extern int force_personality32;
 
 #define ELF_HWCAP		(boot_cpu_data.x86_capability[CPUID_1_EDX])
 
+extern u32 elf_hwcap2;
+
+/*
+ * HWCAP2 supplies mask with kernel enabled CPU features, so that
+ * the application can discover that it can safely use them.
+ * The bits are defined in uapi/asm/hwcap2.h.
+ */
+#define ELF_HWCAP2		(elf_hwcap2)
+
 /* This yields a string that ld.so will use to load implementation
    specific libraries for optimization.  This is more specific in
    intent than poking at uname or /proc/cpuinfo.
diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h
new file mode 100644
index 000000000000..0bd2be5c7617
--- /dev/null
+++ b/arch/x86/include/uapi/asm/hwcap2.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_X86_HWCAP2_H
+#define _ASM_X86_HWCAP2_H
+
+/* MONITOR/MWAIT enabled in Ring 3 */
+#define HWCAP2_RING3MWAIT		(1 << 0)
+
+#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9bab7a8a4293..f879429cfcaa 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -35,6 +35,7 @@
 #include <asm/desc.h>
 #include <asm/fpu/internal.h>
 #include <asm/mtrr.h>
+#include <asm/hwcap2.h>
 #include <linux/numa.h>
 #include <asm/asm.h>
 #include <asm/bugs.h>
@@ -51,6 +52,8 @@
 
 #include "cpu.h"
 
+u32 elf_hwcap2 __read_mostly;
+
 /* all of these masks are initialized in setup_cpu_local_masks() */
 cpumask_var_t cpu_initialized_mask;
 cpumask_var_t cpu_callout_mask;
-- 
cgit 


From 1d12d0ef0194ccc4dcebed3d96bb2301b26fc3ee Mon Sep 17 00:00:00 2001
From: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Date: Fri, 20 Jan 2017 14:22:35 +0100
Subject: x86/cpufeature: Add RING3MWAIT to CPU features

Add software-defined CPUID bit for the non-architectural ring 3
MONITOR/MWAIT feature.

Signed-off-by: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Cc: Piotr.Luc@intel.com
Cc: dave.hansen@linux.intel.com
Link: http://lkml.kernel.org/r/1484918557-15481-4-git-send-email-grzegorz.andrejczuk@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/cpufeatures.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index d9d7136edf05..56e5184514c6 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -100,7 +100,7 @@
 #define X86_FEATURE_XTOPOLOGY	( 3*32+22) /* cpu topology enum extensions */
 #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
 #define X86_FEATURE_NONSTOP_TSC	( 3*32+24) /* TSC does not stop in C states */
-/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
+#define X86_FEATURE_RING3MWAIT	( 3*32+25) /* ring 3 MONITOR/MWAIT */
 #define X86_FEATURE_EXTD_APICID	( 3*32+26) /* has extended APICID (8 bits) */
 #define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
 #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
-- 
cgit 


From e16fd002afe2b16d828bbf738b8a81a185fe9272 Mon Sep 17 00:00:00 2001
From: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Date: Fri, 20 Jan 2017 14:22:36 +0100
Subject: x86/cpufeature: Enable RING3MWAIT for Knights Landing

Enable ring 3 MONITOR/MWAIT for Intel Xeon Phi x200 codenamed Knights
Landing.

Presence of this feature cannot be detected automatically (by reading any
other MSR) therefore it is required to explicitly check for the family and
model of the CPU before attempting to enable it.

Signed-off-by: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Cc: Piotr.Luc@intel.com
Cc: dave.hansen@linux.intel.com
Link: http://lkml.kernel.org/r/1484918557-15481-5-git-send-email-grzegorz.andrejczuk@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/intel.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 203f860d2ab3..da2401a4b0f4 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,8 @@
 #include <asm/cpu.h>
 #include <asm/intel-family.h>
 #include <asm/microcode_intel.h>
+#include <asm/hwcap2.h>
+#include <asm/elf.h>
 
 #ifdef CONFIG_X86_64
 #include <linux/topology.h>
@@ -62,6 +64,39 @@ void check_mpx_erratum(struct cpuinfo_x86 *c)
 	}
 }
 
+static bool ring3mwait_disabled __read_mostly;
+
+static int __init ring3mwait_disable(char *__unused)
+{
+	ring3mwait_disabled = true;
+	return 0;
+}
+__setup("ring3mwait=disable", ring3mwait_disable);
+
+static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
+{
+	/*
+	 * Ring 3 MONITOR/MWAIT feature cannot be detected without
+	 * cpu model and family comparison.
+	 */
+	if (c->x86 != 6 || c->x86_model != INTEL_FAM6_XEON_PHI_KNL)
+		return;
+
+	if (ring3mwait_disabled) {
+		msr_clear_bit(MSR_MISC_FEATURE_ENABLES,
+			      MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
+		return;
+	}
+
+	msr_set_bit(MSR_MISC_FEATURE_ENABLES,
+		    MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
+
+	set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+
+	if (c == &boot_cpu_data)
+		ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
+}
+
 static void early_init_intel(struct cpuinfo_x86 *c)
 {
 	u64 misc_enable;
@@ -560,6 +595,8 @@ static void init_intel(struct cpuinfo_x86 *c)
 		detect_vmx_virtcap(c);
 
 	init_intel_energy_perf(c);
+
+	probe_xeon_phi_r3mwait(c);
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit 


From 68dee8e2f2cacc54d038394e70d22411dee89da2 Mon Sep 17 00:00:00 2001
From: Nikola Pajkovsky <npajkovsky@suse.cz>
Date: Tue, 15 Nov 2016 09:47:49 +0100
Subject: x86/pci-calgary: Fix iommu_free() comparison of unsigned expression
 >= 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 8fd524b355da ("x86: Kill bad_dma_address variable") has killed
bad_dma_address variable and used instead of macro DMA_ERROR_CODE
which is always zero. Since dma_addr is unsigned, the statement

   dma_addr >= DMA_ERROR_CODE

is always true, and not needed.

arch/x86/kernel/pci-calgary_64.c: In function ‘iommu_free’:
arch/x86/kernel/pci-calgary_64.c:299:2: warning: comparison of unsigned expression >= 0 is always true [-Wtype-limits]
  if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {

Fixes: 8fd524b355da ("x86: Kill bad_dma_address variable")
Signed-off-by: Nikola Pajkovsky <npajkovsky@suse.cz>
Cc: iommu@lists.linux-foundation.org
Cc: Jon Mason <jdmason@kudzu.us>
Cc: Muli Ben-Yehuda <mulix@mulix.org>
Link: http://lkml.kernel.org/r/7612c0f9dd7c1290407dbf8e809def922006920b.1479161177.git.npajkovsky@suse.cz
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/pci-calgary_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 5d400ba1349d..d47517941bbc 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -296,7 +296,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 
 	/* were we called with bad_dma_address? */
 	badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
-	if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
+	if (unlikely(dma_addr < badend)) {
 		WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
 		       "address 0x%Lx\n", dma_addr);
 		return;
-- 
cgit 


From 07d495dae20717b00881798ef812f7aa53ca0eb3 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Mon, 28 Nov 2016 13:50:57 +0600
Subject: x86/traps: Get rid of unnecessary
 preempt_disable/preempt_enable_no_resched

Exception handlers which may run on IST stack call ist_enter() at the start
of execution and ist_exit() in the end. ist_enter() disables preemption
unconditionally and ist_exit() enables it.

So the extra preempt_disable/enable() pairs nested inside the
ist_enter/exit() regions are pointless and can be removed.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Jianyu Zhan <nasa4836@gmail.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/20161128075057.7724-1-kuleshovmail@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bf0c6d049080..1dc86ee60a03 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -563,11 +563,9 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	 * as we may switch to the interrupt stack.
 	 */
 	debug_stack_usage_inc();
-	preempt_disable();
 	cond_local_irq_enable(regs);
 	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
 	cond_local_irq_disable(regs);
-	preempt_enable_no_resched();
 	debug_stack_usage_dec();
 exit:
 	ist_exit(regs);
@@ -742,14 +740,12 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_inc();
 
 	/* It's safe to allow irq's after DR6 has been saved */
-	preempt_disable();
 	cond_local_irq_enable(regs);
 
 	if (v8086_mode(regs)) {
 		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 					X86_TRAP_DB);
 		cond_local_irq_disable(regs);
-		preempt_enable_no_resched();
 		debug_stack_usage_dec();
 		goto exit;
 	}
@@ -769,7 +765,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
 	cond_local_irq_disable(regs);
-	preempt_enable_no_resched();
 	debug_stack_usage_dec();
 
 exit:
-- 
cgit 


From 1013fe32a63d1139b1b32049ea46c0c462738d8b Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Mon, 19 Dec 2016 22:35:57 +0800
Subject: x86/mm/pat: Use rb_entry()

To make the code clearer, use rb_entry() instead of open coding it

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Link: http://lkml.kernel.org/r/974a91cd4ed2d04c92e4faa4765077e38f248d6b.1482157956.git.geliangtang@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pat_rbtree.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 159b52ccd600..d76485b22824 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -47,7 +47,7 @@ static u64 get_subtree_max_end(struct rb_node *node)
 {
 	u64 ret = 0;
 	if (node) {
-		struct memtype *data = container_of(node, struct memtype, rb);
+		struct memtype *data = rb_entry(node, struct memtype, rb);
 		ret = data->subtree_max_end;
 	}
 	return ret;
@@ -79,7 +79,7 @@ static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
 	struct memtype *last_lower = NULL;
 
 	while (node) {
-		struct memtype *data = container_of(node, struct memtype, rb);
+		struct memtype *data = rb_entry(node, struct memtype, rb);
 
 		if (get_subtree_max_end(node->rb_left) > start) {
 			/* Lowest overlap if any must be on left side */
@@ -121,7 +121,7 @@ static struct memtype *memtype_rb_match(struct rb_root *root,
 
 		node = rb_next(&match->rb);
 		if (node)
-			match = container_of(node, struct memtype, rb);
+			match = rb_entry(node, struct memtype, rb);
 		else
 			match = NULL;
 	}
@@ -150,7 +150,7 @@ static int memtype_rb_check_conflict(struct rb_root *root,
 
 	node = rb_next(&match->rb);
 	while (node) {
-		match = container_of(node, struct memtype, rb);
+		match = rb_entry(node, struct memtype, rb);
 
 		if (match->start >= end) /* Checked all possible matches */
 			goto success;
@@ -181,7 +181,7 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
 	struct rb_node *parent = NULL;
 
 	while (*node) {
-		struct memtype *data = container_of(*node, struct memtype, rb);
+		struct memtype *data = rb_entry(*node, struct memtype, rb);
 
 		parent = *node;
 		if (data->subtree_max_end < newdata->end)
@@ -270,7 +270,7 @@ int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
 	}
 
 	if (node) { /* pos == i */
-		struct memtype *this = container_of(node, struct memtype, rb);
+		struct memtype *this = rb_entry(node, struct memtype, rb);
 		*out = *this;
 		return 0;
 	} else {
-- 
cgit 


From 4d8bb00604b182b62e7786bae0e58e0befeeff85 Mon Sep 17 00:00:00 2001
From: Piotr Luc <piotr.luc@intel.com>
Date: Fri, 20 Jan 2017 14:22:37 +0100
Subject: x86/cpufeature: Enable RING3MWAIT for Knights Mill

Enable ring 3 MONITOR/MWAIT for Intel Xeon Phi codenamed Knights Mill. We
can't guarantee that this (KNM) will be the last CPU model that needs this
hack.  But, we do recognize that this is far from optimal, and there is an
effort to ensure we don't keep doing extending this hack forever.

Signed-off-by: Piotr Luc <piotr.luc@intel.com>
Cc: Piotr.Luc@intel.com
Cc: dave.hansen@linux.intel.com
Link: http://lkml.kernel.org/r/1484918557-15481-6-git-send-email-grzegorz.andrejczuk@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/intel.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index da2401a4b0f4..a4c4ff9b27e4 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -79,8 +79,15 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
 	 * Ring 3 MONITOR/MWAIT feature cannot be detected without
 	 * cpu model and family comparison.
 	 */
-	if (c->x86 != 6 || c->x86_model != INTEL_FAM6_XEON_PHI_KNL)
+	if (c->x86 != 6)
 		return;
+	switch (c->x86_model) {
+	case INTEL_FAM6_XEON_PHI_KNL:
+	case INTEL_FAM6_XEON_PHI_KNM:
+		break;
+	default:
+		return;
+	}
 
 	if (ring3mwait_disabled) {
 		msr_clear_bit(MSR_MISC_FEATURE_ENABLES,
-- 
cgit 


From 79a8b9aa388b0620cc1d525d7c0f0d9a8a85e08e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 5 Feb 2017 11:50:21 +0100
Subject: x86/CPU/AMD: Bring back Compute Unit ID

Commit:

  a33d331761bc ("x86/CPU/AMD: Fix Bulldozer topology")

restored the initial approach we had with the Fam15h topology of
enumerating CU (Compute Unit) threads as cores. And this is still
correct - they're beefier than HT threads but still have some
shared functionality.

Our current approach has a problem with the Mad Max Steam game, for
example. Yves Dionne reported a certain "choppiness" while playing on
v4.9.5.

That problem stems most likely from the fact that the CU threads share
resources within one CU and when we schedule to a thread of a different
compute unit, this incurs latency due to migrating the working set to a
different CU through the caches.

When the thread siblings mask mirrors that aspect of the CUs and
threads, the scheduler pays attention to it and tries to schedule within
one CU first. Which takes care of the latency, of course.

Reported-by: Yves Dionne <yves.dionne@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org> # 4.9
Cc: Brice Goglin <Brice.Goglin@inria.fr>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yazen Ghannam <yazen.ghannam@amd.com>
Link: http://lkml.kernel.org/r/20170205105022.8705-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h |  1 +
 arch/x86/kernel/cpu/amd.c        |  9 ++++++++-
 arch/x86/kernel/cpu/common.c     |  1 +
 arch/x86/kernel/smpboot.c        | 12 +++++++++---
 4 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1be64da0384e..e6cfe7ba2d65 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -104,6 +104,7 @@ struct cpuinfo_x86 {
 	__u8			x86_phys_bits;
 	/* CPUID returned core id bits: */
 	__u8			x86_coreid_bits;
+	__u8			cu_id;
 	/* Max extended CPUID function supported: */
 	__u32			extended_cpuid_level;
 	/* Maximum supported CPUID level, -1=no CPUID: */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1d3167269a67..20dc44d1e6be 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -309,8 +309,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 
 	/* get information required for multi-node processors */
 	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+		u32 eax, ebx, ecx, edx;
 
-		node_id = cpuid_ecx(0x8000001e) & 7;
+		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+
+		node_id  = ecx & 0xff;
+		smp_num_siblings = ((ebx >> 8) & 0xff) + 1;
+
+		if (c->x86 == 0x15)
+			c->cu_id = ebx & 0xff;
 
 		/*
 		 * We may have multiple LLCs if L3 caches exist, so check if we
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9bab7a8a4293..ede03e849a8b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1015,6 +1015,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	c->x86_model_id[0] = '\0';  /* Unset */
 	c->x86_max_cores = 1;
 	c->x86_coreid_bits = 0;
+	c->cu_id = 0xff;
 #ifdef CONFIG_X86_64
 	c->x86_clflush_size = 64;
 	c->x86_phys_bits = 36;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 46732dc3b73c..99b920d0e516 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -433,9 +433,15 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 		int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 
 		if (c->phys_proc_id == o->phys_proc_id &&
-		    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
-		    c->cpu_core_id == o->cpu_core_id)
-			return topology_sane(c, o, "smt");
+		    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
+			if (c->cpu_core_id == o->cpu_core_id)
+				return topology_sane(c, o, "smt");
+
+			if ((c->cu_id != 0xff) &&
+			    (o->cu_id != 0xff) &&
+			    (c->cu_id == o->cu_id))
+				return topology_sane(c, o, "smt");
+		}
 
 	} else if (c->phys_proc_id == o->phys_proc_id &&
 		   c->cpu_core_id == o->cpu_core_id) {
-- 
cgit 


From 08b259631b5a1d912af4832847b5642f377d9101 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <Yazen.Ghannam@amd.com>
Date: Sun, 5 Feb 2017 11:50:22 +0100
Subject: x86/CPU/AMD: Fix Zen SMT topology

After:

  a33d331761bc ("x86/CPU/AMD: Fix Bulldozer topology")

our  SMT scheduling topology for Fam17h systems is broken, because
the ThreadId is included in the ApicId when SMT is enabled.

So, without further decoding cpu_core_id is unique for each thread
rather than the same for threads on the same core. This didn't affect
systems with SMT disabled. Make cpu_core_id be what it is defined to be.

Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org> # 4.9
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170205105022.8705-2-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/amd.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 20dc44d1e6be..2b4cf04239b6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -319,6 +319,13 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		if (c->x86 == 0x15)
 			c->cu_id = ebx & 0xff;
 
+		if (c->x86 >= 0x17) {
+			c->cpu_core_id = ebx & 0xff;
+
+			if (smp_num_siblings > 1)
+				c->x86_max_cores /= smp_num_siblings;
+		}
+
 		/*
 		 * We may have multiple LLCs if L3 caches exist, so check if we
 		 * have an L3 cache by looking at the L3 cache CPUID leaf.
-- 
cgit 


From b6263178b8dbd9fe70d55f136c2a1da39309520e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 6 Feb 2017 18:55:43 +0900
Subject: kprobes/x86: Use hlist_for_each_entry() instead of
 hlist_for_each_entry_safe()

Use hlist_for_each_entry() in the first loop in the kretprobe
trampoline_handler() function, because it doesn't change the hlist.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/148637493309.19245.12546866092052500584.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index eb3509338ae0..520b8dfe1640 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -745,7 +745,7 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
 	 *	 will be the real return address, and all the rest will
 	 *	 point to kretprobe_trampoline.
 	 */
-	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+	hlist_for_each_entry(ri, head, hlist) {
 		if (ri->task != current)
 			/* another task is sharing our hash bucket */
 			continue;
-- 
cgit 


From 5773ebfee729acf93b330664eab4c8d77edc2193 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Mon, 6 Feb 2017 17:43:49 +0100
Subject: x86/kconfig: Remove misleading note regarding hibernation and KASLR

There used to be a restriction with KASLR and hibernation, but this is no
longer true, and since commit:

  65fe935dd238 ("x86/KASLR, x86/power: Remove x86 hibernation restrictions")

the parameter "kaslr" does no longer exist.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Niklas Cassel <niklass@axis.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1486399429-23078-1-git-send-email-niklass@axis.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e487493bbd47..4e6dbca03aed 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1994,10 +1994,6 @@ config RANDOMIZE_BASE
 	  theoretically possible, but the implementations are further
 	  limited due to memory layouts.
 
-	  If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot
-	  time. To enable it, boot with "kaslr" on the kernel command
-	  line (which will also disable hibernation).
-
 	  If unsure, say N.
 
 # Relocation on x86 needs some additional build support
-- 
cgit 


From 543113d2f4b5dd40d46a95502effe86b845dfe34 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Tue, 7 Feb 2017 12:44:48 +0800
Subject: x86/apic: Fix a typo in a comment line

s/bringin
 /bringing

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: trivial@kernel.org
Link: http://lkml.kernel.org/r/1486442688-24690-1-git-send-email-douly.fnst@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index fdb9c46227cc..8567c851172c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1246,7 +1246,7 @@ static void lapic_setup_esr(void)
 /**
  * setup_local_APIC - setup the local APIC
  *
- * Used to setup local APIC while initializing BSP or bringin up APs.
+ * Used to setup local APIC while initializing BSP or bringing up APs.
  * Always called with preemption disabled.
  */
 void setup_local_APIC(void)
-- 
cgit 


From a2cd2f3f29f26782b7484b32e2af172e29313717 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 6 Feb 2017 11:22:40 +0000
Subject: x86/efi: Allow invocation of arbitrary runtime services

Provide the ability to perform mixed-mode runtime service calls for x86 in
the same way the following commit provided the ability to invoke for boot
services:

  0a637ee61247bd ("x86/efi: Allow invocation of arbitrary boot services")

Suggested-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1486380166-31868-2-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/eboot.c   | 1 +
 arch/x86/boot/compressed/head_32.S | 6 +++---
 arch/x86/boot/compressed/head_64.S | 8 ++++----
 arch/x86/include/asm/efi.h         | 5 +++++
 4 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 6d3aeabbce68..f99978db6b6f 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -32,6 +32,7 @@ static void setup_boot_services##bits(struct efi_config *c)		\
 									\
 	table = (typeof(table))sys_table;				\
 									\
+	c->runtime_services = table->runtime;				\
 	c->boot_services = table->boottime;				\
 	c->text_output = table->con_out;				\
 }
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index fd0b6a272dd5..d85b9625e836 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -82,7 +82,7 @@ ENTRY(efi_pe_entry)
 
 	/* Relocate efi_config->call() */
 	leal	efi32_config(%esi), %eax
-	add	%esi, 32(%eax)
+	add	%esi, 40(%eax)
 	pushl	%eax
 
 	call	make_boot_params
@@ -108,7 +108,7 @@ ENTRY(efi32_stub_entry)
 
 	/* Relocate efi_config->call() */
 	leal	efi32_config(%esi), %eax
-	add	%esi, 32(%eax)
+	add	%esi, 40(%eax)
 	pushl	%eax
 2:
 	call	efi_main
@@ -264,7 +264,7 @@ relocated:
 #ifdef CONFIG_EFI_STUB
 	.data
 efi32_config:
-	.fill 4,8,0
+	.fill 5,8,0
 	.long efi_call_phys
 	.long 0
 	.byte 0
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 4d85e600db78..d2ae1f821e0c 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -264,7 +264,7 @@ ENTRY(efi_pe_entry)
 	/*
 	 * Relocate efi_config->call().
 	 */
-	addq	%rbp, efi64_config+32(%rip)
+	addq	%rbp, efi64_config+40(%rip)
 
 	movq	%rax, %rdi
 	call	make_boot_params
@@ -284,7 +284,7 @@ handover_entry:
 	 * Relocate efi_config->call().
 	 */
 	movq	efi_config(%rip), %rax
-	addq	%rbp, 32(%rax)
+	addq	%rbp, 40(%rax)
 2:
 	movq	efi_config(%rip), %rdi
 	call	efi_main
@@ -456,14 +456,14 @@ efi_config:
 #ifdef CONFIG_EFI_MIXED
 	.global efi32_config
 efi32_config:
-	.fill	4,8,0
+	.fill	5,8,0
 	.quad	efi64_thunk
 	.byte	0
 #endif
 
 	.global efi64_config
 efi64_config:
-	.fill	4,8,0
+	.fill	5,8,0
 	.quad	efi_call
 	.byte	1
 #endif /* CONFIG_EFI_STUB */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index e99675b9c861..2f77bcefe6b4 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -191,6 +191,7 @@ static inline efi_status_t efi_thunk_set_virtual_address_map(
 struct efi_config {
 	u64 image_handle;
 	u64 table;
+	u64 runtime_services;
 	u64 boot_services;
 	u64 text_output;
 	efi_status_t (*call)(unsigned long, ...);
@@ -226,6 +227,10 @@ static inline bool efi_is_64bit(void)
 #define __efi_call_early(f, ...)					\
 	__efi_early()->call((unsigned long)f, __VA_ARGS__);
 
+#define efi_call_runtime(f, ...)					\
+	__efi_early()->call(efi_table_attr(efi_runtime_services, f,	\
+		__efi_early()->runtime_services), __VA_ARGS__)
+
 extern bool efi_reboot_required(void);
 
 #else
-- 
cgit 


From de8cb458625c164bb3f93c4e415e479afce8fa9d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 6 Feb 2017 11:22:43 +0000
Subject: efi: Get and store the secure boot status

Get the firmware's secure-boot status in the kernel boot wrapper and stash
it somewhere that the main kernel image can find.

The efi_get_secureboot() function is extracted from the ARM stub and (a)
generalised so that it can be called from x86 and (b) made to use
efi_call_runtime() so that it can be run in mixed-mode.

For x86, it is stored in boot_params and can be overridden by the boot
loader or kexec.  This allows secure-boot mode to be passed on to a new
kernel.

Suggested-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1486380166-31868-5-git-send-email-ard.biesheuvel@linaro.org
[ Small readability edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/eboot.c      | 7 +++++++
 arch/x86/include/uapi/asm/bootparam.h | 3 ++-
 arch/x86/kernel/asm-offsets.c         | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index f99978db6b6f..801c7a158e55 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -988,6 +988,13 @@ struct boot_params *efi_main(struct efi_config *c,
 	else
 		setup_boot_services32(efi_early);
 
+	/*
+	 * If the boot loader gave us a value for secure_boot then we use that,
+	 * otherwise we ask the BIOS.
+	 */
+	if (boot_params->secure_boot == efi_secureboot_mode_unset)
+		boot_params->secure_boot = efi_get_secureboot(sys_table);
+
 	setup_graphics(boot_params);
 
 	setup_efi_pci(boot_params);
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index b10bf319ed20..5138dacf8bb8 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -135,7 +135,8 @@ struct boot_params {
 	__u8  eddbuf_entries;				/* 0x1e9 */
 	__u8  edd_mbr_sig_buf_entries;			/* 0x1ea */
 	__u8  kbd_status;				/* 0x1eb */
-	__u8  _pad5[3];					/* 0x1ec */
+	__u8  secure_boot;				/* 0x1ec */
+	__u8  _pad5[2];					/* 0x1ed */
 	/*
 	 * The sentinel is set to a nonzero value (0xff) in header.S.
 	 *
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index c62e015b126c..de827d6ac8c2 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -81,6 +81,7 @@ void common(void) {
 
 	BLANK();
 	OFFSET(BP_scratch, boot_params, scratch);
+	OFFSET(BP_secure_boot, boot_params, secure_boot);
 	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
-- 
cgit 


From 9661b332041dab63ba2e5222b40a9f916c1368a9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 6 Feb 2017 11:22:45 +0000
Subject: efi: Print the secure boot status in x86 setup_arch()

Print the secure boot status in the x86 setup_arch() function, but otherwise do
nothing more for now. More functionality will be added later, but this at
least allows for testing.

Signed-off-by: David Howells <dhowells@redhat.com>
[ Use efi_enabled() instead of IS_ENABLED(CONFIG_EFI). ]
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1486380166-31868-7-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/setup.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4cfba947d774..69780edf0dde 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1176,6 +1176,20 @@ void __init setup_arch(char **cmdline_p)
 	/* Allocate bigger log buffer */
 	setup_log_buf(1);
 
+	if (efi_enabled(EFI_BOOT)) {
+		switch (boot_params.secure_boot) {
+		case efi_secureboot_mode_disabled:
+			pr_info("Secure boot disabled\n");
+			break;
+		case efi_secureboot_mode_enabled:
+			pr_info("Secure boot enabled\n");
+			break;
+		default:
+			pr_info("Secure boot could not be determined\n");
+			break;
+		}
+	}
+
 	reserve_initrd();
 
 	acpi_table_upgrade();
-- 
cgit 


From febf2407418a2d6c042fcd77b206040449cb9a70 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 6 Feb 2017 18:01:51 +0100
Subject: x86/ACPI: keep x86_cpu_to_acpiid mapping valid on CPU hotplug

We may or may not have all possible CPUs in MADT on boot but in any
case we're overwriting x86_cpu_to_acpiid mapping with U32_MAX when
acpi_register_lapic() is called again on the CPU hotplug path:

acpi_processor_hotadd_init()
  -> acpi_map_cpu()
    -> acpi_register_lapic()

As we have the required acpi_id information in acpi_processor_hotadd_init()
propagate it to acpi_map_cpu() to always keep x86_cpu_to_acpiid
mapping valid.

Reported-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/kernel/acpi/boot.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 64422f850e95..04bc5f3f9aa1 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -723,11 +723,12 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 	return 0;
 }
 
-int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
+		 int *pcpu)
 {
 	int cpu;
 
-	cpu = acpi_register_lapic(physid, U32_MAX, ACPI_MADT_ENABLED);
+	cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
 	if (cpu < 0) {
 		pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
 		return cpu;
-- 
cgit 


From 5a7670ee23f2c07a639c263b70140eaf1da9f68f Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Fri, 3 Feb 2017 16:57:22 -0500
Subject: x86/boot/32: Convert the 32-bit pgtable setup code from assembly to C

The new Xen PVH entry point requires page tables to be setup by the
kernel since it is entered with paging disabled.

Pull the common code out of head_32.S so that mk_early_pgtbl_32() can be
invoked from both the new Xen entry point and the existing startup_32()
code.

Convert resulting common code to C.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: matt@codeblueprint.co.uk
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1481215471-9639-1-git-send-email-boris.ostrovsky@oracle.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable_32.h |  32 ++++++++++
 arch/x86/kernel/head32.c          |  62 +++++++++++++++++++
 arch/x86/kernel/head_32.S         | 121 +++-----------------------------------
 3 files changed, 101 insertions(+), 114 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index b6c0b404898a..fbc73360aea0 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -27,6 +27,7 @@ struct vm_area_struct;
 
 extern pgd_t swapper_pg_dir[1024];
 extern pgd_t initial_page_table[1024];
+extern pmd_t initial_pg_pmd[];
 
 static inline void pgtable_cache_init(void) { }
 static inline void check_pgt_cache(void) { }
@@ -75,4 +76,35 @@ do {						\
 #define kern_addr_valid(kaddr)	(0)
 #endif
 
+/*
+ * This is how much memory in addition to the memory covered up to
+ * and including _end we need mapped initially.
+ * We need:
+ *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
+ *
+ * Modulo rounding, each megabyte assigned here requires a kilobyte of
+ * memory, which is currently unreclaimed.
+ *
+ * This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
+ */
+#if PTRS_PER_PMD > 1
+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
+#else
+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
+#endif
+
+/*
+ * Number of possible pages in the lowmem region.
+ *
+ * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
+ * gas warning about overflowing shift count when gas has been compiled
+ * with only a host target support using a 32-bit type for internal
+ * representation.
+ */
+#define LOWMEM_PAGES ((((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT))
+
 #endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index f16c55bfc090..e5fb436a6548 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -49,3 +49,65 @@ asmlinkage __visible void __init i386_start_kernel(void)
 
 	start_kernel();
 }
+
+/*
+ * Initialize page tables.  This creates a PDE and a set of page
+ * tables, which are located immediately beyond __brk_base.  The variable
+ * _brk_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end.
+ *
+ * In PAE mode initial_page_table is statically defined to contain
+ * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+ * entries). The identity mapping is handled by pointing two PGD entries
+ * to the first kernel PMD. Note the upper half of each PMD or PTE are
+ * always zero at this stage.
+ */
+void __init mk_early_pgtbl_32(void)
+{
+#ifdef __pa
+#undef __pa
+#endif
+#define __pa(x)  ((unsigned long)(x) - PAGE_OFFSET)
+	pte_t pte, *ptep;
+	int i;
+	unsigned long *ptr;
+	/* Enough space to fit pagetables for the low memory linear map */
+	const unsigned long limit = __pa(_end) +
+		(PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
+#ifdef CONFIG_X86_PAE
+	pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd);
+#define SET_PL2(pl2, val)    { (pl2).pmd = (val); }
+#else
+	pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table);
+#define SET_PL2(pl2, val)   { (pl2).pgd = (val); }
+#endif
+
+	ptep = (pte_t *)__pa(__brk_base);
+	pte.pte = PTE_IDENT_ATTR;
+
+	while ((pte.pte & PTE_PFN_MASK) < limit) {
+
+		SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR);
+		*pl2p = pl2;
+#ifndef CONFIG_X86_PAE
+		/* Kernel PDE entry */
+		*(pl2p +  ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
+#endif
+		for (i = 0; i < PTRS_PER_PTE; i++) {
+			*ptep = pte;
+			pte.pte += PAGE_SIZE;
+			ptep++;
+		}
+
+		pl2p++;
+	}
+
+	ptr = (unsigned long *)__pa(&max_pfn_mapped);
+	/* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */
+	*ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+
+	ptr = (unsigned long *)__pa(&_brk_end);
+	*ptr = (unsigned long)ptep + PAGE_OFFSET;
+}
+
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 4e8577d03372..1f85ee8f9439 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -24,6 +24,7 @@
 #include <asm/nops.h>
 #include <asm/bootparam.h>
 #include <asm/export.h>
+#include <asm/pgtable_32.h>
 
 /* Physical address */
 #define pa(X) ((X) - __PAGE_OFFSET)
@@ -41,43 +42,9 @@
 #define X86_CAPABILITY	new_cpu_data+CPUINFO_x86_capability
 #define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
 
-/*
- * This is how much memory in addition to the memory covered up to
- * and including _end we need mapped initially.
- * We need:
- *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
- *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
- *
- * Modulo rounding, each megabyte assigned here requires a kilobyte of
- * memory, which is currently unreclaimed.
- *
- * This should be a multiple of a page.
- *
- * KERNEL_IMAGE_SIZE should be greater than pa(_end)
- * and small than max_low_pfn, otherwise will waste some page table entries
- */
-
-#if PTRS_PER_PMD > 1
-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
-#else
-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
-#endif
 
 #define SIZEOF_PTREGS 17*4
 
-/*
- * Number of possible pages in the lowmem region.
- *
- * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
- * gas warning about overflowing shift count when gas has been compiled
- * with only a host target support using a 32-bit type for internal
- * representation.
- */
-LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
-
-/* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
-
 /*
  * Worst-case size of the kernel mapping we need to make:
  * a relocatable kernel can live anywhere in lowmem, so we need to be able
@@ -160,90 +127,15 @@ ENTRY(startup_32)
 	call load_ucode_bsp
 #endif
 
-/*
- * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond __brk_base.  The variable
- * _brk_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end.
- */
-#ifdef CONFIG_X86_PAE
-
-	/*
-	 * In PAE mode initial_page_table is statically defined to contain
-	 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
-	 * entries). The identity mapping is handled by pointing two PGD entries
-	 * to the first kernel PMD.
-	 *
-	 * Note the upper half of each PMD or PTE are always zero at this stage.
-	 */
-
-#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
-
-	xorl %ebx,%ebx				/* %ebx is kept at zero */
-
-	movl $pa(__brk_base), %edi
-	movl $pa(initial_pg_pmd), %edx
-	movl $PTE_IDENT_ATTR, %eax
-10:
-	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
-	movl %ecx,(%edx)			/* Store PMD entry */
-						/* Upper half already zero */
-	addl $8,%edx
-	movl $512,%ecx
-11:
-	stosl
-	xchgl %eax,%ebx
-	stosl
-	xchgl %eax,%ebx
-	addl $0x1000,%eax
-	loop 11b
-
-	/*
-	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
-	 */
-	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-	cmpl %ebp,%eax
-	jb 10b
-1:
-	addl $__PAGE_OFFSET, %edi
-	movl %edi, pa(_brk_end)
-	shrl $12, %eax
-	movl %eax, pa(max_pfn_mapped)
+	/* Create early pagetables. */
+	call  mk_early_pgtbl_32
 
 	/* Do early initialization of the fixmap area */
 	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+#ifdef  CONFIG_X86_PAE
+#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
 	movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
-#else	/* Not PAE */
-
-page_pde_offset = (__PAGE_OFFSET >> 20);
-
-	movl $pa(__brk_base), %edi
-	movl $pa(initial_page_table), %edx
-	movl $PTE_IDENT_ATTR, %eax
-10:
-	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
-	movl %ecx,(%edx)			/* Store identity PDE entry */
-	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
-	addl $4,%edx
-	movl $1024, %ecx
-11:
-	stosl
-	addl $0x1000,%eax
-	loop 11b
-	/*
-	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
-	 */
-	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-	cmpl %ebp,%eax
-	jb 10b
-	addl $__PAGE_OFFSET, %edi
-	movl %edi, pa(_brk_end)
-	shrl $12, %eax
-	movl %eax, pa(max_pfn_mapped)
-
-	/* Do early initialization of the fixmap area */
-	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+#else
 	movl %eax,pa(initial_page_table+0xffc)
 #endif
 
@@ -666,6 +558,7 @@ ENTRY(setup_once_ref)
 __PAGE_ALIGNED_BSS
 	.align PAGE_SIZE
 #ifdef CONFIG_X86_PAE
+.globl initial_pg_pmd
 initial_pg_pmd:
 	.fill 1024*KPMDS,4,0
 #else
-- 
cgit 


From 063334f30543597430f172bd7690d21e3590e148 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Fri, 3 Feb 2017 16:57:22 -0500
Subject: xen/x86: Remove PVH support

We are replacing existing PVH guests with new implementation.

We are keeping xen_pvh_domain() macro (for now set to zero) because
when we introduce new PVH implementation later in this series we will
reuse current PVH-specific code (xen_pvh_gnttab_setup()), and that
code is conditioned by 'if (xen_pvh_domain())'. (We will also need
a noop xen_pvh_domain() for !CONFIG_XEN_PVH).

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 140 +++++++----------------------------------------
 arch/x86/xen/mmu.c       |  21 +------
 arch/x86/xen/setup.c     |  37 +------------
 arch/x86/xen/smp.c       |  78 +++++++++-----------------
 arch/x86/xen/smp.h       |   8 ---
 arch/x86/xen/xen-head.S  |  62 ++-------------------
 arch/x86/xen/xen-ops.h   |   1 -
 7 files changed, 53 insertions(+), 294 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 51ef95232725..828f1b226f56 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1138,10 +1138,11 @@ void xen_setup_vcpu_info_placement(void)
 		xen_vcpu_setup(cpu);
 	}
 
-	/* xen_vcpu_setup managed to place the vcpu_info within the
-	 * percpu area for all cpus, so make use of it. Note that for
-	 * PVH we want to use native IRQ mechanism. */
-	if (have_vcpu_info_placement && !xen_pvh_domain()) {
+	/*
+	 * xen_vcpu_setup managed to place the vcpu_info within the
+	 * percpu area for all cpus, so make use of it.
+	 */
+	if (have_vcpu_info_placement) {
 		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
 		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
 		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -1413,49 +1414,9 @@ static void __init xen_boot_params_init_edd(void)
  * Set up the GDT and segment registers for -fstack-protector.  Until
  * we do this, we have to be careful not to call any stack-protected
  * function, which is most of the kernel.
- *
- * Note, that it is __ref because the only caller of this after init
- * is PVH which is not going to use xen_load_gdt_boot or other
- * __init functions.
  */
-static void __ref xen_setup_gdt(int cpu)
+static void xen_setup_gdt(int cpu)
 {
-	if (xen_feature(XENFEAT_auto_translated_physmap)) {
-#ifdef CONFIG_X86_64
-		unsigned long dummy;
-
-		load_percpu_segment(cpu); /* We need to access per-cpu area */
-		switch_to_new_gdt(cpu); /* GDT and GS set */
-
-		/* We are switching of the Xen provided GDT to our HVM mode
-		 * GDT. The new GDT has  __KERNEL_CS with CS.L = 1
-		 * and we are jumping to reload it.
-		 */
-		asm volatile ("pushq %0\n"
-			      "leaq 1f(%%rip),%0\n"
-			      "pushq %0\n"
-			      "lretq\n"
-			      "1:\n"
-			      : "=&r" (dummy) : "0" (__KERNEL_CS));
-
-		/*
-		 * While not needed, we also set the %es, %ds, and %fs
-		 * to zero. We don't care about %ss as it is NULL.
-		 * Strictly speaking this is not needed as Xen zeros those
-		 * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
-		 *
-		 * Linux zeros them in cpu_init() and in secondary_startup_64
-		 * (for BSP).
-		 */
-		loadsegment(es, 0);
-		loadsegment(ds, 0);
-		loadsegment(fs, 0);
-#else
-		/* PVH: TODO Implement. */
-		BUG();
-#endif
-		return; /* PVH does not need any PV GDT ops. */
-	}
 	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
 	pv_cpu_ops.load_gdt = xen_load_gdt_boot;
 
@@ -1466,59 +1427,6 @@ static void __ref xen_setup_gdt(int cpu)
 	pv_cpu_ops.load_gdt = xen_load_gdt;
 }
 
-#ifdef CONFIG_XEN_PVH
-/*
- * A PV guest starts with default flags that are not set for PVH, set them
- * here asap.
- */
-static void xen_pvh_set_cr_flags(int cpu)
-{
-
-	/* Some of these are setup in 'secondary_startup_64'. The others:
-	 * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
-	 * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
-	write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
-
-	if (!cpu)
-		return;
-	/*
-	 * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
-	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu().
-	*/
-	if (boot_cpu_has(X86_FEATURE_PSE))
-		cr4_set_bits_and_update_boot(X86_CR4_PSE);
-
-	if (boot_cpu_has(X86_FEATURE_PGE))
-		cr4_set_bits_and_update_boot(X86_CR4_PGE);
-}
-
-/*
- * Note, that it is ref - because the only caller of this after init
- * is PVH which is not going to use xen_load_gdt_boot or other
- * __init functions.
- */
-void __ref xen_pvh_secondary_vcpu_init(int cpu)
-{
-	xen_setup_gdt(cpu);
-	xen_pvh_set_cr_flags(cpu);
-}
-
-static void __init xen_pvh_early_guest_init(void)
-{
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
-		return;
-
-	BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector));
-
-	xen_pvh_early_cpu_init(0, false);
-	xen_pvh_set_cr_flags(0);
-
-#ifdef CONFIG_X86_32
-	BUG(); /* PVH: Implement proper support. */
-#endif
-}
-#endif    /* CONFIG_XEN_PVH */
-
 static void __init xen_dom0_set_legacy_features(void)
 {
 	x86_platform.legacy.rtc = 1;
@@ -1555,24 +1463,17 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	xen_domain_type = XEN_PV_DOMAIN;
 
 	xen_setup_features();
-#ifdef CONFIG_XEN_PVH
-	xen_pvh_early_guest_init();
-#endif
+
 	xen_setup_machphys_mapping();
 
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
-	if (!xen_pvh_domain()) {
-		pv_cpu_ops = xen_cpu_ops;
+	pv_cpu_ops = xen_cpu_ops;
 
-		x86_platform.get_nmi_reason = xen_get_nmi_reason;
-	}
+	x86_platform.get_nmi_reason = xen_get_nmi_reason;
 
-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		x86_init.resources.memory_setup = xen_auto_xlated_memory_setup;
-	else
-		x86_init.resources.memory_setup = xen_memory_setup;
+	x86_init.resources.memory_setup = xen_memory_setup;
 	x86_init.oem.arch_setup = xen_arch_setup;
 	x86_init.oem.banner = xen_banner;
 
@@ -1665,18 +1566,15 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	/* set the limit of our address space */
 	xen_reserve_top();
 
-	/* PVH: runs at default kernel iopl of 0 */
-	if (!xen_pvh_domain()) {
-		/*
-		 * We used to do this in xen_arch_setup, but that is too late
-		 * on AMD were early_cpu_init (run before ->arch_setup()) calls
-		 * early_amd_init which pokes 0xcf8 port.
-		 */
-		set_iopl.iopl = 1;
-		rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
-		if (rc != 0)
-			xen_raw_printk("physdev_op failed %d\n", rc);
-	}
+	/*
+	 * We used to do this in xen_arch_setup, but that is too late
+	 * on AMD were early_cpu_init (run before ->arch_setup()) calls
+	 * early_amd_init which pokes 0xcf8 port.
+	 */
+	set_iopl.iopl = 1;
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+	if (rc != 0)
+		xen_raw_printk("physdev_op failed %d\n", rc);
 
 #ifdef CONFIG_X86_32
 	/* set up basic CPUID stuff */
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 7d5afdb417cc..f6740b5b1738 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1792,10 +1792,6 @@ static void __init set_page_prot_flags(void *addr, pgprot_t prot,
 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
 	pte_t pte = pfn_pte(pfn, prot);
 
-	/* For PVH no need to set R/O or R/W to pin them or unpin them. */
-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		return;
-
 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
 		BUG();
 }
@@ -1902,8 +1898,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
  * level2_ident_pgt, and level2_kernel_pgt.  This means that only the
  * kernel has a physical mapping to start with - but that's enough to
  * get __va working.  We need to fill in the rest of the physical
- * mapping once some sort of allocator has been set up.  NOTE: for
- * PVH, the page tables are native.
+ * mapping once some sort of allocator has been set up.
  */
 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 {
@@ -2812,16 +2807,6 @@ static int do_remap_gfn(struct vm_area_struct *vma,
 
 	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
-	if (xen_feature(XENFEAT_auto_translated_physmap)) {
-#ifdef CONFIG_XEN_PVH
-		/* We need to update the local page tables and the xen HAP */
-		return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
-						 prot, domid, pages);
-#else
-		return -EINVAL;
-#endif
-        }
-
 	rmd.mfn = gfn;
 	rmd.prot = prot;
 	/* We use the err_ptr to indicate if there we are doing a contiguous
@@ -2915,10 +2900,6 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 	if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
 		return 0;
 
-#ifdef CONFIG_XEN_PVH
-	return xen_xlate_unmap_gfn_range(vma, numpgs, pages);
-#else
 	return -EINVAL;
-#endif
 }
 EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index f3f7b41116f7..a8c306cf8868 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -914,39 +914,6 @@ char * __init xen_memory_setup(void)
 	return "Xen";
 }
 
-/*
- * Machine specific memory setup for auto-translated guests.
- */
-char * __init xen_auto_xlated_memory_setup(void)
-{
-	struct xen_memory_map memmap;
-	int i;
-	int rc;
-
-	memmap.nr_entries = ARRAY_SIZE(xen_e820_map);
-	set_xen_guest_handle(memmap.buffer, xen_e820_map);
-
-	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
-	if (rc < 0)
-		panic("No memory map (%d)\n", rc);
-
-	xen_e820_map_entries = memmap.nr_entries;
-
-	sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
-			  &xen_e820_map_entries);
-
-	for (i = 0; i < xen_e820_map_entries; i++)
-		e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
-				xen_e820_map[i].type);
-
-	/* Remove p2m info, it is not needed. */
-	xen_start_info->mfn_list = 0;
-	xen_start_info->first_p2m_pfn = 0;
-	xen_start_info->nr_p2m_frames = 0;
-
-	return "Xen";
-}
-
 /*
  * Set the bit indicating "nosegneg" library variants should be used.
  * We only need to bother in pure 32-bit mode; compat 32-bit processes
@@ -1032,8 +999,8 @@ void __init xen_pvmmu_arch_setup(void)
 void __init xen_arch_setup(void)
 {
 	xen_panic_handler_init();
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
-		xen_pvmmu_arch_setup();
+
+	xen_pvmmu_arch_setup();
 
 #ifdef CONFIG_ACPI
 	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 311acad7dad2..0dee6f59ea82 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -99,18 +99,8 @@ static void cpu_bringup(void)
 	local_irq_enable();
 }
 
-/*
- * Note: cpu parameter is only relevant for PVH. The reason for passing it
- * is we can't do smp_processor_id until the percpu segments are loaded, for
- * which we need the cpu number! So we pass it in rdi as first parameter.
- */
-asmlinkage __visible void cpu_bringup_and_idle(int cpu)
+asmlinkage __visible void cpu_bringup_and_idle(void)
 {
-#ifdef CONFIG_XEN_PVH
-	if (xen_feature(XENFEAT_auto_translated_physmap) &&
-	    xen_feature(XENFEAT_supervisor_mode_kernel))
-		xen_pvh_secondary_vcpu_init(cpu);
-#endif
 	cpu_bringup();
 	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
@@ -404,61 +394,47 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	gdt = get_cpu_gdt_table(cpu);
 
 #ifdef CONFIG_X86_32
-	/* Note: PVH is not yet supported on x86_32. */
 	ctxt->user_regs.fs = __KERNEL_PERCPU;
 	ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
 #endif
 	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
 
-	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-		ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
-		ctxt->flags = VGCF_IN_KERNEL;
-		ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
-		ctxt->user_regs.ds = __USER_DS;
-		ctxt->user_regs.es = __USER_DS;
-		ctxt->user_regs.ss = __KERNEL_DS;
+	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+	ctxt->flags = VGCF_IN_KERNEL;
+	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+	ctxt->user_regs.ds = __USER_DS;
+	ctxt->user_regs.es = __USER_DS;
+	ctxt->user_regs.ss = __KERNEL_DS;
 
-		xen_copy_trap_info(ctxt->trap_ctxt);
+	xen_copy_trap_info(ctxt->trap_ctxt);
 
-		ctxt->ldt_ents = 0;
+	ctxt->ldt_ents = 0;
 
-		BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+	BUG_ON((unsigned long)gdt & ~PAGE_MASK);
 
-		gdt_mfn = arbitrary_virt_to_mfn(gdt);
-		make_lowmem_page_readonly(gdt);
-		make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
+	gdt_mfn = arbitrary_virt_to_mfn(gdt);
+	make_lowmem_page_readonly(gdt);
+	make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
 
-		ctxt->gdt_frames[0] = gdt_mfn;
-		ctxt->gdt_ents      = GDT_ENTRIES;
+	ctxt->gdt_frames[0] = gdt_mfn;
+	ctxt->gdt_ents      = GDT_ENTRIES;
 
-		ctxt->kernel_ss = __KERNEL_DS;
-		ctxt->kernel_sp = idle->thread.sp0;
+	ctxt->kernel_ss = __KERNEL_DS;
+	ctxt->kernel_sp = idle->thread.sp0;
 
 #ifdef CONFIG_X86_32
-		ctxt->event_callback_cs     = __KERNEL_CS;
-		ctxt->failsafe_callback_cs  = __KERNEL_CS;
+	ctxt->event_callback_cs     = __KERNEL_CS;
+	ctxt->failsafe_callback_cs  = __KERNEL_CS;
 #else
-		ctxt->gs_base_kernel = per_cpu_offset(cpu);
-#endif
-		ctxt->event_callback_eip    =
-					(unsigned long)xen_hypervisor_callback;
-		ctxt->failsafe_callback_eip =
-					(unsigned long)xen_failsafe_callback;
-		ctxt->user_regs.cs = __KERNEL_CS;
-		per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
-	}
-#ifdef CONFIG_XEN_PVH
-	else {
-		/*
-		 * The vcpu comes on kernel page tables which have the NX pte
-		 * bit set. This means before DS/SS is touched, NX in
-		 * EFER must be set. Hence the following assembly glue code.
-		 */
-		ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init;
-		ctxt->user_regs.rdi = cpu;
-		ctxt->user_regs.rsi = true;  /* entry == true */
-	}
+	ctxt->gs_base_kernel = per_cpu_offset(cpu);
 #endif
+	ctxt->event_callback_eip    =
+		(unsigned long)xen_hypervisor_callback;
+	ctxt->failsafe_callback_eip =
+		(unsigned long)xen_failsafe_callback;
+	ctxt->user_regs.cs = __KERNEL_CS;
+	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+
 	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
 	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
 	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index c5c16dc4f694..9beef333584a 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -21,12 +21,4 @@ static inline int xen_smp_intr_init(unsigned int cpu)
 static inline void xen_smp_intr_free(unsigned int cpu) {}
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_XEN_PVH
-extern void xen_pvh_early_cpu_init(int cpu, bool entry);
-#else
-static inline void xen_pvh_early_cpu_init(int cpu, bool entry)
-{
-}
-#endif
-
 #endif
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7f8d8abf4c1a..37794e42b67d 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -16,25 +16,6 @@
 #include <xen/interface/xen-mca.h>
 #include <asm/xen/interface.h>
 
-#ifdef CONFIG_XEN_PVH
-#define PVH_FEATURES_STR  "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel"
-/* Note the lack of 'hvm_callback_vector'. Older hypervisor will
- * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in
- * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore.
- */
-#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \
-		      (1 << XENFEAT_auto_translated_physmap) | \
-		      (1 << XENFEAT_supervisor_mode_kernel) | \
-		      (1 << XENFEAT_hvm_callback_vector))
-/* The XENFEAT_writable_page_tables is not stricly necessary as we set that
- * up regardless whether this CONFIG option is enabled or not, but it
- * clarifies what the right flags need to be.
- */
-#else
-#define PVH_FEATURES_STR  ""
-#define PVH_FEATURES (0)
-#endif
-
 	__INIT
 ENTRY(startup_xen)
 	cld
@@ -54,41 +35,6 @@ ENTRY(startup_xen)
 
 	__FINIT
 
-#ifdef CONFIG_XEN_PVH
-/*
- * xen_pvh_early_cpu_init() - early PVH VCPU initialization
- * @cpu:   this cpu number (%rdi)
- * @entry: true if this is a secondary vcpu coming up on this entry
- *         point, false if this is the boot CPU being initialized for
- *         the first time (%rsi)
- *
- * Note: This is called as a function on the boot CPU, and is the entry point
- *       on the secondary CPU.
- */
-ENTRY(xen_pvh_early_cpu_init)
-	mov     %rsi, %r11
-
-	/* Gather features to see if NX implemented. */
-	mov     $0x80000001, %eax
-	cpuid
-	mov     %edx, %esi
-
-	mov     $MSR_EFER, %ecx
-	rdmsr
-	bts     $_EFER_SCE, %eax
-
-	bt      $20, %esi
-	jnc     1f      	/* No NX, skip setting it */
-	bts     $_EFER_NX, %eax
-1:	wrmsr
-#ifdef CONFIG_SMP
-	cmp     $0, %r11b
-	jne     cpu_bringup_and_idle
-#endif
-	ret
-
-#endif /* CONFIG_XEN_PVH */
-
 .pushsection .text
 	.balign PAGE_SIZE
 ENTRY(hypercall_page)
@@ -114,10 +60,10 @@ ENTRY(hypercall_page)
 #endif
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
-	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR)
-	ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) |
-						(1 << XENFEAT_writable_page_tables) |
-						(1 << XENFEAT_dom0))
+	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,
+		.ascii "!writable_page_tables|pae_pgdir_above_4gb")
+	ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES,
+		.long (1 << XENFEAT_writable_page_tables) | (1 << XENFEAT_dom0))
 	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
 	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index ac0a2b0f9e62..f6a41c41ebc7 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -146,5 +146,4 @@ __visible void xen_adjust_exception_frame(void);
 
 extern int xen_panic_handler_init(void);
 
-void xen_pvh_secondary_vcpu_init(int cpu);
 #endif /* XEN_OPS_H */
-- 
cgit 


From 7243b93345f7f8de260e8f5b4670803e64fcbb00 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Sun, 5 Feb 2017 19:50:52 -0500
Subject: xen/pvh: Bootstrap PVH guest

Start PVH guest at XEN_ELFNOTE_PHYS32_ENTRY address. Setup hypercall
page, initialize boot_params, enable early page tables.

Since this stub is executed before kernel entry point we cannot use
variables in .bss which is cleared by kernel. We explicitly place
variables that are initialized here into .data.

While adjusting xen_hvm_init_shared_info() make it use cpuid_e?x()
instead of cpuid() (wherever possible).

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/Kconfig     |   2 +-
 arch/x86/xen/Makefile    |   1 +
 arch/x86/xen/enlighten.c | 124 +++++++++++++++++++++++++++++++++---
 arch/x86/xen/xen-pvh.S   | 161 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 277 insertions(+), 11 deletions(-)
 create mode 100644 arch/x86/xen/xen-pvh.S

(limited to 'arch/x86')

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c7b15f3e2cf3..76b6dbd627df 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -53,5 +53,5 @@ config XEN_DEBUG_FS
 
 config XEN_PVH
 	bool "Support for running as a PVH guest"
-	depends on X86_64 && XEN && XEN_PVHVM
+	depends on XEN && XEN_PVHVM && ACPI
 	def_bool n
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index e47e52787d32..cb0164aee156 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
 obj-$(CONFIG_XEN_DOM0)		+= vga.o
 obj-$(CONFIG_SWIOTLB_XEN)	+= pci-swiotlb-xen.o
 obj-$(CONFIG_XEN_EFI)		+= efi.o
+obj-$(CONFIG_XEN_PVH)	 	+= xen-pvh.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 828f1b226f56..d2144f7c8fab 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -45,6 +45,7 @@
 #include <xen/interface/memory.h>
 #include <xen/interface/nmi.h>
 #include <xen/interface/xen-mca.h>
+#include <xen/interface/hvm/start_info.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvm.h>
@@ -176,6 +177,20 @@ struct tls_descs {
  */
 static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
 
+#ifdef CONFIG_XEN_PVH
+/*
+ * PVH variables.
+ *
+ * xen_pvh and pvh_bootparams need to live in data segment since they
+ * are used after startup_{32|64}, which clear .bss, are invoked.
+ */
+bool xen_pvh __attribute__((section(".data"))) = 0;
+struct boot_params pvh_bootparams __attribute__((section(".data")));
+
+struct hvm_start_info pvh_start_info;
+unsigned int pvh_start_info_sz = sizeof(pvh_start_info);
+#endif
+
 static void clamp_max_cpus(void)
 {
 #ifdef CONFIG_SMP
@@ -1656,6 +1671,90 @@ asmlinkage __visible void __init xen_start_kernel(void)
 #endif
 }
 
+#ifdef CONFIG_XEN_PVH
+static void __init init_pvh_bootparams(void)
+{
+	struct xen_memory_map memmap;
+	unsigned int i;
+	int rc;
+
+	memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
+
+	memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_map);
+	set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_map);
+	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	if (rc) {
+		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
+		BUG();
+	}
+
+	if (memmap.nr_entries < E820MAX - 1) {
+		pvh_bootparams.e820_map[memmap.nr_entries].addr =
+			ISA_START_ADDRESS;
+		pvh_bootparams.e820_map[memmap.nr_entries].size =
+			ISA_END_ADDRESS - ISA_START_ADDRESS;
+		pvh_bootparams.e820_map[memmap.nr_entries].type =
+			E820_RESERVED;
+		memmap.nr_entries++;
+	} else
+		xen_raw_printk("Warning: Can fit ISA range into e820\n");
+
+	sanitize_e820_map(pvh_bootparams.e820_map,
+			  ARRAY_SIZE(pvh_bootparams.e820_map),
+			  &memmap.nr_entries);
+
+	pvh_bootparams.e820_entries = memmap.nr_entries;
+	for (i = 0; i < pvh_bootparams.e820_entries; i++)
+		e820_add_region(pvh_bootparams.e820_map[i].addr,
+				pvh_bootparams.e820_map[i].size,
+				pvh_bootparams.e820_map[i].type);
+
+	pvh_bootparams.hdr.cmd_line_ptr =
+		pvh_start_info.cmdline_paddr;
+
+	/* The first module is always ramdisk. */
+	if (pvh_start_info.nr_modules) {
+		struct hvm_modlist_entry *modaddr =
+			__va(pvh_start_info.modlist_paddr);
+		pvh_bootparams.hdr.ramdisk_image = modaddr->paddr;
+		pvh_bootparams.hdr.ramdisk_size = modaddr->size;
+	}
+
+	/*
+	 * See Documentation/x86/boot.txt.
+	 *
+	 * Version 2.12 supports Xen entry point but we will use default x86/PC
+	 * environment (i.e. hardware_subarch 0).
+	 */
+	pvh_bootparams.hdr.version = 0x212;
+	pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
+}
+
+/*
+ * This routine (and those that it might call) should not use
+ * anything that lives in .bss since that segment will be cleared later.
+ */
+void __init xen_prepare_pvh(void)
+{
+	u32 msr;
+	u64 pfn;
+
+	if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
+		xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
+				pvh_start_info.magic);
+		BUG();
+	}
+
+	xen_pvh = 1;
+
+	msr = cpuid_ebx(xen_cpuid_base() + 2);
+	pfn = __pa(hypercall_page);
+	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+	init_pvh_bootparams();
+}
+#endif
+
 void __ref xen_hvm_init_shared_info(void)
 {
 	int cpu;
@@ -1695,20 +1794,29 @@ void __ref xen_hvm_init_shared_info(void)
 static void __init init_hvm_pv_info(void)
 {
 	int major, minor;
-	uint32_t eax, ebx, ecx, edx, pages, msr, base;
-	u64 pfn;
+	uint32_t eax, ebx, ecx, edx, base;
 
 	base = xen_cpuid_base();
-	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+	eax = cpuid_eax(base + 1);
 
 	major = eax >> 16;
 	minor = eax & 0xffff;
 	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
 
-	cpuid(base + 2, &pages, &msr, &ecx, &edx);
+	xen_domain_type = XEN_HVM_DOMAIN;
 
-	pfn = __pa(hypercall_page);
-	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+	/* PVH set up hypercall page in xen_prepare_pvh(). */
+	if (xen_pvh_domain())
+		pv_info.name = "Xen PVH";
+	else {
+		u64 pfn;
+		uint32_t msr;
+
+		pv_info.name = "Xen HVM";
+		msr = cpuid_ebx(base + 2);
+		pfn = __pa(hypercall_page);
+		wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+	}
 
 	xen_setup_features();
 
@@ -1717,10 +1825,6 @@ static void __init init_hvm_pv_info(void)
 		this_cpu_write(xen_vcpu_id, ebx);
 	else
 		this_cpu_write(xen_vcpu_id, smp_processor_id());
-
-	pv_info.name = "Xen HVM";
-
-	xen_domain_type = XEN_HVM_DOMAIN;
 }
 #endif
 
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
new file mode 100644
index 000000000000..5e246716d58f
--- /dev/null
+++ b/arch/x86/xen/xen-pvh.S
@@ -0,0 +1,161 @@
+/*
+ * Copyright C 2016, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+	.code32
+	.text
+#define _pa(x)          ((x) - __START_KERNEL_map)
+
+#include <linux/elfnote.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/asm.h>
+#include <asm/boot.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <xen/interface/elfnote.h>
+
+	__HEAD
+
+/*
+ * Entry point for PVH guests.
+ *
+ * Xen ABI specifies the following register state when we come here:
+ *
+ * - `ebx`: contains the physical memory address where the loader has placed
+ *          the boot start info structure.
+ * - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared.
+ * - `cr4`: all bits are cleared.
+ * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’
+ *          and a limit of ‘0xFFFFFFFF’. The selector value is unspecified.
+ * - `ds`, `es`: must be a 32-bit read/write data segment with a base of
+ *               ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all
+ *               unspecified.
+ * - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit
+ *         of '0x67'.
+ * - `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared.
+ *             Bit 8 (TF) must be cleared. Other bits are all unspecified.
+ *
+ * All other processor registers and flag bits are unspecified. The OS is in
+ * charge of setting up it's own stack, GDT and IDT.
+ */
+
+ENTRY(pvh_start_xen)
+	cld
+
+	lgdt (_pa(gdt))
+
+	mov $(__BOOT_DS),%eax
+	mov %eax,%ds
+	mov %eax,%es
+	mov %eax,%ss
+
+	/* Stash hvm_start_info. */
+	mov $_pa(pvh_start_info), %edi
+	mov %ebx, %esi
+	mov _pa(pvh_start_info_sz), %ecx
+	shr $2,%ecx
+	rep
+	movsl
+
+	mov $_pa(early_stack_end), %esp
+
+	/* Enable PAE mode. */
+	mov %cr4, %eax
+	orl $X86_CR4_PAE, %eax
+	mov %eax, %cr4
+
+#ifdef CONFIG_X86_64
+	/* Enable Long mode. */
+	mov $MSR_EFER, %ecx
+	rdmsr
+	btsl $_EFER_LME, %eax
+	wrmsr
+
+	/* Enable pre-constructed page tables. */
+	mov $_pa(init_level4_pgt), %eax
+	mov %eax, %cr3
+	mov $(X86_CR0_PG | X86_CR0_PE), %eax
+	mov %eax, %cr0
+
+	/* Jump to 64-bit mode. */
+	ljmp $__KERNEL_CS, $_pa(1f)
+
+	/* 64-bit entry point. */
+	.code64
+1:
+	call xen_prepare_pvh
+
+	/* startup_64 expects boot_params in %rsi. */
+	mov $_pa(pvh_bootparams), %rsi
+	mov $_pa(startup_64), %rax
+	jmp *%rax
+
+#else /* CONFIG_X86_64 */
+
+	call mk_early_pgtbl_32
+
+	mov $_pa(initial_page_table), %eax
+	mov %eax, %cr3
+
+	mov %cr0, %eax
+	or $(X86_CR0_PG | X86_CR0_PE), %eax
+	mov %eax, %cr0
+
+	ljmp $__BOOT_CS, $1f
+1:
+	call xen_prepare_pvh
+	mov $_pa(pvh_bootparams), %esi
+
+	/* startup_32 doesn't expect paging and PAE to be on. */
+	ljmp $__BOOT_CS, $_pa(2f)
+2:
+	mov %cr0, %eax
+	and $~X86_CR0_PG, %eax
+	mov %eax, %cr0
+	mov %cr4, %eax
+	and $~X86_CR4_PAE, %eax
+	mov %eax, %cr4
+
+	ljmp $__BOOT_CS, $_pa(startup_32)
+#endif
+END(pvh_start_xen)
+
+	.section ".init.data","aw"
+	.balign 8
+gdt:
+	.word gdt_end - gdt_start
+	.long _pa(gdt_start)
+	.word 0
+gdt_start:
+	.quad 0x0000000000000000            /* NULL descriptor */
+	.quad 0x0000000000000000            /* reserved */
+#ifdef CONFIG_X86_64
+	.quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* __KERNEL_CS */
+#else
+	.quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* __KERNEL_CS */
+#endif
+	.quad GDT_ENTRY(0xc092, 0, 0xfffff) /* __KERNEL_DS */
+gdt_end:
+
+	.balign 4
+early_stack:
+	.fill 256, 1, 0
+early_stack_end:
+
+	ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
+	             _ASM_PTR (pvh_start_xen - __START_KERNEL_map))
-- 
cgit 


From 5adad168e586cb381633f45d181bb729b04393a5 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Sun, 5 Feb 2017 19:50:58 -0500
Subject: xen/pvh: Make sure we don't use ACPI_IRQ_MODEL_PIC for SCI

Since we are not using PIC and (at least currently) don't have IOAPIC
we want to make sure that acpi_irq_model doesn't stay set to
ACPI_IRQ_MODEL_PIC (which is the default value). If we allowed it to
stay then acpi_os_install_interrupt_handler() would try (and fail) to
request_irq() for PIC.

Instead we set the model to ACPI_IRQ_MODEL_PLATFORM which will prevent
this from happening.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/enlighten.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d2144f7c8fab..6d406f3465bc 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1672,6 +1672,16 @@ asmlinkage __visible void __init xen_start_kernel(void)
 }
 
 #ifdef CONFIG_XEN_PVH
+
+static void xen_pvh_arch_setup(void)
+{
+#ifdef CONFIG_ACPI
+	/* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */
+	if (nr_ioapics == 0)
+		acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
+#endif
+}
+
 static void __init init_pvh_bootparams(void)
 {
 	struct xen_memory_map memmap;
@@ -1752,6 +1762,8 @@ void __init xen_prepare_pvh(void)
 	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
 
 	init_pvh_bootparams();
+
+	x86_init.oem.arch_setup = xen_pvh_arch_setup;
 }
 #endif
 
-- 
cgit 


From bcc57df281d93dfa502c824e9f73e0191c3f7c34 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Mon, 6 Feb 2017 10:57:15 -0500
Subject: xen/pvh: PVH guests always have PV devices

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/platform-pci-unplug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 90d1b83cf35f..33a783c77d96 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -73,8 +73,8 @@ bool xen_has_pv_devices(void)
 	if (!xen_domain())
 		return false;
 
-	/* PV domains always have them. */
-	if (xen_pv_domain())
+	/* PV and PVH domains always have them. */
+	if (xen_pv_domain() || xen_pvh_domain())
 		return true;
 
 	/* And user has xen_platform_pci=0 set in guest config as
-- 
cgit 


From 7a1c44ebc5ac2e2c28d95b0da6060728c334e7e4 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Mon, 6 Feb 2017 10:58:06 -0500
Subject: xen/pvh: Use Xen's emergency_restart op for PVH guests

Using native_machine_emergency_restart (called during reboot) will
lead PVH guests to machine_real_restart()  where we try to use
real_mode_header which is not initialized.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/enlighten.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 6d406f3465bc..ec1d5c46e58f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1924,6 +1924,9 @@ static void __init xen_hvm_guest_init(void)
 	x86_init.irqs.intr_init = xen_init_IRQ;
 	xen_hvm_init_time_ops();
 	xen_hvm_init_mmu_ops();
+
+	if (xen_pvh_domain())
+		machine_ops.emergency_restart = xen_emergency_restart;
 #ifdef CONFIG_KEXEC_CORE
 	machine_ops.shutdown = xen_hvm_shutdown;
 	machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
-- 
cgit 


From 42cf014d38d8822cce63703a467e00f65d000952 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 25 Jan 2017 11:58:57 +0100
Subject: KVM: nVMX: kmap() can't fail

kmap() can't fail, therefore it will always return a valid pointer. Let's
just get rid of the unnecessary checks.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d850d5d36182..693e4203b666 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4973,10 +4973,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 			return 0;
 
 		vapic_page = kmap(vmx->nested.virtual_apic_page);
-		if (!vapic_page) {
-			WARN_ON(1);
-			return -ENOMEM;
-		}
 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
 		kunmap(vmx->nested.virtual_apic_page);
 
@@ -9738,11 +9734,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 		return false;
 	}
 	msr_bitmap_l1 = (unsigned long *)kmap(page);
-	if (!msr_bitmap_l1) {
-		nested_release_page_clean(page);
-		WARN_ON(1);
-		return false;
-	}
 
 	memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
 
-- 
cgit 


From 6342c50ad12e8ce0736e722184a7dbdea4a3477f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 25 Jan 2017 11:58:58 +0100
Subject: KVM: nVMX: vmx_complete_nested_posted_interrupt() can't fail

vmx_complete_nested_posted_interrupt() can't fail, let's turn it into
a void function.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 693e4203b666..7c3e42623090 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4953,7 +4953,7 @@ static bool vmx_get_enable_apicv(void)
 	return enable_apicv;
 }
 
-static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int max_irr;
@@ -4964,13 +4964,13 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 	    vmx->nested.pi_pending) {
 		vmx->nested.pi_pending = false;
 		if (!pi_test_and_clear_on(vmx->nested.pi_desc))
-			return 0;
+			return;
 
 		max_irr = find_last_bit(
 			(unsigned long *)vmx->nested.pi_desc->pir, 256);
 
 		if (max_irr == 256)
-			return 0;
+			return;
 
 		vapic_page = kmap(vmx->nested.virtual_apic_page);
 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
@@ -4983,7 +4983,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 			vmcs_write16(GUEST_INTR_STATUS, status);
 		}
 	}
-	return 0;
 }
 
 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -10695,7 +10694,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
 		return 0;
 	}
 
-	return vmx_complete_nested_posted_interrupt(vcpu);
+	vmx_complete_nested_posted_interrupt(vcpu);
+	return 0;
 }
 
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
-- 
cgit 


From 55dd00a73a518281bc846dc5de1a718349431eb2 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 24 Jan 2017 15:09:39 -0200
Subject: KVM: x86: add KVM_HC_CLOCK_PAIRING hypercall

Add a hypercall to retrieve the host realtime clock and the TSC value
used to calculate that clock read.

Used to implement clock synchronization between host and guest.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/kvm_para.h |  9 +++++
 arch/x86/kvm/x86.c                   | 66 ++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 1421a6585126..cff0bb6556f8 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -50,6 +50,15 @@ struct kvm_steal_time {
 	__u32 pad[11];
 };
 
+#define KVM_CLOCK_PAIRING_WALLCLOCK 0
+struct kvm_clock_pairing {
+	__s64 sec;
+	__s64 nsec;
+	__u64 tsc;
+	__u32 flags;
+	__u32 pad[9];
+};
+
 #define KVM_STEAL_ALIGNMENT_BITS 5
 #define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
 #define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4fd4d4f35caf..09e5d31dac98 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1142,6 +1142,7 @@ struct pvclock_gtod_data {
 
 	u64		boot_ns;
 	u64		nsec_base;
+	u64		wall_time_sec;
 };
 
 static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1165,6 +1166,8 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 	vdata->boot_ns			= boot_ns;
 	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;
 
+	vdata->wall_time_sec            = tk->xtime_sec;
+
 	write_seqcount_end(&vdata->seq);
 }
 #endif
@@ -1626,6 +1629,28 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now)
 	return mode;
 }
 
+static int do_realtime(struct timespec *ts, u64 *cycle_now)
+{
+	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+	unsigned long seq;
+	int mode;
+	u64 ns;
+
+	do {
+		seq = read_seqcount_begin(&gtod->seq);
+		mode = gtod->clock.vclock_mode;
+		ts->tv_sec = gtod->wall_time_sec;
+		ns = gtod->nsec_base;
+		ns += vgettsc(cycle_now);
+		ns >>= gtod->clock.shift;
+	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+
+	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+	ts->tv_nsec = ns;
+
+	return mode;
+}
+
 /* returns true if host is using tsc clocksource */
 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
 {
@@ -1635,6 +1660,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
 
 	return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
 }
+
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_walltime_and_clockread(struct timespec *ts,
+					   u64 *cycle_now)
+{
+	/* checked again under seqlock below */
+	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+		return false;
+
+	return do_realtime(ts, cycle_now) == VCLOCK_TSC;
+}
 #endif
 
 /*
@@ -6112,6 +6148,33 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
+			        unsigned long clock_type)
+{
+	struct kvm_clock_pairing clock_pairing;
+	struct timespec ts;
+	cycle_t cycle;
+	int ret;
+
+	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
+		return -KVM_EOPNOTSUPP;
+
+	if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
+		return -KVM_EOPNOTSUPP;
+
+	clock_pairing.sec = ts.tv_sec;
+	clock_pairing.nsec = ts.tv_nsec;
+	clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
+	clock_pairing.flags = 0;
+
+	ret = 0;
+	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
+			    sizeof(struct kvm_clock_pairing)))
+		ret = -KVM_EFAULT;
+
+	return ret;
+}
+
 /*
  * kvm_pv_kick_cpu_op:  Kick a vcpu.
  *
@@ -6176,6 +6239,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
 		ret = 0;
 		break;
+	case KVM_HC_CLOCK_PAIRING:
+		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
-- 
cgit 


From ad21fc4faa2a1f919bac1073b885df9310dbc581 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Mon, 6 Feb 2017 16:31:57 -0800
Subject: arch: Move CONFIG_DEBUG_RODATA and CONFIG_SET_MODULE_RONX to be
 common

There are multiple architectures that support CONFIG_DEBUG_RODATA and
CONFIG_SET_MODULE_RONX. These options also now have the ability to be
turned off at runtime. Move these to an architecture independent
location and make these options def_bool y for almost all of those
arches.

Signed-off-by: Laura Abbott <labbott@redhat.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/x86/Kconfig       |  5 ++---
 arch/x86/Kconfig.debug | 11 -----------
 2 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e487493bbd47..13e1bf4b0fe5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -54,6 +54,8 @@ config X86
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_SG_CHAIN
+	select ARCH_HAS_STRICT_KERNEL_RWX
+	select ARCH_HAS_STRICT_MODULE_RWX
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
@@ -309,9 +311,6 @@ config ARCH_SUPPORTS_UPROBES
 config FIX_EARLYCON_MEM
 	def_bool y
 
-config DEBUG_RODATA
-	def_bool y
-
 config PGTABLE_LEVELS
 	int
 	default 4 if X86_64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 67eec55093a5..69cdd0b2176b 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -109,17 +109,6 @@ config DEBUG_WX
 
 	  If in doubt, say "Y".
 
-config DEBUG_SET_MODULE_RONX
-	bool "Set loadable kernel module data as NX and text as RO"
-	depends on MODULES
-	---help---
-	  This option helps catch unintended modifications to loadable
-	  kernel module's text and read-only data. It also prevents execution
-	  of module data. Such protection may interfere with run-time code
-	  patching and dynamic kernel tracing - and they might also protect
-	  against certain classes of kernel exploits.
-	  If in doubt, say "N".
-
 config DEBUG_NX_TEST
 	tristate "Testcase for the NX non-executable stack feature"
 	depends on DEBUG_KERNEL && m
-- 
cgit 


From 80fbd89cbd07287a7013006c14ddec923b7a4ff6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 8 Feb 2017 10:57:24 +0100
Subject: KVM: x86: fix compilation

Fix rebase breakage from commit 55dd00a73a51 ("KVM: x86: add
KVM_HC_CLOCK_PAIRING hypercall", 2017-01-24), courtesy of the
"I could have sworn I had pushed the right branch" department.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 09e5d31dac98..96dd7dd13ee6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6153,7 +6153,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
 {
 	struct kvm_clock_pairing clock_pairing;
 	struct timespec ts;
-	cycle_t cycle;
+	u64 cycle;
 	int ret;
 
 	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
-- 
cgit 


From f4066c2bc4d0de4e5dcbff21dae41e89fe8f38c0 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 24 Jan 2017 15:09:41 -0200
Subject: kvmclock: export kvmclock clocksource and data pointers

To be used by KVM PTP driver.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvmclock.h | 6 ++++++
 arch/x86/kernel/kvmclock.c      | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/kvmclock.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h
new file mode 100644
index 000000000000..f260bef63591
--- /dev/null
+++ b/arch/x86/include/asm/kvmclock.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_KVM_CLOCK_H
+#define _ASM_X86_KVM_CLOCK_H
+
+extern struct clocksource kvm_clock;
+
+#endif /* _ASM_X86_KVM_CLOCK_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2a5cafdf8808..995fa260a6da 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -28,6 +28,7 @@
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
+#include <asm/kvmclock.h>
 
 static int kvmclock __ro_after_init = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
@@ -49,6 +50,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
 {
 	return hv_clock;
 }
+EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va);
 
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
@@ -174,13 +176,14 @@ bool kvm_check_and_clear_guest_paused(void)
 	return ret;
 }
 
-static struct clocksource kvm_clock = {
+struct clocksource kvm_clock = {
 	.name = "kvm-clock",
 	.read = kvm_clock_get_cycles,
 	.rating = 400,
 	.mask = CLOCKSOURCE_MASK(64),
 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
+EXPORT_SYMBOL_GPL(kvm_clock);
 
 int kvm_register_clock(char *txt)
 {
-- 
cgit 


From d966564fcdc19e13eb6ba1fbe6b8101070339c3d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 8 Feb 2017 18:08:29 -0800
Subject: Revert "x86/ioapic: Restore IO-APIC irq_chip retrigger callback"

This reverts commit 020eb3daaba2857b32c4cf4c82f503d6a00a67de.

Gabriel C reports that it causes his machine to not boot, and we haven't
tracked down the reason for it yet.  Since the bug it fixes has been
around for a longish time, we're better off reverting the fix for now.

Gabriel says:
 "It hangs early and freezes with a lot RCU warnings.

  I bisected it down to :

  > Ruslan Ruslichenko (1):
  >       x86/ioapic: Restore IO-APIC irq_chip retrigger callback

  Reverting this one fixes the problem for me..

  The box is a PRIMERGY TX200 S5 , 2 socket , 2 x E5520 CPU(s) installed"

and Ruslan and Thomas are currently stumped.

Reported-and-bisected-by: Gabriel C <nix.or.die@gmail.com>
Cc: Ruslan Ruslichenko <rruslich@cisco.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org   # for the backport of the original commit
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/apic/io_apic.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 52f352b063fd..bd6b8c270c24 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1875,7 +1875,6 @@ static struct irq_chip ioapic_chip __read_mostly = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_eoi		= ioapic_ack_level,
 	.irq_set_affinity	= ioapic_set_affinity,
-	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
@@ -1887,7 +1886,6 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_eoi		= ioapic_ir_ack_level,
 	.irq_set_affinity	= ioapic_set_affinity,
-	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
-- 
cgit 


From 8ef81a9a453f9048c1683e40b540a4221986a2d1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 9 Feb 2017 16:10:42 +0100
Subject: KVM: x86: hide KVM_HC_CLOCK_PAIRING on 32 bit

The newly added hypercall doesn't work on x86-32:

arch/x86/kvm/x86.c: In function 'kvm_pv_clock_pairing':
arch/x86/kvm/x86.c:6163:6: error: implicit declaration of function 'kvm_get_walltime_and_clockread';did you mean 'kvm_get_time_scale'? [-Werror=implicit-function-declaration]

This adds an #ifdef around it, matching the one around the related
functions that are also only implemented on 64-bit systems.

Fixes: 55dd00a73a51 ("KVM: x86: add KVM_HC_CLOCK_PAIRING hypercall")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 500008f800dc..2f64e5d0ae53 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6148,6 +6148,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+#ifdef CONFIG_X86_64
 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
 			        unsigned long clock_type)
 {
@@ -6174,6 +6175,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
 
 	return ret;
 }
+#endif
 
 /*
  * kvm_pv_kick_cpu_op:  Kick a vcpu.
@@ -6239,9 +6241,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
 		ret = 0;
 		break;
+#ifdef CONFIG_X86_64
 	case KVM_HC_CLOCK_PAIRING:
 		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
 		break;
+#endif
 	default:
 		ret = -KVM_ENOSYS;
 		break;
-- 
cgit 


From f2e04214ef7f7e49d1e06109ad1b2718155dab25 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 9 Feb 2017 16:08:41 +0100
Subject: x86/tsc: Avoid the large time jump when sanitizing TSC ADJUST

Olof reported that on a machine which has a BIOS wreckaged TSC the
timestamps in dmesg are making a large jump because the TSC value is
jumping forward after resetting the TSC ADJUST register to a sane value.

This can be avoided by calling the TSC ADJUST saniziting function before
initializing the per cpu sched clock machinery. That takes the offset into
account and avoid the time jump.

What cannot be avoided is that the 'Firmware Bug' warnings on the secondary
CPUs are printed with the large time offsets because it would be too much
effort and ugly hackery to print those warnings into a buffer and emit them
after the adjustemt on the starting CPUs. It's a firmware bug and should be
fixed in firmware. The weird timestamps are collateral damage and just
illustrate the sillyness of the BIOS folks:

[    0.397445] smp: Bringing up secondary CPUs ...
[    0.402100] x86: Booting SMP configuration:
[    0.406343] .... node  #0, CPUs:      #1
[1265776479.930667] [Firmware Bug]: TSC ADJUST differs: Reference CPU0: -2978888639075328 CPU1: -2978888639183101
[1265776479.944664] TSC ADJUST synchronize: Reference CPU0: 0 CPU1: -2978888639183101
[    0.508119]  #2
[1265776480.032346] [Firmware Bug]: TSC ADJUST differs: Reference CPU0: -2978888639075328 CPU2: -2978888639183677
[1265776480.044192] TSC ADJUST synchronize: Reference CPU0: 0 CPU2: -2978888639183677
[    0.607643]  #3
[1265776480.131874] [Firmware Bug]: TSC ADJUST differs: Reference CPU0: -2978888639075328 CPU3: -2978888639184530
[1265776480.143720] TSC ADJUST synchronize: Reference CPU0: 0 CPU3: -2978888639184530
[    0.707108] smp: Brought up 1 node, 4 CPUs
[    0.711271] smpboot: Total of 4 processors activated (21698.88 BogoMIPS)

Reported-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170209151231.411460506@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index e41af597aed8..37e7cf544e51 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1356,6 +1356,9 @@ void __init tsc_init(void)
 		(unsigned long)cpu_khz / 1000,
 		(unsigned long)cpu_khz % 1000);
 
+	/* Sanitize TSC ADJUST before cyc2ns gets initialized */
+	tsc_store_and_check_tsc_adjust(true);
+
 	/*
 	 * Secondary CPUs do not run through tsc_init(), so set up
 	 * all the scale factors for all CPUs, assuming the same
@@ -1386,8 +1389,6 @@ void __init tsc_init(void)
 
 	if (unsynchronized_tsc())
 		mark_tsc_unstable("TSCs unsynchronized");
-	else
-		tsc_store_and_check_tsc_adjust(true);
 
 	check_system_tsc_reliable();
 
-- 
cgit 


From 5f2e71e71410ecb858cfec184ba092adaca61626 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 9 Feb 2017 16:08:42 +0100
Subject: x86/tsc: Make the TSC ADJUST sanitizing work for tsc_reliable

When the TSC is marked reliable then the synchronization check is skipped,
but that also skips the TSC ADJUST sanitizing code. So on a machine with a
wreckaged BIOS the TSC deviation between CPUs might go unnoticed.

Let the TSC adjust sanitizing code run unconditionally and just skip the
expensive synchronization checks when TSC is marked reliable.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Olof Johansson <olof@lixom.net>
Link: http://lkml.kernel.org/r/20170209151231.491189912@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_sync.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index d0db011051a5..728f75378475 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -286,13 +286,6 @@ void check_tsc_sync_source(int cpu)
 	if (unsynchronized_tsc())
 		return;
 
-	if (tsc_clocksource_reliable) {
-		if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
-			pr_info(
-			"Skipped synchronization checks as TSC is reliable.\n");
-		return;
-	}
-
 	/*
 	 * Set the maximum number of test runs to
 	 *  1 if the CPU does not provide the TSC_ADJUST MSR
@@ -380,14 +373,19 @@ void check_tsc_sync_target(void)
 	int cpus = 2;
 
 	/* Also aborts if there is no TSC. */
-	if (unsynchronized_tsc() || tsc_clocksource_reliable)
+	if (unsynchronized_tsc())
 		return;
 
 	/*
 	 * Store, verify and sanitize the TSC adjust register. If
 	 * successful skip the test.
+	 *
+	 * The test is also skipped when the TSC is marked reliable. This
+	 * is true for SoCs which have no fallback clocksource. On these
+	 * SoCs the TSC is frequency synchronized, but still the TSC ADJUST
+	 * register might have been wreckaged by the BIOS..
 	 */
-	if (tsc_store_and_check_tsc_adjust(false)) {
+	if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) {
 		atomic_inc(&skip_test);
 		return;
 	}
-- 
cgit 


From 146fbb766934dc003fcbf755b519acef683576bf Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 10 Feb 2017 12:54:05 +0300
Subject: x86/mm/ptdump: Fix soft lockup in page table walker

CONFIG_KASAN=y needs a lot of virtual memory mapped for its shadow.
In that case ptdump_walk_pgd_level_core() takes a lot of time to
walk across all page tables and doing this without
a rescheduling causes soft lockups:

 NMI watchdog: BUG: soft lockup - CPU#3 stuck for 23s! [swapper/0:1]
 ...
 Call Trace:
  ptdump_walk_pgd_level_core+0x40c/0x550
  ptdump_walk_pgd_level_checkwx+0x17/0x20
  mark_rodata_ro+0x13b/0x150
  kernel_init+0x2f/0x120
  ret_from_fork+0x2c/0x40

I guess that this issue might arise even without KASAN on huge machines
with several terabytes of RAM.

Stick cond_resched() in pgd loop to fix this.

Reported-by: Tobias Regnery <tobias.regnery@gmail.com>
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Alexander Potapenko <glider@google.com>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20170210095405.31802-1-aryabinin@virtuozzo.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/dump_pagetables.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index ea9c49adaa1f..8aa6bea1cd6c 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -15,6 +15,7 @@
 #include <linux/debugfs.h>
 #include <linux/mm.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 
 #include <asm/pgtable.h>
@@ -406,6 +407,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 		} else
 			note_page(m, &st, __pgprot(0), 1);
 
+		cond_resched();
 		start++;
 	}
 
-- 
cgit 


From 9b06e1018abc65585b07c75c5b3f406dbabe7005 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Sat, 4 Feb 2017 08:46:23 -0700
Subject: Drivers: hv: Fix the bug in generating the guest ID

Fix the bug in the generation of the guest ID. Without this fix
the host side telemetry code is broken.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Fixes: 352c9624242d ("Drivers: hv: vmbus: Move the definition of generate_guest_id()")
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index f8dc3700de67..56407c6d2397 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -80,7 +80,7 @@ struct ms_hyperv_tsc_page {
  *
  */
 
-#define HV_LINUX_VENDOR_ID              0x8800
+#define HV_LINUX_VENDOR_ID              0x8100
 
 /*
  * Generate the guest ID based on the guideline described above.
@@ -91,7 +91,7 @@ static inline  __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version,
 {
 	__u64 guest_id = 0;
 
-	guest_id = (((__u64)HV_LINUX_VENDOR_ID) << 56);
+	guest_id = (((__u64)HV_LINUX_VENDOR_ID) << 48);
 	guest_id |= (d_info1 << 48);
 	guest_id |= (kernel_version << 16);
 	guest_id |= d_info2;
-- 
cgit 


From dee863b571b0a76e9c549ee99e8782bb4bc6502b Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Sat, 4 Feb 2017 09:57:13 -0700
Subject: hv: export current Hyper-V clocksource

As a preparation to implementing Hyper-V PTP device supporting
.getcrosststamp we need to export a reference to the current Hyper-V
clocksource in use (MSR or TSC page).

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/hyperv/hv_init.c       | 13 +++++++++----
 arch/x86/include/asm/mshyperv.h |  3 +++
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index b371d0e984a9..c224b7df4d21 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -99,6 +99,9 @@ static struct clocksource hyperv_cs_msr = {
 };
 
 static void *hypercall_pg;
+struct clocksource *hyperv_cs;
+EXPORT_SYMBOL_GPL(hyperv_cs);
+
 /*
  * This function is to be invoked early in the boot sequence after the
  * hypervisor has been detected.
@@ -141,10 +144,10 @@ void hyperv_init(void)
 		union hv_x64_msr_hypercall_contents tsc_msr;
 
 		tsc_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
-		if (!tsc_pg) {
-			clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);
-			return;
-		}
+		if (!tsc_pg)
+			goto register_msr_cs;
+
+		hyperv_cs = &hyperv_cs_tsc;
 
 		rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
 
@@ -161,6 +164,8 @@ void hyperv_init(void)
 	 * the partition counter.
 	 */
 
+register_msr_cs:
+	hyperv_cs = &hyperv_cs_msr;
 	if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
 		clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);
 }
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 56407c6d2397..7c9c895432a9 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/interrupt.h>
+#include <linux/clocksource.h>
 #include <asm/hyperv.h>
 
 /*
@@ -168,6 +169,8 @@ void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs));
 void hv_remove_crash_handler(void);
 
 #if IS_ENABLED(CONFIG_HYPERV)
+extern struct clocksource *hyperv_cs;
+
 void hyperv_init(void);
 void hyperv_report_panic(struct pt_regs *regs);
 bool hv_is_hypercall_page_setup(void);
-- 
cgit 


From 372b1e91343e657a7cc5e2e2bcecd5140ac28119 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Wed, 8 Feb 2017 18:30:56 -0700
Subject: drivers: hv: Turn off write permission on the hypercall page

The hypercall page only needs to be executable but currently it is setup to
be writable as well. Fix the issue.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Cc: <stable@vger.kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Reported-by: Stephen Hemminger <stephen@networkplumber.org>
Tested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/hyperv/hv_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index c224b7df4d21..db64baf0e500 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -125,7 +125,7 @@ void hyperv_init(void)
 	guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0);
 	wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
 
-	hypercall_pg  = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_EXEC);
+	hypercall_pg  = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX);
 	if (hypercall_pg == NULL) {
 		wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
 		return;
-- 
cgit 


From 699c4cec238731a4c466f73fe6e9e45ab6f49a41 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Feb 2017 18:17:44 +0100
Subject: PCI/MSI: Remove pci_msi_domain_{alloc,free}_irqs()

Just call the msi_* version directly instead of having trivial wrappers for
one or two callsites.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/msi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 015bbf30e3e3..c61aec7e65f4 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -82,7 +82,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (domain == NULL)
 		return -ENOSYS;
 
-	return pci_msi_domain_alloc_irqs(domain, dev, nvec, type);
+	return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
 }
 
 void native_teardown_msi_irq(unsigned int irq)
-- 
cgit 


From c459bd7beda0295ea67db0ce2004a49addb2f765 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 1 Feb 2017 10:45:02 -0800
Subject: crypto: sha512-mb - Protect sha512 mb ctx mgr access

The flusher and regular multi-buffer computation via mcryptd may race with another.
Add here a lock and turn off interrupt to to access multi-buffer
computation state cstate->mgr before a round of computation. This should
prevent the flusher code jumping in.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha512-mb/sha512_mb.c | 64 +++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c
index 9c1bb6d58141..2dd3674b5a1e 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb.c
+++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
@@ -221,7 +221,7 @@ static struct sha512_hash_ctx *sha512_ctx_mgr_resubmit
 }
 
 static struct sha512_hash_ctx
-		*sha512_ctx_mgr_get_comp_ctx(struct sha512_ctx_mgr *mgr)
+		*sha512_ctx_mgr_get_comp_ctx(struct mcryptd_alg_cstate *cstate)
 {
 	/*
 	 * If get_comp_job returns NULL, there are no jobs complete.
@@ -233,11 +233,17 @@ static struct sha512_hash_ctx
 	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr
 	 * still need processing.
 	 */
+	struct sha512_ctx_mgr *mgr;
 	struct sha512_hash_ctx *ctx;
+	unsigned long flags;
 
+	mgr = cstate->mgr;
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	ctx = (struct sha512_hash_ctx *)
 				sha512_job_mgr_get_comp_job(&mgr->mgr);
-	return sha512_ctx_mgr_resubmit(mgr, ctx);
+	ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+	spin_unlock_irqrestore(&cstate->work_lock, flags);
+	return ctx;
 }
 
 static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
@@ -246,12 +252,17 @@ static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
 }
 
 static struct sha512_hash_ctx
-			*sha512_ctx_mgr_submit(struct sha512_ctx_mgr *mgr,
+			*sha512_ctx_mgr_submit(struct mcryptd_alg_cstate *cstate,
 					  struct sha512_hash_ctx *ctx,
 					  const void *buffer,
 					  uint32_t len,
 					  int flags)
 {
+	struct sha512_ctx_mgr *mgr;
+	unsigned long irqflags;
+
+	mgr = cstate->mgr;
+	spin_lock_irqsave(&cstate->work_lock, irqflags);
 	if (flags & (~HASH_ENTIRE)) {
 		/*
 		 * User should not pass anything other than FIRST, UPDATE, or
@@ -351,20 +362,26 @@ static struct sha512_hash_ctx
 		}
 	}
 
-	return sha512_ctx_mgr_resubmit(mgr, ctx);
+	ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+	spin_unlock_irqrestore(&cstate->work_lock, irqflags);
+	return ctx;
 }
 
-static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr)
+static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct mcryptd_alg_cstate *cstate)
 {
+	struct sha512_ctx_mgr *mgr;
 	struct sha512_hash_ctx *ctx;
+	unsigned long flags;
 
+	mgr = cstate->mgr;
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	while (1) {
 		ctx = (struct sha512_hash_ctx *)
 					sha512_job_mgr_flush(&mgr->mgr);
 
 		/* If flush returned 0, there are no more jobs in flight. */
 		if (!ctx)
-			return NULL;
+			break;
 
 		/*
 		 * If flush returned a job, resubmit the job to finish
@@ -378,8 +395,10 @@ static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr)
 		 * the sha512_ctx_mgr still need processing. Loop.
 		 */
 		if (ctx)
-			return ctx;
+			break;
 	}
+	spin_unlock_irqrestore(&cstate->work_lock, flags);
+	return ctx;
 }
 
 static int sha512_mb_init(struct ahash_request *areq)
@@ -439,11 +458,11 @@ static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
 		sha_ctx = (struct sha512_hash_ctx *)
 						ahash_request_ctx(&rctx->areq);
 		kernel_fpu_begin();
-		sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx,
+		sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx,
 						rctx->walk.data, nbytes, flag);
 		if (!sha_ctx) {
 			if (flush)
-				sha_ctx = sha512_ctx_mgr_flush(cstate->mgr);
+				sha_ctx = sha512_ctx_mgr_flush(cstate);
 		}
 		kernel_fpu_end();
 		if (sha_ctx)
@@ -471,11 +490,12 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 	struct sha512_hash_ctx *sha_ctx;
 	struct mcryptd_hash_request_ctx *req_ctx;
 	int ret;
+	unsigned long flags;
 
 	/* remove from work list */
-	spin_lock(&cstate->work_lock);
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	list_del(&rctx->waiter);
-	spin_unlock(&cstate->work_lock);
+	spin_unlock_irqrestore(&cstate->work_lock, flags);
 
 	if (irqs_disabled())
 		rctx->complete(&req->base, err);
@@ -486,14 +506,14 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 	}
 
 	/* check to see if there are other jobs that are done */
-	sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
+	sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
 	while (sha_ctx) {
 		req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
 		ret = sha_finish_walk(&req_ctx, cstate, false);
 		if (req_ctx) {
-			spin_lock(&cstate->work_lock);
+			spin_lock_irqsave(&cstate->work_lock, flags);
 			list_del(&req_ctx->waiter);
-			spin_unlock(&cstate->work_lock);
+			spin_unlock_irqrestore(&cstate->work_lock, flags);
 
 			req = cast_mcryptd_ctx_to_req(req_ctx);
 			if (irqs_disabled())
@@ -504,7 +524,7 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 				local_bh_enable();
 			}
 		}
-		sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
+		sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
 	}
 
 	return 0;
@@ -515,6 +535,7 @@ static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
 {
 	unsigned long next_flush;
 	unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+	unsigned long flags;
 
 	/* initialize tag */
 	rctx->tag.arrival = jiffies;    /* tag the arrival time */
@@ -522,9 +543,9 @@ static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
 	next_flush = rctx->tag.arrival + delay;
 	rctx->tag.expire = next_flush;
 
-	spin_lock(&cstate->work_lock);
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	list_add_tail(&rctx->waiter, &cstate->work_list);
-	spin_unlock(&cstate->work_lock);
+	spin_unlock_irqrestore(&cstate->work_lock, flags);
 
 	mcryptd_arm_flusher(cstate, delay);
 }
@@ -565,7 +586,7 @@ static int sha512_mb_update(struct ahash_request *areq)
 	sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
 	sha512_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
 							nbytes, HASH_UPDATE);
 	kernel_fpu_end();
 
@@ -628,7 +649,7 @@ static int sha512_mb_finup(struct ahash_request *areq)
 	sha512_mb_add_list(rctx, cstate);
 
 	kernel_fpu_begin();
-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
 								nbytes, flag);
 	kernel_fpu_end();
 
@@ -677,8 +698,7 @@ static int sha512_mb_final(struct ahash_request *areq)
 	/* flag HASH_FINAL and 0 data size */
 	sha512_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
-								HASH_LAST);
+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, &data, 0, HASH_LAST);
 	kernel_fpu_end();
 
 	/* check if anything is returned */
@@ -940,7 +960,7 @@ static unsigned long sha512_mb_flusher(struct mcryptd_alg_cstate *cstate)
 			break;
 		kernel_fpu_begin();
 		sha_ctx = (struct sha512_hash_ctx *)
-					sha512_ctx_mgr_flush(cstate->mgr);
+					sha512_ctx_mgr_flush(cstate);
 		kernel_fpu_end();
 		if (!sha_ctx) {
 			pr_err("sha512_mb error: nothing got flushed for"
-- 
cgit 


From f2029b1e47b607619d1dd2cb0bbb77f64ec6b7c2 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Fri, 10 Feb 2017 11:38:37 -0800
Subject: perf/x86/intel: Add Kaby Lake support

Add Kaby Lake mobile and desktop models for RAPL, CSTATE and UNCORE
matching Skylake.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: peterz@infradead.org
Cc: kan.liang@intel.com
Cc: bigeasy@linutronix.de
Cc: dave.hansen@linux.intel.com
Cc: piotr.luc@intel.com
Cc: davidcc@google.com
Cc: bp@suse.de
Link: http://lkml.kernel.org/r/1486755517-17812-1-git-send-email-srinivas.pandruvada@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/events/intel/cstate.c | 3 +++
 arch/x86/events/intel/rapl.c   | 3 +++
 arch/x86/events/intel/uncore.c | 2 ++
 3 files changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 1076c9a77292..aff4b5b69d40 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -541,6 +541,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
 	X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE,  snb_cstates),
 	X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates),
 
+	X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE,  snb_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates),
+
 	X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates),
 	X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates),
 	{ },
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 22ef4f72cf32..22054ca49026 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -771,6 +771,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init),
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,	 hsx_rapl_init),
 
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE,  skl_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init),
+
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init),
 	{},
 };
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 1ab45976474d..758c1aa5009d 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1328,6 +1328,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,      skx_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init),
 	{},
 };
 
-- 
cgit 


From 3ba5b5ea7dc3a10ef50819b43a9f8de2705f4eec Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 13 Feb 2017 15:52:28 +0300
Subject: x86/vm86: Fix unused variable warning if THP is disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GCC complains about unused variable 'vma' in mark_screen_rdonly() if THP is
disabled:

arch/x86/kernel/vm86_32.c: In function ‘mark_screen_rdonly’:
arch/x86/kernel/vm86_32.c:180:26: warning: unused variable ‘vma’
[-Wunused-variable]
   struct vm_area_struct *vma = find_vma(mm, 0xA0000);

That's silly. pmd_trans_huge() resolves to 0 when THP is disabled, so the
whole block should be eliminated.

Moving the variable declaration outside the if() block shuts GCC up.

Reported-by: Jérémy Lefaure <jeremy.lefaure@lse.epita.fr>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Tested-by: Borislav Petkov <bp@suse.de>
Cc: Carlos O'Donell <carlos@redhat.com>
Link: http://lkml.kernel.org/r/20170213125228.63645-1-kirill.shutemov@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vm86_32.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index ec5d7545e6dc..0442d98367ae 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -160,11 +160,12 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 
 static void mark_screen_rdonly(struct mm_struct *mm)
 {
+	struct vm_area_struct *vma;
+	spinlock_t *ptl;
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
-	spinlock_t *ptl;
 	int i;
 
 	down_write(&mm->mmap_sem);
@@ -177,7 +178,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
 	pmd = pmd_offset(pud, 0xA0000);
 
 	if (pmd_trans_huge(*pmd)) {
-		struct vm_area_struct *vma = find_vma(mm, 0xA0000);
+		vma = find_vma(mm, 0xA0000);
 		split_huge_pmd(vma, pmd, 0xA0000);
 	}
 	if (pmd_none_or_clear_bad(pmd))
-- 
cgit 


From ab520be8cd5d56867fc95cfbc34b90880faf1f9d Mon Sep 17 00:00:00 2001
From: Paul Durrant <paul.durrant@citrix.com>
Date: Mon, 13 Feb 2017 17:03:23 +0000
Subject: xen/privcmd: Add IOCTL_PRIVCMD_DM_OP

Recently a new dm_op[1] hypercall was added to Xen to provide a mechanism
for restricting device emulators (such as QEMU) to a limited set of
hypervisor operations, and being able to audit those operations in the
kernel of the domain in which they run.

This patch adds IOCTL_PRIVCMD_DM_OP as gateway for __HYPERVISOR_dm_op.

NOTE: There is no requirement for user-space code to bounce data through
      locked memory buffers (as with IOCTL_PRIVCMD_HYPERCALL) since
      privcmd has enough information to lock the original buffers
      directly.

[1] http://xenbits.xen.org/gitweb/?p=xen.git;a=commit;h=524a98c2

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Acked-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/include/asm/xen/hypercall.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a12a047184ee..f6d20f6cca12 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -472,6 +472,13 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
 	return _hypercall2(int, xenpmu_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_dm_op(
+	domid_t dom, unsigned int nr_bufs, void *bufs)
+{
+	return _hypercall3(int, dm_op, dom, nr_bufs, bufs);
+}
+
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
-- 
cgit 


From db1c056cee59f4a7670c3bd9ee1657468cf0b4c4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 8 Dec 2016 15:31:41 +0100
Subject: kvm: vmx: Use the hardware provided GPA instead of page walk

As in the SVM patch, the guest physical address is passed by
VMX to x86_emulate_instruction already, so mark the GPA as available
in vcpu->arch.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7c3e42623090..0828b02b5af2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6382,6 +6382,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 			EPT_VIOLATION_EXECUTABLE))
 		      ? PFERR_PRESENT_MASK : 0;
 
+	vcpu->arch.gpa_available = true;
 	vcpu->arch.exit_qualification = exit_qualification;
 
 	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -6399,6 +6400,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 	}
 
 	ret = handle_mmio_page_fault(vcpu, gpa, true);
+	vcpu->arch.gpa_available = true;
 	if (likely(ret == RET_MMIO_PF_EMULATE))
 		return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
 					      EMULATE_DONE;
@@ -8517,6 +8519,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
 	trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+	vcpu->arch.gpa_available = false;
 
 	/*
 	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
-- 
cgit 


From 967235d320329e4a7a2bd1a36b04293063e985ae Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Dec 2016 14:03:45 +0100
Subject: KVM: vmx: clear pending interrupts on KVM_SET_LAPIC

Pending interrupts might be in the PI descriptor when the
LAPIC is restored from an external state; we do not want
them to be injected.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 3 +--
 arch/x86/kvm/vmx.c   | 9 +++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 33b799fd3a6e..8ddd0ed03880 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2204,8 +2204,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 				1 : count_vectors(apic->regs + APIC_ISR);
 	apic->highest_isr_cache = -1;
 	if (vcpu->arch.apicv_active) {
-		if (kvm_x86_ops->apicv_post_state_restore)
-			kvm_x86_ops->apicv_post_state_restore(vcpu);
+		kvm_x86_ops->apicv_post_state_restore(vcpu);
 		kvm_x86_ops->hwapic_irr_update(vcpu,
 				apic_find_highest_irr(apic));
 		kvm_x86_ops->hwapic_isr_update(vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0828b02b5af2..8d2e0cc8e83e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8749,6 +8749,14 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 }
 
+static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	pi_clear_on(&vmx->pi_desc);
+	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
+}
+
 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -11574,6 +11582,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.get_enable_apicv = vmx_get_enable_apicv,
 	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
 	.load_eoi_exitmap = vmx_load_eoi_exitmap,
+	.apicv_post_state_restore = vmx_apicv_post_state_restore,
 	.hwapic_irr_update = vmx_hwapic_irr_update,
 	.hwapic_isr_update = vmx_hwapic_isr_update,
 	.sync_pir_to_irr = vmx_sync_pir_to_irr,
-- 
cgit 


From 0ad3bed6c5ec6dbb093a26802c85088a85fb9757 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Dec 2016 15:23:54 +0100
Subject: kvm: nVMX: move nested events check to kvm_vcpu_running

vcpu_run calls kvm_vcpu_running, not kvm_arch_vcpu_runnable,
and the former does not call check_nested_events.

Once KVM_REQ_EVENT is removed from the APICv interrupt injection
path, however, this would leave no place to trigger a vmexit
from L2 to L1, causing a missed interrupt delivery while in guest
mode.  This is caught by the "ack interrupt on exit" test in
vmx.flat.

[This does not change the calls to check_nested_events in
 inject_pending_event.  That is material for a separate cleanup.]

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2f64e5d0ae53..204793f0f0e2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7023,6 +7023,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 
 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
+	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+		kvm_x86_ops->check_nested_events(vcpu, false);
+
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 		!vcpu->arch.apf.halted);
 }
@@ -8389,9 +8392,6 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
-		kvm_x86_ops->check_nested_events(vcpu, false);
-
 	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
 }
 
-- 
cgit 


From 810e6defcca4d05275aa15c2872c0a4949178fcb Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Dec 2016 13:05:46 +0100
Subject: KVM: x86: preparatory changes for APICv cleanups

Add return value to __kvm_apic_update_irr/kvm_apic_update_irr.
Move vmx_sync_pir_to_irr around.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 25 +++++++++++++++++--------
 arch/x86/kvm/lapic.h |  4 ++--
 arch/x86/kvm/vmx.c   | 32 ++++++++++++++++----------------
 3 files changed, 35 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8ddd0ed03880..120afc2bcfd3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -341,7 +341,7 @@ static int find_highest_vector(void *bitmap)
 	     vec >= 0; vec -= APIC_VECTORS_PER_REG) {
 		reg = bitmap + REG_POS(vec);
 		if (*reg)
-			return fls(*reg) - 1 + vec;
+			return __fls(*reg) + vec;
 	}
 
 	return -1;
@@ -361,27 +361,36 @@ static u8 count_vectors(void *bitmap)
 	return count;
 }
 
-void __kvm_apic_update_irr(u32 *pir, void *regs)
+int __kvm_apic_update_irr(u32 *pir, void *regs)
 {
-	u32 i, pir_val;
+	u32 i, vec;
+	u32 pir_val, irr_val;
+	int max_irr = -1;
 
-	for (i = 0; i <= 7; i++) {
+	for (i = vec = 0; i <= 7; i++, vec += 32) {
 		pir_val = READ_ONCE(pir[i]);
+		irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
 		if (pir_val) {
-			pir_val = xchg(&pir[i], 0);
-			*((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
+			irr_val |= xchg(&pir[i], 0);
+			*((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
 		}
+		if (irr_val)
+			max_irr = __fls(irr_val) + vec;
 	}
+
+	return max_irr;
 }
 EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
 
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
+	int max_irr;
 
-	__kvm_apic_update_irr(pir, apic->regs);
+	max_irr = __kvm_apic_update_irr(pir, apic->regs);
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	return max_irr;
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 05abd837b78a..bcbe811f3b97 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -71,8 +71,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, unsigned int dest, int dest_mode);
 
-void __kvm_apic_update_irr(u32 *pir, void *regs);
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+int __kvm_apic_update_irr(u32 *pir, void *regs);
+int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 		     struct dest_map *dest_map);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8d2e0cc8e83e..4ac9b484e244 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5057,22 +5057,6 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 		kvm_vcpu_kick(vcpu);
 }
 
-static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (!pi_test_on(&vmx->pi_desc))
-		return;
-
-	pi_clear_on(&vmx->pi_desc);
-	/*
-	 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
-	 * But on x86 this is just a compiler barrier anyway.
-	 */
-	smp_mb__after_atomic();
-	kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
-}
-
 /*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
  * will not change in the lifetime of the guest.
@@ -8738,6 +8722,22 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 	}
 }
 
+static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!pi_test_on(&vmx->pi_desc))
+		return;
+
+	pi_clear_on(&vmx->pi_desc);
+	/*
+	 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+	 * But on x86 this is just a compiler barrier anyway.
+	 */
+	smp_mb__after_atomic();
+	kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+}
+
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
 	if (!kvm_vcpu_apicv_active(vcpu))
-- 
cgit 


From 3d92789f69162ee5689f3766e5f50bb46b7e1d97 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Dec 2016 13:29:03 +0100
Subject: KVM: vmx: move sync_pir_to_irr from apic_find_highest_irr to callers

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 7 ++++---
 arch/x86/kvm/x86.c   | 4 +++-
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 120afc2bcfd3..8af6db9b64aa 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -410,8 +410,6 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
 	if (!apic->irr_pending)
 		return -1;
 
-	if (apic->vcpu->arch.apicv_active)
-		kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
 	result = apic_search_irr(apic);
 	ASSERT(result == -1 || result >= 16);
 
@@ -581,7 +579,10 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 
 static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
 {
-	int highest_irr = apic_find_highest_irr(apic);
+	int highest_irr;
+	if (apic->vcpu->arch.apicv_active)
+		kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
+	highest_irr = apic_find_highest_irr(apic);
 	if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
 		return -1;
 	return highest_irr;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 204793f0f0e2..8f80da161e80 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6822,9 +6822,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		 * Update architecture specific hints for APIC
 		 * virtual interrupt delivery.
 		 */
-		if (vcpu->arch.apicv_active)
+		if (vcpu->arch.apicv_active) {
+			kvm_x86_ops->sync_pir_to_irr(vcpu);
 			kvm_x86_ops->hwapic_irr_update(vcpu,
 				kvm_lapic_find_highest_irr(vcpu));
+		}
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
-- 
cgit 


From 76dfafd536730ef9b9d99b1cf596916d52be76d1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Dec 2016 17:17:11 +0100
Subject: KVM: x86: do not scan IRR twice on APICv vmentry

Calls to apic_find_highest_irr are scanning IRR twice, once
in vmx_sync_pir_from_irr and once in apic_search_irr.  Change
sync_pir_from_irr to get the new maximum IRR from kvm_apic_update_irr;
now that it does the computation, it can also do the RVI write.

In order to avoid complications in svm.c, make the callback optional.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/lapic.c            |  8 +++++---
 arch/x86/kvm/svm.c              |  6 ------
 arch/x86/kvm/vmx.c              | 31 +++++++++++++++++++------------
 arch/x86/kvm/x86.c              |  9 +++------
 5 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 417502cf42b6..e4f13e714bcf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -969,7 +969,7 @@ struct kvm_x86_ops {
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
-	void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
+	int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8af6db9b64aa..7ed2400b2777 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -515,6 +515,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 	 */
 	return apic_find_highest_irr(vcpu->arch.apic);
 }
+EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
 
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode,
@@ -580,9 +581,10 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
 {
 	int highest_irr;
-	if (apic->vcpu->arch.apicv_active)
-		kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
-	highest_irr = apic_find_highest_irr(apic);
+	if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active)
+		highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
+	else
+		highest_irr = apic_find_highest_irr(apic);
 	if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
 		return -1;
 	return highest_irr;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d0414f054bdf..13cd06220b19 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4359,11 +4359,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 	return;
 }
 
-static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
-	return;
-}
-
 static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
 {
 	kvm_lapic_set_irr(vec, vcpu->arch.apic);
@@ -5373,7 +5368,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.get_enable_apicv = svm_get_enable_apicv,
 	.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
-	.sync_pir_to_irr = svm_sync_pir_to_irr,
 	.hwapic_irr_update = svm_hwapic_irr_update,
 	.hwapic_isr_update = svm_hwapic_isr_update,
 	.apicv_post_state_restore = avic_post_state_restore,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4ac9b484e244..d03cb62b70d2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6649,8 +6649,10 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_ple())
 		ple_gap = 0;
 
-	if (!cpu_has_vmx_apicv())
+	if (!cpu_has_vmx_apicv()) {
 		enable_apicv = 0;
+		kvm_x86_ops->sync_pir_to_irr = NULL;
+	}
 
 	if (cpu_has_vmx_tsc_scaling()) {
 		kvm_has_tsc_control = true;
@@ -8722,20 +8724,25 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 	}
 }
 
-static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int max_irr;
 
-	if (!pi_test_on(&vmx->pi_desc))
-		return;
-
-	pi_clear_on(&vmx->pi_desc);
-	/*
-	 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
-	 * But on x86 this is just a compiler barrier anyway.
-	 */
-	smp_mb__after_atomic();
-	kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+	WARN_ON(!vcpu->arch.apicv_active);
+	if (pi_test_on(&vmx->pi_desc)) {
+		pi_clear_on(&vmx->pi_desc);
+		/*
+		 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+		 * But on x86 this is just a compiler barrier anyway.
+		 */
+		smp_mb__after_atomic();
+		max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+	} else {
+		max_irr = kvm_lapic_find_highest_irr(vcpu);
+	}
+	vmx_hwapic_irr_update(vcpu, max_irr);
+	return max_irr;
 }
 
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8f80da161e80..75b0f30d75ee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2909,7 +2909,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
-	if (vcpu->arch.apicv_active)
+	if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
 		kvm_x86_ops->sync_pir_to_irr(vcpu);
 
 	return kvm_apic_get_state(vcpu, s);
@@ -6659,7 +6659,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 	if (irqchip_split(vcpu->kvm))
 		kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
 	else {
-		if (vcpu->arch.apicv_active)
+		if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
 			kvm_x86_ops->sync_pir_to_irr(vcpu);
 		kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
 	}
@@ -6822,11 +6822,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		 * Update architecture specific hints for APIC
 		 * virtual interrupt delivery.
 		 */
-		if (vcpu->arch.apicv_active) {
+		if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
 			kvm_x86_ops->sync_pir_to_irr(vcpu);
-			kvm_x86_ops->hwapic_irr_update(vcpu,
-				kvm_lapic_find_highest_irr(vcpu));
-		}
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
-- 
cgit 


From b95234c840045b7c72380fd14c59416af28fcb02 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Dec 2016 13:57:33 +0100
Subject: kvm: x86: do not use KVM_REQ_EVENT for APICv interrupt injection

Since bf9f6ac8d749 ("KVM: Update Posted-Interrupts Descriptor when vCPU
is blocked", 2015-09-18) the posted interrupt descriptor is checked
unconditionally for PIR.ON.  Therefore we don't need KVM_REQ_EVENT to
trigger the scan and, if NMIs or SMIs are not involved, we can avoid
the complicated event injection path.

Calling kvm_vcpu_kick if PIR.ON=1 is also useless, though it has been
there since APICv was introduced.

However, without the KVM_REQ_EVENT safety net KVM needs to be much
more careful about races between vmx_deliver_posted_interrupt and
vcpu_enter_guest.  First, the IPI for posted interrupts may be issued
between setting vcpu->mode = IN_GUEST_MODE and disabling interrupts.
If that happens, kvm_trigger_posted_interrupt returns true, but
smp_kvm_posted_intr_ipi doesn't do anything about it.  The guest is
entered with PIR.ON, but the posted interrupt IPI has not been sent
and the interrupt is only delivered to the guest on the next vmentry
(if any).  To fix this, disable interrupts before setting vcpu->mode.
This ensures that the IPI is delayed until the guest enters non-root mode;
it is then trapped by the processor causing the interrupt to be injected.

Second, the IPI may be issued between kvm_x86_ops->sync_pir_to_irr(vcpu)
and vcpu->mode = IN_GUEST_MODE.  In this case, kvm_vcpu_kick is called
but it (correctly) doesn't do anything because it sees vcpu->mode ==
OUTSIDE_GUEST_MODE.  Again, the guest is entered with PIR.ON but no
posted interrupt IPI is pending; this time, the fix for this is to move
the RVI update after IN_GUEST_MODE.

Both issues were mostly masked by the liberal usage of KVM_REQ_EVENT,
though the second could actually happen with VT-d posted interrupts.
In both race scenarios KVM_REQ_EVENT would cancel guest entry, resulting
in another vmentry which would inject the interrupt.

This saves about 300 cycles on the self_ipi_* tests of vmexit.flat.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 11 ++++-------
 arch/x86/kvm/vmx.c   |  8 +++++---
 arch/x86/kvm/x86.c   | 44 +++++++++++++++++++++++++-------------------
 3 files changed, 34 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7ed2400b2777..9fa5b8164961 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -385,12 +385,8 @@ EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
 int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	int max_irr;
 
-	max_irr = __kvm_apic_update_irr(pir, apic->regs);
-
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	return max_irr;
+	return __kvm_apic_update_irr(pir, apic->regs);
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
@@ -423,9 +419,10 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
 	vcpu = apic->vcpu;
 
 	if (unlikely(vcpu->arch.apicv_active)) {
-		/* try to update RVI */
+		/* need to update RVI */
 		apic_clear_vector(vec, apic->regs + APIC_IRR);
-		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		kvm_x86_ops->hwapic_irr_update(vcpu,
+				apic_find_highest_irr(apic));
 	} else {
 		apic->irr_pending = false;
 		apic_clear_vector(vec, apic->regs + APIC_IRR);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d03cb62b70d2..fd8cd50e9dc6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5051,9 +5051,11 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
 		return;
 
-	r = pi_test_and_set_on(&vmx->pi_desc);
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
+	/* If a previous notification has sent the IPI, nothing to do.  */
+	if (pi_test_and_set_on(&vmx->pi_desc))
+		return;
+
+	if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
 		kvm_vcpu_kick(vcpu);
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 75b0f30d75ee..63a89a51dcc9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6813,19 +6813,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_hv_process_stimers(vcpu);
 	}
 
-	/*
-	 * KVM_REQ_EVENT is not set when posted interrupts are set by
-	 * VT-d hardware, so we have to update RVI unconditionally.
-	 */
-	if (kvm_lapic_enabled(vcpu)) {
-		/*
-		 * Update architecture specific hints for APIC
-		 * virtual interrupt delivery.
-		 */
-		if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
-			kvm_x86_ops->sync_pir_to_irr(vcpu);
-	}
-
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
 		++vcpu->stat.req_event;
 		kvm_apic_accept_events(vcpu);
@@ -6870,20 +6857,39 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->prepare_guest_switch(vcpu);
 	if (vcpu->fpu_active)
 		kvm_load_guest_fpu(vcpu);
+
+	/*
+	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
+	 * IPI are then delayed after guest entry, which ensures that they
+	 * result in virtual interrupt delivery.
+	 */
+	local_irq_disable();
 	vcpu->mode = IN_GUEST_MODE;
 
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
 	/*
-	 * We should set ->mode before check ->requests,
-	 * Please see the comment in kvm_make_all_cpus_request.
-	 * This also orders the write to mode from any reads
-	 * to the page tables done while the VCPU is running.
-	 * Please see the comment in kvm_flush_remote_tlbs.
+	 * 1) We should set ->mode before checking ->requests.  Please see
+	 * the comment in kvm_make_all_cpus_request.
+	 *
+	 * 2) For APICv, we should set ->mode before checking PIR.ON.  This
+	 * pairs with the memory barrier implicit in pi_test_and_set_on
+	 * (see vmx_deliver_posted_interrupt).
+	 *
+	 * 3) This also orders the write to mode from any reads to the page
+	 * tables done while the VCPU is running.  Please see the comment
+	 * in kvm_flush_remote_tlbs.
 	 */
 	smp_mb__after_srcu_read_unlock();
 
-	local_irq_disable();
+	/*
+	 * This handles the case where a posted interrupt was
+	 * notified with kvm_vcpu_kick.
+	 */
+	if (kvm_lapic_enabled(vcpu)) {
+		if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
+			kvm_x86_ops->sync_pir_to_irr(vcpu);
+	}
 
 	if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
 	    || need_resched() || signal_pending(current)) {
-- 
cgit 


From cf8b84f48a5936f558a7a415f1d2f42161bf73eb Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 30 Nov 2016 12:03:42 -0800
Subject: kvm: nVMX: Prepare for checkpointing L2 state

Split prepare_vmcs12 into two parts: the part that stores the current L2
guest state and the part that sets up the exit information fields. The
former will be used when checkpointing the vCPU's VMX state.

Modify prepare_vmcs02 so that it can construct a vmcs02 midway through
L2 execution, using the checkpointed L2 guest state saved into the
cached vmcs12 above.

Signed-off-by: Jim Mattson <jmattson@google.com>
[Rebasing: add from_vmentry argument to prepare_vmcs02 instead of using
 vmx->nested.nested_run_pending, because it is no longer 1 at the
 point prepare_vmcs02 is called. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 77 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 48 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fd8cd50e9dc6..4b4b59b34d44 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10040,7 +10040,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
  * is assigned to entry_failure_code on failure.
  */
 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-			  unsigned long *entry_failure_code)
+			  bool from_vmentry, unsigned long *entry_failure_code)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exec_control;
@@ -10083,21 +10083,26 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
 	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
 
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+	if (from_vmentry &&
+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
 		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
 		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
 	} else {
 		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
 		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
 	}
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-		vmcs12->vm_entry_intr_info_field);
-	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-		vmcs12->vm_entry_exception_error_code);
-	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-		vmcs12->vm_entry_instruction_len);
-	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-		vmcs12->guest_interruptibility_info);
+	if (from_vmentry) {
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			     vmcs12->vm_entry_intr_info_field);
+		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+			     vmcs12->vm_entry_exception_error_code);
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmcs12->vm_entry_instruction_len);
+		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+			     vmcs12->guest_interruptibility_info);
+	} else {
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+	}
 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
 	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
 	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
@@ -10290,16 +10295,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 			~VM_ENTRY_IA32E_MODE) |
 		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
 
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
+	if (from_vmentry &&
+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
 		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
 		vcpu->arch.pat = vmcs12->guest_ia32_pat;
-	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
-
+	}
 
 	set_cr4_guest_host_mask(vmx);
 
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
+	if (from_vmentry &&
+	    vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
 		vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
 
 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
@@ -10351,7 +10358,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
 	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
 
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+	if (from_vmentry &&
+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
 		vcpu->arch.efer = vmcs12->guest_ia32_efer;
 	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
 		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
@@ -10561,7 +10569,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
 	vmx_segment_cache_clear(vmx);
 
-	if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) {
+	if (prepare_vmcs02(vcpu, vmcs12, true, &exit_qualification)) {
 		leave_guest_mode(vcpu);
 		vmx_load_vmcs01(vcpu);
 		nested_vmx_entry_failure(vcpu, vmcs12,
@@ -10733,21 +10741,13 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
 }
 
 /*
- * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
- * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
- * and this function updates it to reflect the changes to the guest state while
- * L2 was running (and perhaps made some exits which were handled directly by L0
- * without going back to L1), and to reflect the exit reason.
- * Note that we do not have to copy here all VMCS fields, just those that
- * could have changed by the L2 guest or the exit - i.e., the guest-state and
- * exit-information fields only. Other fields are modified by L1 with VMWRITE,
- * which already writes to vmcs12 directly.
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
  */
-static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-			   u32 exit_reason, u32 exit_intr_info,
-			   unsigned long exit_qualification)
+static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
-	/* update guest state fields: */
 	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
 	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
 
@@ -10853,6 +10853,25 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
 	if (nested_cpu_has_xsaves(vmcs12))
 		vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+			   u32 exit_reason, u32 exit_intr_info,
+			   unsigned long exit_qualification)
+{
+	/* update guest state fields: */
+	sync_vmcs12(vcpu, vmcs12);
 
 	/* update exit information fields: */
 
-- 
cgit 


From e29acc55bfc3afcfdf9ea3468f1217d08f0b2d2b Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 30 Nov 2016 12:03:43 -0800
Subject: kvm: nVMX: Refactor handle_vmon()

Handle_vmon is split into two parts: the part that handles the VMXON
instruction, and the part that modifies the vcpu state to transition
from legacy mode to VMX operation. The latter will be used when
restoring the checkpointed state of a vCPU that was in VMX operation
when a snapshot was taken.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 93 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 52 insertions(+), 41 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4b4b59b34d44..33cb8d2b9653 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7124,6 +7124,53 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
 	return 0;
 }
 
+static int enter_vmx_operation(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs *shadow_vmcs;
+
+	if (cpu_has_vmx_msr_bitmap()) {
+		vmx->nested.msr_bitmap =
+				(unsigned long *)__get_free_page(GFP_KERNEL);
+		if (!vmx->nested.msr_bitmap)
+			goto out_msr_bitmap;
+	}
+
+	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+	if (!vmx->nested.cached_vmcs12)
+		goto out_cached_vmcs12;
+
+	if (enable_shadow_vmcs) {
+		shadow_vmcs = alloc_vmcs();
+		if (!shadow_vmcs)
+			goto out_shadow_vmcs;
+		/* mark vmcs as shadow */
+		shadow_vmcs->revision_id |= (1u << 31);
+		/* init shadow vmcs */
+		vmcs_clear(shadow_vmcs);
+		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
+	}
+
+	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+	vmx->nested.vmcs02_num = 0;
+
+	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL_PINNED);
+	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
+	vmx->nested.vmxon = true;
+	return 0;
+
+out_shadow_vmcs:
+	kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+	free_page((unsigned long)vmx->nested.msr_bitmap);
+
+out_msr_bitmap:
+	return -ENOMEM;
+}
+
 /*
  * Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
@@ -7134,9 +7181,9 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
  */
 static int handle_vmon(struct kvm_vcpu *vcpu)
 {
+	int ret;
 	struct kvm_segment cs;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct vmcs *shadow_vmcs;
 	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
 		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 
@@ -7176,49 +7223,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 
 	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
 		return 1;
-
-	if (cpu_has_vmx_msr_bitmap()) {
-		vmx->nested.msr_bitmap =
-				(unsigned long *)__get_free_page(GFP_KERNEL);
-		if (!vmx->nested.msr_bitmap)
-			goto out_msr_bitmap;
-	}
-
-	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
-	if (!vmx->nested.cached_vmcs12)
-		goto out_cached_vmcs12;
-
-	if (enable_shadow_vmcs) {
-		shadow_vmcs = alloc_vmcs();
-		if (!shadow_vmcs)
-			goto out_shadow_vmcs;
-		/* mark vmcs as shadow */
-		shadow_vmcs->revision_id |= (1u << 31);
-		/* init shadow vmcs */
-		vmcs_clear(shadow_vmcs);
-		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
-	}
-
-	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
-	vmx->nested.vmcs02_num = 0;
-
-	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_REL_PINNED);
-	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
-
-	vmx->nested.vmxon = true;
+ 
+	ret = enter_vmx_operation(vcpu);
+	if (ret)
+		return ret;
 
 	nested_vmx_succeed(vcpu);
 	return kvm_skip_emulated_instruction(vcpu);
-
-out_shadow_vmcs:
-	kfree(vmx->nested.cached_vmcs12);
-
-out_cached_vmcs12:
-	free_page((unsigned long)vmx->nested.msr_bitmap);
-
-out_msr_bitmap:
-	return -ENOMEM;
 }
 
 /*
-- 
cgit 


From a8bc284eb70f58a4fd0f4a70d816cca28ca01973 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 30 Nov 2016 12:03:44 -0800
Subject: kvm: nVMX: Refactor handle_vmptrld()

Handle_vmptrld is split into two parts: the part that handles the
VMPTRLD instruction, and the part that establishes the current VMCS
pointer.  The latter will be used when restoring the checkpointed state
of a vCPU that had a valid VMCS pointer when a snapshot was taken.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 33cb8d2b9653..ef9affcabdeb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7678,6 +7678,18 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 	return kvm_skip_emulated_instruction(vcpu);
 }
 
+static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+	vmx->nested.current_vmptr = vmptr;
+	if (enable_shadow_vmcs) {
+		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+			      SECONDARY_EXEC_SHADOW_VMCS);
+		vmcs_write64(VMCS_LINK_POINTER,
+			     __pa(vmx->vmcs01.shadow_vmcs));
+		vmx->nested.sync_shadow_vmcs = true;
+	}
+}
+
 /* Emulate the VMPTRLD instruction */
 static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
@@ -7708,7 +7720,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		}
 
 		nested_release_vmcs12(vmx);
-		vmx->nested.current_vmptr = vmptr;
 		vmx->nested.current_vmcs12 = new_vmcs12;
 		vmx->nested.current_vmcs12_page = page;
 		/*
@@ -7717,14 +7728,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		 */
 		memcpy(vmx->nested.cached_vmcs12,
 		       vmx->nested.current_vmcs12, VMCS12_SIZE);
-
-		if (enable_shadow_vmcs) {
-			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
-				      SECONDARY_EXEC_SHADOW_VMCS);
-			vmcs_write64(VMCS_LINK_POINTER,
-				     __pa(vmx->vmcs01.shadow_vmcs));
-			vmx->nested.sync_shadow_vmcs = true;
-		}
+		set_current_vmptr(vmx, vmptr);
 	}
 
 	nested_vmx_succeed(vcpu);
-- 
cgit 


From 6beb7bd52e482c213dfa6c1c88e27a579df5bf4d Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 30 Nov 2016 12:03:45 -0800
Subject: kvm: nVMX: Refactor nested_get_vmcs12_pages()

Perform the checks on vmcs12 state early, but defer the gpa->hpa lookups
until after prepare_vmcs02. Later, when we restore the checkpointed
state of a vCPU in guest mode, we will not be able to do the gpa->hpa
lookups when the restore is done.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 138 ++++++++++++++++++++++++++---------------------------
 1 file changed, 69 insertions(+), 69 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ef9affcabdeb..650f34336fad 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9626,17 +9626,16 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 		kvm_inject_page_fault(vcpu, fault);
 }
 
-static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+					       struct vmcs12 *vmcs12);
+
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 					struct vmcs12 *vmcs12)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int maxphyaddr = cpuid_maxphyaddr(vcpu);
+	u64 hpa;
 
 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-		if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
-		    vmcs12->apic_access_addr >> maxphyaddr)
-			return false;
-
 		/*
 		 * Translate L1 physical address to host physical
 		 * address for vmcs02. Keep the page pinned, so this
@@ -9647,59 +9646,80 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 			nested_release_page(vmx->nested.apic_access_page);
 		vmx->nested.apic_access_page =
 			nested_get_page(vcpu, vmcs12->apic_access_addr);
+		/*
+		 * If translation failed, no matter: This feature asks
+		 * to exit when accessing the given address, and if it
+		 * can never be accessed, this feature won't do
+		 * anything anyway.
+		 */
+		if (vmx->nested.apic_access_page) {
+			hpa = page_to_phys(vmx->nested.apic_access_page);
+			vmcs_write64(APIC_ACCESS_ADDR, hpa);
+		} else {
+			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+		}
+	} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+		   cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
+		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+			      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+		kvm_vcpu_reload_apic_access_page(vcpu);
 	}
 
 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-		if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
-		    vmcs12->virtual_apic_page_addr >> maxphyaddr)
-			return false;
-
 		if (vmx->nested.virtual_apic_page) /* shouldn't happen */
 			nested_release_page(vmx->nested.virtual_apic_page);
 		vmx->nested.virtual_apic_page =
 			nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
 
 		/*
-		 * Failing the vm entry is _not_ what the processor does
-		 * but it's basically the only possibility we have.
-		 * We could still enter the guest if CR8 load exits are
-		 * enabled, CR8 store exits are enabled, and virtualize APIC
-		 * access is disabled; in this case the processor would never
-		 * use the TPR shadow and we could simply clear the bit from
-		 * the execution control.  But such a configuration is useless,
-		 * so let's keep the code simple.
+		 * If translation failed, VM entry will fail because
+		 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
+		 * Failing the vm entry is _not_ what the processor
+		 * does but it's basically the only possibility we
+		 * have.  We could still enter the guest if CR8 load
+		 * exits are enabled, CR8 store exits are enabled, and
+		 * virtualize APIC access is disabled; in this case
+		 * the processor would never use the TPR shadow and we
+		 * could simply clear the bit from the execution
+		 * control.  But such a configuration is useless, so
+		 * let's keep the code simple.
 		 */
-		if (!vmx->nested.virtual_apic_page)
-			return false;
+		if (vmx->nested.virtual_apic_page) {
+			hpa = page_to_phys(vmx->nested.virtual_apic_page);
+			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
+		}
 	}
 
 	if (nested_cpu_has_posted_intr(vmcs12)) {
-		if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
-		    vmcs12->posted_intr_desc_addr >> maxphyaddr)
-			return false;
-
 		if (vmx->nested.pi_desc_page) { /* shouldn't happen */
 			kunmap(vmx->nested.pi_desc_page);
 			nested_release_page(vmx->nested.pi_desc_page);
 		}
 		vmx->nested.pi_desc_page =
 			nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
-		if (!vmx->nested.pi_desc_page)
-			return false;
-
 		vmx->nested.pi_desc =
 			(struct pi_desc *)kmap(vmx->nested.pi_desc_page);
 		if (!vmx->nested.pi_desc) {
 			nested_release_page_clean(vmx->nested.pi_desc_page);
-			return false;
+			return;
 		}
 		vmx->nested.pi_desc =
 			(struct pi_desc *)((void *)vmx->nested.pi_desc +
 			(unsigned long)(vmcs12->posted_intr_desc_addr &
 			(PAGE_SIZE - 1)));
+		vmcs_write64(POSTED_INTR_DESC_ADDR,
+			page_to_phys(vmx->nested.pi_desc_page) +
+			(unsigned long)(vmcs12->posted_intr_desc_addr &
+			(PAGE_SIZE - 1)));
 	}
-
-	return true;
+	if (cpu_has_vmx_msr_bitmap() &&
+	    nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
+	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+		;
+	else
+		vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+				CPU_BASED_USE_MSR_BITMAPS);
 }
 
 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
@@ -10146,12 +10166,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
 		vmx->nested.pi_pending = false;
 		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
-		vmcs_write64(POSTED_INTR_DESC_ADDR,
-			page_to_phys(vmx->nested.pi_desc_page) +
-			(unsigned long)(vmcs12->posted_intr_desc_addr &
-			(PAGE_SIZE - 1)));
-	} else
+	} else {
 		exec_control &= ~PIN_BASED_POSTED_INTR;
+	}
 
 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
 
@@ -10196,26 +10213,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
 			exec_control |= vmcs12->secondary_vm_exec_control;
 
-		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
-			/*
-			 * If translation failed, no matter: This feature asks
-			 * to exit when accessing the given address, and if it
-			 * can never be accessed, this feature won't do
-			 * anything anyway.
-			 */
-			if (!vmx->nested.apic_access_page)
-				exec_control &=
-				  ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-			else
-				vmcs_write64(APIC_ACCESS_ADDR,
-				  page_to_phys(vmx->nested.apic_access_page));
-		} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
-			    cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
-			exec_control |=
-				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-			kvm_vcpu_reload_apic_access_page(vcpu);
-		}
-
 		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
 			vmcs_write64(EOI_EXIT_BITMAP0,
 				vmcs12->eoi_exit_bitmap0);
@@ -10230,6 +10227,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		}
 
 		nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
+
+		/*
+		 * Write an illegal value to APIC_ACCESS_ADDR. Later,
+		 * nested_get_vmcs12_pages will either fix it up or
+		 * remove the VM execution control.
+		 */
+		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+			vmcs_write64(APIC_ACCESS_ADDR, -1ull);
+
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}
 
@@ -10266,19 +10272,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	exec_control &= ~CPU_BASED_TPR_SHADOW;
 	exec_control |= vmcs12->cpu_based_vm_exec_control;
 
+	/*
+	 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+	 * nested_get_vmcs12_pages can't fix it up, the illegal value
+	 * will result in a VM entry failure.
+	 */
 	if (exec_control & CPU_BASED_TPR_SHADOW) {
-		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-				page_to_phys(vmx->nested.virtual_apic_page));
+		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
 		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
 	}
 
-	if (cpu_has_vmx_msr_bitmap() &&
-	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
-	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
-		; /* MSR_BITMAP will be set by following vmx_set_efer. */
-	else
-		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
-
 	/*
 	 * Merging of IO bitmap not currently supported.
 	 * Rather, exit every time.
@@ -10456,11 +10459,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		goto out;
 	}
 
-	if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		goto out;
-	}
-
 	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		goto out;
@@ -10592,6 +10590,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
+	nested_get_vmcs12_pages(vcpu, vmcs12);
+
 	msr_entry_idx = nested_vmx_load_msr(vcpu,
 					    vmcs12->vm_entry_msr_load_addr,
 					    vmcs12->vm_entry_msr_load_count);
-- 
cgit 


From ca0bde28f2ed66c2229ecfb7f4bfa0defa3da4b5 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 30 Nov 2016 12:03:46 -0800
Subject: kvm: nVMX: Split VMCS checks from nested_vmx_run()

The checks performed on the contents of the vmcs12 are extracted from
nested_vmx_run so that they can be used to validate a vmcs12 that has
been restored from a checkpoint.

Signed-off-by: Jim Mattson <jmattson@google.com>
[Change prepare_vmcs02 and nested_vmx_load_cr3's last argument to u32,
 to match check_vmentry_postreqs.  Update comments for singlestep
 handling. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 194 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 103 insertions(+), 91 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 650f34336fad..71df7411959f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10035,7 +10035,7 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
  * is assigned to entry_failure_code on failure.
  */
 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
-			       unsigned long *entry_failure_code)
+			       u32 *entry_failure_code)
 {
 	if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
 		if (!nested_cr3_valid(vcpu, cr3)) {
@@ -10075,7 +10075,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
  * is assigned to entry_failure_code on failure.
  */
 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-			  bool from_vmentry, unsigned long *entry_failure_code)
+			  bool from_vmentry, u32 *entry_failure_code)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exec_control;
@@ -10411,68 +10411,22 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	return 0;
 }
 
-/*
- * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
- * for running an L2 nested guest.
- */
-static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
-	struct vmcs12 *vmcs12;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int cpu;
-	struct loaded_vmcs *vmcs02;
-	bool ia32e;
-	u32 msr_entry_idx;
-	unsigned long exit_qualification;
-
-	if (!nested_vmx_check_permission(vcpu))
-		return 1;
-
-	if (!nested_vmx_check_vmcs12(vcpu))
-		goto out;
-
-	vmcs12 = get_vmcs12(vcpu);
-
-	if (enable_shadow_vmcs)
-		copy_shadow_to_vmcs12(vmx);
-
-	/*
-	 * The nested entry process starts with enforcing various prerequisites
-	 * on vmcs12 as required by the Intel SDM, and act appropriately when
-	 * they fail: As the SDM explains, some conditions should cause the
-	 * instruction to fail, while others will cause the instruction to seem
-	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
-	 * To speed up the normal (success) code path, we should avoid checking
-	 * for misconfigurations which will anyway be caught by the processor
-	 * when using the merged vmcs02.
-	 */
-	if (vmcs12->launch_state == launch) {
-		nested_vmx_failValid(vcpu,
-			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
-			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
-		goto out;
-	}
 
 	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
-	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		goto out;
-	}
+	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
-	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		goto out;
-	}
+	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
-	if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		goto out;
-	}
+	if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
-	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		goto out;
-	}
+	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
 	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
 				vmx->nested.nested_vmx_procbased_ctls_low,
@@ -10489,28 +10443,30 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	    !vmx_control_verify(vmcs12->vm_entry_controls,
 				vmx->nested.nested_vmx_entry_ctls_low,
 				vmx->nested.nested_vmx_entry_ctls_high))
-	{
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		goto out;
-	}
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
 	if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
 	    !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
-	    !nested_cr3_valid(vcpu, vmcs12->host_cr3)) {
-		nested_vmx_failValid(vcpu,
-			VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
-		goto out;
-	}
+	    !nested_cr3_valid(vcpu, vmcs12->host_cr3))
+		return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+
+	return 0;
+}
+
+static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+				  u32 *exit_qual)
+{
+	bool ia32e;
+
+	*exit_qual = ENTRY_FAIL_DEFAULT;
 
 	if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
-	    !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) {
-		nested_vmx_entry_failure(vcpu, vmcs12,
-			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+	    !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
 		return 1;
-	}
-	if (vmcs12->vmcs_link_pointer != -1ull) {
-		nested_vmx_entry_failure(vcpu, vmcs12,
-			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
+
+	if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
+	    vmcs12->vmcs_link_pointer != -1ull) {
+		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
 		return 1;
 	}
 
@@ -10523,16 +10479,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
 	 *   CR0.PG) is 1.
 	 */
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
+	if (to_vmx(vcpu)->nested.nested_run_pending &&
+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
 		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
 		if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
 		    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
 		    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
-		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
-			nested_vmx_entry_failure(vcpu, vmcs12,
-				EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
 			return 1;
-		}
 	}
 
 	/*
@@ -10546,11 +10500,75 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 			 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
 		if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
 		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
-		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
-			nested_vmx_entry_failure(vcpu, vmcs12,
-				EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
 			return 1;
-		}
+	}
+
+	return 0;
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+	struct vmcs12 *vmcs12;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int cpu;
+	struct loaded_vmcs *vmcs02;
+	u32 msr_entry_idx;
+	u32 exit_qual;
+	int ret;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (!nested_vmx_check_vmcs12(vcpu))
+		goto out;
+
+	vmcs12 = get_vmcs12(vcpu);
+
+	if (enable_shadow_vmcs)
+		copy_shadow_to_vmcs12(vmx);
+
+	/*
+	 * The nested entry process starts with enforcing various prerequisites
+	 * on vmcs12 as required by the Intel SDM, and act appropriately when
+	 * they fail: As the SDM explains, some conditions should cause the
+	 * instruction to fail, while others will cause the instruction to seem
+	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
+	 * To speed up the normal (success) code path, we should avoid checking
+	 * for misconfigurations which will anyway be caught by the processor
+	 * when using the merged vmcs02.
+	 */
+	if (vmcs12->launch_state == launch) {
+		nested_vmx_failValid(vcpu,
+			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+		goto out;
+	}
+
+	ret = check_vmentry_prereqs(vcpu, vmcs12);
+	if (ret) {
+		nested_vmx_failValid(vcpu, ret);
+		goto out;
+	}
+
+	/*
+	 * After this point, the trap flag no longer triggers a singlestep trap
+	 * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
+	 * This is not 100% correct; for performance reasons, we delegate most
+	 * of the checks on host state to the processor.  If those fail,
+	 * the singlestep trap is missed.
+	 */
+	skip_emulated_instruction(vcpu);
+
+	ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
+	if (ret) {
+		nested_vmx_entry_failure(vcpu, vmcs12,
+					 EXIT_REASON_INVALID_STATE, exit_qual);
+		return 1;
 	}
 
 	/*
@@ -10562,12 +10580,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	if (!vmcs02)
 		return -ENOMEM;
 
-	/*
-	 * After this point, the trap flag no longer triggers a singlestep trap
-	 * on the vm entry instructions. Don't call
-	 * kvm_skip_emulated_instruction.
-	 */
-	skip_emulated_instruction(vcpu);
 	enter_guest_mode(vcpu);
 
 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
@@ -10582,11 +10594,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
 	vmx_segment_cache_clear(vmx);
 
-	if (prepare_vmcs02(vcpu, vmcs12, true, &exit_qualification)) {
+	if (prepare_vmcs02(vcpu, vmcs12, true, &exit_qual)) {
 		leave_guest_mode(vcpu);
 		vmx_load_vmcs01(vcpu);
 		nested_vmx_entry_failure(vcpu, vmcs12,
-				EXIT_REASON_INVALID_STATE, exit_qualification);
+				EXIT_REASON_INVALID_STATE, exit_qual);
 		return 1;
 	}
 
@@ -10937,7 +10949,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 				   struct vmcs12 *vmcs12)
 {
 	struct kvm_segment seg;
-	unsigned long entry_failure_code;
+	u32 entry_failure_code;
 
 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
 		vcpu->arch.efer = vmcs12->host_ia32_efer;
-- 
cgit 


From 858e25c06fb0bc4b39b2f01c14c990348e3a9b67 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 30 Nov 2016 12:03:47 -0800
Subject: kvm: nVMX: Refactor nested_vmx_run()

Nested_vmx_run is split into two parts: the part that handles the
VMLAUNCH/VMRESUME instruction, and the part that modifies the vcpu state
to transition from VMX root mode to VMX non-root mode. The latter will
be used when restoring the checkpointed state of a vCPU that was in VMX
operation when a snapshot was taken.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 111 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 62 insertions(+), 49 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 71df7411959f..bca60665d55d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10507,6 +10507,65 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	return 0;
 }
 
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	struct loaded_vmcs *vmcs02;
+	int cpu;
+	u32 msr_entry_idx;
+	u32 exit_qual;
+
+	vmcs02 = nested_get_current_vmcs02(vmx);
+	if (!vmcs02)
+		return -ENOMEM;
+
+	enter_guest_mode(vcpu);
+
+	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+	cpu = get_cpu();
+	vmx->loaded_vmcs = vmcs02;
+	vmx_vcpu_put(vcpu);
+	vmx_vcpu_load(vcpu, cpu);
+	vcpu->cpu = cpu;
+	put_cpu();
+
+	vmx_segment_cache_clear(vmx);
+
+	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
+		leave_guest_mode(vcpu);
+		vmx_load_vmcs01(vcpu);
+		nested_vmx_entry_failure(vcpu, vmcs12,
+					 EXIT_REASON_INVALID_STATE, exit_qual);
+		return 1;
+	}
+
+	nested_get_vmcs12_pages(vcpu, vmcs12);
+
+	msr_entry_idx = nested_vmx_load_msr(vcpu,
+					    vmcs12->vm_entry_msr_load_addr,
+					    vmcs12->vm_entry_msr_load_count);
+	if (msr_entry_idx) {
+		leave_guest_mode(vcpu);
+		vmx_load_vmcs01(vcpu);
+		nested_vmx_entry_failure(vcpu, vmcs12,
+				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
+		return 1;
+	}
+
+	vmcs12->launch_state = 1;
+
+	/*
+	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+	 * returned as far as L1 is concerned. It will only return (and set
+	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
+	 */
+	return 0;
+}
+
 /*
  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
  * for running an L2 nested guest.
@@ -10515,9 +10574,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 {
 	struct vmcs12 *vmcs12;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int cpu;
-	struct loaded_vmcs *vmcs02;
-	u32 msr_entry_idx;
 	u32 exit_qual;
 	int ret;
 
@@ -10576,58 +10632,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	 * the nested entry.
 	 */
 
-	vmcs02 = nested_get_current_vmcs02(vmx);
-	if (!vmcs02)
-		return -ENOMEM;
-
-	enter_guest_mode(vcpu);
-
-	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
-		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-
-	cpu = get_cpu();
-	vmx->loaded_vmcs = vmcs02;
-	vmx_vcpu_put(vcpu);
-	vmx_vcpu_load(vcpu, cpu);
-	vcpu->cpu = cpu;
-	put_cpu();
-
-	vmx_segment_cache_clear(vmx);
-
-	if (prepare_vmcs02(vcpu, vmcs12, true, &exit_qual)) {
-		leave_guest_mode(vcpu);
-		vmx_load_vmcs01(vcpu);
-		nested_vmx_entry_failure(vcpu, vmcs12,
-				EXIT_REASON_INVALID_STATE, exit_qual);
-		return 1;
-	}
-
-	nested_get_vmcs12_pages(vcpu, vmcs12);
-
-	msr_entry_idx = nested_vmx_load_msr(vcpu,
-					    vmcs12->vm_entry_msr_load_addr,
-					    vmcs12->vm_entry_msr_load_count);
-	if (msr_entry_idx) {
-		leave_guest_mode(vcpu);
-		vmx_load_vmcs01(vcpu);
-		nested_vmx_entry_failure(vcpu, vmcs12,
-				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
-		return 1;
-	}
-
-	vmcs12->launch_state = 1;
+	ret = enter_vmx_non_root_mode(vcpu, true);
+	if (ret)
+		return ret;
 
 	if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
 		return kvm_vcpu_halt(vcpu);
 
 	vmx->nested.nested_run_pending = 1;
 
-	/*
-	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
-	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
-	 * returned as far as L1 is concerned. It will only return (and set
-	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
-	 */
 	return 1;
 
 out:
-- 
cgit 


From 681bcea802f2bf0a28cdf0fd0f813de7bb5cd3c7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 24 Jan 2017 22:21:16 +0100
Subject: KVM: svm: inititalize hash table structures directly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hashtable and guarding spinlock are global data structures,
we can inititalize them statically.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20170124212116.4568-1-david@redhat.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 13cd06220b19..4e5905a1ce70 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -971,8 +971,8 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
  * a particular vCPU.
  */
 #define SVM_VM_DATA_HASH_BITS	8
-DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
-static spinlock_t svm_vm_data_hash_lock;
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
 
 /* Note:
  * This function is called from IOMMU driver to notify
@@ -1077,8 +1077,6 @@ static __init int svm_hardware_setup(void)
 		} else {
 			pr_info("AVIC enabled\n");
 
-			hash_init(svm_vm_data_hash);
-			spin_lock_init(&svm_vm_data_hash_lock);
 			amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
 		}
 	}
-- 
cgit 


From 47512cfd0d7a8bd6ab71d01cd89fca19eb2093eb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Feb 2017 11:11:50 +0100
Subject: x86/platform/goldfish: Prevent unconditional loading

The goldfish platform code registers the platform device unconditionally
which causes havoc in several ways if the goldfish_pdev_bus driver is
enabled:

 - Access to the hardcoded physical memory region, which is either not
   available or contains stuff which is completely unrelated.

 - Prevents that the interrupt of the serial port can be requested

 - In case of a spurious interrupt it goes into a infinite loop in the
   interrupt handler of the pdev_bus driver (which needs to be fixed
   seperately).

Add a 'goldfish' command line option to make the registration opt-in when
the platform is compiled in.

I'm seriously grumpy about this engineering trainwreck, which has seven
SOBs from Intel developers for 50 lines of code. And none of them figured
out that this is broken. Impressive fail!

Fixes: ddd70cf93d78 ("goldfish: platform device for x86")
Reported-by: Gabriel C <nix.or.die@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/platform/goldfish/goldfish.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c
index 1693107a518e..0d17c0aafeb1 100644
--- a/arch/x86/platform/goldfish/goldfish.c
+++ b/arch/x86/platform/goldfish/goldfish.c
@@ -42,10 +42,22 @@ static struct resource goldfish_pdev_bus_resources[] = {
 	}
 };
 
+static bool goldfish_enable __initdata;
+
+static int __init goldfish_setup(char *str)
+{
+	goldfish_enable = true;
+	return 0;
+}
+__setup("goldfish", goldfish_setup);
+
 static int __init goldfish_init(void)
 {
+	if (!goldfish_enable)
+		return -ENODEV;
+
 	platform_device_register_simple("goldfish_pdev_bus", -1,
-						goldfish_pdev_bus_resources, 2);
+					goldfish_pdev_bus_resources, 2);
 	return 0;
 }
 device_initcall(goldfish_init);
-- 
cgit 


From d48085f0716f195ee7432de2dd110e2093c40fd5 Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Tue, 14 Feb 2017 18:11:29 -0600
Subject: x86/platform/UV/NMI: Fix uneccessary kABI breakage

The addition of support for UV Hubless systems unneccessarily broke
the kABI for a symbol that is not used by external kernel modules.
Remove the symbol from the EXPORT list.

Signed-off-by: Mike Travis <mike.travis@hpe.com>
Reviewed-by: Russ Anderson <russ.anderson@hpe.com>
Link: http://lkml.kernel.org/r/20170215001129.068078379@asylum.americas.sgi.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/platform/uv/uv_nmi.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 0ecd7bf7d2d3..9743d0ccfec6 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -65,7 +65,6 @@
 static struct uv_hub_nmi_s **uv_hub_nmi_list;
 
 DEFINE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi);
-EXPORT_PER_CPU_SYMBOL_GPL(uv_cpu_nmi);
 
 /* UV hubless values */
 #define NMI_CONTROL_PORT	0x70
-- 
cgit 


From 3bba73b1b7a88d88c3ea16b7914c13d475e4a87b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Feb 2017 15:12:04 +0100
Subject: x86/cpufeature: Move RING3MWAIT feature to avoid conflicts

The original feature bit is used in a different branch already. Move it to
scattered bits.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/cpufeatures.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 56e5184514c6..43c4ea9cd907 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -100,7 +100,7 @@
 #define X86_FEATURE_XTOPOLOGY	( 3*32+22) /* cpu topology enum extensions */
 #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
 #define X86_FEATURE_NONSTOP_TSC	( 3*32+24) /* TSC does not stop in C states */
-#define X86_FEATURE_RING3MWAIT	( 3*32+25) /* ring 3 MONITOR/MWAIT */
+/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
 #define X86_FEATURE_EXTD_APICID	( 3*32+26) /* has extended APICID (8 bits) */
 #define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
 #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
@@ -186,7 +186,7 @@
  *
  * Reuse free bits when adding new feature flags!
  */
-
+#define X86_FEATURE_RING3MWAIT	( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
 #define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
 #define X86_FEATURE_CAT_L3	( 7*32+ 4) /* Cache Allocation Technology L3 */
-- 
cgit 


From 47c0152e0f8bd325869417e0aaff032e62bcf6f2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Dec 2016 11:44:07 +0100
Subject: KVM: VMX: use vmcs_set/clear_bits for CPU-based execution controls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 30 ++++++++----------------------
 1 file changed, 8 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bca60665d55d..0e0b5d09597e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5459,26 +5459,20 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
 
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
-	u32 cpu_based_vm_exec_control;
-
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+	vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+		      CPU_BASED_VIRTUAL_INTR_PENDING);
 }
 
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
-	u32 cpu_based_vm_exec_control;
-
 	if (!cpu_has_virtual_nmis() ||
 	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
 		enable_irq_window(vcpu);
 		return;
 	}
 
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+	vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+		      CPU_BASED_VIRTUAL_NMI_PENDING);
 }
 
 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -6137,12 +6131,8 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 
 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 {
-	u32 cpu_based_vm_exec_control;
-
-	/* clear pending irq */
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+	vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+			CPU_BASED_VIRTUAL_INTR_PENDING);
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
@@ -6408,12 +6398,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 
 static int handle_nmi_window(struct kvm_vcpu *vcpu)
 {
-	u32 cpu_based_vm_exec_control;
-
-	/* clear pending NMI */
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+	vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+			CPU_BASED_VIRTUAL_NMI_PENDING);
 	++vcpu->stat.nmi_window_exits;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
-- 
cgit 


From bbd6411513aa8ef3ea02abab61318daf87c1af1e Mon Sep 17 00:00:00 2001
From: "Cao, Lei" <Lei.Cao@stratus.com>
Date: Fri, 3 Feb 2017 20:04:35 +0000
Subject: KVM: Support vCPU-based gfn->hva cache
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide versions of struct gfn_to_hva_cache functions that
take vcpu as a parameter instead of struct kvm.  The existing functions
are not needed anymore, so delete them.  This allows dirty pages to
be logged in the vcpu dirty ring, instead of the global dirty ring,
for ring-based dirty memory tracking.

Signed-off-by: Lei Cao <lei.cao@stratus.com>
Message-Id: <CY1PR08MB19929BD2AC47A291FD680E83F04F0@CY1PR08MB1992.namprd08.prod.outlook.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 22 ++++++++++------------
 arch/x86/kvm/x86.c   | 41 ++++++++++++++++++++---------------------
 2 files changed, 30 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9fa5b8164961..bad6a25067bc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -529,16 +529,14 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 
 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
 {
-
-	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
-				      sizeof(val));
+	return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val,
+					   sizeof(val));
 }
 
 static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
 {
-
-	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
-				      sizeof(*val));
+	return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val,
+					  sizeof(*val));
 }
 
 static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
@@ -2287,8 +2285,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
 	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
-	if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
-				  sizeof(u32)))
+	if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+				       sizeof(u32)))
 		return;
 
 	apic_set_tpr(vcpu->arch.apic, data & 0xff);
@@ -2340,14 +2338,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 		max_isr = 0;
 	data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
 
-	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
-				sizeof(u32));
+	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+				    sizeof(u32));
 }
 
 int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 {
 	if (vapic_addr) {
-		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
 					&vcpu->arch.apic->vapic_cache,
 					vapic_addr, sizeof(u32)))
 			return -EINVAL;
@@ -2441,7 +2439,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
 	vcpu->arch.pv_eoi.msr_val = data;
 	if (!pv_eoi_enabled(vcpu))
 		return 0;
-	return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+	return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data,
 					 addr, sizeof(u8));
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63a89a51dcc9..0aa8db229e0a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1811,7 +1811,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct pvclock_vcpu_time_info guest_hv_clock;
 
-	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+	if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time,
 		&guest_hv_clock, sizeof(guest_hv_clock))))
 		return;
 
@@ -1832,9 +1832,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
 
 	vcpu->hv_clock.version = guest_hv_clock.version + 1;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
+	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+				    &vcpu->hv_clock,
+				    sizeof(vcpu->hv_clock.version));
 
 	smp_wmb();
 
@@ -1848,16 +1848,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 
 	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
 
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock));
+	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+				    &vcpu->hv_clock,
+				    sizeof(vcpu->hv_clock));
 
 	smp_wmb();
 
 	vcpu->hv_clock.version++;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
+	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+				    &vcpu->hv_clock,
+				    sizeof(vcpu->hv_clock.version));
 }
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2090,7 +2090,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 		return 0;
 	}
 
-	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+	if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa,
 					sizeof(u32)))
 		return 1;
 
@@ -2109,7 +2109,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
-	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+	if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
 		return;
 
@@ -2120,7 +2120,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.st.steal.version += 1;
 
-	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 
 	smp_wmb();
@@ -2129,14 +2129,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 		vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
 
-	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 
 	smp_wmb();
 
 	vcpu->arch.st.steal.version += 1;
 
-	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 }
 
@@ -2241,7 +2241,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!(data & 1))
 			break;
 
-		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
 		     &vcpu->arch.pv_time, data & ~1ULL,
 		     sizeof(struct pvclock_vcpu_time_info)))
 			vcpu->arch.pv_time_enabled = false;
@@ -2262,7 +2262,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (data & KVM_STEAL_RESERVED_MASK)
 			return 1;
 
-		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime,
 						data & KVM_STEAL_VALID_BITS,
 						sizeof(struct kvm_steal_time)))
 			return 1;
@@ -2875,7 +2875,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.st.steal.preempted = 1;
 
-	kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
+	kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime,
 			&vcpu->arch.st.steal.preempted,
 			offsetof(struct kvm_steal_time, preempted),
 			sizeof(vcpu->arch.st.steal.preempted));
@@ -8533,9 +8533,8 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 
 static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
 {
-
-	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
-				      sizeof(val));
+	return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val,
+					   sizeof(val));
 }
 
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
-- 
cgit 


From 243b72aae28ca1032284028323bb81c9235b15c9 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Tue, 14 Feb 2017 13:08:38 +0300
Subject: x86/mm/ptdump: Optimize check for W+X mappings for CONFIG_KASAN=y

Enabling both DEBUG_WX=y and KASAN=y options significantly increases
boot time (dozens of seconds at least).
KASAN fills kernel page tables with repeated values to map several
TBs of the virtual memory to the single kasan_zero_page:

    kasan_zero_pud ->
        kasan_zero_pmd->
            kasan_zero_pte->
                kasan_zero_page

So, the page table walker used to find W+X mapping check the same
kasan_zero_p?d page table entries a lot more than once.
With patch pud walker will skip the pud if it has the same value as
the previous one . Skipping done iff we search for W+X mappings,
so this optimization won't affect the page table dump via debugfs.

This dropped time spend in W+X check from ~30 sec to reasonable 0.1 sec:

Before:
[    4.579991] Freeing unused kernel memory: 1000K
[   35.257523] x86/mm: Checked W+X mappings: passed, no W+X pages found.

After:
[    5.138756] Freeing unused kernel memory: 1000K
[    5.266496] x86/mm: Checked W+X mappings: passed, no W+X pages found.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: kasan-dev@googlegroups.com
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Link: http://lkml.kernel.org/r/20170214100839.17186-1-aryabinin@virtuozzo.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/dump_pagetables.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 8aa6bea1cd6c..08135341798c 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -327,18 +327,31 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
 
 #if PTRS_PER_PUD > 1
 
+/*
+ * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y
+ * KASAN fills page tables with the same values. Since there is no
+ * point in checking page table more than once we just skip repeated
+ * entries. This saves us dozens of seconds during boot.
+ */
+static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
+{
+	return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
+}
+
 static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 							unsigned long P)
 {
 	int i;
 	pud_t *start;
 	pgprotval_t prot;
+	pud_t *prev_pud = NULL;
 
 	start = (pud_t *) pgd_page_vaddr(addr);
 
 	for (i = 0; i < PTRS_PER_PUD; i++) {
 		st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
-		if (!pud_none(*start)) {
+		if (!pud_none(*start) &&
+		    !pud_already_checked(prev_pud, start, st->check_wx)) {
 			if (pud_large(*start) || !pud_present(*start)) {
 				prot = pud_flags(*start);
 				note_page(m, st, __pgprot(prot), 2);
@@ -349,6 +362,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 		} else
 			note_page(m, st, __pgprot(0), 2);
 
+		prev_pud = start;
 		start++;
 	}
 }
-- 
cgit 


From 025205f8f30c6ab52b69bf34fb359ac80360fefd Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Tue, 14 Feb 2017 13:08:39 +0300
Subject: x86/mm/ptdump: Add address marker for KASAN shadow region

Annotate the KASAN shadow with address markers in page table
dump output:

$ cat /sys/kernel/debug/kernel_page_tables
...

---[ Vmemmap ]---
0xffffea0000000000-0xffffea0003000000          48M     RW         PSE     GLB NX pmd
0xffffea0003000000-0xffffea0004000000          16M                               pmd
0xffffea0004000000-0xffffea0005000000          16M     RW         PSE     GLB NX pmd
0xffffea0005000000-0xffffea0040000000         944M                               pmd
0xffffea0040000000-0xffffea8000000000         511G                               pud
0xffffea8000000000-0xffffec0000000000        1536G                               pgd
---[ KASAN shadow ]---
0xffffec0000000000-0xffffed0000000000           1T     ro                 GLB NX pte
0xffffed0000000000-0xffffed0018000000         384M     RW         PSE     GLB NX pmd
0xffffed0018000000-0xffffed0020000000         128M                               pmd
0xffffed0020000000-0xffffed0028200000         130M     RW         PSE     GLB NX pmd
0xffffed0028200000-0xffffed0040000000         382M                               pmd
0xffffed0040000000-0xffffed8000000000         511G                               pud
0xffffed8000000000-0xfffff50000000000        7680G                               pgd
0xfffff50000000000-0xfffffbfff0000000     7339776M     ro                 GLB NX pte
0xfffffbfff0000000-0xfffffbfff0200000           2M                               pmd
0xfffffbfff0200000-0xfffffbfff0a00000           8M     RW         PSE     GLB NX pmd
0xfffffbfff0a00000-0xfffffbffffe00000         244M                               pmd
0xfffffbffffe00000-0xfffffc0000000000           2M     ro                 GLB NX pte
---[ KASAN shadow end ]---
0xfffffc0000000000-0xffffff0000000000           3T                               pgd
---[ ESPfix Area ]---
...

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: kasan-dev@googlegroups.com
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Link: http://lkml.kernel.org/r/20170214100839.17186-2-aryabinin@virtuozzo.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/dump_pagetables.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 08135341798c..58b5bee7ea27 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 
+#include <asm/kasan.h>
 #include <asm/pgtable.h>
 
 /*
@@ -51,6 +52,10 @@ enum address_markers_idx {
 	LOW_KERNEL_NR,
 	VMALLOC_START_NR,
 	VMEMMAP_START_NR,
+#ifdef CONFIG_KASAN
+	KASAN_SHADOW_START_NR,
+	KASAN_SHADOW_END_NR,
+#endif
 # ifdef CONFIG_X86_ESPFIX64
 	ESPFIX_START_NR,
 # endif
@@ -76,6 +81,10 @@ static struct addr_marker address_markers[] = {
 	{ 0/* PAGE_OFFSET */,   "Low Kernel Mapping" },
 	{ 0/* VMALLOC_START */, "vmalloc() Area" },
 	{ 0/* VMEMMAP_START */, "Vmemmap" },
+#ifdef CONFIG_KASAN
+	{ KASAN_SHADOW_START,	"KASAN shadow" },
+	{ KASAN_SHADOW_END,	"KASAN shadow end" },
+#endif
 # ifdef CONFIG_X86_ESPFIX64
 	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
 # endif
-- 
cgit 


From 460df4c1fc7c00829050c08d6368dc6e6beef307 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 8 Feb 2017 11:50:15 +0100
Subject: KVM: race-free exit from KVM_RUN without POSIX signals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The purpose of the KVM_SET_SIGNAL_MASK API is to let userspace "kick"
a VCPU out of KVM_RUN through a POSIX signal.  A signal is attached
to a dummy signal handler; by blocking the signal outside KVM_RUN and
unblocking it inside, this possible race is closed:

          VCPU thread                     service thread
   --------------------------------------------------------------
        check flag
                                          set flag
                                          raise signal
        (signal handler does nothing)
        KVM_RUN

However, one issue with KVM_SET_SIGNAL_MASK is that it has to take
tsk->sighand->siglock on every KVM_RUN.  This lock is often on a
remote NUMA node, because it is on the node of a thread's creator.
Taking this lock can be very expensive if there are many userspace
exits (as is the case for SMP Windows VMs without Hyper-V reference
time counter).

As an alternative, we can put the flag directly in kvm_run so that
KVM can see it:

          VCPU thread                     service thread
   --------------------------------------------------------------
                                          raise signal
        signal handler
          set run->immediate_exit
        KVM_RUN
          check run->immediate_exit

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0aa8db229e0a..8d3047c8cce7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2672,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_DISABLE_QUIRKS:
 	case KVM_CAP_SET_BOOT_CPU_ID:
  	case KVM_CAP_SPLIT_IRQCHIP:
+	case KVM_CAP_IMMEDIATE_EXIT:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
@@ -7202,7 +7203,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	} else
 		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
-	r = vcpu_run(vcpu);
+	if (kvm_run->immediate_exit)
+		r = -EINTR;
+	else
+		r = vcpu_run(vcpu);
 
 out:
 	post_kvm_run_save(vcpu);
-- 
cgit 


From bd7e5b0899a429445cc6e3037c13f8b5ae3be903 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 3 Feb 2017 21:18:52 -0800
Subject: KVM: x86: remove code for lazy FPU handling

The FPU is always active now when running KVM.

Reviewed-by: David Matlack <dmatlack@google.com>
Reviewed-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   3 --
 arch/x86/kvm/cpuid.c            |   2 -
 arch/x86/kvm/svm.c              |  43 ++-------------
 arch/x86/kvm/vmx.c              | 112 ++++++----------------------------------
 arch/x86/kvm/x86.c              |   7 +--
 5 files changed, 19 insertions(+), 148 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e4f13e714bcf..74ef58c8ff53 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -55,7 +55,6 @@
 #define KVM_REQ_TRIPLE_FAULT      10
 #define KVM_REQ_MMU_SYNC          11
 #define KVM_REQ_CLOCK_UPDATE      12
-#define KVM_REQ_DEACTIVATE_FPU    13
 #define KVM_REQ_EVENT             14
 #define KVM_REQ_APF_HALT          15
 #define KVM_REQ_STEAL_UPDATE      16
@@ -936,8 +935,6 @@ struct kvm_x86_ops {
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 	u32 (*get_pkru)(struct kvm_vcpu *vcpu);
-	void (*fpu_activate)(struct kvm_vcpu *vcpu);
-	void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
 
 	void (*tlb_flush)(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c0e2036217ad..1d155cc56629 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -123,8 +123,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
 		best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
-	kvm_x86_ops->fpu_activate(vcpu);
-
 	/*
 	 * The existing code assumes virtual address is 48-bit in the canonical
 	 * address checks; exit if it is ever changed.
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4e5905a1ce70..d1efe2c62b3f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1157,7 +1157,6 @@ static void init_vmcb(struct vcpu_svm *svm)
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct vmcb_save_area *save = &svm->vmcb->save;
 
-	svm->vcpu.fpu_active = 1;
 	svm->vcpu.arch.hflags = 0;
 
 	set_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1899,15 +1898,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
 	ulong gcr0 = svm->vcpu.arch.cr0;
 	u64 *hcr0 = &svm->vmcb->save.cr0;
 
-	if (!svm->vcpu.fpu_active)
-		*hcr0 |= SVM_CR0_SELECTIVE_MASK;
-	else
-		*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
-			| (gcr0 & SVM_CR0_SELECTIVE_MASK);
+	*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
+		| (gcr0 & SVM_CR0_SELECTIVE_MASK);
 
 	mark_dirty(svm->vmcb, VMCB_CR);
 
-	if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
+	if (gcr0 == *hcr0) {
 		clr_cr_intercept(svm, INTERCEPT_CR0_READ);
 		clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
 	} else {
@@ -1938,8 +1934,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	if (!npt_enabled)
 		cr0 |= X86_CR0_PG | X86_CR0_WP;
 
-	if (!vcpu->fpu_active)
-		cr0 |= X86_CR0_TS;
 	/*
 	 * re-enable caching here because the QEMU bios
 	 * does not do it - this results in some delay at
@@ -2158,22 +2152,6 @@ static int ac_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
-static void svm_fpu_activate(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	clr_exception_intercept(svm, NM_VECTOR);
-
-	svm->vcpu.fpu_active = 1;
-	update_cr0_intercept(svm);
-}
-
-static int nm_interception(struct vcpu_svm *svm)
-{
-	svm_fpu_activate(&svm->vcpu);
-	return 1;
-}
-
 static bool is_erratum_383(void)
 {
 	int err, i;
@@ -2571,9 +2549,6 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 		if (!npt_enabled && svm->apf_reason == 0)
 			return NESTED_EXIT_HOST;
 		break;
-	case SVM_EXIT_EXCP_BASE + NM_VECTOR:
-		nm_interception(svm);
-		break;
 	default:
 		break;
 	}
@@ -4018,7 +3993,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
 	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
-	[SVM_EXIT_EXCP_BASE + NM_VECTOR]	= nm_interception,
 	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
 	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
 	[SVM_EXIT_INTR]				= intr_interception,
@@ -5072,14 +5046,6 @@ static bool svm_has_wbinvd_exit(void)
 	return true;
 }
 
-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	set_exception_intercept(svm, NM_VECTOR);
-	update_cr0_intercept(svm);
-}
-
 #define PRE_EX(exit)  { .exit_code = (exit), \
 			.stage = X86_ICPT_PRE_EXCEPT, }
 #define POST_EX(exit) { .exit_code = (exit), \
@@ -5340,9 +5306,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
 	.get_pkru = svm_get_pkru,
 
-	.fpu_activate = svm_fpu_activate,
-	.fpu_deactivate = svm_fpu_deactivate,
-
 	.tlb_flush = svm_flush_tlb,
 
 	.run = svm_vcpu_run,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0e0b5d09597e..9856b73a21ad 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1856,7 +1856,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	u32 eb;
 
 	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
-	     (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
+	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
 	if ((vcpu->guest_debug &
 	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
 	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1865,8 +1865,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 		eb = ~0;
 	if (enable_ept)
 		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
-	if (vcpu->fpu_active)
-		eb &= ~(1u << NM_VECTOR);
 
 	/* When we are running a nested L2 guest and L1 specified for it a
 	 * certain exception bitmap, we must trap the same exceptions and pass
@@ -2340,25 +2338,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-{
-	ulong cr0;
-
-	if (vcpu->fpu_active)
-		return;
-	vcpu->fpu_active = 1;
-	cr0 = vmcs_readl(GUEST_CR0);
-	cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
-	cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
-	vmcs_writel(GUEST_CR0, cr0);
-	update_exception_bitmap(vcpu);
-	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
-	if (is_guest_mode(vcpu))
-		vcpu->arch.cr0_guest_owned_bits &=
-			~get_vmcs12(vcpu)->cr0_guest_host_mask;
-	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-}
-
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 
 /*
@@ -2377,33 +2356,6 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
 		(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
 }
 
-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-	/* Note that there is no vcpu->fpu_active = 0 here. The caller must
-	 * set this *before* calling this function.
-	 */
-	vmx_decache_cr0_guest_bits(vcpu);
-	vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
-	update_exception_bitmap(vcpu);
-	vcpu->arch.cr0_guest_owned_bits = 0;
-	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-	if (is_guest_mode(vcpu)) {
-		/*
-		 * L1's specified read shadow might not contain the TS bit,
-		 * so now that we turned on shadowing of this bit, we need to
-		 * set this bit of the shadow. Like in nested_vmx_run we need
-		 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
-		 * up-to-date here because we just decached cr0.TS (and we'll
-		 * only update vmcs12->guest_cr0 on nested exit).
-		 */
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-		vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
-			(vcpu->arch.cr0 & X86_CR0_TS);
-		vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
-	} else
-		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
-}
-
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 {
 	unsigned long rflags, save_rflags;
@@ -4232,9 +4184,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	if (enable_ept)
 		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
 
-	if (!vcpu->fpu_active)
-		hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
-
 	vmcs_writel(CR0_READ_SHADOW, cr0);
 	vmcs_writel(GUEST_CR0, hw_cr0);
 	vcpu->arch.cr0 = cr0;
@@ -5321,7 +5270,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	/* 22.2.1, 20.8.1 */
 	vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
 
-	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+	vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
+	vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
+
 	set_cr4_guest_host_mask(vmx);
 
 	if (vmx_xsaves_supported())
@@ -5425,7 +5376,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vmx_set_cr0(vcpu, cr0); /* enter rmode */
 	vmx_set_cr4(vcpu, 0);
 	vmx_set_efer(vcpu, 0);
-	vmx_fpu_activate(vcpu);
+
 	update_exception_bitmap(vcpu);
 
 	vpid_sync_context(vmx->vpid);
@@ -5698,11 +5649,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 	if (is_nmi(intr_info))
 		return 1;  /* already handled by vmx_vcpu_run() */
 
-	if (is_no_device(intr_info)) {
-		vmx_fpu_activate(vcpu);
-		return 1;
-	}
-
 	if (is_invalid_opcode(intr_info)) {
 		if (is_guest_mode(vcpu)) {
 			kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5892,22 +5838,6 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
 		return kvm_set_cr4(vcpu, val);
 }
 
-/* called to set cr0 as appropriate for clts instruction exit. */
-static void handle_clts(struct kvm_vcpu *vcpu)
-{
-	if (is_guest_mode(vcpu)) {
-		/*
-		 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
-		 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
-		 * just pretend it's off (also in arch.cr0 for fpu_activate).
-		 */
-		vmcs_writel(CR0_READ_SHADOW,
-			vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
-		vcpu->arch.cr0 &= ~X86_CR0_TS;
-	} else
-		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-}
-
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification, val;
@@ -5953,9 +5883,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		}
 		break;
 	case 2: /* clts */
-		handle_clts(vcpu);
+		WARN_ONCE(1, "Guest should always own CR0.TS");
+		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
 		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
-		vmx_fpu_activate(vcpu);
 		return kvm_skip_emulated_instruction(vcpu);
 	case 1: /*mov from cr*/
 		switch (cr) {
@@ -10349,8 +10279,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	}
 
 	/*
-	 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
-	 * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
+	 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
+	 * bits which we consider mandatory enabled.
 	 * The CR0_READ_SHADOW is what L2 should have expected to read given
 	 * the specifications by L1; It's not enough to take
 	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
@@ -10963,24 +10893,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
 	/*
 	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
-	 * actually changed, because it depends on the current state of
-	 * fpu_active (which may have changed).
-	 * Note that vmx_set_cr0 refers to efer set above.
+	 * actually changed, because vmx_set_cr0 refers to efer set above.
+	 *
+	 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
+	 * (KVM doesn't change it);
 	 */
+	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
 	vmx_set_cr0(vcpu, vmcs12->host_cr0);
-	/*
-	 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
-	 * to apply the same changes to L1's vmcs. We just set cr0 correctly,
-	 * but we also need to update cr0_guest_host_mask and exception_bitmap.
-	 */
-	update_exception_bitmap(vcpu);
-	vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
-	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-	/*
-	 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
-	 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
-	 */
+	/* Same as above - no reason to call set_cr4_guest_host_mask().  */
 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
 	kvm_set_cr4(vcpu, vmcs12->host_cr4);
 
@@ -11609,9 +11530,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.get_pkru = vmx_get_pkru,
 
-	.fpu_activate = vmx_fpu_activate,
-	.fpu_deactivate = vmx_fpu_deactivate,
-
 	.tlb_flush = vmx_flush_tlb,
 
 	.run = vmx_vcpu_run,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8d3047c8cce7..c48404017e4f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6751,10 +6751,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			r = 0;
 			goto out;
 		}
-		if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
-			vcpu->fpu_active = 0;
-			kvm_x86_ops->fpu_deactivate(vcpu);
-		}
 		if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
 			/* Page is swapped out. Do synthetic halt */
 			vcpu->arch.apf.halted = true;
@@ -6856,8 +6852,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
-	if (vcpu->fpu_active)
-		kvm_load_guest_fpu(vcpu);
+	kvm_load_guest_fpu(vcpu);
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
-- 
cgit 


From 9383191da4e40360a5d880fbe6bb03911c61621b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 16 Feb 2017 22:24:49 +0100
Subject: bpf: remove stubs for cBPF from arch code

Remove the dummy bpf_jit_compile() stubs for eBPF JITs and make
that a single __weak function in the core that can be overridden
similarly to the eBPF one. Also remove stale pr_err() mentions
of bpf_jit_compile.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index bb660e53cbd6..26123d0ae13a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1067,13 +1067,13 @@ common_load:
 
 		ilen = prog - temp;
 		if (ilen > BPF_MAX_INSN_SIZE) {
-			pr_err("bpf_jit_compile fatal insn size error\n");
+			pr_err("bpf_jit: fatal insn size error\n");
 			return -EFAULT;
 		}
 
 		if (image) {
 			if (unlikely(proglen + ilen > oldproglen)) {
-				pr_err("bpf_jit_compile fatal error\n");
+				pr_err("bpf_jit: fatal error\n");
 				return -EFAULT;
 			}
 			memcpy(image + proglen, temp, ilen);
@@ -1085,10 +1085,6 @@ common_load:
 	return proglen;
 }
 
-void bpf_jit_compile(struct bpf_prog *prog)
-{
-}
-
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_binary_header *header = NULL;
-- 
cgit 


From 74451e66d516c55e309e8d89a4a1e7596e46aacd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 16 Feb 2017 22:24:50 +0100
Subject: bpf: make jited programs visible in traces

Long standing issue with JITed programs is that stack traces from
function tracing check whether a given address is kernel code
through {__,}kernel_text_address(), which checks for code in core
kernel, modules and dynamically allocated ftrace trampolines. But
what is still missing is BPF JITed programs (interpreted programs
are not an issue as __bpf_prog_run() will be attributed to them),
thus when a stack trace is triggered, the code walking the stack
won't see any of the JITed ones. The same for address correlation
done from user space via reading /proc/kallsyms. This is read by
tools like perf, but the latter is also useful for permanent live
tracing with eBPF itself in combination with stack maps when other
eBPF types are part of the callchain. See offwaketime example on
dumping stack from a map.

This work tries to tackle that issue by making the addresses and
symbols known to the kernel. The lookup from *kernel_text_address()
is implemented through a latched RB tree that can be read under
RCU in fast-path that is also shared for symbol/size/offset lookup
for a specific given address in kallsyms. The slow-path iteration
through all symbols in the seq file done via RCU list, which holds
a tiny fraction of all exported ksyms, usually below 0.1 percent.
Function symbols are exported as bpf_prog_<tag>, in order to aide
debugging and attribution. This facility is currently enabled for
root-only when bpf_jit_kallsyms is set to 1, and disabled if hardening
is active in any mode. The rationale behind this is that still a lot
of systems ship with world read permissions on kallsyms thus addresses
should not get suddenly exposed for them. If that situation gets
much better in future, we always have the option to change the
default on this. Likewise, unprivileged programs are not allowed
to add entries there either, but that is less of a concern as most
such programs types relevant in this context are for root-only anyway.
If enabled, call graphs and stack traces will then show a correct
attribution; one example is illustrated below, where the trace is
now visible in tooling such as perf script --kallsyms=/proc/kallsyms
and friends.

Before:

  7fff8166889d bpf_clone_redirect+0x80007f0020ed (/lib/modules/4.9.0-rc8+/build/vmlinux)
         f5d80 __sendmsg_nocancel+0xffff006451f1a007 (/usr/lib64/libc-2.18.so)

After:

  7fff816688b7 bpf_clone_redirect+0x80007f002107 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fffa0575728 bpf_prog_33c45a467c9e061a+0x8000600020fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fffa07ef1fc cls_bpf_classify+0x8000600020dc (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff81678b68 tc_classify+0x80007f002078 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164d40b __netif_receive_skb_core+0x80007f0025fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164d718 __netif_receive_skb+0x80007f002018 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164e565 process_backlog+0x80007f002095 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164dc71 net_rx_action+0x80007f002231 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff81767461 __softirqentry_text_start+0x80007f0020d1 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff817658ac do_softirq_own_stack+0x80007f00201c (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff810a2c20 do_softirq+0x80007f002050 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff810a2cb5 __local_bh_enable_ip+0x80007f002085 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168d452 ip_finish_output2+0x80007f002152 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168ea3d ip_finish_output+0x80007f00217d (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168f2af ip_output+0x80007f00203f (/lib/modules/4.9.0-rc8+/build/vmlinux)
  [...]
  7fff81005854 do_syscall_64+0x80007f002054 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff817649eb return_from_SYSCALL_64+0x80007f002000 (/lib/modules/4.9.0-rc8+/build/vmlinux)
         f5d80 __sendmsg_nocancel+0xffff01c484812007 (/usr/lib64/libc-2.18.so)

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 26123d0ae13a..18a62e208826 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1180,18 +1180,3 @@ out:
 					   tmp : orig_prog);
 	return prog;
 }
-
-void bpf_jit_free(struct bpf_prog *fp)
-{
-	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
-	struct bpf_binary_header *header = (void *)addr;
-
-	if (!fp->jited)
-		goto free_filter;
-
-	set_memory_rw(addr, header->pages);
-	bpf_jit_binary_free(header);
-
-free_filter:
-	bpf_prog_unlock_free(fp);
-}
-- 
cgit 


From 06ce521af9558814b8606c0476c54497cf83a653 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 24 Jan 2017 11:56:21 +0100
Subject: kvm: fix page struct leak in handle_vmon

handle_vmon gets a reference on VMXON region page,
but does not release it. Release the reference.

Found by syzkaller; based on a patch by Dmitry.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9856b73a21ad..d13073c841ff 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6996,13 +6996,18 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
 		}
 
 		page = nested_get_page(vcpu, vmptr);
-		if (page == NULL ||
-		    *(u32 *)kmap(page) != VMCS12_REVISION) {
+		if (page == NULL) {
 			nested_vmx_failInvalid(vcpu);
+			return kvm_skip_emulated_instruction(vcpu);
+		}
+		if (*(u32 *)kmap(page) != VMCS12_REVISION) {
 			kunmap(page);
+			nested_release_page_clean(page);
+			nested_vmx_failInvalid(vcpu);
 			return kvm_skip_emulated_instruction(vcpu);
 		}
 		kunmap(page);
+		nested_release_page_clean(page);
 		vmx->nested.vmxon_ptr = vmptr;
 		break;
 	case EXIT_REASON_VMCLEAR:
-- 
cgit 


From 4f53ab14285802b298261f8b52af322039d1dfd0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 20 Feb 2017 08:56:09 -0800
Subject: x86/asm: Define the kernel TSS limit in a macro

Rather than open-coding the kernel TSS limit in set_tss_desc(), make
it a real macro near the TSS layout definition.

This is purely a cleanup.

Cc: Thomas Garnier <thgarnie@google.com>
Cc: Jim Mattson <jmattson@google.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/desc.h      | 10 +---------
 arch/x86/include/asm/processor.h | 10 ++++++++++
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 12080d87da3b..2e781bcc5e12 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -177,16 +177,8 @@ static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
 	struct desc_struct *d = get_cpu_gdt_table(cpu);
 	tss_desc tss;
 
-	/*
-	 * sizeof(unsigned long) coming from an extra "long" at the end
-	 * of the iobitmap. See tss_struct definition in processor.h
-	 *
-	 * -1? seg base+limit should be pointing to the address of the
-	 * last valid byte
-	 */
 	set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
-			      IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
-			      sizeof(unsigned long) - 1);
+			      __KERNEL_TSS_LIMIT);
 	write_gdt_entry(d, entry, &tss, DESC_TSS);
 }
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1be64da0384e..f8f1b7537abe 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -341,6 +341,16 @@ struct tss_struct {
 
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
 
+/*
+ * sizeof(unsigned long) coming from an extra "long" at the end
+ * of the iobitmap.
+ *
+ * -1? seg base+limit should be pointing to the address of the
+ * last valid byte
+ */
+#define __KERNEL_TSS_LIMIT	\
+	(IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
+
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
 #endif
-- 
cgit 


From e0c230634af99967da79a6ed1faecc720fb623ca Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 20 Feb 2017 08:56:10 -0800
Subject: x86/kvm/vmx: Don't fetch the TSS base from the GDT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current CPU's TSS base is a foregone conclusion, so there's no need
to parse it out of the segment tables.  This should save a couple cycles
(as STR is surely microcoded and poorly optimized) but, more importantly,
it's a cleanup and it means that segment_base() will never be called on
64-bit kernels.

Cc: Thomas Garnier <thgarnie@google.com>
Cc: Jim Mattson <jmattson@google.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d13073c841ff..3dbbf4ec471f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2086,13 +2086,6 @@ static unsigned long segment_base(u16 selector)
 	return v;
 }
 
-static inline unsigned long kvm_read_tr_base(void)
-{
-	u16 tr;
-	asm("str %0" : "=g"(tr));
-	return segment_base(tr);
-}
-
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2292,10 +2285,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 		/*
 		 * Linux uses per-cpu TSS and GDT, so set these when switching
-		 * processors.
+		 * processors.  See 22.2.4.
 		 */
-		vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
-		vmcs_writel(HOST_GDTR_BASE, gdt->address);   /* 22.2.4 */
+		vmcs_writel(HOST_TR_BASE,
+			    (unsigned long)this_cpu_ptr(&cpu_tss));
+		vmcs_writel(HOST_GDTR_BASE, gdt->address);
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-- 
cgit 


From e28baeadcf0d657c6b6e849ae1b4faccb4faf326 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 20 Feb 2017 08:56:11 -0800
Subject: x86/kvm/vmx: Get rid of segment_base() on 64-bit kernels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It was a bit buggy (it didn't list all segment types that needed
64-bit fixups), but the bug was irrelevant because it wasn't called
in any interesting context on 64-bit kernels and was only used for
data segents on 32-bit kernels.

To avoid confusion, make it explicitly 32-bit only.

Cc: Thomas Garnier <thgarnie@google.com>
Cc: Jim Mattson <jmattson@google.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3dbbf4ec471f..3ddd72303fe4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2057,6 +2057,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 	}
 }
 
+#ifdef CONFIG_X86_32
+/*
+ * On 32-bit kernels, VM exits still load the FS and GS bases from the
+ * VMCS rather than the segment table.  KVM uses this helper to figure
+ * out the current bases to poke them into the VMCS before entry.
+ */
 static unsigned long segment_base(u16 selector)
 {
 	struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
@@ -2079,12 +2085,9 @@ static unsigned long segment_base(u16 selector)
 	}
 	d = (struct desc_struct *)(table_base + (selector & ~7));
 	v = get_desc_base(d);
-#ifdef CONFIG_X86_64
-       if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
-               v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
-#endif
 	return v;
 }
+#endif
 
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
-- 
cgit 


From 8c2e41f7ae1234c192ef497472ad306227c77c03 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 20 Feb 2017 08:56:12 -0800
Subject: x86/kvm/vmx: Simplify segment_base()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use actual pointer types for pointers (instead of unsigned long) and
replace hardcoded constants with the appropriate self-documenting
macros.

The function is still a bit messy, but this seems a lot better than
before to me.

This is mostly borrowed from a patch by Thomas Garnier.

Cc: Thomas Garnier <thgarnie@google.com>
Cc: Jim Mattson <jmattson@google.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3ddd72303fe4..2dd94cf597cc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2067,24 +2067,23 @@ static unsigned long segment_base(u16 selector)
 {
 	struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
 	struct desc_struct *d;
-	unsigned long table_base;
+	struct desc_struct *table;
 	unsigned long v;
 
-	if (!(selector & ~3))
+	if (!(selector & ~SEGMENT_RPL_MASK))
 		return 0;
 
-	table_base = gdt->address;
+	table = (struct desc_struct *)gdt->address;
 
-	if (selector & 4) {           /* from ldt */
+	if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
 		u16 ldt_selector = kvm_read_ldt();
 
-		if (!(ldt_selector & ~3))
+		if (!(ldt_selector & ~SEGMENT_RPL_MASK))
 			return 0;
 
-		table_base = segment_base(ldt_selector);
+		table = (struct desc_struct *)segment_base(ldt_selector);
 	}
-	d = (struct desc_struct *)(table_base + (selector & ~7));
-	v = get_desc_base(d);
+	v = get_desc_base(&table[selector >> 3]);
 	return v;
 }
 #endif
-- 
cgit 


From d3273deac9c0cdae32eb46f928487433eaa37f87 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 20 Feb 2017 08:56:13 -0800
Subject: x86/asm/64: Drop __cacheline_aligned from struct x86_hw_tss

Historically, the entire TSS + io bitmap structure was cacheline
aligned, but commit ca241c75037b ("x86: unify tss_struct") changed it
(presumably inadvertently) so that the fixed-layout hardware part is
cacheline-aligned and the io bitmap is after the padding.  This wastes
24 bytes (the hardware part should be 104 bytes, but this pads it to
128 bytes) and, serves no purpose, and causes sizeof(struct
x86_hw_tss) to have a confusing value.

Drop the pointless alignment.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/processor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f8f1b7537abe..1879cdf2b6ae 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -303,7 +303,7 @@ struct x86_hw_tss {
 	u16			reserved5;
 	u16			io_bitmap_base;
 
-} __attribute__((packed)) ____cacheline_aligned;
+} __attribute__((packed));
 #endif
 
 /*
-- 
cgit 


From b7ffc44d5b2ea163899d09289ca7743d5c32e926 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 20 Feb 2017 08:56:14 -0800
Subject: x86/kvm/vmx: Defer TR reload after VM exit

Intel's VMX is daft and resets the hidden TSS limit register to 0x67
on VMX reload, and the 0x67 is not configurable.  KVM currently
reloads TR using the LTR instruction on every exit, but this is quite
slow because LTR is serializing.

The 0x67 limit is entirely harmless unless ioperm() is in use, so
defer the reload until a task using ioperm() is actually running.

Here's some poorly done benchmarking using kvm-unit-tests:

Before:

cpuid 1313
vmcall 1195
mov_from_cr8 11
mov_to_cr8 17
inl_from_pmtimer 6770
inl_from_qemu 6856
inl_from_kernel 2435
outl_to_kernel 1402

After:

cpuid 1291
vmcall 1181
mov_from_cr8 11
mov_to_cr8 16
inl_from_pmtimer 6457
inl_from_qemu 6209
inl_from_kernel 2339
outl_to_kernel 1391

Signed-off-by: Andy Lutomirski <luto@kernel.org>
[Force-reload TR in invalidate_tss_limit. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/desc.h | 48 +++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/ioport.c    |  5 +++++
 arch/x86/kernel/process.c   | 10 ++++++++++
 arch/x86/kvm/vmx.c          | 23 +++++++++-------------
 4 files changed, 72 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 2e781bcc5e12..cb8f9149f6c8 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -205,6 +205,54 @@ static inline void native_load_tr_desc(void)
 	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
 }
 
+static inline void force_reload_TR(void)
+{
+	struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
+	tss_desc tss;
+
+	memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
+
+	/*
+	 * LTR requires an available TSS, and the TSS is currently
+	 * busy.  Make it be available so that LTR will work.
+	 */
+	tss.type = DESC_TSS;
+	write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);
+
+	load_TR_desc();
+}
+
+DECLARE_PER_CPU(bool, need_tr_refresh);
+
+static inline void refresh_TR(void)
+{
+	DEBUG_LOCKS_WARN_ON(preemptible());
+
+	if (unlikely(this_cpu_read(need_tr_refresh))) {
+		force_reload_TR();
+		this_cpu_write(need_tr_refresh, false);
+	}
+}
+
+/*
+ * If you do something evil that corrupts the cached TSS limit (I'm looking
+ * at you, VMX exits), call this function.
+ *
+ * The optimization here is that the TSS limit only matters for Linux if the
+ * IO bitmap is in use.  If the TSS limit gets forced to its minimum value,
+ * everything works except that IO bitmap will be ignored and all CPL 3 IO
+ * instructions will #GP, which is exactly what we want for normal tasks.
+ */
+static inline void invalidate_tss_limit(void)
+{
+	DEBUG_LOCKS_WARN_ON(preemptible());
+
+	if (unlikely(test_thread_flag(TIF_IO_BITMAP)))
+		force_reload_TR();
+	else
+		this_cpu_write(need_tr_refresh, true);
+}
+
 static inline void native_load_gdt(const struct desc_ptr *dtr)
 {
 	asm volatile("lgdt %0"::"m" (*dtr));
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 589b3193f102..b01bc8517450 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -16,6 +16,7 @@
 #include <linux/syscalls.h>
 #include <linux/bitmap.h>
 #include <asm/syscalls.h>
+#include <asm/desc.h>
 
 /*
  * this changes the io permissions bitmap in the current task.
@@ -45,6 +46,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 		memset(bitmap, 0xff, IO_BITMAP_BYTES);
 		t->io_bitmap_ptr = bitmap;
 		set_thread_flag(TIF_IO_BITMAP);
+
+		preempt_disable();
+		refresh_TR();
+		preempt_enable();
 	}
 
 	/*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b615a1113f58..7780efa635b9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -32,6 +32,7 @@
 #include <asm/mce.h>
 #include <asm/vm86.h>
 #include <asm/switch_to.h>
+#include <asm/desc.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -64,6 +65,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 };
 EXPORT_PER_CPU_SYMBOL(cpu_tss);
 
+DEFINE_PER_CPU(bool, need_tr_refresh);
+EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh);
+
 /*
  * this gets called so that we can store lazy state into memory and copy the
  * current task into the new thread.
@@ -209,6 +213,12 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 */
 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
 		       max(prev->io_bitmap_max, next->io_bitmap_max));
+
+		/*
+		 * Make sure that the TSS limit is correct for the CPU
+		 * to notice the IO bitmap.
+		 */
+		refresh_TR();
 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
 		/*
 		 * Clear any possible leftover bits:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2dd94cf597cc..acf6013a0caf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1990,19 +1990,6 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 	m->host[i].value = host_val;
 }
 
-static void reload_tss(void)
-{
-	/*
-	 * VT restores TR but not its size.  Useless.
-	 */
-	struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
-	struct desc_struct *descs;
-
-	descs = (void *)gdt->address;
-	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
-	load_TR_desc();
-}
-
 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 {
 	u64 guest_efer = vmx->vcpu.arch.efer;
@@ -2172,7 +2159,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 		loadsegment(es, vmx->host_state.es_sel);
 	}
 #endif
-	reload_tss();
+	invalidate_tss_limit();
 #ifdef CONFIG_X86_64
 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 #endif
@@ -2293,6 +2280,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			    (unsigned long)this_cpu_ptr(&cpu_tss));
 		vmcs_writel(HOST_GDTR_BASE, gdt->address);
 
+		/*
+		 * VM exits change the host TR limit to 0x67 after a VM
+		 * exit.  This is okay, since 0x67 covers everything except
+		 * the IO bitmap and have have code to handle the IO bitmap
+		 * being lost after a VM exit.
+		 */
+		BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
+
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 
-- 
cgit 


From 96794e4ed4d758272c486e1529e431efb7045265 Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Tue, 21 Feb 2017 03:50:01 -0500
Subject: KVM: VMX: use correct vmcs_read/write for guest segment selector/base

Guest segment selector is 16 bit field and guest segment base is natural
width field. Fix two incorrect invocations accordingly.

Without this patch, build fails when aggressive inlining is used with ICC.

Cc: stable@vger.kernel.org
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index acf6013a0caf..ef4ba71dbb66 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3905,7 +3905,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
 	}
 
 	vmcs_write16(sf->selector, var.selector);
-	vmcs_write32(sf->base, var.base);
+	vmcs_writel(sf->base, var.base);
 	vmcs_write32(sf->limit, var.limit);
 	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
 }
@@ -8270,7 +8270,7 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
 static void vmx_dump_sel(char *name, uint32_t sel)
 {
 	pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
-	       name, vmcs_read32(sel),
+	       name, vmcs_read16(sel),
 	       vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
 	       vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
 	       vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
-- 
cgit 


From 6c62985d576c8a816f528c39204207b9f449d923 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 20 Feb 2017 13:36:03 -0500
Subject: x86/paravirt: Change vcp_is_preempted() arg type to long

The cpu argument in the function prototype of vcpu_is_preempted()
is changed from int to long. That makes it easier to provide a better
optimized assembly version of that function.

For Xen, vcpu_is_preempted(long) calls xen_vcpu_stolen(int), the
downcast from long to int is not a problem as vCPU number won't exceed
32 bits.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/paravirt.h      | 2 +-
 arch/x86/include/asm/qspinlock.h     | 2 +-
 arch/x86/kernel/kvm.c                | 2 +-
 arch/x86/kernel/paravirt-spinlocks.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 1eea6ca40694..f75fbfe550f2 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -673,7 +673,7 @@ static __always_inline void pv_kick(int cpu)
 	PVOP_VCALL1(pv_lock_ops.kick, cpu);
 }
 
-static __always_inline bool pv_vcpu_is_preempted(int cpu)
+static __always_inline bool pv_vcpu_is_preempted(long cpu)
 {
 	return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
 }
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index c343ab52579f..48a706f641f2 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -34,7 +34,7 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
 }
 
 #define vcpu_is_preempted vcpu_is_preempted
-static inline bool vcpu_is_preempted(int cpu)
+static inline bool vcpu_is_preempted(long cpu)
 {
 	return pv_vcpu_is_preempted(cpu);
 }
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 36bc66416021..334173d2665a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -589,7 +589,7 @@ out:
 	local_irq_restore(flags);
 }
 
-__visible bool __kvm_vcpu_is_preempted(int cpu)
+__visible bool __kvm_vcpu_is_preempted(long cpu)
 {
 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
 
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 6d4bf812af45..8caa8a18472b 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -20,7 +20,7 @@ bool pv_is_native_spin_unlock(void)
 		__raw_callee_save___native_queued_spin_unlock;
 }
 
-__visible bool __native_vcpu_is_preempted(int cpu)
+__visible bool __native_vcpu_is_preempted(long cpu)
 {
 	return false;
 }
-- 
cgit 


From dd0fd8bca1850ddadf5d33a9ed28f3707cd98ac7 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 20 Feb 2017 13:36:04 -0500
Subject: x86/kvm: Provide optimized version of vcpu_is_preempted() for x86-64

It was found when running fio sequential write test with a XFS ramdisk
on a KVM guest running on a 2-socket x86-64 system, the %CPU times
as reported by perf were as follows:

 69.75%  0.59%  fio  [k] down_write
 69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
 67.12%  1.12%  fio  [k] rwsem_down_write_failed
 63.48% 52.77%  fio  [k] osq_lock
  9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
  3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted

Making vcpu_is_preempted() a callee-save function has a relatively
high cost on x86-64 primarily due to at least one more cacheline of
data access from the saving and restoring of registers (8 of them)
to and from stack as well as one more level of function call.

To reduce this performance overhead, an optimized assembly version
of the the __raw_callee_save___kvm_vcpu_is_preempt() function is
provided for x86-64.

With this patch applied on a KVM guest on a 2-socket 16-core 32-thread
system with 16 parallel jobs (8 on each socket), the aggregrate
bandwidth of the fio test on an XFS ramdisk were as follows:

   I/O Type      w/o patch    with patch
   --------      ---------    ----------
   random read   8141.2 MB/s  8497.1 MB/s
   seq read      8229.4 MB/s  8304.2 MB/s
   random write  1675.5 MB/s  1701.5 MB/s
   seq write     1681.3 MB/s  1699.9 MB/s

There are some increases in the aggregated bandwidth because of
the patch.

The perf data now became:

 70.78%  0.58%  fio  [k] down_write
 70.20%  0.01%  fio  [k] call_rwsem_down_write_failed
 69.70%  1.17%  fio  [k] rwsem_down_write_failed
 59.91% 55.42%  fio  [k] osq_lock
 10.14% 10.14%  fio  [k] __kvm_vcpu_is_preempted

The assembly code was verified by using a test kernel module to
compare the output of C __kvm_vcpu_is_preempted() and that of assembly
__raw_callee_save___kvm_vcpu_is_preempt() to verify that they matched.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kernel/asm-offsets_64.c |  9 +++++++++
 arch/x86/kernel/kvm.c            | 24 ++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 210927ee2e74..99332f550c48 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -13,6 +13,10 @@ static char syscalls_ia32[] = {
 #include <asm/syscalls_32.h>
 };
 
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#include <asm/kvm_para.h>
+#endif
+
 int main(void)
 {
 #ifdef CONFIG_PARAVIRT
@@ -22,6 +26,11 @@ int main(void)
 	BLANK();
 #endif
 
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+	OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
+	BLANK();
+#endif
+
 #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
 	ENTRY(bx);
 	ENTRY(cx);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 334173d2665a..d05797be2f64 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -589,6 +589,7 @@ out:
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_X86_32
 __visible bool __kvm_vcpu_is_preempted(long cpu)
 {
 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
@@ -597,6 +598,29 @@ __visible bool __kvm_vcpu_is_preempted(long cpu)
 }
 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+"movq	__per_cpu_offset(,%rdi,8), %rax;"
+"cmpb	$0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"setne	%al;"
+"ret;"
+".popsection");
+
+#endif
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */
-- 
cgit 


From d2852a2240509e512712e25de2d0796cda435ecb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 21 Feb 2017 16:09:33 +0100
Subject: arch: add ARCH_HAS_SET_MEMORY config

Currently, there's no good way to test for the presence of
set_memory_ro/rw/x/nx() helpers implemented by archs such as
x86, arm, arm64 and s390.

There's DEBUG_SET_MODULE_RONX and DEBUG_RODATA, however both
don't really reflect that: set_memory_*() are also available
even when DEBUG_SET_MODULE_RONX is turned off, and DEBUG_RODATA
is set by parisc, but doesn't implement above functions. Thus,
add ARCH_HAS_SET_MEMORY that is selected by mentioned archs,
where generic code can test against this.

This also allows later on to move DEBUG_SET_MODULE_RONX out of
the arch specific Kconfig to define it only once depending on
ARCH_HAS_SET_MEMORY.

Suggested-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e487493bbd47..434dd2a1c5f2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,6 +53,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
-- 
cgit 


From 9d876e79df6a2f364b9f2737eacd72ceb27da53a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 21 Feb 2017 16:09:34 +0100
Subject: bpf: fix unlocking of jited image when module ronx not set

Eric and Willem reported that they recently saw random crashes when
JIT was in use and bisected this to 74451e66d516 ("bpf: make jited
programs visible in traces"). Issue was that the consolidation part
added bpf_jit_binary_unlock_ro() that would unlock previously made
read-only memory back to read-write. However, DEBUG_SET_MODULE_RONX
cannot be used for this to test for presence of set_memory_*()
functions. We need to use ARCH_HAS_SET_MEMORY instead to fix this;
also add the corresponding bpf_jit_binary_lock_ro() to filter.h.

Fixes: 74451e66d516 ("bpf: make jited programs visible in traces")
Reported-by: Eric Dumazet <edumazet@google.com>
Reported-by: Willem de Bruijn <willemb@google.com>
Bisected-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 18a62e208826..32322ce9b405 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1165,7 +1165,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 
 	if (image) {
 		bpf_flush_icache(header, image + proglen);
-		set_memory_ro((unsigned long)header, header->pages);
+		bpf_jit_binary_lock_ro(header);
 		prog->bpf_func = (void *)image;
 		prog->jited = 1;
 	} else {
-- 
cgit 


From ddffe98d166f4a93d996d5aa628fd745311fc1e7 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Date: Wed, 22 Feb 2017 15:45:13 -0800
Subject: mm/memory_hotplug: set magic number to page->freelist instead of
 page->lru.next

To identify that pages of page table are allocated from bootmem
allocator, magic number sets to page->lru.next.

But page->lru list is initialized in reserve_bootmem_region().  So when
calling free_pagetable(), the function cannot find the magic number of
pages.  And free_pagetable() frees the pages by free_reserved_page() not
put_page_bootmem().

But if the pages are allocated from bootmem allocator and used as page
table, the pages have private flag.  So before freeing the pages, we
should clear the private flag by put_page_bootmem().

Before applying the commit 7bfec6f47bb0 ("mm, page_alloc: check multiple
page fields with a single branch"), we could find the following visible
issue:

  BUG: Bad page state in process kworker/u1024:1
  page:ffffea103cfd8040 count:0 mapcount:0 mappi
  flags: 0x6fffff80000800(private)
  page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
  bad because of flags: 0x800(private)
  <snip>
  Call Trace:
  [...] dump_stack+0x63/0x87
  [...] bad_page+0x114/0x130
  [...] free_pages_prepare+0x299/0x2d0
  [...] free_hot_cold_page+0x31/0x150
  [...] __free_pages+0x25/0x30
  [...] free_pagetable+0x6f/0xb4
  [...] remove_pagetable+0x379/0x7ff
  [...] vmemmap_free+0x10/0x20
  [...] sparse_remove_one_section+0x149/0x180
  [...] __remove_pages+0x2e9/0x4f0
  [...] arch_remove_memory+0x63/0xc0
  [...] remove_memory+0x8c/0xc0
  [...] acpi_memory_device_remove+0x79/0xa5
  [...] acpi_bus_trim+0x5a/0x8d
  [...] acpi_bus_trim+0x38/0x8d
  [...] acpi_device_hotplug+0x1b7/0x418
  [...] acpi_hotplug_work_fn+0x1e/0x29
  [...] process_one_work+0x152/0x400
  [...] worker_thread+0x125/0x4b0
  [...] kthread+0xd8/0xf0
  [...] ret_from_fork+0x22/0x40

And the issue still silently occurs.

Until freeing the pages of page table allocated from bootmem allocator,
the page->freelist is never used.  So the patch sets magic number to
page->freelist instead of page->lru.next.

[isimatu.yasuaki@jp.fujitsu.com: fix merge issue]
  Link: http://lkml.kernel.org/r/722b1cc4-93ac-dd8b-2be2-7a7e313b3b0b@gmail.com
Link: http://lkml.kernel.org/r/2c29bd9f-5b67-02d0-18a3-8828e78bbb6f@gmail.com
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index af85b686a7b0..97346f987ef2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -679,7 +679,7 @@ static void __meminit free_pagetable(struct page *page, int order)
 	if (PageReserved(page)) {
 		__ClearPageReserved(page);
 
-		magic = (unsigned long)page->lru.next;
+		magic = (unsigned long)page->freelist;
 		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
 			while (nr_pages--)
 				put_page_bootmem(page++);
-- 
cgit 


From ecf1385d72f0491400a8ceca7001196ca369aa8c Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 22 Feb 2017 15:46:37 -0800
Subject: mm: drop unused argument of zap_page_range()

There's no users of zap_page_range() who wants non-NULL 'details'.
Let's drop it.

Link: http://lkml.kernel.org/r/20170118122429.43661-3-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/mpx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index af59f808742f..aad4ac386f98 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -796,7 +796,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
 			return -EINVAL;
 
 		len = min(vma->vm_end, end) - addr;
-		zap_page_range(vma, addr, len, NULL);
+		zap_page_range(vma, addr, len);
 		trace_mpx_unmap_zap(addr, addr+len);
 
 		vma = vma->vm_next;
-- 
cgit 


From d1091c7fa3d52ebce4dd3f15d04155b3469b2f90 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 21 Feb 2017 15:35:32 -0600
Subject: objtool: Improve detection of BUG() and other dead ends

The BUG() macro's use of __builtin_unreachable() via the unreachable()
macro tells gcc that the instruction is a dead end, and that it's safe
to assume the current code path will not execute past the previous
instruction.

On x86, the BUG() macro is implemented with the 'ud2' instruction.  When
objtool's branch analysis sees that instruction, it knows the current
code path has come to a dead end.

Peter Zijlstra has been working on a patch to change the WARN macros to
use 'ud2'.  That patch will break objtool's assumption that 'ud2' is
always a dead end.

Generally it's best for objtool to avoid making those kinds of
assumptions anyway.  The more ignorant it is of kernel code internals,
the better.

So create a more generic way for objtool to detect dead ends by adding
an annotation to the unreachable() macro.  The annotation stores a
pointer to the end of the unreachable code path in an '__unreachable'
section.  Objtool can read that section to find the dead ends.

Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/41a6d33971462ebd944a1c60ad4bf5be86c17b77.1487712920.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/vmlinux.lds.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e79f15f108a8..ad0118fbce90 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -346,6 +346,7 @@ SECTIONS
 	/DISCARD/ : {
 		*(.eh_frame)
 		*(__func_stack_frame_non_standard)
+		*(__unreachable)
 	}
 }
 
-- 
cgit 


From a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 24 Feb 2017 14:57:02 -0800
Subject: mm, x86: add support for PUD-sized transparent hugepages

The current transparent hugepage code only supports PMDs.  This patch
adds support for transparent use of PUDs with DAX.  It does not include
support for anonymous pages.  x86 support code also added.

Most of this patch simply parallels the work that was done for huge
PMDs.  The only major difference is how the new ->pud_entry method in
mm_walk works.  The ->pmd_entry method replaces the ->pte_entry method,
whereas the ->pud_entry method works along with either ->pmd_entry or
->pte_entry.  The pagewalk code takes care of locking the PUD before
calling ->pud_walk, so handlers do not need to worry whether the PUD is
stable.

[dave.jiang@intel.com: fix SMP x86 32bit build for native_pud_clear()]
  Link: http://lkml.kernel.org/r/148719066814.31111.3239231168815337012.stgit@djiang5-desk3.ch.intel.com
[dave.jiang@intel.com: native_pud_clear missing on i386 build]
  Link: http://lkml.kernel.org/r/148640375195.69754.3315433724330910314.stgit@djiang5-desk3.ch.intel.com
Link: http://lkml.kernel.org/r/148545059381.17912.8602162635537598445.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Alexander Kapshuk <alexander.kapshuk@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig                      |   1 +
 arch/x86/include/asm/paravirt.h       |  11 +++
 arch/x86/include/asm/paravirt_types.h |   2 +
 arch/x86/include/asm/pgtable-2level.h |  17 +++++
 arch/x86/include/asm/pgtable-3level.h |  30 ++++++++
 arch/x86/include/asm/pgtable.h        | 140 ++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/pgtable_64.h     |  15 ++++
 arch/x86/kernel/paravirt.c            |   1 +
 arch/x86/mm/pgtable.c                 |  31 ++++++++
 9 files changed, 248 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 874c1238dffd..33007aa74111 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -109,6 +109,7 @@ config X86
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
 	select HAVE_ARCH_VMAP_STACK		if X86_64
 	select HAVE_ARCH_WITHIN_STACK_FRAMES
 	select HAVE_CC_STACKPROTECTOR
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f75fbfe550f2..0489884fdc44 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -475,6 +475,17 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 			    native_pmd_val(pmd));
 }
 
+static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
+			      pud_t *pudp, pud_t pud)
+{
+	if (sizeof(pudval_t) > sizeof(long))
+		/* 5 arg words */
+		pv_mmu_ops.set_pud_at(mm, addr, pudp, pud);
+	else
+		PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp,
+			    native_pud_val(pud));
+}
+
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
 	pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index bb2de45a60f2..b060f962d581 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -249,6 +249,8 @@ struct pv_mmu_ops {
 	void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
 	void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
 			   pmd_t *pmdp, pmd_t pmdval);
+	void (*set_pud_at)(struct mm_struct *mm, unsigned long addr,
+			   pud_t *pudp, pud_t pudval);
 	void (*pte_update)(struct mm_struct *mm, unsigned long addr,
 			   pte_t *ptep);
 
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index fd74a11959de..a8b96e708c2b 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -21,6 +21,10 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
 	*pmdp = pmd;
 }
 
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
+{
+}
+
 static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
 	native_set_pte(ptep, pte);
@@ -31,6 +35,10 @@ static inline void native_pmd_clear(pmd_t *pmdp)
 	native_set_pmd(pmdp, __pmd(0));
 }
 
+static inline void native_pud_clear(pud_t *pudp)
+{
+}
+
 static inline void native_pte_clear(struct mm_struct *mm,
 				    unsigned long addr, pte_t *xp)
 {
@@ -55,6 +63,15 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+static inline pud_t native_pudp_get_and_clear(pud_t *xp)
+{
+	return __pud(xchg((pudval_t *)xp, 0));
+}
+#else
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
+#endif
+
 /* Bit manipulation helper on pte/pgoff entry */
 static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift,
 				      unsigned long mask, unsigned int leftshift)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index cdaa58c9b39e..8f50fb3f04e1 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -121,6 +121,12 @@ static inline void native_pmd_clear(pmd_t *pmd)
 	*(tmp + 1) = 0;
 }
 
+#ifndef CONFIG_SMP
+static inline void native_pud_clear(pud_t *pudp)
+{
+}
+#endif
+
 static inline void pud_clear(pud_t *pudp)
 {
 	set_pud(pudp, __pud(0));
@@ -176,6 +182,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+union split_pud {
+	struct {
+		u32 pud_low;
+		u32 pud_high;
+	};
+	pud_t pud;
+};
+
+static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
+{
+	union split_pud res, *orig = (union split_pud *)pudp;
+
+	/* xchg acts as a barrier before setting of the high bits */
+	res.pud_low = xchg(&orig->pud_low, 0);
+	res.pud_high = orig->pud_high;
+	orig->pud_high = 0;
+
+	return res.pud;
+}
+#else
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
+#endif
+
 /* Encode and de-code a swap entry */
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
 #define __swp_type(x)			(((x).val) & 0x1f)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 437feb436efa..1cfb36b8c024 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -46,6 +46,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
 #define set_pte(ptep, pte)		native_set_pte(ptep, pte)
 #define set_pte_at(mm, addr, ptep, pte)	native_set_pte_at(mm, addr, ptep, pte)
 #define set_pmd_at(mm, addr, pmdp, pmd)	native_set_pmd_at(mm, addr, pmdp, pmd)
+#define set_pud_at(mm, addr, pudp, pud)	native_set_pud_at(mm, addr, pudp, pud)
 
 #define set_pte_atomic(ptep, pte)					\
 	native_set_pte_atomic(ptep, pte)
@@ -128,6 +129,16 @@ static inline int pmd_young(pmd_t pmd)
 	return pmd_flags(pmd) & _PAGE_ACCESSED;
 }
 
+static inline int pud_dirty(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_DIRTY;
+}
+
+static inline int pud_young(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_ACCESSED;
+}
+
 static inline int pte_write(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_RW;
@@ -181,6 +192,13 @@ static inline int pmd_trans_huge(pmd_t pmd)
 	return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
 }
 
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline int pud_trans_huge(pud_t pud)
+{
+	return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+}
+#endif
+
 #define has_transparent_hugepage has_transparent_hugepage
 static inline int has_transparent_hugepage(void)
 {
@@ -192,6 +210,18 @@ static inline int pmd_devmap(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_DEVMAP);
 }
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline int pud_devmap(pud_t pud)
+{
+	return !!(pud_val(pud) & _PAGE_DEVMAP);
+}
+#else
+static inline int pud_devmap(pud_t pud)
+{
+	return 0;
+}
+#endif
 #endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
@@ -333,6 +363,65 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 	return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
+static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
+{
+	pudval_t v = native_pud_val(pud);
+
+	return __pud(v | set);
+}
+
+static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
+{
+	pudval_t v = native_pud_val(pud);
+
+	return __pud(v & ~clear);
+}
+
+static inline pud_t pud_mkold(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_ACCESSED);
+}
+
+static inline pud_t pud_mkclean(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_DIRTY);
+}
+
+static inline pud_t pud_wrprotect(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_RW);
+}
+
+static inline pud_t pud_mkdirty(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+}
+
+static inline pud_t pud_mkdevmap(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_DEVMAP);
+}
+
+static inline pud_t pud_mkhuge(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_PSE);
+}
+
+static inline pud_t pud_mkyoung(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_ACCESSED);
+}
+
+static inline pud_t pud_mkwrite(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_RW);
+}
+
+static inline pud_t pud_mknotpresent(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE);
+}
+
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline int pte_soft_dirty(pte_t pte)
 {
@@ -344,6 +433,11 @@ static inline int pmd_soft_dirty(pmd_t pmd)
 	return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
 }
 
+static inline int pud_soft_dirty(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_SOFT_DIRTY;
+}
+
 static inline pte_t pte_mksoft_dirty(pte_t pte)
 {
 	return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
@@ -354,6 +448,11 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
+static inline pud_t pud_mksoft_dirty(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
+}
+
 static inline pte_t pte_clear_soft_dirty(pte_t pte)
 {
 	return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
@@ -364,6 +463,11 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
 	return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
+static inline pud_t pud_clear_soft_dirty(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
+}
+
 #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
 
 /*
@@ -392,6 +496,12 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 		     massage_pgprot(pgprot));
 }
 
+static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
+{
+	return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) |
+		     massage_pgprot(pgprot));
+}
+
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
 	pteval_t val = pte_val(pte);
@@ -771,6 +881,14 @@ static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
 	return res;
 }
 
+static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
+{
+	pud_t res = *pudp;
+
+	native_pud_clear(pudp);
+	return res;
+}
+
 static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
 				     pte_t *ptep , pte_t pte)
 {
@@ -783,6 +901,12 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
 	native_set_pmd(pmdp, pmd);
 }
 
+static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr,
+				     pud_t *pudp, pud_t pud)
+{
+	native_set_pud(pudp, pud);
+}
+
 #ifndef CONFIG_PARAVIRT
 /*
  * Rules for using pte_update - it must be called after any PTE update which
@@ -861,10 +985,15 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 extern int pmdp_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pmd_t *pmdp,
 				 pmd_t entry, int dirty);
+extern int pudp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pud_t *pudp,
+				 pud_t entry, int dirty);
 
 #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
 extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 				     unsigned long addr, pmd_t *pmdp);
+extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
+				     unsigned long addr, pud_t *pudp);
 
 #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
 extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
@@ -884,6 +1013,13 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long
 	return native_pmdp_get_and_clear(pmdp);
 }
 
+#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
+static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+					unsigned long addr, pud_t *pudp)
+{
+	return native_pudp_get_and_clear(pudp);
+}
+
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 				      unsigned long addr, pmd_t *pmdp)
@@ -932,6 +1068,10 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmd)
 {
 }
+static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
+		unsigned long addr, pud_t *pud)
+{
+}
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 62b775926045..73c7ccc38912 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,6 +106,21 @@ static inline void native_pud_clear(pud_t *pud)
 	native_set_pud(pud, native_make_pud(0));
 }
 
+static inline pud_t native_pudp_get_and_clear(pud_t *xp)
+{
+#ifdef CONFIG_SMP
+	return native_make_pud(xchg(&xp->pud, 0));
+#else
+	/* native_local_pudp_get_and_clear,
+	 * but duplicated because of cyclic dependency
+	 */
+	pud_t ret = *xp;
+
+	native_pud_clear(xp);
+	return ret;
+#endif
+}
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 	*pgdp = pgd;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index a1bfba0f7234..4797e87b0fb6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -425,6 +425,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 	.pmd_clear = native_pmd_clear,
 #endif
 	.set_pud = native_set_pud,
+	.set_pud_at = native_set_pud_at,
 
 	.pmd_val = PTE_IDENT,
 	.make_pmd = PTE_IDENT,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3feec5af4e67..6cbdff26bb96 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -445,6 +445,26 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
 
 	return changed;
 }
+
+int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+			  pud_t *pudp, pud_t entry, int dirty)
+{
+	int changed = !pud_same(*pudp, entry);
+
+	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
+
+	if (changed && dirty) {
+		*pudp = entry;
+		/*
+		 * We had a write-protection fault here and changed the pud
+		 * to to more permissive. No need to flush the TLB for that,
+		 * #PF is architecturally guaranteed to do that and in the
+		 * worst-case we'll generate a spurious fault.
+		 */
+	}
+
+	return changed;
+}
 #endif
 
 int ptep_test_and_clear_young(struct vm_area_struct *vma,
@@ -474,6 +494,17 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 
 	return ret;
 }
+int pudp_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long addr, pud_t *pudp)
+{
+	int ret = 0;
+
+	if (pud_young(*pudp))
+		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+					 (unsigned long *)pudp);
+
+	return ret;
+}
 #endif
 
 int ptep_clear_flush_young(struct vm_area_struct *vma,
-- 
cgit 


From 220ced1676c490c3192dd9bc1a06be86dee88a56 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 24 Feb 2017 14:57:12 -0800
Subject: mm: fix get_user_pages() vs device-dax pud mappings

A new unit test for the device-dax 1GB enabling currently fails with
this warning before hanging the test thread:

 WARNING: CPU: 0 PID: 21 at lib/percpu-refcount.c:155 percpu_ref_switch_to_atomic_rcu+0x1e3/0x1f0
 percpu ref (dax_pmem_percpu_release [dax_pmem]) <= 0 (0) after switching to atomic
 [..]
 CPU: 0 PID: 21 Comm: rcuos/1 Tainted: G           O    4.10.0-rc7-next-20170207+ #944
 [..]
 Call Trace:
  dump_stack+0x86/0xc3
  __warn+0xcb/0xf0
  warn_slowpath_fmt+0x5f/0x80
  ? rcu_nocb_kthread+0x27a/0x510
  ? dax_pmem_percpu_exit+0x50/0x50 [dax_pmem]
  percpu_ref_switch_to_atomic_rcu+0x1e3/0x1f0
  ? percpu_ref_exit+0x60/0x60
  rcu_nocb_kthread+0x339/0x510
  ? rcu_nocb_kthread+0x27a/0x510
  kthread+0x101/0x140

The get_user_pages() path needs to arrange for references to be taken
against the dev_pagemap instance backing the pud mapping.  Refactor the
existing __gup_device_huge_pmd() to also account for the pud case.

Link: http://lkml.kernel.org/r/148653181153.38226.9605457830505509385.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/gup.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 0d4fb3ebbbac..99c7805a9693 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -154,14 +154,12 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 	SetPageReferenced(page);
 }
 
-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 		unsigned long end, struct page **pages, int *nr)
 {
 	int nr_start = *nr;
-	unsigned long pfn = pmd_pfn(pmd);
 	struct dev_pagemap *pgmap = NULL;
 
-	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
 	do {
 		struct page *page = pfn_to_page(pfn);
 
@@ -180,6 +178,24 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
 	return 1;
 }
 
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	unsigned long fault_pfn;
+
+	fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	unsigned long fault_pfn;
+
+	fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+	return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
@@ -251,9 +267,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
 
 	if (!pte_allows_gup(pud_val(pud), write))
 		return 0;
+
+	VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
+	if (pud_devmap(pud))
+		return __gup_device_huge_pud(pud, addr, end, pages, nr);
+
 	/* hugepages are never "special" */
 	VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
-	VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
 
 	refs = 0;
 	head = pud_page(pud);
-- 
cgit 


From 897ab3e0c49e24b62e2d54d165c7afec6bbca65b Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 14:58:22 -0800
Subject: userfaultfd: non-cooperative: add event for memory unmaps

When a non-cooperative userfaultfd monitor copies pages in the
background, it may encounter regions that were already unmapped.
Addition of UFFD_EVENT_UNMAP allows the uffd monitor to track precisely
changes in the virtual memory layout.

Since there might be different uffd contexts for the affected VMAs, we
first should create a temporary representation for the unmap event for
each uffd context and then notify them one by one to the appropriate
userfault file descriptors.

The event notification occurs after the mmap_sem has been released.

[arnd@arndb.de: fix nommu build]
  Link: http://lkml.kernel.org/r/20170203165141.3665284-1-arnd@arndb.de
[mhocko@suse.com: fix nommu build]
  Link: http://lkml.kernel.org/r/20170202091503.GA22823@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/1485542673-24387-3-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/entry/vdso/vma.c | 2 +-
 arch/x86/mm/mpx.c         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 10820f6cefbf..572cee3fccff 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -186,7 +186,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
 
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
-		do_munmap(mm, text_start, image->size);
+		do_munmap(mm, text_start, image->size, NULL);
 	} else {
 		current->mm->context.vdso = (void __user *)text_start;
 		current->mm->context.vdso_image = image;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index aad4ac386f98..c98079684bdb 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -51,7 +51,7 @@ static unsigned long mpx_mmap(unsigned long len)
 
 	down_write(&mm->mmap_sem);
 	addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
-			MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
+		       MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL);
 	up_write(&mm->mmap_sem);
 	if (populate)
 		mm_populate(addr, populate);
@@ -893,7 +893,7 @@ static int unmap_entire_bt(struct mm_struct *mm,
 	 * avoid recursion, do_munmap() will check whether it comes
 	 * from one bounds table through VM_MPX flag.
 	 */
-	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm));
+	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL);
 }
 
 static int try_unmap_single_bt(struct mm_struct *mm,
-- 
cgit 


From 712c604dcdf8186295e2af694adf52c6842ad100 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Fri, 24 Feb 2017 14:58:44 -0800
Subject: mm: wire up GFP flag passing in dma_alloc_from_contiguous

The callers of the DMA alloc functions already provide the proper
context GFP flags.  Make sure to pass them through to the CMA allocator,
to make the CMA compaction context aware.

Link: http://lkml.kernel.org/r/20170127172328.18574-3-l.stach@pengutronix.de
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Alexander Graf <agraf@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/pci-dma.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d30c37750765..d5c223c9cf11 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -91,7 +91,8 @@ again:
 	page = NULL;
 	/* CMA can be used only in the context which permits sleeping */
 	if (gfpflags_allow_blocking(flag)) {
-		page = dma_alloc_from_contiguous(dev, count, get_order(size));
+		page = dma_alloc_from_contiguous(dev, count, get_order(size),
+						 flag);
 		if (page && page_to_phys(page) + size > dma_mask) {
 			dma_release_from_contiguous(dev, page, count);
 			page = NULL;
-- 
cgit 


From 76062b4ae2ea54fcfb8fce6940921a90f33f38da Mon Sep 17 00:00:00 2001
From: Shanth Murthy <shanth.murthy@intel.com>
Date: Mon, 13 Feb 2017 04:02:52 -0800
Subject: platform/x86: intel_pmc_ipc: read s0ix residency API

This patch adds a new API to indicate S0ix residency in usec. It utilizes
the PMC Global Control Registers (GCR) to read deep and shallow
S0ix residency.

PMC MMIO resources:
        o Lower 4kB: IPC1 (PMC inter-processor communication) interface
        o Upper 4kB: GCR (Global Control Registers)

This enables the power management framework to take corrective actions when
the platform fails to enter S0ix after kernel freeze as part of the suspend
to idle flow. (echo freeze > /sys/power/state).

This is expected to be used with a S0ix failsafe framework such as:
<https://lwn.net/Articles/689505/>

[rajneesh: folded in "fix division in 32-bit case" from Andy Shevchenko]
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com>
Signed-off-by: Shanth Murthy <shanth.murthy@intel.com>
[andy: fixed kbuild error, removed "total" from variables, fixed macro]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 arch/x86/include/asm/intel_pmc_ipc.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h
index cd0310e186f4..4291b6a5ddf7 100644
--- a/arch/x86/include/asm/intel_pmc_ipc.h
+++ b/arch/x86/include/asm/intel_pmc_ipc.h
@@ -30,6 +30,7 @@ int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,
 		u32 *out, u32 outlen, u32 dptr, u32 sptr);
 int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
 		u32 *out, u32 outlen);
+int intel_pmc_s0ix_counter_read(u64 *data);
 
 #else
 
@@ -50,6 +51,11 @@ static inline int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
 	return -EINVAL;
 }
 
+static inline int intel_pmc_s0ix_counter_read(u64 *data)
+{
+	return -EINVAL;
+}
+
 #endif /*CONFIG_INTEL_PMC_IPC*/
 
 #endif
-- 
cgit 


From 7d134b2ce639448199052fd573a324f7e7cd5ed8 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Mon, 27 Feb 2017 14:26:56 -0800
Subject: kprobes: move kprobe declarations to asm-generic/kprobes.h

Often all is needed is these small helpers, instead of compiler.h or a
full kprobes.h.  This is important for asm helpers, in fact even some
asm/kprobes.h make use of these helpers...  instead just keep a generic
asm file with helpers useful for asm code with the least amount of
clutter as possible.

Likewise we need now to also address what to do about this file for both
when architectures have CONFIG_HAVE_KPROBES, and when they do not.  Then
for when architectures have CONFIG_HAVE_KPROBES but have disabled
CONFIG_KPROBES.

Right now most asm/kprobes.h do not have guards against CONFIG_KPROBES,
this means most architecture code cannot include asm/kprobes.h safely.
Correct this and add guards for architectures missing them.
Additionally provide architectures that not have kprobes support with
the default asm-generic solution.  This lets us force asm/kprobes.h on
the header include/linux/kprobes.h always, but most importantly we can
now safely include just asm/kprobes.h on architecture code without
bringing the full kitchen sink of header files.

Two architectures already provided a guard against CONFIG_KPROBES on its
kprobes.h: sh, arch.  The rest of the architectures needed gaurds added.
We avoid including any not-needed headers on asm/kprobes.h unless
kprobes have been enabled.

In a subsequent atomic change we can try now to remove compiler.h from
include/linux/kprobes.h.

During this sweep I've also identified a few architectures defining a
common macro needed for both kprobes and ftrace, that of the definition
of the breakput instruction up.  Some refer to this as
BREAKPOINT_INSTRUCTION.  This must be kept outside of the #ifdef
CONFIG_KPROBES guard.

[mcgrof@kernel.org: fix arm64 build]
  Link: http://lkml.kernel.org/r/CAB=NE6X1WMByuARS4mZ1g9+W=LuVBnMDnh_5zyN0CLADaVh=Jw@mail.gmail.com
[sfr@canb.auug.org.au: fixup for kprobes declarations moving]
  Link: http://lkml.kernel.org/r/20170214165933.13ebd4f4@canb.auug.org.au
Link: http://lkml.kernel.org/r/20170203233139.32682-1-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/kprobes.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index d1d1e5094c28..200581691c6e 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -21,6 +21,12 @@
  *
  * See arch/x86/kernel/kprobes.c for x86 kprobes history.
  */
+
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION	0xcc
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
@@ -32,7 +38,6 @@ struct pt_regs;
 struct kprobe;
 
 typedef u8 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0xcc
 #define RELATIVEJUMP_OPCODE 0xe9
 #define RELATIVEJUMP_SIZE 5
 #define RELATIVECALL_OPCODE 0xe8
@@ -116,4 +121,6 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 				    unsigned long val, void *data);
 extern int kprobe_int3_handler(struct pt_regs *regs);
 extern int kprobe_debug_handler(struct pt_regs *regs);
+
+#endif /* CONFIG_KPROBES */
 #endif /* _ASM_X86_KPROBES_H */
-- 
cgit 


From 9332ef9dbd172d4ab0a0141df7cb21c696a5ce96 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 27 Feb 2017 14:28:47 -0800
Subject: scripts/spelling.txt: add "an user" pattern and fix typo instances

Fix typos and add the following to the scripts/spelling.txt:

  an user||a user
  an userspace||a userspace

I also added "userspace" to the list since it is a common word in Linux.
I found some instances for "an userfaultfd", but I did not add it to the
list.  I felt it is endless to find words that start with "user" such as
"userland" etc., so must draw a line somewhere.

Link: http://lkml.kernel.org/r/1481573103-11329-4-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2fd7586aad4d..1cda35277278 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4102,7 +4102,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
 				 * as a SMAP violation if all of the following
 				 * conditions are ture:
 				 *   - X86_CR4_SMAP is set in CR4
-				 *   - An user page is accessed
+				 *   - A user page is accessed
 				 *   - Page fault in kernel mode
 				 *   - if CPL = 3 or X86_EFLAGS_AC is clear
 				 *
-- 
cgit 


From 03440c4e5e2f167764997a7e0f2dbb279d8078e6 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 27 Feb 2017 14:28:49 -0800
Subject: scripts/spelling.txt: add "an union" pattern and fix typo instances

Fix typos and add the following to the scripts/spelling.txt:

  an union||a union

Link: http://lkml.kernel.org/r/1481573103-11329-5-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/desc_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index eb5deb42484d..49265345d4d2 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -15,7 +15,7 @@
  * FIXME: Accessing the desc_struct through its fields is more elegant,
  * and should be the one valid thing to do. However, a lot of open code
  * still touches the a and b accessors, and doing this allow us to do it
- * incrementally. We keep the signature as a struct, rather than an union,
+ * incrementally. We keep the signature as a struct, rather than a union,
  * so we can get rid of it transparently in the future -- glommer
  */
 /* 8 byte segment descriptor */
-- 
cgit 


From f1f1007644ffc8051a4c11427d58b1967ae7b75a Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Mon, 27 Feb 2017 14:30:07 -0800
Subject: mm: add new mmgrab() helper

Apart from adding the helper function itself, the rest of the kernel is
converted mechanically using:

  git grep -l 'atomic_inc.*mm_count' | xargs sed -i 's/atomic_inc(&\(.*\)->mm_count);/mmgrab\(\1\);/'
  git grep -l 'atomic_inc.*mm_count' | xargs sed -i 's/atomic_inc(&\(.*\)\.mm_count);/mmgrab\(\&\1\);/'

This is needed for a later patch that hooks into the helper, but might
be a worthwhile cleanup on its own.

(Michal Hocko provided most of the kerneldoc comment.)

Link: http://lkml.kernel.org/r/20161218123229.22952-1-vegard.nossum@oracle.com
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f07005e6f461..c64ca5929cb5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1510,7 +1510,7 @@ void cpu_init(void)
 	for (i = 0; i <= IO_BITMAP_LONGS; i++)
 		t->io_bitmap[i] = ~0UL;
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	me->active_mm = &init_mm;
 	BUG_ON(me->mm);
 	enter_lazy_tlb(&init_mm, me);
@@ -1561,7 +1561,7 @@ void cpu_init(void)
 	/*
 	 * Set up and load the per-CPU TSS and LDT
 	 */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	curr->active_mm = &init_mm;
 	BUG_ON(curr->mm);
 	enter_lazy_tlb(&init_mm, curr);
-- 
cgit 


From 2959a5f726f6510d6dd7c958f8877e08d0cf589c Mon Sep 17 00:00:00 2001
From: Jinbum Park <jinb.park7@gmail.com>
Date: Mon, 27 Feb 2017 14:30:22 -0800
Subject: mm: add arch-independent testcases for RODATA

This patch makes arch-independent testcases for RODATA.  Both x86 and
x86_64 already have testcases for RODATA, But they are arch-specific
because using inline assembly directly.

And cacheflush.h is not a suitable location for rodata-test related
things.  Since they were in cacheflush.h, If someone change the state of
CONFIG_DEBUG_RODATA_TEST, It cause overhead of kernel build.

To solve the above issues, write arch-independent testcases and move it
to shared location.

[jinb.park7@gmail.com: fix config dependency]
  Link: http://lkml.kernel.org/r/20170209131625.GA16954@pjb1027-Latitude-E5410
Link: http://lkml.kernel.org/r/20170129105436.GA9303@pjb1027-Latitude-E5410
Signed-off-by: Jinbum Park <jinb.park7@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Valentin Rothberg <valentinrothberg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig.debug            |  8 -----
 arch/x86/include/asm/cacheflush.h | 10 ------
 arch/x86/kernel/Makefile          |  1 -
 arch/x86/kernel/test_rodata.c     | 75 ---------------------------------------
 arch/x86/mm/init_32.c             |  4 ---
 arch/x86/mm/init_64.c             |  5 ---
 6 files changed, 103 deletions(-)
 delete mode 100644 arch/x86/kernel/test_rodata.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index c4cba00dbdee..63c1d13aaf9f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -74,14 +74,6 @@ config EFI_PGT_DUMP
 	  issues with the mapping of the EFI runtime regions into that
 	  table.
 
-config DEBUG_RODATA_TEST
-	bool "Testcase for the marking rodata read-only"
-	default y
-	---help---
-	  This option enables a testcase for the setting rodata read-only
-	  as well as for the change_page_attr() infrastructure.
-	  If in doubt, say "N"
-
 config DEBUG_WX
 	bool "Warn on W+X mappings at boot"
 	select X86_PTDUMP_CORE
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 872877d930de..e7e1942edff7 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -90,18 +90,8 @@ void clflush_cache_range(void *addr, unsigned int size);
 
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
-extern const int rodata_test_data;
 extern int kernel_set_to_readonly;
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
 
-#ifdef CONFIG_DEBUG_RODATA_TEST
-int rodata_test(void);
-#else
-static inline int rodata_test(void)
-{
-	return 0;
-}
-#endif
-
 #endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index bdcdb3b3a219..84c00592d359 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,7 +100,6 @@ obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
 
 obj-$(CONFIG_AMD_NB)		+= amd_nb.o
-obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
deleted file mode 100644
index 222e84e2432e..000000000000
--- a/arch/x86/kernel/test_rodata.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * test_rodata.c: functional test for mark_rodata_ro function
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <asm/cacheflush.h>
-#include <asm/sections.h>
-#include <asm/asm.h>
-
-int rodata_test(void)
-{
-	unsigned long result;
-	unsigned long start, end;
-
-	/* test 1: read the value */
-	/* If this test fails, some previous testrun has clobbered the state */
-	if (!rodata_test_data) {
-		printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
-		return -ENODEV;
-	}
-
-	/* test 2: write to the variable; this should fault */
-	/*
-	 * If this test fails, we managed to overwrite the data
-	 *
-	 * This is written in assembly to be able to catch the
-	 * exception that is supposed to happen in the correct
-	 * case
-	 */
-
-	result = 1;
-	asm volatile(
-		"0:	mov %[zero],(%[rodata_test])\n"
-		"	mov %[zero], %[rslt]\n"
-		"1:\n"
-		".section .fixup,\"ax\"\n"
-		"2:	jmp 1b\n"
-		".previous\n"
-		_ASM_EXTABLE(0b,2b)
-		: [rslt] "=r" (result)
-		: [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
-	);
-
-
-	if (!result) {
-		printk(KERN_ERR "rodata_test: test data was not read only\n");
-		return -ENODEV;
-	}
-
-	/* test 3: check the value hasn't changed */
-	/* If this test fails, we managed to overwrite the data */
-	if (!rodata_test_data) {
-		printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
-		return -ENODEV;
-	}
-	/* test 4: check if the rodata section is 4Kb aligned */
-	start = (unsigned long)__start_rodata;
-	end = (unsigned long)__end_rodata;
-	if (start & (PAGE_SIZE - 1)) {
-		printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
-		return -ENODEV;
-	}
-	if (end & (PAGE_SIZE - 1)) {
-		printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
-		return -ENODEV;
-	}
-
-	return 0;
-}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 928d657de829..2b4b53e6793f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -864,9 +864,6 @@ static noinline int do_test_wp_bit(void)
 	return flag;
 }
 
-const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
-
 int kernel_set_to_readonly __read_mostly;
 
 void set_kernel_text_rw(void)
@@ -939,7 +936,6 @@ void mark_rodata_ro(void)
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 		size >> 10);
-	rodata_test();
 
 #ifdef CONFIG_CPA_DEBUG
 	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 97346f987ef2..15173d37f399 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1000,9 +1000,6 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 }
 
-const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
-
 int kernel_set_to_readonly;
 
 void set_kernel_text_rw(void)
@@ -1071,8 +1068,6 @@ void mark_rodata_ro(void)
 	all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
 	set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
 
-	rodata_test();
-
 #ifdef CONFIG_CPA_DEBUG
 	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
 	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
-- 
cgit 


From 3e761a42e19c63b624ebac94d918d8a15e07e2a7 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 27 Feb 2017 14:30:25 -0800
Subject: mm, x86: fix HIGHMEM64 && PARAVIRT build config for
 native_pud_clear()

Looks like I also missed the build config that includes
CONFIG_HIGHMEM64G && CONFIG_PARAVIRT to export the native_pud_clear()
dummy function.

Fixes: a00cc7d9dd93d ("mm, x86: add support for PUD-sized transparent hugepages")
Link: http://lkml.kernel.org/r/148823188084.56076.17451228917824355200.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reported-by: Laura Abbott <labbott@redhat.com>
Reported-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable-3level.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 8f50fb3f04e1..72277b1028a5 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -121,7 +121,8 @@ static inline void native_pmd_clear(pmd_t *pmd)
 	*(tmp + 1) = 0;
 }
 
-#ifndef CONFIG_SMP
+#if !defined(CONFIG_SMP) || (defined(CONFIG_HIGHMEM64G) && \
+		defined(CONFIG_PARAVIRT))
 static inline void native_pud_clear(pud_t *pudp)
 {
 }
-- 
cgit