From 3d94ae0c70a71a9824479366775e2c7679a57d94 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 28 Sep 2011 11:49:28 -0700
Subject: x86/cmpxchg: add a locked add() helper

Mostly to remove some conditional code in spinlock.h.

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
---
 arch/x86/include/asm/cmpxchg.h  | 42 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/spinlock.h | 15 +--------------
 2 files changed, 43 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 5d3acdf5a7a6..49eade13161c 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -14,6 +14,8 @@ extern void __cmpxchg_wrong_size(void)
 	__compiletime_error("Bad argument size for cmpxchg");
 extern void __xadd_wrong_size(void)
 	__compiletime_error("Bad argument size for xadd");
+extern void __add_wrong_size(void)
+	__compiletime_error("Bad argument size for add");
 
 /*
  * Constants for operation sizes. On 32-bit, the 64-bit size it set to
@@ -207,4 +209,44 @@ extern void __xadd_wrong_size(void)
 #define xadd_sync(ptr, inc)	__xadd((ptr), (inc), "lock; ")
 #define xadd_local(ptr, inc)	__xadd((ptr), (inc), "")
 
+#define __add(ptr, inc, lock)						\
+	({								\
+	        __typeof__ (*(ptr)) __ret = (inc);			\
+		switch (sizeof(*(ptr))) {				\
+		case __X86_CASE_B:					\
+			asm volatile (lock "addb %b1, %0\n"		\
+				      : "+m" (*(ptr)) : "ri" (inc)	\
+				      : "memory", "cc");		\
+			break;						\
+		case __X86_CASE_W:					\
+			asm volatile (lock "addw %w1, %0\n"		\
+				      : "+m" (*(ptr)) : "ri" (inc)	\
+				      : "memory", "cc");		\
+			break;						\
+		case __X86_CASE_L:					\
+			asm volatile (lock "addl %1, %0\n"		\
+				      : "+m" (*(ptr)) : "ri" (inc)	\
+				      : "memory", "cc");		\
+			break;						\
+		case __X86_CASE_Q:					\
+			asm volatile (lock "addq %1, %0\n"		\
+				      : "+m" (*(ptr)) : "ri" (inc)	\
+				      : "memory", "cc");		\
+			break;						\
+		default:						\
+			__add_wrong_size();				\
+		}							\
+		__ret;							\
+	})
+
+/*
+ * add_*() adds "inc" to "*ptr"
+ *
+ * __add() takes a lock prefix
+ * add_smp() is locked when multiple CPUs are online
+ * add_sync() is always locked
+ */
+#define add_smp(ptr, inc)	__add((ptr), (inc), LOCK_PREFIX)
+#define add_sync(ptr, inc)	__add((ptr), (inc), "lock; ")
+
 #endif	/* ASM_X86_CMPXCHG_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 972c260919a3..a82c2bf504b6 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -79,23 +79,10 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
 	return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
 }
 
-#if (NR_CPUS < 256)
 static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
 {
-	asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
-		     : "+m" (lock->head_tail)
-		     :
-		     : "memory", "cc");
+	__add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
 }
-#else
-static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
-{
-	asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
-		     : "+m" (lock->head_tail)
-		     :
-		     : "memory", "cc");
-}
-#endif
 
 static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
 {
-- 
cgit 


From 31a8394e069e47dc47f4c29e4213aa943342f19f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri, 30 Sep 2011 12:14:10 -0700
Subject: x86: consolidate xchg and xadd macros

They both have a basic "put new value in location, return old value"
pattern, so they can use the same macro easily.

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
---
 arch/x86/include/asm/cmpxchg.h | 114 +++++++++++++----------------------------
 1 file changed, 36 insertions(+), 78 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 49eade13161c..5488e10b9dba 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -33,60 +33,47 @@ extern void __add_wrong_size(void)
 #define	__X86_CASE_Q	-1		/* sizeof will never return -1 */
 #endif
 
+/* 
+ * An exchange-type operation, which takes a value and a pointer, and
+ * returns a the old value.
+ */
+#define __xchg_op(ptr, arg, op, lock)					\
+	({								\
+	        __typeof__ (*(ptr)) __ret = (arg);			\
+		switch (sizeof(*(ptr))) {				\
+		case __X86_CASE_B:					\
+			asm volatile (lock #op "b %b0, %1\n"		\
+				      : "+r" (__ret), "+m" (*(ptr))	\
+				      : : "memory", "cc");		\
+			break;						\
+		case __X86_CASE_W:					\
+			asm volatile (lock #op "w %w0, %1\n"		\
+				      : "+r" (__ret), "+m" (*(ptr))	\
+				      : : "memory", "cc");		\
+			break;						\
+		case __X86_CASE_L:					\
+			asm volatile (lock #op "l %0, %1\n"		\
+				      : "+r" (__ret), "+m" (*(ptr))	\
+				      : : "memory", "cc");		\
+			break;						\
+		case __X86_CASE_Q:					\
+			asm volatile (lock #op "q %q0, %1\n"		\
+				      : "+r" (__ret), "+m" (*(ptr))	\
+				      : : "memory", "cc");		\
+			break;						\
+		default:						\
+			__ ## op ## _wrong_size();			\
+		}							\
+		__ret;							\
+	})
+
 /*
  * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
  * Since this is generally used to protect other memory information, we
  * use "asm volatile" and "memory" clobbers to prevent gcc from moving
  * information around.
  */
-#define __xchg(x, ptr, size)						\
-({									\
-	__typeof(*(ptr)) __x = (x);					\
-	switch (size) {							\
-	case __X86_CASE_B:						\
-	{								\
-		volatile u8 *__ptr = (volatile u8 *)(ptr);		\
-		asm volatile("xchgb %0,%1"				\
-			     : "=q" (__x), "+m" (*__ptr)		\
-			     : "0" (__x)				\
-			     : "memory");				\
-		break;							\
-	}								\
-	case __X86_CASE_W:						\
-	{								\
-		volatile u16 *__ptr = (volatile u16 *)(ptr);		\
-		asm volatile("xchgw %0,%1"				\
-			     : "=r" (__x), "+m" (*__ptr)		\
-			     : "0" (__x)				\
-			     : "memory");				\
-		break;							\
-	}								\
-	case __X86_CASE_L:						\
-	{								\
-		volatile u32 *__ptr = (volatile u32 *)(ptr);		\
-		asm volatile("xchgl %0,%1"				\
-			     : "=r" (__x), "+m" (*__ptr)		\
-			     : "0" (__x)				\
-			     : "memory");				\
-		break;							\
-	}								\
-	case __X86_CASE_Q:						\
-	{								\
-		volatile u64 *__ptr = (volatile u64 *)(ptr);		\
-		asm volatile("xchgq %0,%1"				\
-			     : "=r" (__x), "+m" (*__ptr)		\
-			     : "0" (__x)				\
-			     : "memory");				\
-		break;							\
-	}								\
-	default:							\
-		__xchg_wrong_size();					\
-	}								\
-	__x;								\
-})
-
-#define xchg(ptr, v)							\
-	__xchg((v), (ptr), sizeof(*ptr))
+#define xchg(ptr, v)	__xchg_op((ptr), (v), xchg, "")
 
 /*
  * Atomic compare and exchange.  Compare OLD with MEM, if identical,
@@ -167,36 +154,6 @@ extern void __add_wrong_size(void)
 	__cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
 #endif
 
-#define __xadd(ptr, inc, lock)						\
-	({								\
-	        __typeof__ (*(ptr)) __ret = (inc);			\
-		switch (sizeof(*(ptr))) {				\
-		case __X86_CASE_B:					\
-			asm volatile (lock "xaddb %b0, %1\n"		\
-				      : "+r" (__ret), "+m" (*(ptr))	\
-				      : : "memory", "cc");		\
-			break;						\
-		case __X86_CASE_W:					\
-			asm volatile (lock "xaddw %w0, %1\n"		\
-				      : "+r" (__ret), "+m" (*(ptr))	\
-				      : : "memory", "cc");		\
-			break;						\
-		case __X86_CASE_L:					\
-			asm volatile (lock "xaddl %0, %1\n"		\
-				      : "+r" (__ret), "+m" (*(ptr))	\
-				      : : "memory", "cc");		\
-			break;						\
-		case __X86_CASE_Q:					\
-			asm volatile (lock "xaddq %q0, %1\n"		\
-				      : "+r" (__ret), "+m" (*(ptr))	\
-				      : : "memory", "cc");		\
-			break;						\
-		default:						\
-			__xadd_wrong_size();				\
-		}							\
-		__ret;							\
-	})
-
 /*
  * xadd() adds "inc" to "*ptr" and atomically returns the previous
  * value of "*ptr".
@@ -205,6 +162,7 @@ extern void __add_wrong_size(void)
  * xadd_sync() is always locked
  * xadd_local() is never locked
  */
+#define __xadd(ptr, inc, lock)	__xchg_op((ptr), (inc), xadd, lock)
 #define xadd(ptr, inc)		__xadd((ptr), (inc), LOCK_PREFIX)
 #define xadd_sync(ptr, inc)	__xadd((ptr), (inc), "lock; ")
 #define xadd_local(ptr, inc)	__xadd((ptr), (inc), "")
-- 
cgit 


From 4fc3490114bb159bd4fff1b3c96f4320fe6fb08f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Mon, 7 Nov 2011 16:33:40 -0800
Subject: x86-64: Set siginfo and context on vsyscall emulation faults

To make this work, we teach the page fault handler how to send
signals on failed uaccess.  This only works for user addresses
(kernel addresses will never hit the page fault handler in the
first place), so we need to generate signals for those
separately.

This gets the tricky case right: if the user buffer spans
multiple pages and only the second page is invalid, we set
cr2 and si_addr correctly.  UML relies on this behavior to
"fault in" pages as needed.

We steal a bit from thread_info.uaccess_err to enable this.
Before this change, uaccess_err was a 32-bit boolean value.

This fixes issues with UML when vsyscall=emulate.

Reported-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: richard -rw- weinberger <richard.weinberger@gmail.com>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/4c8f91de7ec5cd2ef0f59521a04e1015f11e42b4.1320712291.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/thread_info.h |  3 +-
 arch/x86/include/asm/uaccess.h     |  2 +-
 arch/x86/kernel/vsyscall_64.c      | 75 ++++++++++++++++++++++++++++++++++----
 arch/x86/mm/extable.c              |  2 +-
 arch/x86/mm/fault.c                | 22 ++++++++---
 5 files changed, 87 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c127b52..25ebd792725b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,7 +40,8 @@ struct thread_info {
 						*/
 	__u8			supervisor_stack[0];
 #endif
-	int			uaccess_err;
+	int			sig_on_uaccess_error:1;
+	int			uaccess_err:1;	/* uaccess failed */
 };
 
 #define INIT_THREAD_INFO(tsk)			\
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 36361bf6fdd1..8be5f54d9360 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -462,7 +462,7 @@ struct __large_struct { unsigned long buf[100]; };
 	barrier();
 
 #define uaccess_catch(err)						\
-	(err) |= current_thread_info()->uaccess_err;			\
+	(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);	\
 	current_thread_info()->uaccess_err = prev_err;			\
 } while (0)
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e4d4a22e8b94..8084beccd64e 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr)
 	return nr;
 }
 
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+	/*
+	 * XXX: if access_ok, get_user, and put_user handled
+	 * sig_on_uaccess_error, this could go away.
+	 */
+
+	if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+		siginfo_t info;
+		struct thread_struct *thread = &current->thread;
+
+		thread->error_code	= 6;  /* user fault, no page, write */
+		thread->cr2		= ptr;
+		thread->trap_no		= 14;
+
+		memset(&info, 0, sizeof(info));
+		info.si_signo		= SIGSEGV;
+		info.si_errno		= 0;
+		info.si_code		= SEGV_MAPERR;
+		info.si_addr		= (void __user *)ptr;
+
+		force_sig_info(SIGSEGV, &info, current);
+		return false;
+	} else {
+		return true;
+	}
+}
+
 bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 {
 	struct task_struct *tsk;
 	unsigned long caller;
 	int vsyscall_nr;
+	int prev_sig_on_uaccess_error;
 	long ret;
 
 	/*
@@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	if (seccomp_mode(&tsk->seccomp))
 		do_exit(SIGKILL);
 
+	/*
+	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
+	 * preserve that behavior to make writing exploits harder.
+	 */
+	prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+	current_thread_info()->sig_on_uaccess_error = 1;
+
+	/*
+	 * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
+	 * 64-bit, so we don't need to special-case it here.  For all the
+	 * vsyscalls, 0 means "don't write anything" not "write it at
+	 * address 0".
+	 */
+	ret = -EFAULT;
 	switch (vsyscall_nr) {
 	case 0:
+		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+		    !write_ok_or_segv(regs->si, sizeof(struct timezone)))
+			break;
+
 		ret = sys_gettimeofday(
 			(struct timeval __user *)regs->di,
 			(struct timezone __user *)regs->si);
 		break;
 
 	case 1:
+		if (!write_ok_or_segv(regs->di, sizeof(time_t)))
+			break;
+
 		ret = sys_time((time_t __user *)regs->di);
 		break;
 
 	case 2:
+		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+		    !write_ok_or_segv(regs->si, sizeof(unsigned)))
+			break;
+
 		ret = sys_getcpu((unsigned __user *)regs->di,
 				 (unsigned __user *)regs->si,
 				 0);
 		break;
 	}
 
+	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+
 	if (ret == -EFAULT) {
-		/*
-		 * Bad news -- userspace fed a bad pointer to a vsyscall.
-		 *
-		 * With a real vsyscall, that would have caused SIGSEGV.
-		 * To make writing reliable exploits using the emulated
-		 * vsyscalls harder, generate SIGSEGV here as well.
-		 */
+		/* Bad news -- userspace fed a bad pointer to a vsyscall. */
 		warn_bad_vsyscall(KERN_INFO, regs,
 				  "vsyscall fault (exploit attempt?)");
-		goto sigsegv;
+
+		/*
+		 * If we failed to generate a signal for any reason,
+		 * generate one here.  (This should be impossible.)
+		 */
+		if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+				 !sigismember(&tsk->pending.signal, SIGSEGV)))
+			goto sigsegv;
+
+		return true;  /* Don't emulate the ret. */
 	}
 
 	regs->ax = ret;
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index d0474ad2a6e5..1fb85dbe390a 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -25,7 +25,7 @@ int fixup_exception(struct pt_regs *regs)
 	if (fixup) {
 		/* If fixup is less than 16, it means uaccess error */
 		if (fixup->fixup < 16) {
-			current_thread_info()->uaccess_err = -EFAULT;
+			current_thread_info()->uaccess_err = 1;
 			regs->ip += fixup->fixup;
 			return 1;
 		}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5db0490deb07..9d74824a708d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -626,7 +626,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
 
 static noinline void
 no_context(struct pt_regs *regs, unsigned long error_code,
-	   unsigned long address)
+	   unsigned long address, int signal, int si_code)
 {
 	struct task_struct *tsk = current;
 	unsigned long *stackend;
@@ -634,8 +634,17 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs))
+	if (fixup_exception(regs)) {
+		if (current_thread_info()->sig_on_uaccess_error && signal) {
+			tsk->thread.trap_no = 14;
+			tsk->thread.error_code = error_code | PF_USER;
+			tsk->thread.cr2 = address;
+
+			/* XXX: hwpoison faults will set the wrong code. */
+			force_sig_info_fault(signal, si_code, address, tsk, 0);
+		}
 		return;
+	}
 
 	/*
 	 * 32-bit:
@@ -755,7 +764,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 	if (is_f00f_bug(regs, address))
 		return;
 
-	no_context(regs, error_code, address);
+	no_context(regs, error_code, address, SIGSEGV, si_code);
 }
 
 static noinline void
@@ -819,7 +828,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 
 	/* Kernel mode? Handle exceptions or die: */
 	if (!(error_code & PF_USER)) {
-		no_context(regs, error_code, address);
+		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
 		return;
 	}
 
@@ -854,7 +863,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 		if (!(fault & VM_FAULT_RETRY))
 			up_read(&current->mm->mmap_sem);
 		if (!(error_code & PF_USER))
-			no_context(regs, error_code, address);
+			no_context(regs, error_code, address, 0, 0);
 		return 1;
 	}
 	if (!(fault & VM_FAULT_ERROR))
@@ -864,7 +873,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 		/* Kernel mode? Handle exceptions or die: */
 		if (!(error_code & PF_USER)) {
 			up_read(&current->mm->mmap_sem);
-			no_context(regs, error_code, address);
+			no_context(regs, error_code, address,
+				   SIGSEGV, SEGV_MAPERR);
 			return 1;
 		}
 
-- 
cgit 


From 2e57ae0515124af45dd889bfbd4840fd40fcc07d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Mon, 7 Nov 2011 16:33:41 -0800
Subject: x86: Default to vsyscall=emulate

This essentially reverts:

  2b666859ec32: x86: Default to vsyscall=native for now

The ABI breakage should now be fixed by:

 commit 48c4206f5b02f28c4c78a1f5b491d3772fb64fb9
 Author: Andy Lutomirski <luto@mit.edu>
 Date:   Thu Oct 20 08:48:19 2011 -0700

    x86-64: Set siginfo and context on vsyscall emulation faults

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: richard -rw- weinberger <richard.weinberger@gmail.com>
Cc: Adrian Bunk <bunk@stusta.de>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/93154af3b2b6d208906ae02d80d92cf60c6fa94f.1320712291.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vsyscall_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8084beccd64e..b07ba9393564 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
 	.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
 };
 
-static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
 
 static int __init vsyscall_setup(char *str)
 {
-- 
cgit 


From e9a9eca517d4cd94e816538efc400257e34bc63e Mon Sep 17 00:00:00 2001
From: Maurice Ma <maurice.ma@intel.com>
Date: Tue, 11 Oct 2011 11:52:13 +0100
Subject: x86, efi: Convert efi_phys_get_time() args to physical addresses

Because callers of efi_phys_get_time() pass virtual stack
addresses as arguments, we need to find their corresponding
physical addresses and when calling GetTime() in physical mode.

Without this patch the following line is printed on boot,

	"Oops: efitime: can't read time!"

Signed-off-by: Maurice Ma <maurice.ma@intel.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Matthew Garrett <mjg@redhat.com>
Link: http://lkml.kernel.org/r/1318330333-4617-1-git-send-email-matt@console-pimps.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/efi/efi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 37718f0f053d..d2376eb7231a 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -238,7 +238,8 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
 
 	spin_lock_irqsave(&rtc_lock, flags);
 	efi_call_phys_prelog();
-	status = efi_call_phys2(efi_phys.get_time, tm, tc);
+	status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
+				virt_to_phys(tc));
 	efi_call_phys_epilog();
 	spin_unlock_irqrestore(&rtc_lock, flags);
 	return status;
-- 
cgit 


From 70ea6855d368588a7f1b0242ab83ca6fe2e2ff16 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Tue, 29 Nov 2011 10:54:22 +0000
Subject: x86-64: Slightly shorten int_ret_from_sys_call

Testing for a return to ring 0 was necessary here solely because
of the branch out of ret_from_fork. That branch, however, can be
directed to retint_restore_args, and thus the test-and-branch
can be eliminated here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/4ED4C7EE0200007800064028@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index faf8d5e74b0b..ab4b7ffd526d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -411,7 +411,7 @@ ENTRY(ret_from_fork)
 	RESTORE_REST
 
 	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
-	je   int_ret_from_sys_call
+	jz   retint_restore_args
 
 	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
 	jnz  int_ret_from_sys_call
@@ -612,8 +612,6 @@ tracesys:
 GLOBAL(int_ret_from_sys_call)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl $3,CS-ARGOFFSET(%rsp)
-	je retint_restore_args
 	movl $_TIF_ALLWORK_MASK,%edi
 	/* edi:	mask to check */
 GLOBAL(int_with_check)
-- 
cgit 


From 39e9543344fa3179e346d2b381c6e0cd17b516de Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Tue, 29 Nov 2011 11:03:46 +0000
Subject: x86-64: Reduce amount of redundant code generated for
 invalidate_interruptNN

Previously these up to 32 entry points, consisting of all the
same code except for their very first instruction, consumed 0x70
bytes per instance. Just like for device interrupt entry points,
fold them together so that they all use a single instance of the
code after having pushed their vector indicator (resulting in
0x10 bytes per instance, to retain 16-byte alignment of the
individual entry points).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/4ED4CA230200007800064065@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ab4b7ffd526d..1581f1990187 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -951,6 +951,7 @@ END(common_interrupt)
 ENTRY(\sym)
 	INTR_FRAME
 	pushq_cfi $~(\num)
+.Lcommon_\sym:
 	interrupt \do_sym
 	jmp ret_from_intr
 	CFI_ENDPROC
@@ -974,13 +975,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
 	x86_platform_ipi smp_x86_platform_ipi
 
 #ifdef CONFIG_SMP
-.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+	ALIGN
+	INTR_FRAME
+.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
 	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
 .if NUM_INVALIDATE_TLB_VECTORS > \idx
-apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
-	invalidate_interrupt\idx smp_invalidate_interrupt
+ENTRY(invalidate_interrupt\idx)
+	pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
+	jmp .Lcommon_invalidate_interrupt0
+	CFI_ADJUST_CFA_OFFSET -8
+END(invalidate_interrupt\idx)
 .endif
 .endr
+	CFI_ENDPROC
+apicinterrupt INVALIDATE_TLB_VECTOR_START, \
+	invalidate_interrupt0, smp_invalidate_interrupt
 #endif
 
 apicinterrupt THRESHOLD_APIC_VECTOR \
-- 
cgit 


From 46db09d3fd847f185a7d23a96bc8fe7a4be0cd05 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Tue, 29 Nov 2011 11:17:45 +0000
Subject: x86-64: Slightly shorten line system call entry and exit paths

GET_THREAD_INFO() involves a memory read immediately followed by
an "sub" on the value read, in turn (in several cases)
immediately followed by a use of the calculated value as the
base address of a memory access. This combination of
instructions has a non-negligible potential for stalls.

In the system call entry point code, however, the (fixed) offset
of the stack pointer from the end of the stack is generally
known, and hence we can instead avoid the memory load and
subtract, and instead do the memory reference using %rsp as the
base register. To do so in a legible fashion, introduce a
THREAD_INFO() macro which, provided a register (generally %rsp)
and the known offset from the end of the stack, produces a
suitable memory access operand.

The patch attempts to only touch the fast paths (no auditing and
alike), but manages to do so only in the 64-bit entry point
case; the compatibility mode entry points have so many
interdependencies between their various branch targets that it
was necessary to also adjust the slow paths to eliminate the
risk of having missed some register dependency during code
analysis.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/4ED4CD690200007800064075@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/ia32/ia32entry.S          | 36 +++++++++++++++---------------------
 arch/x86/include/asm/thread_info.h |  6 ++++++
 arch/x86/kernel/entry_64.S         |  8 +++-----
 3 files changed, 24 insertions(+), 26 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a6253ec1b284..0d5c279f3732 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -134,7 +134,7 @@ ENTRY(ia32_sysenter_target)
 	CFI_REL_OFFSET rsp,0
 	pushfq_cfi
 	/*CFI_REL_OFFSET rflags,0*/
-	movl	8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
+	movl	TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
 	CFI_REGISTER rip,r10
 	pushq_cfi $__USER32_CS
 	/*CFI_REL_OFFSET cs,0*/
@@ -150,9 +150,8 @@ ENTRY(ia32_sysenter_target)
  	.section __ex_table,"a"
  	.quad 1b,ia32_badarg
  	.previous	
-	GET_THREAD_INFO(%r10)
-	orl    $TS_COMPAT,TI_status(%r10)
-	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	cmpq	$(IA32_NR_syscalls-1),%rax
@@ -162,13 +161,12 @@ sysenter_do_call:
 sysenter_dispatch:
 	call	*ia32_sys_call_table(,%rax,8)
 	movq	%rax,RAX-ARGOFFSET(%rsp)
-	GET_THREAD_INFO(%r10)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl	$_TIF_ALLWORK_MASK,TI_flags(%r10)
+	testl	$_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jnz	sysexit_audit
 sysexit_from_sys_call:
-	andl    $~TS_COMPAT,TI_status(%r10)
+	andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	/* clear IF, that popfq doesn't enable interrupts early */
 	andl  $~0x200,EFLAGS-R11(%rsp) 
 	movl	RIP-R11(%rsp),%edx		/* User %eip */
@@ -205,7 +203,7 @@ sysexit_from_sys_call:
 	.endm
 
 	.macro auditsys_exit exit
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jnz ia32_ret_from_sys_call
 	TRACE_IRQS_ON
 	sti
@@ -215,12 +213,11 @@ sysexit_from_sys_call:
 	movzbl %al,%edi		/* zero-extend that into %edi */
 	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
 	call audit_syscall_exit
-	GET_THREAD_INFO(%r10)
 	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall return value */
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	cli
 	TRACE_IRQS_OFF
-	testl %edi,TI_flags(%r10)
+	testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jz \exit
 	CLEAR_RREGS -ARGOFFSET
 	jmp int_with_check
@@ -238,7 +235,7 @@ sysexit_audit:
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jz	sysenter_auditsys
 #endif
 	SAVE_REST
@@ -309,9 +306,8 @@ ENTRY(ia32_cstar_target)
 	.section __ex_table,"a"
 	.quad 1b,ia32_badarg
 	.previous	
-	GET_THREAD_INFO(%r10)
-	orl   $TS_COMPAT,TI_status(%r10)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	cmpq $IA32_NR_syscalls-1,%rax
@@ -321,13 +317,12 @@ cstar_do_call:
 cstar_dispatch:
 	call *ia32_sys_call_table(,%rax,8)
 	movq %rax,RAX-ARGOFFSET(%rsp)
-	GET_THREAD_INFO(%r10)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
+	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jnz sysretl_audit
 sysretl_from_sys_call:
-	andl $~TS_COMPAT,TI_status(%r10)
+	andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	RESTORE_ARGS 0,-ARG_SKIP,0,0,0
 	movl RIP-ARGOFFSET(%rsp),%ecx
 	CFI_REGISTER rip,rcx
@@ -355,7 +350,7 @@ sysretl_audit:
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
@@ -420,9 +415,8 @@ ENTRY(ia32_syscall)
 	/* note the registers are not zero extended to the sf.
 	   this could be a problem. */
 	SAVE_ARGS 0,1,0
-	GET_THREAD_INFO(%r10)
-	orl   $TS_COMPAT,TI_status(%r10)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jnz ia32_tracesys
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 25ebd792725b..185b719ec61a 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -232,6 +232,12 @@ static inline struct thread_info *current_thread_info(void)
 	movq PER_CPU_VAR(kernel_stack),reg ; \
 	subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
 
+/*
+ * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
+ * a certain register (to be used in assembler memory operands).
+ */
+#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg)
+
 #endif
 
 #endif /* !X86_32 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1581f1990187..75f72a50cf26 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -478,8 +478,7 @@ ENTRY(system_call_after_swapgs)
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
-	GET_THREAD_INFO(%rcx)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jnz tracesys
 system_call_fastpath:
 	cmpq $__NR_syscall_max,%rax
@@ -496,10 +495,9 @@ ret_from_sys_call:
 	/* edi:	flagmask */
 sysret_check:
 	LOCKDEP_SYS_EXIT
-	GET_THREAD_INFO(%rcx)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	movl TI_flags(%rcx),%edx
+	movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
 	andl %edi,%edx
 	jnz  sysret_careful
 	CFI_REMEMBER_STATE
@@ -583,7 +581,7 @@ sysret_audit:
 	/* Do syscall tracing */
 tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jz auditsys
 #endif
 	SAVE_REST
-- 
cgit 


From f6b2bc847641ea38e2655c8424fef5d2d19f35f9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Tue, 29 Nov 2011 11:24:10 +0000
Subject: x86-64: Cleanup some assembly entry points

system_call_after_swapgs doesn't really benefit from forcing
alignment from it - quite the opposite, native code needlessly
so far got a big NOP instruction inserted in front of it. Xen
being the only user of the separate entry point can well live
with the branch going to three bytes into a cache line.

The compatibility mode ptregs entry points for one can make use
of the GLOBAL() macro, and should be suitably aligned. Their
shared continuation point (ia32_ptregs_common) otoh doesn't need
to be global at all, but should continue to be properly aligned.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/4ED4CEEA020000780006407D@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/ia32/ia32entry.S  | 7 ++++---
 arch/x86/kernel/entry_64.S | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 0d5c279f3732..3e274564f6bf 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -453,8 +453,8 @@ quiet_ni_syscall:
 	CFI_ENDPROC
 	
 	.macro PTREGSCALL label, func, arg
-	.globl \label
-\label:
+	ALIGN
+GLOBAL(\label)
 	leaq \func(%rip),%rax
 	leaq -ARGOFFSET+8(%rsp),\arg	/* 8 for return address */
 	jmp  ia32_ptregs_common	
@@ -471,7 +471,8 @@ quiet_ni_syscall:
 	PTREGSCALL stub32_vfork, sys_vfork, %rdi
 	PTREGSCALL stub32_iopl, sys_iopl, %rsi
 
-ENTRY(ia32_ptregs_common)
+	ALIGN
+ia32_ptregs_common:
 	popq %r11
 	CFI_ENDPROC
 	CFI_STARTPROC32	simple
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 75f72a50cf26..cfad7fce6163 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -465,7 +465,7 @@ ENTRY(system_call)
 	 * after the swapgs, so that it can do the swapgs
 	 * for the guest and jump here on syscall.
 	 */
-ENTRY(system_call_after_swapgs)
+GLOBAL(system_call_after_swapgs)
 
 	movq	%rsp,PER_CPU_VAR(old_rsp)
 	movq	PER_CPU_VAR(kernel_stack),%rsp
-- 
cgit 


From 668b44846606185a9bed5a5357bc2cb132e00dd7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 30 Nov 2011 11:43:34 +0100
Subject: x86/div64: Add a micro-optimization shortcut if base is power of two

In the target code I have a do_div(x, PAGE_SIZE). The x86-64
version of it was doing a shift and a mask which is clever. The
32bit version of it had a div operation in it which made me
think. After digging I noticed that x86 has an optimized version
of it. This patch adds this shift and mask optimization if base
is constant so we don't have any runtime "checking" overhead
since most users use a power of ten.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1322649814-544-1-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/div64.h | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9a2d644c08ef..ced283ac79df 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -4,6 +4,7 @@
 #ifdef CONFIG_X86_32
 
 #include <linux/types.h>
+#include <linux/log2.h>
 
 /*
  * do_div() is NOT a C function. It wants to return
@@ -21,15 +22,20 @@
 ({								\
 	unsigned long __upper, __low, __high, __mod, __base;	\
 	__base = (base);					\
-	asm("":"=a" (__low), "=d" (__high) : "A" (n));		\
-	__upper = __high;					\
-	if (__high) {						\
-		__upper = __high % (__base);			\
-		__high = __high / (__base);			\
+	if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
+		__mod = n & (__base - 1);			\
+		n >>= ilog2(__base);				\
+	} else {						\
+		asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
+		__upper = __high;				\
+		if (__high) {					\
+			__upper = __high % (__base);		\
+			__high = __high / (__base);		\
+		}						\
+		asm("divl %2" : "=a" (__low), "=d" (__mod)	\
+			: "rm" (__base), "0" (__low), "1" (__upper));	\
+		asm("" : "=A" (n) : "a" (__low), "d" (__high));	\
 	}							\
-	asm("divl %2":"=a" (__low), "=d" (__mod)		\
-	    : "rm" (__base), "0" (__low), "1" (__upper));	\
-	asm("":"=A" (n) : "a" (__low), "d" (__high));		\
 	__mod;							\
 })
 
-- 
cgit 


From 3596ff4e6b2aff8a28c69af389d5046090a53330 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Tue, 25 Oct 2011 19:48:12 +0530
Subject: x86: Call do_notify_resume() with interrupts enabled

do_notify_resume() gets called with interrupts disabled on x86_32. This
is different from the x86_64 behavior, where interrupts are enabled at
the time.

Queries on lkml on this issue hasn't yielded any clear answer. Lets make
x86_32 behave the same as x86_64, unless there is a real reason to
maintain status quo.

Please refer https://lkml.org/lkml/2011/9/27/130 for more
details.

A similar change was suggested in ARM:

	https://lkml.org/lkml/2011/8/25/231

My 32-bit machine works fine (tm) with this patch.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20111025141812.GA21225@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f3f6f5344001..22d0e21b4dd7 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -625,6 +625,8 @@ work_notifysig:				# deal with pending signals and
 	movl %esp, %eax
 	jne work_notifysig_v86		# returning to kernel-space or
 					# vm86-space
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	xorl %edx, %edx
 	call do_notify_resume
 	jmp resume_userspace_sig
@@ -638,6 +640,8 @@ work_notifysig_v86:
 #else
 	movl %esp, %eax
 #endif
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	xorl %edx, %edx
 	call do_notify_resume
 	jmp resume_userspace_sig
-- 
cgit 


From cc3a1bf52a9d2808c7cd6e8f413b02b650b6b84b Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Tue, 25 Oct 2011 19:51:59 +0530
Subject: x86: Clean up and extend do_int3()

Since there is a possibility of !KPROBES int3 listeners
(such as kgdb) and since DIE_TRAP is currently not being
used by anybody, notify all listeners with DIE_INT3.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20111025142159.GB21225@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a8e3eb83466c..fa1191fb679d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -306,15 +306,10 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 			== NOTIFY_STOP)
 		return;
 #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
-#ifdef CONFIG_KPROBES
+
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
 			== NOTIFY_STOP)
 		return;
-#else
-	if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
-			== NOTIFY_STOP)
-		return;
-#endif
 
 	preempt_conditional_sti(regs);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
-- 
cgit 


From 1cf8343f55525c09c88da0a494a96e1b034f84e2 Mon Sep 17 00:00:00 2001
From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
Date: Tue, 6 Dec 2011 17:58:14 +0900
Subject: x86: Fix rflags in FAKE_STACK_FRAME

The x86_64 kernel pushes the fake kernel stack in
arch/x86/kernel/entry_64.S:FAKE_STACK_FRAME, and
rflags register in it does not conform to the specification.

Although Intel's manual[1] says bit 1 of it shall be set to 1,
this bit is cleared to 0 on pushing the fake stack.

[1] Intel(R) 64 and IA-32 Architectures Software Developer's Manual
    Vol.1 3-21 Figure 3-8. EFLAGS Register

If it is not on purpose, it is better to be fixed, because
it can lead some tools misunderstanding the stack frame. For example,
"crash" utility[2] actually detects it and warns you like
below:

       RIP: ffffffff8005dfa2  RSP: ffff8104ce0c7f58  RFLAGS: 00000200
       [...]

       bt: WARNING: possibly bogus exception frame

Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
Tested-by: Masayoshi MIZUMA <m.mizuma@jp.fujitsu.com>
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor-flags.h | 1 +
 arch/x86/kernel/entry_64.S             | 2 +-
 arch/x86/kernel/process.c              | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 2dddb317bb39..f8ab3eaad128 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -6,6 +6,7 @@
  * EFLAGS bits
  */
 #define X86_EFLAGS_CF	0x00000001 /* Carry Flag */
+#define X86_EFLAGS_BIT1	0x00000002 /* Bit 1 - always on */
 #define X86_EFLAGS_PF	0x00000004 /* Parity Flag */
 #define X86_EFLAGS_AF	0x00000010 /* Auxiliary carry Flag */
 #define X86_EFLAGS_ZF	0x00000040 /* Zero Flag */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index cfad7fce6163..a20e1cb9dc87 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -221,7 +221,7 @@ ENDPROC(native_usergs_sysret64)
 	/*CFI_REL_OFFSET	ss,0*/
 	pushq_cfi %rax /* rsp */
 	CFI_REL_OFFSET	rsp,0
-	pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
+	pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
 	/*CFI_REL_OFFSET	rflags,0*/
 	pushq_cfi $__KERNEL_CS /* cs */
 	/*CFI_REL_OFFSET	cs,0*/
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ee5d4fbd53b4..15763af7bfe3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 	regs.orig_ax = -1;
 	regs.ip = (unsigned long) kernel_thread_helper;
 	regs.cs = __KERNEL_CS | get_kernel_rpl();
-	regs.flags = X86_EFLAGS_IF | 0x2;
+	regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
 
 	/* Ok, create the new process.. */
 	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
-- 
cgit 


From 79f1ddd06471b094ae30eb17b33beb9f1234ca93 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 6 Dec 2011 12:22:03 -0800
Subject: x86: Use the same node_distance for 32 and 64-bit

The node_distance function is not x86 64-bit specific.  Having
the #ifdef around the extern function declaration and the
 #define causes the default node_distance macro to be used in
asm-generic/topology.h. This also causes a sparse warning in
arch/x86/mm/numa.c when CONFIG_X86_64 is not set:

warning: symbol '__node_distance' was not declared. Should it be
static?

Remove the #ifdef to fix both issues.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1112061220310.28251@chino.kir.corp.google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/topology.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c00692476e9f..800f77c60051 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -130,10 +130,8 @@ extern void setup_node_to_cpumask_map(void);
 	.balance_interval	= 1,					\
 }
 
-#ifdef CONFIG_X86_64
 extern int __node_distance(int, int);
 #define node_distance(a, b) __node_distance(a, b)
-#endif
 
 #else /* !CONFIG_NUMA */
 
-- 
cgit 


From 890890cb8e415e1e7a61bfe3c8e246f710196824 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 11 Dec 2011 21:13:19 +0300
Subject: x86/i386: Use less assembly in strlen(), speed things up a bit

Current i386 strlen() hardcodes NOT/DEC sequence. DEC is
mentioned to be suboptimal on Core2. So, put only REPNE SCASB
sequence in assembly, compiler can do the rest.

The difference in generated code is like below (MCORE2=y):

	<strlen>:
		push   %edi
		mov    $0xffffffff,%ecx
		mov    %eax,%edi
		xor    %eax,%eax
		repnz scas %es:(%edi),%al
		not    %ecx

	-	dec    %ecx
	-	mov    %ecx,%eax
	+	lea    -0x1(%ecx),%eax

		pop    %edi
		ret

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jan Beulich <JBeulich@suse.com>
Link: http://lkml.kernel.org/r/20111211181319.GA17097@p183.telecom.by
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lib/string_32.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 82004d2bf05e..bd59090825db 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -164,15 +164,13 @@ EXPORT_SYMBOL(strchr);
 size_t strlen(const char *s)
 {
 	int d0;
-	int res;
+	size_t res;
 	asm volatile("repne\n\t"
-		"scasb\n\t"
-		"notl %0\n\t"
-		"decl %0"
+		"scasb"
 		: "=c" (res), "=&D" (d0)
 		: "1" (s), "a" (0), "0" (0xffffffffu)
 		: "memory");
-	return res;
+	return ~res - 1;
 }
 EXPORT_SYMBOL(strlen);
 #endif
-- 
cgit 


From 969df4b82904a30fef19a67398a0c854d223ea67 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 14 Dec 2011 16:12:54 +0100
Subject: x86: Report cpb and eff_freq_ro flags correctly

Add the flags to get rid of the [9] and [10] feature names
in cpuinfo's 'power management' fields and replace them with
meaningful names.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Link: http://lkml.kernel.org/r/1323875574-17881-1-git-send-email-joerg.roedel@amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/powerflags.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 5abbea297e0c..7b3fe56b1c21 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = {
 	"100mhzsteps",
 	"hwpstate",
 	"",	/* tsc invariant mapped to constant_tsc */
-		/* nothing */
+	"cpb",  /* core performance boost */
+	"eff_freq_ro", /* Readonly aperf/mperf */
 };
-- 
cgit 


From cebef5beed3de3037de85a521495897256b2c5da Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Wed, 14 Dec 2011 08:33:25 +0000
Subject: x86: Fix and improve percpu_cmpxchg{8,16}b_double()

They had several problems/shortcomings:

Only the first memory operand was mentioned in the 2x32bit asm()
operands, and 2x64-bit version had a memory clobber. The first
allowed the compiler to not recognize the need to re-load the
data in case it had it cached in some register, and the second
was overly destructive.

The memory operand in the 2x32-bit asm() was declared to only be
an output.

The types of the local copies of the old and new values were
incorrect (as in other per-CPU ops, the types of the per-CPU
variables accessed should be used here, to make sure the
respective types are compatible).

The __dummy variable was pointless (and needlessly initialized
in the 2x32-bit case), given that local copies of the inputs
already exist.

The 2x64-bit variant forced the address of the first object into
%rsi, even though this is needed only for the call to the
emulation function. The real cmpxchg16b can operate on an
memory.

At once also change the return value type to what it really is -
'bool'.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4EE86D6502000078000679FE@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/percpu.h | 53 +++++++++++++++++--------------------------
 1 file changed, 21 insertions(+), 32 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 3470c9d0ebba..529bf07e8067 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -451,23 +451,20 @@ do {									\
 #endif /* !CONFIG_M386 */
 
 #ifdef CONFIG_X86_CMPXCHG64
-#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)			\
+#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2)		\
 ({									\
-	char __ret;							\
-	typeof(o1) __o1 = o1;						\
-	typeof(o1) __n1 = n1;						\
-	typeof(o2) __o2 = o2;						\
-	typeof(o2) __n2 = n2;						\
-	typeof(o2) __dummy = n2;					\
+	bool __ret;							\
+	typeof(pcp1) __o1 = (o1), __n1 = (n1);				\
+	typeof(pcp2) __o2 = (o2), __n2 = (n2);				\
 	asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t"	\
-		    : "=a"(__ret), "=m" (pcp1), "=d"(__dummy)		\
-		    :  "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2));	\
+		    : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \
+		    :  "b" (__n1), "c" (__n2), "a" (__o1));		\
 	__ret;								\
 })
 
-#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
-#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)	percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
+#define __this_cpu_cmpxchg_double_4	percpu_cmpxchg8b_double
+#define this_cpu_cmpxchg_double_4	percpu_cmpxchg8b_double
+#define irqsafe_cpu_cmpxchg_double_4	percpu_cmpxchg8b_double
 #endif /* CONFIG_X86_CMPXCHG64 */
 
 /*
@@ -508,31 +505,23 @@ do {									\
  * it in software.  The address used in the cmpxchg16 instruction must be
  * aligned to a 16 byte boundary.
  */
-#ifdef CONFIG_SMP
-#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
-#else
-#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
-#endif
-#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)			\
+#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2)		\
 ({									\
-	char __ret;							\
-	typeof(o1) __o1 = o1;						\
-	typeof(o1) __n1 = n1;						\
-	typeof(o2) __o2 = o2;						\
-	typeof(o2) __n2 = n2;						\
-	typeof(o2) __dummy;						\
-	alternative_io(CMPXCHG16B_EMU_CALL,				\
-		       "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t",	\
+	bool __ret;							\
+	typeof(pcp1) __o1 = (o1), __n1 = (n1);				\
+	typeof(pcp2) __o2 = (o2), __n2 = (n2);				\
+	alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \
+		       "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t",	\
 		       X86_FEATURE_CX16,				\
-		       ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)),		\
-		       "S" (&pcp1), "b"(__n1), "c"(__n2),		\
-		       "a"(__o1), "d"(__o2) : "memory");		\
+		       ASM_OUTPUT2("=a" (__ret), "+m" (pcp1),		\
+				   "+m" (pcp2), "+d" (__o2)),		\
+		       "b" (__n1), "c" (__n2), "a" (__o1) : "rsi");	\
 	__ret;								\
 })
 
-#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
-#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)	percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
+#define __this_cpu_cmpxchg_double_8	percpu_cmpxchg16b_double
+#define this_cpu_cmpxchg_double_8	percpu_cmpxchg16b_double
+#define irqsafe_cpu_cmpxchg_double_8	percpu_cmpxchg16b_double
 
 #endif
 
-- 
cgit 


From 83d99df7c4bf37176d8c7b199e3b129a51fa04c8 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 15 Dec 2011 14:55:53 -0800
Subject: x86, bitops: Move fls64.h inside __KERNEL__

We would include <asm-generic/bitops/fls64.h> even without __KERNEL__,
but that doesn't make sense, as:

1. That file provides fls64(), but the corresponding function fls() is
   not exported to user space.
2. The implementation of fls64.h uses kernel-only symbols.
3. fls64.h is not exported to user space.

This appears to have been a bug introduced in checkin:

d57594c203b1 bitops: use __fls for fls64 on 64-bit archs

Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Alexander van Heukelum <heukelum@mailshack.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/4EEA77E1.6050009@zytor.com
---
 arch/x86/include/asm/bitops.h | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 1775d6e5920e..4a6235b053cb 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -380,6 +380,8 @@ static inline unsigned long __fls(unsigned long word)
 	return word;
 }
 
+#undef ADDR
+
 #ifdef __KERNEL__
 /**
  * ffs - find first set bit in word
@@ -434,11 +436,6 @@ static inline int fls(int x)
 #endif
 	return r + 1;
 }
-#endif /* __KERNEL__ */
-
-#undef ADDR
-
-#ifdef __KERNEL__
 
 #include <asm-generic/bitops/find.h>
 
@@ -450,12 +447,8 @@ static inline int fls(int x)
 
 #include <asm-generic/bitops/const_hweight.h>
 
-#endif /* __KERNEL__ */
-
 #include <asm-generic/bitops/fls64.h>
 
-#ifdef __KERNEL__
-
 #include <asm-generic/bitops/le.h>
 
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
-- 
cgit 


From ca3d30cc02f780f68771087040ce935add6ba2b7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Dec 2011 14:56:54 +0000
Subject: x86_64, asm: Optimise fls(), ffs() and fls64()

fls(N), ffs(N) and fls64(N) can be optimised on x86_64.  Currently they use a
CMOV instruction after the BSR/BSF to set the destination register to -1 if the
value to be scanned was 0 (in which case BSR/BSF set the Z flag).

Instead, according to the AMD64 specification, we can make use of the fact that
BSR/BSF doesn't modify its output register if its input is 0.  By preloading
the output with -1 and incrementing the result, we achieve the desired result
without the need for a conditional check.

The Intel x86_64 specification, however, says that the result of BSR/BSF in
such a case is undefined.  That said, when queried, one of the Intel CPU
architects said that the behaviour on all Intel CPUs is that:

 (1) with BSRQ/BSFQ, the 64-bit destination register is written with its
     original value if the source is 0, thus, in essence, giving the effect we
     want.  And,

 (2) with BSRL/BSFL, the lower half of the 64-bit destination register is
     written with its original value if the source is 0, and the upper half is
     cleared, thus giving us the effect we want (we return a 4-byte int).

Further, it was indicated that they (Intel) are unlikely to get away with
changing the behaviour.

It might be possible to optimise the 32-bit versions of these functions, but
there's a lot more variation, and so the effective non-destructive property of
BSRL/BSRF cannot be relied on.

[ hpa: specifically, some 486 chips are known to NOT have this property. ]

I have benchmarked these functions on my Core2 Duo test machine using the
following program:

	#include <stdlib.h>
	#include <stdio.h>

	#ifndef __x86_64__
	#error
	#endif

	#define PAGE_SHIFT 12

	typedef unsigned long long __u64, u64;
	typedef unsigned int __u32, u32;
	#define noinline	__attribute__((noinline))

	static __always_inline int fls64(__u64 x)
	{
		long bitpos = -1;

		asm("bsrq %1,%0"
		    : "+r" (bitpos)
		    : "rm" (x));
		return bitpos + 1;
	}

	static inline unsigned long __fls(unsigned long word)
	{
		asm("bsr %1,%0"
		    : "=r" (word)
		    : "rm" (word));
		return word;
	}
	static __always_inline int old_fls64(__u64 x)
	{
		if (x == 0)
			return 0;
		return __fls(x) + 1;
	}

	static noinline // __attribute__((const))
	int old_get_order(unsigned long size)
	{
		int order;

		size = (size - 1) >> (PAGE_SHIFT - 1);
		order = -1;
		do {
			size >>= 1;
			order++;
		} while (size);
		return order;
	}

	static inline __attribute__((const))
	int get_order_old_fls64(unsigned long size)
	{
		int order;
		size--;
		size >>= PAGE_SHIFT;
		order = old_fls64(size);
		return order;
	}

	static inline __attribute__((const))
	int get_order(unsigned long size)
	{
		int order;
		size--;
		size >>= PAGE_SHIFT;
		order = fls64(size);
		return order;
	}

	unsigned long prevent_optimise_out;

	static noinline unsigned long test_old_get_order(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += old_get_order(n);
			}
		}
		return total;
	}

	static noinline unsigned long test_get_order_old_fls64(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += get_order_old_fls64(n);
			}
		}
		return total;
	}

	static noinline unsigned long test_get_order(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += get_order(n);
			}
		}
		return total;
	}

	int main(int argc, char **argv)
	{
		unsigned long total;

		switch (argc) {
		case 1:  total = test_old_get_order();		break;
		case 2:  total = test_get_order_old_fls64();	break;
		default: total = test_get_order();		break;
		}
		prevent_optimise_out = total;
		return 0;
	}

This allows me to test the use of the old fls64() implementation and the new
fls64() implementation and also to contrast these to the out-of-line loop-based
implementation of get_order().  The results were:

	warthog>time ./get_order
	real    1m37.191s
	user    1m36.313s
	sys     0m0.861s
	warthog>time ./get_order x
	real    0m16.892s
	user    0m16.586s
	sys     0m0.287s
	warthog>time ./get_order x x
	real    0m7.731s
	user    0m7.727s
	sys     0m0.002s

Using the current upstream fls64() as a basis for an inlined get_order() [the
second result above] is much faster than using the current out-of-line
loop-based get_order() [the first result above].

Using my optimised inline fls64()-based get_order() [the third result above]
is even faster still.

[ hpa: changed the selection of 32 vs 64 bits to use CONFIG_X86_64
  instead of comparing BITS_PER_LONG, updated comments, rebased manually
  on top of 83d99df7c4bf x86, bitops: Move fls64.h inside __KERNEL__ ]

Signed-off-by: David Howells <dhowells@redhat.com>
Link: http://lkml.kernel.org/r/20111213145654.14362.39868.stgit@warthog.procyon.org.uk
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/bitops.h | 67 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 62 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 4a6235b053cb..b97596e2b68c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -397,10 +397,25 @@ static inline unsigned long __fls(unsigned long word)
 static inline int ffs(int x)
 {
 	int r;
-#ifdef CONFIG_X86_CMOV
+
+#ifdef CONFIG_X86_64
+	/*
+	 * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before, except that the
+	 * top 32 bits will be cleared.
+	 *
+	 * We cannot do this on 32 bits because at the very least some
+	 * 486 CPUs did not behave this way.
+	 */
+	long tmp = -1;
+	asm("bsfl %1,%0"
+	    : "=r" (r)
+	    : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
 	asm("bsfl %1,%0\n\t"
 	    "cmovzl %2,%0"
-	    : "=r" (r) : "rm" (x), "r" (-1));
+	    : "=&r" (r) : "rm" (x), "r" (-1));
 #else
 	asm("bsfl %1,%0\n\t"
 	    "jnz 1f\n\t"
@@ -424,7 +439,22 @@ static inline int ffs(int x)
 static inline int fls(int x)
 {
 	int r;
-#ifdef CONFIG_X86_CMOV
+
+#ifdef CONFIG_X86_64
+	/*
+	 * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before, except that the
+	 * top 32 bits will be cleared.
+	 *
+	 * We cannot do this on 32 bits because at the very least some
+	 * 486 CPUs did not behave this way.
+	 */
+	long tmp = -1;
+	asm("bsrl %1,%0"
+	    : "=r" (r)
+	    : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
 	asm("bsrl %1,%0\n\t"
 	    "cmovzl %2,%0"
 	    : "=&r" (r) : "rm" (x), "rm" (-1));
@@ -437,6 +467,35 @@ static inline int fls(int x)
 	return r + 1;
 }
 
+/**
+ * fls64 - find last set bit in a 64-bit word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffsll, but returns the position of the most significant set bit.
+ *
+ * fls64(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 64.
+ */
+#ifdef CONFIG_X86_64
+static __always_inline int fls64(__u64 x)
+{
+	long bitpos = -1;
+	/*
+	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before.
+	 */
+	asm("bsrq %1,%0"
+	    : "+r" (bitpos)
+	    : "rm" (x));
+	return bitpos + 1;
+}
+#else
+#include <asm-generic/bitops/fls64.h>
+#endif
+
 #include <asm-generic/bitops/find.h>
 
 #include <asm-generic/bitops/sched.h>
@@ -447,8 +506,6 @@ static inline int fls(int x)
 
 #include <asm-generic/bitops/const_hweight.h>
 
-#include <asm-generic/bitops/fls64.h>
-
 #include <asm-generic/bitops/le.h>
 
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
-- 
cgit 


From cdcd629869fabcd38ebd24a03b0a05ec1cbcafb0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Mon, 2 Jan 2012 17:02:18 +0000
Subject: x86: Fix and improve cmpxchg_double{,_local}()

Just like the per-CPU ones they had several
problems/shortcomings:

Only the first memory operand was mentioned in the asm()
operands, and the 2x64-bit version didn't have a memory clobber
while the 2x32-bit one did. The former allowed the compiler to
not recognize the need to re-load the data in case it had it
cached in some register, while the latter was overly
destructive.

The types of the local copies of the old and new values were
incorrect (the types of the pointed-to variables should be used
here, to make sure the respective old/new variable types are
compatible).

The __dummy/__junk variables were pointless, given that local
copies of the inputs already existed (and can hence be used for
discarded outputs).

The 32-bit variant of cmpxchg_double_local() referenced
cmpxchg16b_local().

At once also:

 - change the return value type to what it really is: 'bool'
 - unify 32- and 64-bit variants
 - abstract out the common part of the 'normal' and 'local' variants

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F01F12A020000780006A19B@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/cmpxchg.h    | 23 ++++++++++++++++++++
 arch/x86/include/asm/cmpxchg_32.h | 46 ---------------------------------------
 arch/x86/include/asm/cmpxchg_64.h | 43 ------------------------------------
 3 files changed, 23 insertions(+), 89 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 5488e10b9dba..0c9fa2745f13 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -207,4 +207,27 @@ extern void __add_wrong_size(void)
 #define add_smp(ptr, inc)	__add((ptr), (inc), LOCK_PREFIX)
 #define add_sync(ptr, inc)	__add((ptr), (inc), "lock; ")
 
+#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2)			\
+({									\
+	bool __ret;							\
+	__typeof__(*(p1)) __old1 = (o1), __new1 = (n1);			\
+	__typeof__(*(p2)) __old2 = (o2), __new2 = (n2);			\
+	BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long));			\
+	BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long));			\
+	VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long)));		\
+	VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2));	\
+	asm volatile(pfx "cmpxchg%c4b %2; sete %0"			\
+		     : "=a" (__ret), "+d" (__old2),			\
+		       "+m" (*(p1)), "+m" (*(p2))			\
+		     : "i" (2 * sizeof(long)), "a" (__old1),		\
+		       "b" (__new1), "c" (__new2));			\
+	__ret;								\
+})
+
+#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \
+	__cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2)
+
+#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \
+	__cmpxchg_double(, p1, p2, o1, o2, n1, n2)
+
 #endif	/* ASM_X86_CMPXCHG_H */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index fbebb07dd80b..53f4b219336b 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -166,52 +166,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
 
 #endif
 
-#define cmpxchg8b(ptr, o1, o2, n1, n2)				\
-({								\
-	char __ret;						\
-	__typeof__(o2) __dummy;					\
-	__typeof__(*(ptr)) __old1 = (o1);			\
-	__typeof__(o2) __old2 = (o2);				\
-	__typeof__(*(ptr)) __new1 = (n1);			\
-	__typeof__(o2) __new2 = (n2);				\
-	asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1"	\
-		       : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\
-		       : "a" (__old1), "d"(__old2),		\
-		         "b" (__new1), "c" (__new2)		\
-		       : "memory");				\
-	__ret; })
-
-
-#define cmpxchg8b_local(ptr, o1, o2, n1, n2)			\
-({								\
-	char __ret;						\
-	__typeof__(o2) __dummy;					\
-	__typeof__(*(ptr)) __old1 = (o1);			\
-	__typeof__(o2) __old2 = (o2);				\
-	__typeof__(*(ptr)) __new1 = (n1);			\
-	__typeof__(o2) __new2 = (n2);				\
-	asm volatile("cmpxchg8b %2; setz %1"			\
-		       : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\
-		       : "a" (__old), "d"(__old2),		\
-		         "b" (__new1), "c" (__new2),		\
-		       : "memory");				\
-	__ret; })
-
-
-#define cmpxchg_double(ptr, o1, o2, n1, n2)				\
-({									\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
-	VM_BUG_ON((unsigned long)(ptr) % 8);				\
-	cmpxchg8b((ptr), (o1), (o2), (n1), (n2));			\
-})
-
-#define cmpxchg_double_local(ptr, o1, o2, n1, n2)			\
-({									\
-       BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
-       VM_BUG_ON((unsigned long)(ptr) % 8);				\
-       cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2));			\
-})
-
 #define system_has_cmpxchg_double() cpu_has_cx8
 
 #endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 285da02c38fa..614be87f1a9b 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -20,49 +20,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
 	cmpxchg_local((ptr), (o), (n));					\
 })
 
-#define cmpxchg16b(ptr, o1, o2, n1, n2)				\
-({								\
-	char __ret;						\
-	__typeof__(o2) __junk;					\
-	__typeof__(*(ptr)) __old1 = (o1);			\
-	__typeof__(o2) __old2 = (o2);				\
-	__typeof__(*(ptr)) __new1 = (n1);			\
-	__typeof__(o2) __new2 = (n2);				\
-	asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1"	\
-		       : "=d"(__junk), "=a"(__ret), "+m" (*ptr)	\
-		       : "b"(__new1), "c"(__new2),		\
-		         "a"(__old1), "d"(__old2));		\
-	__ret; })
-
-
-#define cmpxchg16b_local(ptr, o1, o2, n1, n2)			\
-({								\
-	char __ret;						\
-	__typeof__(o2) __junk;					\
-	__typeof__(*(ptr)) __old1 = (o1);			\
-	__typeof__(o2) __old2 = (o2);				\
-	__typeof__(*(ptr)) __new1 = (n1);			\
-	__typeof__(o2) __new2 = (n2);				\
-	asm volatile("cmpxchg16b %2;setz %1"			\
-		       : "=d"(__junk), "=a"(__ret), "+m" (*ptr)	\
-		       : "b"(__new1), "c"(__new2),		\
-		         "a"(__old1), "d"(__old2));		\
-	__ret; })
-
-#define cmpxchg_double(ptr, o1, o2, n1, n2)				\
-({									\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
-	VM_BUG_ON((unsigned long)(ptr) % 16);				\
-	cmpxchg16b((ptr), (o1), (o2), (n1), (n2));			\
-})
-
-#define cmpxchg_double_local(ptr, o1, o2, n1, n2)			\
-({									\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
-	VM_BUG_ON((unsigned long)(ptr) % 16);				\
-	cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2));		\
-})
-
 #define system_has_cmpxchg_double() cpu_has_cx16
 
 #endif /* _ASM_X86_CMPXCHG_64_H */
-- 
cgit 


From ceb7b40b65539a771d1bfaf47660ac0ee57e0c4f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 3 Jan 2012 17:35:40 +0100
Subject: x86: Fix atomic64_xxx_cx8() functions

It appears about all functions in arch/x86/lib/atomic64_cx8_32.S
are wrong in case cmpxchg8b must be restarted, because
LOCK_PREFIX macro defines a label "1" clashing with other local
labels :

1:
	some_instructions
	LOCK_PREFIX
	cmpxchg8b (%ebp)
	jne 1b  / jumps to beginning of LOCK_PREFIX !

A possible fix is to use a magic label "672" in LOCK_PREFIX asm
definition, similar to the "671" one we defined in
LOCK_PREFIX_HERE.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Jan Beulich <JBeulich@suse.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1325608540.2320.103.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/alternative-asm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 091508b533b4..952bd0100c5c 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -4,10 +4,10 @@
 
 #ifdef CONFIG_SMP
 	.macro LOCK_PREFIX
-1:	lock
+672:	lock
 	.section .smp_locks,"a"
 	.balign 4
-	.long 1b - .
+	.long 672b - .
 	.previous
 	.endm
 #else
-- 
cgit