From 5d61a2172a0142c635ab6d7c3b1589af85a3603e Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Fri, 8 Aug 2014 18:44:01 -0500
Subject: powerpc/nohash: Split __early_init_mmu() into boot and secondary

__early_init_mmu() does some things that are really only needed by the
boot cpu.  On FSL booke, This includes calling
memblock_enforce_memory_limit(), which is labelled __init.  Secondary
cpu init code can't be __init as that would break CPU hotplug.

While it's probably a bug that memblock_enforce_memory_limit() isn't
__init_memblock instead, there's no reason why we should be doing this
stuff for secondary cpus in the first place.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/tlb_nohash.c | 111 +++++++++++++++++++++++++------------------
 1 file changed, 66 insertions(+), 45 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 92cb18d52ea8..f38ea4df6a85 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -581,42 +581,10 @@ static void setup_mmu_htw(void)
 /*
  * Early initialization of the MMU TLB code
  */
-static void __early_init_mmu(int boot_cpu)
+static void early_init_this_mmu(void)
 {
 	unsigned int mas4;
 
-	/* XXX This will have to be decided at runtime, but right
-	 * now our boot and TLB miss code hard wires it. Ideally
-	 * we should find out a suitable page size and patch the
-	 * TLB miss code (either that or use the PACA to store
-	 * the value we want)
-	 */
-	mmu_linear_psize = MMU_PAGE_1G;
-
-	/* XXX This should be decided at runtime based on supported
-	 * page sizes in the TLB, but for now let's assume 16M is
-	 * always there and a good fit (which it probably is)
-	 *
-	 * Freescale booke only supports 4K pages in TLB0, so use that.
-	 */
-	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
-		mmu_vmemmap_psize = MMU_PAGE_4K;
-	else
-		mmu_vmemmap_psize = MMU_PAGE_16M;
-
-	/* XXX This code only checks for TLB 0 capabilities and doesn't
-	 *     check what page size combos are supported by the HW. It
-	 *     also doesn't handle the case where a separate array holds
-	 *     the IND entries from the array loaded by the PT.
-	 */
-	if (boot_cpu) {
-		/* Look for supported page sizes */
-		setup_page_sizes();
-
-		/* Look for HW tablewalk support */
-		setup_mmu_htw();
-	}
-
 	/* Set MAS4 based on page table setting */
 
 	mas4 = 0x4 << MAS4_WIMGED_SHIFT;
@@ -650,11 +618,6 @@ static void __early_init_mmu(int boot_cpu)
 	}
 	mtspr(SPRN_MAS4, mas4);
 
-	/* Set the global containing the top of the linear mapping
-	 * for use by the TLB miss code
-	 */
-	linear_map_top = memblock_end_of_DRAM();
-
 #ifdef CONFIG_PPC_FSL_BOOK3E
 	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
 		unsigned int num_cams;
@@ -662,10 +625,49 @@ static void __early_init_mmu(int boot_cpu)
 		/* use a quarter of the TLBCAM for bolted linear map */
 		num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
 		linear_map_top = map_mem_in_cams(linear_map_top, num_cams);
+	}
+#endif
 
-		/* limit memory so we dont have linear faults */
-		memblock_enforce_memory_limit(linear_map_top);
+	/* A sync won't hurt us after mucking around with
+	 * the MMU configuration
+	 */
+	mb();
+}
 
+static void __init early_init_mmu_global(void)
+{
+	/* XXX This will have to be decided at runtime, but right
+	 * now our boot and TLB miss code hard wires it. Ideally
+	 * we should find out a suitable page size and patch the
+	 * TLB miss code (either that or use the PACA to store
+	 * the value we want)
+	 */
+	mmu_linear_psize = MMU_PAGE_1G;
+
+	/* XXX This should be decided at runtime based on supported
+	 * page sizes in the TLB, but for now let's assume 16M is
+	 * always there and a good fit (which it probably is)
+	 *
+	 * Freescale booke only supports 4K pages in TLB0, so use that.
+	 */
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
+		mmu_vmemmap_psize = MMU_PAGE_4K;
+	else
+		mmu_vmemmap_psize = MMU_PAGE_16M;
+
+	/* XXX This code only checks for TLB 0 capabilities and doesn't
+	 *     check what page size combos are supported by the HW. It
+	 *     also doesn't handle the case where a separate array holds
+	 *     the IND entries from the array loaded by the PT.
+	 */
+	/* Look for supported page sizes */
+	setup_page_sizes();
+
+	/* Look for HW tablewalk support */
+	setup_mmu_htw();
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
 		if (book3e_htw_mode == PPC_HTW_NONE) {
 			extlb_level_exc = EX_TLB_SIZE;
 			patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
@@ -675,22 +677,41 @@ static void __early_init_mmu(int boot_cpu)
 	}
 #endif
 
-	/* A sync won't hurt us after mucking around with
-	 * the MMU configuration
+	/* Set the global containing the top of the linear mapping
+	 * for use by the TLB miss code
 	 */
-	mb();
+	linear_map_top = memblock_end_of_DRAM();
+}
+
+static void __init early_mmu_set_memory_limit(void)
+{
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+		/*
+		 * Limit memory so we dont have linear faults.
+		 * Unlike memblock_set_current_limit, which limits
+		 * memory available during early boot, this permanently
+		 * reduces the memory available to Linux.  We need to
+		 * do this because highmem is not supported on 64-bit.
+		 */
+		memblock_enforce_memory_limit(linear_map_top);
+	}
+#endif
 
 	memblock_set_current_limit(linear_map_top);
 }
 
+/* boot cpu only */
 void __init early_init_mmu(void)
 {
-	__early_init_mmu(1);
+	early_init_mmu_global();
+	early_init_this_mmu();
+	early_mmu_set_memory_limit();
 }
 
 void early_init_mmu_secondary(void)
 {
-	__early_init_mmu(0);
+	early_init_this_mmu();
 }
 
 void setup_initial_memory_limit(phys_addr_t first_memblock_base,
-- 
cgit 


From 11d549048e5bea3055053d8dcf45089b1fd711f1 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 8 Aug 2014 22:22:12 -0700
Subject: powerpc: Fix "attempt to move .org backwards" error

Once again, we see

arch/powerpc/kernel/exceptions-64s.S: Assembler messages:
arch/powerpc/kernel/exceptions-64s.S:865: Error: attempt to move .org backwards
arch/powerpc/kernel/exceptions-64s.S:866: Error: attempt to move .org backwards
arch/powerpc/kernel/exceptions-64s.S:890: Error: attempt to move .org backwards

when compiling ppc:allmodconfig.

This time the problem has been caused by to commit 0869b6fd209bda
("powerpc/book3s: Add basic infrastructure to handle HMI in Linux"),
which adds functions hmi_exception_early and hmi_exception_after_realmode
into a critical (size-limited) code area, even though that does not appear
to be necessary.

Move those functions to a non-critical area of the file.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/exceptions-64s.S | 110 +++++++++++++++++------------------
 1 file changed, 55 insertions(+), 55 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6144d5a6bfe7..050f79a4a168 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -592,61 +592,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	MASKABLE_EXCEPTION_HV_OOL(0xe62, hmi_exception)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62)
 
-	.globl hmi_exception_early
-hmi_exception_early:
-	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
-	mr	r10,r1			/* Save r1			*/
-	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
-	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
-	std	r9,_CCR(r1)		/* save CR in stackframe	*/
-	mfspr	r11,SPRN_HSRR0		/* Save HSRR0 */
-	std	r11,_NIP(r1)		/* save HSRR0 in stackframe	*/
-	mfspr	r12,SPRN_HSRR1		/* Save SRR1 */
-	std	r12,_MSR(r1)		/* save SRR1 in stackframe	*/
-	std	r10,0(r1)		/* make stack chain pointer	*/
-	std	r0,GPR0(r1)		/* save r0 in stackframe	*/
-	std	r10,GPR1(r1)		/* save r1 in stackframe	*/
-	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
-	EXCEPTION_PROLOG_COMMON_3(0xe60)
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	hmi_exception_realmode
-	/* Windup the stack. */
-	/* Clear MSR_RI before setting SRR0 and SRR1. */
-	li	r0,MSR_RI
-	mfmsr	r9			/* get MSR value */
-	andc	r9,r9,r0
-	mtmsrd	r9,1			/* Clear MSR_RI */
-	/* Move original HSRR0 and HSRR1 into the respective regs */
-	ld	r9,_MSR(r1)
-	mtspr	SPRN_HSRR1,r9
-	ld	r3,_NIP(r1)
-	mtspr	SPRN_HSRR0,r3
-	ld	r9,_CTR(r1)
-	mtctr	r9
-	ld	r9,_XER(r1)
-	mtxer	r9
-	ld	r9,_LINK(r1)
-	mtlr	r9
-	REST_GPR(0, r1)
-	REST_8GPRS(2, r1)
-	REST_GPR(10, r1)
-	ld	r11,_CCR(r1)
-	mtcr	r11
-	REST_GPR(11, r1)
-	REST_2GPRS(12, r1)
-	/* restore original r1. */
-	ld	r1,GPR1(r1)
-
-	/*
-	 * Go to virtual mode and pull the HMI event information from
-	 * firmware.
-	 */
-	.globl hmi_exception_after_realmode
-hmi_exception_after_realmode:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	hmi_exception_hv
-
 	MASKABLE_EXCEPTION_HV_OOL(0xe82, h_doorbell)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe82)
 
@@ -1306,6 +1251,61 @@ fwnmi_data_area:
 	. = 0x8000
 #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
 
+	.globl hmi_exception_early
+hmi_exception_early:
+	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
+	mr	r10,r1			/* Save r1			*/
+	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
+	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
+	std	r9,_CCR(r1)		/* save CR in stackframe	*/
+	mfspr	r11,SPRN_HSRR0		/* Save HSRR0 */
+	std	r11,_NIP(r1)		/* save HSRR0 in stackframe	*/
+	mfspr	r12,SPRN_HSRR1		/* Save SRR1 */
+	std	r12,_MSR(r1)		/* save SRR1 in stackframe	*/
+	std	r10,0(r1)		/* make stack chain pointer	*/
+	std	r0,GPR0(r1)		/* save r0 in stackframe	*/
+	std	r10,GPR1(r1)		/* save r1 in stackframe	*/
+	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
+	EXCEPTION_PROLOG_COMMON_3(0xe60)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	hmi_exception_realmode
+	/* Windup the stack. */
+	/* Clear MSR_RI before setting SRR0 and SRR1. */
+	li	r0,MSR_RI
+	mfmsr	r9			/* get MSR value */
+	andc	r9,r9,r0
+	mtmsrd	r9,1			/* Clear MSR_RI */
+	/* Move original HSRR0 and HSRR1 into the respective regs */
+	ld	r9,_MSR(r1)
+	mtspr	SPRN_HSRR1,r9
+	ld	r3,_NIP(r1)
+	mtspr	SPRN_HSRR0,r3
+	ld	r9,_CTR(r1)
+	mtctr	r9
+	ld	r9,_XER(r1)
+	mtxer	r9
+	ld	r9,_LINK(r1)
+	mtlr	r9
+	REST_GPR(0, r1)
+	REST_8GPRS(2, r1)
+	REST_GPR(10, r1)
+	ld	r11,_CCR(r1)
+	mtcr	r11
+	REST_GPR(11, r1)
+	REST_2GPRS(12, r1)
+	/* restore original r1. */
+	ld	r1,GPR1(r1)
+
+	/*
+	 * Go to virtual mode and pull the HMI event information from
+	 * firmware.
+	 */
+	.globl hmi_exception_after_realmode
+hmi_exception_after_realmode:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	hmi_exception_hv
+
 #ifdef CONFIG_PPC_POWERNV
 _GLOBAL(opal_mc_secondary_handler)
 	HMT_MEDIUM_PPR_DISCARD
-- 
cgit 


From 51d7d5205d3389a32859f9939f1093f267409929 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 7 Aug 2014 15:36:17 +1000
Subject: powerpc: Add smp_mb() to arch_spin_is_locked()

The kernel defines the function spin_is_locked(), which can be used to
check if a spinlock is currently locked.

Using spin_is_locked() on a lock you don't hold is obviously racy. That
is, even though you may observe that the lock is unlocked, it may become
locked at any time.

There is (at least) one exception to that, which is if two locks are
used as a pair, and the holder of each checks the status of the other
before doing any update.

Assuming *A and *B are two locks, and *COUNTER is a shared non-atomic
value:

The first CPU does:

	spin_lock(*A)

	if spin_is_locked(*B)
		# nothing
	else
		smp_mb()
		LOAD r = *COUNTER
		r++
		STORE *COUNTER = r

	spin_unlock(*A)

And the second CPU does:

	spin_lock(*B)

	if spin_is_locked(*A)
		# nothing
	else
		smp_mb()
		LOAD r = *COUNTER
		r++
		STORE *COUNTER = r

	spin_unlock(*B)

Although this is a strange locking construct, it should work.

It seems to be understood, but not documented, that spin_is_locked() is
not a memory barrier, so in the examples above and below the caller
inserts its own memory barrier before acting on the result of
spin_is_locked().

For now we assume spin_is_locked() is implemented as below, and we break
it out in our examples:

	bool spin_is_locked(*LOCK) {
		LOAD l = *LOCK
		return l.locked
	}

Our intuition is that there should be no problem even if the two code
sequences run simultaneously such as:

	CPU 0			CPU 1
	==================================================
	spin_lock(*A)		spin_lock(*B)
	LOAD b = *B		LOAD a = *A
	if b.locked # true	if a.locked # true
	# nothing		# nothing
	spin_unlock(*A)		spin_unlock(*B)

If one CPU gets the lock before the other then it will do the update and
the other CPU will back off:

	CPU 0			CPU 1
	==================================================
	spin_lock(*A)
	LOAD b = *B
				spin_lock(*B)
	if b.locked # false	LOAD a = *A
	else			if a.locked # true
	smp_mb()		# nothing
	LOAD r1 = *COUNTER	spin_unlock(*B)
	r1++
	STORE *COUNTER = r1
	spin_unlock(*A)

However in reality spin_lock() itself is not indivisible. On powerpc we
implement it as a load-and-reserve and store-conditional.

Ignoring the retry logic for the lost reservation case, it boils down to:
	spin_lock(*LOCK) {
		LOAD l = *LOCK
		l.locked = true
		STORE *LOCK = l
		ACQUIRE_BARRIER
	}

The ACQUIRE_BARRIER is required to give spin_lock() ACQUIRE semantics as
defined in memory-barriers.txt:

     This acts as a one-way permeable barrier.  It guarantees that all
     memory operations after the ACQUIRE operation will appear to happen
     after the ACQUIRE operation with respect to the other components of
     the system.

On modern powerpc systems we use lwsync for ACQUIRE_BARRIER. lwsync is
also know as "lightweight sync", or "sync 1".

As described in Power ISA v2.07 section B.2.1.1, in this scenario the
lwsync is not the barrier itself. It instead causes the LOAD of *LOCK to
act as the barrier, preventing any loads or stores in the locked region
from occurring prior to the load of *LOCK.

Whether this behaviour is in accordance with the definition of ACQUIRE
semantics in memory-barriers.txt is open to discussion, we may switch to
a different barrier in future.

What this means in practice is that the following can occur:

	CPU 0			CPU 1
	==================================================
	LOAD a = *A 		LOAD b = *B
	a.locked = true		b.locked = true
	LOAD b = *B		LOAD a = *A
	STORE *A = a		STORE *B = b
	if b.locked # false	if a.locked # false
	else			else
	smp_mb()		smp_mb()
	LOAD r1 = *COUNTER	LOAD r2 = *COUNTER
	r1++			r2++
	STORE *COUNTER = r1
				STORE *COUNTER = r2	# Lost update
	spin_unlock(*A)		spin_unlock(*B)

That is, the load of *B can occur prior to the store that makes *A
visibly locked. And similarly for CPU 1. The result is both CPUs hold
their lock and believe the other lock is unlocked.

The easiest fix for this is to add a full memory barrier to the start of
spin_is_locked(), so adding to our previous definition would give us:

	bool spin_is_locked(*LOCK) {
		smp_mb()
		LOAD l = *LOCK
		return l.locked
	}

The new barrier orders the store to the lock we are locking vs the load
of the other lock:

	CPU 0			CPU 1
	==================================================
	LOAD a = *A 		LOAD b = *B
	a.locked = true		b.locked = true
	STORE *A = a		STORE *B = b
	smp_mb()		smp_mb()
	LOAD b = *B		LOAD a = *A
	if b.locked # true	if a.locked # true
	# nothing		# nothing
	spin_unlock(*A)		spin_unlock(*B)

Although the above example is theoretical, there is code similar to this
example in sem_lock() in ipc/sem.c. This commit in addition to the next
commit appears to be a fix for crashes we are seeing in that code where
we believe this race happens in practice.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/spinlock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 35aa339410bd..4dbe072eecbe 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -61,6 +61,7 @@ static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
+	smp_mb();
 	return !arch_spin_value_unlocked(*lock);
 }
 
-- 
cgit 


From 78e05b1421fa41ae8457701140933baa5e7d9479 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 7 Aug 2014 15:36:18 +1000
Subject: powerpc: Add smp_mb()s to arch_spin_unlock_wait()

Similar to the previous commit which described why we need to add a
barrier to arch_spin_is_locked(), we have a similar problem with
spin_unlock_wait().

We need a barrier on entry to ensure any spinlock we have previously
taken is visibly locked prior to the load of lock->slock.

It's also not clear if spin_unlock_wait() is intended to have ACQUIRE
semantics. For now be conservative and add a barrier on exit to give it
ACQUIRE semantics.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/lib/locks.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 0c9c8d7d0734..170a0346f756 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -70,12 +70,16 @@ void __rw_yield(arch_rwlock_t *rw)
 
 void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
+	smp_mb();
+
 	while (lock->slock) {
 		HMT_low();
 		if (SHARED_PROCESSOR)
 			__spin_yield(lock);
 	}
 	HMT_medium();
+
+	smp_mb();
 }
 
 EXPORT_SYMBOL(arch_spin_unlock_wait);
-- 
cgit 


From 763fe0addb8fe15ccea67c0aebddc06f4bb25439 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Wed, 6 Aug 2014 17:10:16 +1000
Subject: powerpc/powernv: Fix IOMMU group lost

When we take full hotplug to recover from EEH errors, PCI buses
could be involved. For the case, the child devices of involved
PCI buses can't be attached to IOMMU group properly, which is
caused by commit 3f28c5a ("powerpc/powernv: Reduce multi-hit of
iommu_add_device()").

When adding the PCI devices of the newly created PCI buses to
the system, the IOMMU group is expected to be added in (C).
(A) fails to bind the IOMMU group because bus->is_added is
false. (B) fails because the device doesn't have binding IOMMU
table yet. bus->is_added is set to true at end of (C) and
pdev->is_added is set to true at (D).

   pcibios_add_pci_devices()
      pci_scan_bridge()
         pci_scan_child_bus()
            pci_scan_slot()
               pci_scan_single_device()
                  pci_scan_device()
                  pci_device_add()
                     pcibios_add_device()           A: Ignore
                     device_add()                   B: Ignore
                  pcibios_fixup_bus()
                     pcibios_setup_bus_devices()
                        pcibios_setup_device()      C: Hit
      pcibios_finish_adding_to_bus()
         pci_bus_add_devices()
            pci_bus_add_device()                    D: Add device

If the parent PCI bus isn't involved in hotplug, the IOMMU
group is expected to be bound in (B). (A) should fail as the
sysfs entries aren't populated.

The patch fixes the issue by reverting commit 3f28c5a and remove
WARN_ON() in iommu_add_device() to allow calling the function
even the specified device already has associated IOMMU group.

Cc: <stable@vger.kernel.org>  # 3.16+
Reported-by: Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Acked-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/iommu.c               | 38 +++++++++++++++++--------------
 arch/powerpc/platforms/powernv/pci-ioda.c |  2 +-
 2 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index f84f799babb1..a10642a0d861 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1120,37 +1120,41 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership);
 int iommu_add_device(struct device *dev)
 {
 	struct iommu_table *tbl;
-	int ret = 0;
 
-	if (WARN_ON(dev->iommu_group)) {
-		pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n",
-				dev_name(dev),
-				iommu_group_id(dev->iommu_group));
+	/*
+	 * The sysfs entries should be populated before
+	 * binding IOMMU group. If sysfs entries isn't
+	 * ready, we simply bail.
+	 */
+	if (!device_is_registered(dev))
+		return -ENOENT;
+
+	if (dev->iommu_group) {
+		pr_debug("%s: Skipping device %s with iommu group %d\n",
+			 __func__, dev_name(dev),
+			 iommu_group_id(dev->iommu_group));
 		return -EBUSY;
 	}
 
 	tbl = get_iommu_table_base(dev);
 	if (!tbl || !tbl->it_group) {
-		pr_debug("iommu_tce: skipping device %s with no tbl\n",
-				dev_name(dev));
+		pr_debug("%s: Skipping device %s with no tbl\n",
+			 __func__, dev_name(dev));
 		return 0;
 	}
 
-	pr_debug("iommu_tce: adding %s to iommu group %d\n",
-			dev_name(dev), iommu_group_id(tbl->it_group));
+	pr_debug("%s: Adding %s to iommu group %d\n",
+		 __func__, dev_name(dev),
+		 iommu_group_id(tbl->it_group));
 
 	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
-		pr_err("iommu_tce: unsupported iommu page size.");
-		pr_err("%s has not been added\n", dev_name(dev));
+		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
+		       __func__, IOMMU_PAGE_SIZE(tbl),
+		       PAGE_SIZE, dev_name(dev));
 		return -EINVAL;
 	}
 
-	ret = iommu_group_add_device(tbl->it_group, dev);
-	if (ret < 0)
-		pr_err("iommu_tce: %s has not been added, ret=%d\n",
-				dev_name(dev), ret);
-
-	return ret;
+	return iommu_group_add_device(tbl->it_group, dev);
 }
 EXPORT_SYMBOL_GPL(iommu_add_device);
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index b136108ddc99..df241b11d4f7 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -857,7 +857,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 
 	pe = &phb->ioda.pe_array[pdn->pe_number];
 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
-	set_iommu_table_base(&pdev->dev, &pe->tce32_table);
+	set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table);
 }
 
 static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
-- 
cgit 


From 97b3be1e9404f4c1645f3286428abd4d2052041e Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Wed, 6 Aug 2014 17:03:09 +1000
Subject: powerpc/ppc476: Disable BTAC

This patch disables the branch target address CAM which under specific
circumstances may cause the processor to skip execution of 1-4
instructions. This fixes IBM Erratum #47.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/head_44x.S | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index c334f53453f7..b5061abbd2e0 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -1210,10 +1210,12 @@ clear_utlb_entry:
 
 	/* We configure icbi to invalidate 128 bytes at a time since the
 	 * current 32-bit kernel code isn't too happy with icache != dcache
-	 * block size
+	 * block size. We also disable the BTAC as this can cause errors
+	 * in some circumstances (see IBM Erratum 47).
 	 */
 	mfspr	r3,SPRN_CCR0
 	oris	r3,r3,0x0020
+	ori	r3,r3,0x0040
 	mtspr	SPRN_CCR0,r3
 	isync
 
-- 
cgit 


From 3609e09fd824c37df6f2bf13caa88f6f54a11922 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 6 Aug 2014 15:42:17 +1000
Subject: powerpc: Add POWER8 features to CPU_FTRS_POSSIBLE/ALWAYS

We have been a bit slack about updating the CPU_FTRS_POSSIBLE and
CPU_FTRS_ALWAYS masks. When we added POWER8, and also POWER8E we forgot
to update the ALWAYS mask. And when we added POWER8_DD1 we forgot to
update both the POSSIBLE and ALWAYS masks.

Luckily this hasn't caused any actual bugs AFAICS. Failing to update the
ALWAYS mask just forgoes a potential optimisation opportunity. Failing
to update the POSSIBLE mask for POWER8_DD1 is also OK because it only
removes a bit rather than adding any.

Regardless they should all be in both masks so as to avoid any future
bugs when the set of ALWAYS/POSSIBLE bits changes, or the masks
themselves change.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Michael Neuling <mikey@neuling.org>
Acked-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/cputable.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 642e436d4595..daa5af91163c 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -459,7 +459,8 @@ extern const char *powerpc_base_platform;
 #define CPU_FTRS_POSSIBLE	\
 	    (CPU_FTRS_POWER4 | CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
 	     CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
-	     CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | CPU_FTR_VSX)
+	     CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \
+	     CPU_FTRS_PA6T | CPU_FTR_VSX)
 #endif
 #else
 enum {
@@ -509,7 +510,8 @@ enum {
 #define CPU_FTRS_ALWAYS		\
 	    (CPU_FTRS_POWER4 & CPU_FTRS_PPC970 & CPU_FTRS_POWER5 & \
 	     CPU_FTRS_POWER6 & CPU_FTRS_POWER7 & CPU_FTRS_CELL & \
-	     CPU_FTRS_PA6T & CPU_FTRS_POSSIBLE)
+	     CPU_FTRS_PA6T & CPU_FTRS_POWER8 & CPU_FTRS_POWER8E & \
+	     CPU_FTRS_POWER8_DD1 & CPU_FTRS_POSSIBLE)
 #endif
 #else
 enum {
-- 
cgit 


From b09c2ec4082c63584491f35df2cb530ee8ca312d Mon Sep 17 00:00:00 2001
From: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
Date: Sat, 9 Aug 2014 11:15:45 +0530
Subject: powerpc/powernv: Interface to register/unregister opal dump region

PowerNV platform is capable of capturing host memory region when system
crashes (because of host/firmware). We have new OPAL API to register/
unregister memory region to be captured when system crashes.

This patch adds support for new API. Also during boot time we register
kernel log buffer and unregister before doing kexec.

Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/opal.h                | 11 +++++++++++
 arch/powerpc/platforms/powernv/opal-wrappers.S |  2 ++
 arch/powerpc/platforms/powernv/opal.c          | 23 +++++++++++++++++++++++
 3 files changed, 36 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index b2f8ce1fd0d7..86055e598269 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -149,6 +149,8 @@ struct opal_sg_list {
 #define OPAL_DUMP_INFO2				94
 #define OPAL_PCI_EEH_FREEZE_SET			97
 #define OPAL_HANDLE_HMI				98
+#define OPAL_REGISTER_DUMP_REGION		101
+#define OPAL_UNREGISTER_DUMP_REGION		102
 
 #ifndef __ASSEMBLY__
 
@@ -920,6 +922,8 @@ int64_t opal_set_param(uint64_t token, uint32_t param_id, uint64_t buffer,
 		uint64_t length);
 int64_t opal_sensor_read(uint32_t sensor_hndl, int token, __be32 *sensor_data);
 int64_t opal_handle_hmi(void);
+int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end);
+int64_t opal_unregister_dump_region(uint32_t id);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
@@ -974,6 +978,13 @@ struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
 					     unsigned long vmalloc_size);
 void opal_free_sg_list(struct opal_sg_list *sg);
 
+/*
+ * Dump region ID range usable by the OS
+ */
+#define OPAL_DUMP_REGION_HOST_START		0x80
+#define OPAL_DUMP_REGION_LOG_BUF		0x80
+#define OPAL_DUMP_REGION_HOST_END		0xFF
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_H */
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index a328be44880f..2e6ce1b8dc8f 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -245,3 +245,5 @@ OPAL_CALL(opal_sensor_read,			OPAL_SENSOR_READ);
 OPAL_CALL(opal_get_param,			OPAL_GET_PARAM);
 OPAL_CALL(opal_set_param,			OPAL_SET_PARAM);
 OPAL_CALL(opal_handle_hmi,			OPAL_HANDLE_HMI);
+OPAL_CALL(opal_register_dump_region,		OPAL_REGISTER_DUMP_REGION);
+OPAL_CALL(opal_unregister_dump_region,		OPAL_UNREGISTER_DUMP_REGION);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index f0a01a46a57d..b44eec3e8dbd 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -605,6 +605,24 @@ static int opal_sysfs_init(void)
 	return 0;
 }
 
+static void __init opal_dump_region_init(void)
+{
+	void *addr;
+	uint64_t size;
+	int rc;
+
+	/* Register kernel log buffer */
+	addr = log_buf_addr_get();
+	size = log_buf_len_get();
+	rc = opal_register_dump_region(OPAL_DUMP_REGION_LOG_BUF,
+				       __pa(addr), size);
+	/* Don't warn if this is just an older OPAL that doesn't
+	 * know about that call
+	 */
+	if (rc && rc != OPAL_UNSUPPORTED)
+		pr_warn("DUMP: Failed to register kernel log buffer. "
+			"rc = %d\n", rc);
+}
 static int __init opal_init(void)
 {
 	struct device_node *np, *consoles;
@@ -654,6 +672,8 @@ static int __init opal_init(void)
 	/* Create "opal" kobject under /sys/firmware */
 	rc = opal_sysfs_init();
 	if (rc == 0) {
+		/* Setup dump region interface */
+		opal_dump_region_init();
 		/* Setup error log interface */
 		rc = opal_elog_init();
 		/* Setup code update interface */
@@ -694,6 +714,9 @@ void opal_shutdown(void)
 		else
 			mdelay(10);
 	}
+
+	/* Unregister memory dump region */
+	opal_unregister_dump_region(OPAL_DUMP_REGION_LOG_BUF);
 }
 
 /* Export this so that test modules can use it */
-- 
cgit 


From ce8f150a174c105abddbd3cc193cbba57f9db255 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 11 Aug 2014 14:37:22 +1000
Subject: powerpc/boot: Use correct zlib types for comparison

Avoids this warning:

arch/powerpc/boot/gunzip_util.c:118:9: warning: comparison of distinct pointer types lacks a cast

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/boot/gunzip_util.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/boot/gunzip_util.c b/arch/powerpc/boot/gunzip_util.c
index ef2aed0f63ca..9dc52501de83 100644
--- a/arch/powerpc/boot/gunzip_util.c
+++ b/arch/powerpc/boot/gunzip_util.c
@@ -112,10 +112,10 @@ int gunzip_partial(struct gunzip_state *state, void *dst, int dstlen)
 		r = zlib_inflate(&state->s, Z_FULL_FLUSH);
 		if (r != Z_OK && r != Z_STREAM_END)
 			fatal("inflate returned %d msg: %s\n\r", r, state->s.msg);
-		len = state->s.next_out - (unsigned char *)dst;
+		len = state->s.next_out - (Byte *)dst;
 	} else {
 		/* uncompressed image */
-		len = min(state->s.avail_in, (unsigned)dstlen);
+		len = min(state->s.avail_in, (uLong)dstlen);
 		memcpy(dst, state->s.next_in, len);
 		state->s.next_in += len;
 		state->s.avail_in -= len;
-- 
cgit 


From f1b3929c232784580e5d8ee324b6bc634e709575 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Mon, 11 Aug 2014 19:16:19 +1000
Subject: powerpc/pseries: Failure on removing device node

While running command "drmgr -c phb -r -s 'PHB 528'", following
backtrace jumped out because the target device node isn't marked
with OF_DETACHED by of_detach_node(), which caused by error
returned from memory hotplug related reconfig notifier when
disabling CONFIG_MEMORY_HOTREMOVE. The patch fixes it.

ERROR: Bad of_node_put() on /pci@800000020000210/ethernet@0
CPU: 14 PID: 2252 Comm: drmgr Tainted: G        W     3.16.0+ #427
Call Trace:
[c000000012a776a0] [c000000000013d9c] .show_stack+0x88/0x148 (unreliable)
[c000000012a77750] [c00000000083cd34] .dump_stack+0x7c/0x9c
[c000000012a777d0] [c0000000006807c4] .of_node_release+0x58/0xe0
[c000000012a77860] [c00000000038a7d0] .kobject_release+0x174/0x1b8
[c000000012a77900] [c00000000038a884] .kobject_put+0x70/0x78
[c000000012a77980] [c000000000681680] .of_node_put+0x28/0x34
[c000000012a77a00] [c000000000681ea8] .__of_get_next_child+0x64/0x70
[c000000012a77a90] [c000000000682138] .of_find_node_by_path+0x1b8/0x20c
[c000000012a77b40] [c000000000051840] .ofdt_write+0x308/0x688
[c000000012a77c20] [c000000000238430] .proc_reg_write+0xb8/0xd4
[c000000012a77cd0] [c0000000001cbeac] .vfs_write+0xec/0x1f8
[c000000012a77d70] [c0000000001cc3b0] .SyS_write+0x58/0xa0
[c000000012a77e30] [c00000000000a064] syscall_exit+0x0/0x98

Cc: stable@vger.kernel.org
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/pseries/hotplug-memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 7995135170a3..24abc5c223c7 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -146,7 +146,7 @@ static inline int pseries_remove_memblock(unsigned long base,
 }
 static inline int pseries_remove_mem_node(struct device_node *np)
 {
-	return -EOPNOTSUPP;
+	return 0;
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
-- 
cgit 


From 5efbabe09d986f25c02d19954660238fcd7f008a Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Mon, 11 Aug 2014 19:16:20 +1000
Subject: powerpc/pseries: Avoid deadlock on removing ddw

Function remove_ddw() could be called in of_reconfig_notifier and
we potentially remove the dynamic DMA window property, which invokes
of_reconfig_notifier again. Eventually, it leads to the deadlock as
following backtrace shows.

The patch fixes the above issue by deferring releasing the dynamic
DMA window property while releasing the device node.

=============================================
[ INFO: possible recursive locking detected ]
3.16.0+ #428 Tainted: G        W
---------------------------------------------
drmgr/2273 is trying to acquire lock:
 ((of_reconfig_chain).rwsem){.+.+..}, at: [<c000000000091890>] \
 .__blocking_notifier_call_chain+0x40/0x78

but task is already holding lock:
 ((of_reconfig_chain).rwsem){.+.+..}, at: [<c000000000091890>] \
 .__blocking_notifier_call_chain+0x40/0x78

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock((of_reconfig_chain).rwsem);
  lock((of_reconfig_chain).rwsem);
 *** DEADLOCK ***

 May be due to missing lock nesting notation

2 locks held by drmgr/2273:
 #0:  (sb_writers#4){.+.+.+}, at: [<c0000000001cbe70>] \
      .vfs_write+0xb0/0x1f8
 #1:  ((of_reconfig_chain).rwsem){.+.+..}, at: [<c000000000091890>] \
      .__blocking_notifier_call_chain+0x40/0x78

stack backtrace:
CPU: 17 PID: 2273 Comm: drmgr Tainted: G        W     3.16.0+ #428
Call Trace:
[c0000000137e7000] [c000000000013d9c] .show_stack+0x88/0x148 (unreliable)
[c0000000137e70b0] [c00000000083cd34] .dump_stack+0x7c/0x9c
[c0000000137e7130] [c0000000000b8afc] .__lock_acquire+0x128c/0x1c68
[c0000000137e7280] [c0000000000b9a4c] .lock_acquire+0xe8/0x104
[c0000000137e7350] [c00000000083588c] .down_read+0x4c/0x90
[c0000000137e73e0] [c000000000091890] .__blocking_notifier_call_chain+0x40/0x78
[c0000000137e7490] [c000000000091900] .blocking_notifier_call_chain+0x38/0x48
[c0000000137e7520] [c000000000682a28] .of_reconfig_notify+0x34/0x5c
[c0000000137e75b0] [c000000000682a9c] .of_property_notify+0x4c/0x54
[c0000000137e7650] [c000000000682bf0] .of_remove_property+0x30/0xd4
[c0000000137e76f0] [c000000000052a44] .remove_ddw+0x144/0x168
[c0000000137e7790] [c000000000053204] .iommu_reconfig_notifier+0x30/0xe0
[c0000000137e7820] [c00000000009137c] .notifier_call_chain+0x6c/0xb4
[c0000000137e78c0] [c0000000000918ac] .__blocking_notifier_call_chain+0x5c/0x78
[c0000000137e7970] [c000000000091900] .blocking_notifier_call_chain+0x38/0x48
[c0000000137e7a00] [c000000000682a28] .of_reconfig_notify+0x34/0x5c
[c0000000137e7a90] [c000000000682e14] .of_detach_node+0x44/0x1fc
[c0000000137e7b40] [c0000000000518e4] .ofdt_write+0x3ac/0x688
[c0000000137e7c20] [c000000000238430] .proc_reg_write+0xb8/0xd4
[c0000000137e7cd0] [c0000000001cbeac] .vfs_write+0xec/0x1f8
[c0000000137e7d70] [c0000000001cc3b0] .SyS_write+0x58/0xa0
[c0000000137e7e30] [c00000000000a064] syscall_exit+0x0/0x98

Cc: stable@vger.kernel.org
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/pseries/iommu.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 33b552ffbe57..4642d6a4d356 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -721,13 +721,13 @@ static int __init disable_ddw_setup(char *str)
 
 early_param("disable_ddw", disable_ddw_setup);
 
-static void remove_ddw(struct device_node *np)
+static void remove_ddw(struct device_node *np, bool remove_prop)
 {
 	struct dynamic_dma_window_prop *dwp;
 	struct property *win64;
 	const u32 *ddw_avail;
 	u64 liobn;
-	int len, ret;
+	int len, ret = 0;
 
 	ddw_avail = of_get_property(np, "ibm,ddw-applicable", &len);
 	win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
@@ -761,7 +761,8 @@ static void remove_ddw(struct device_node *np)
 			np->full_name, ret, ddw_avail[2], liobn);
 
 delprop:
-	ret = of_remove_property(np, win64);
+	if (remove_prop)
+		ret = of_remove_property(np, win64);
 	if (ret)
 		pr_warning("%s: failed to remove direct window property: %d\n",
 			np->full_name, ret);
@@ -805,7 +806,7 @@ static int find_existing_ddw_windows(void)
 		window = kzalloc(sizeof(*window), GFP_KERNEL);
 		if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
 			kfree(window);
-			remove_ddw(pdn);
+			remove_ddw(pdn, true);
 			continue;
 		}
 
@@ -1045,7 +1046,7 @@ out_free_window:
 	kfree(window);
 
 out_clear_window:
-	remove_ddw(pdn);
+	remove_ddw(pdn, true);
 
 out_free_prop:
 	kfree(win64->name);
@@ -1255,7 +1256,14 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 
 	switch (action) {
 	case OF_RECONFIG_DETACH_NODE:
-		remove_ddw(np);
+		/*
+		 * Removing the property will invoke the reconfig
+		 * notifier again, which causes dead-lock on the
+		 * read-write semaphore of the notifier chain. So
+		 * we have to remove the property when releasing
+		 * the device node.
+		 */
+		remove_ddw(np, false);
 		if (pci && pci->iommu_table)
 			iommu_free_table(pci->iommu_table, np->full_name);
 
-- 
cgit 


From 56758e3c3ca7e3d8960e0e8f881e80adf119f4fe Mon Sep 17 00:00:00 2001
From: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Date: Mon, 11 Aug 2014 16:43:18 -0700
Subject: powerpc: remove duplicate definition of TEXASR_FS

It appears that commits 7f06f21d40a6 ("powerpc/tm: Add checking to
treclaim/trechkpt") and e4e38121507a ("KVM: PPC: Book3S HV: Add
transactional memory support") both added definitions of TEXASR_FS.
Remove one of them. At the same time, fix the alignment of the remaining
definition (should be tab-separated like the rest of the #defines).

Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/reg.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 1c987bf794ef..0c0505956a29 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -213,9 +213,8 @@
 #define SPRN_ACOP	0x1F	/* Available Coprocessor Register */
 #define SPRN_TFIAR	0x81	/* Transaction Failure Inst Addr   */
 #define SPRN_TEXASR	0x82	/* Transaction EXception & Summary */
-#define   TEXASR_FS	__MASK(63-36)	/* Transaction Failure Summary */
 #define SPRN_TEXASRU	0x83	/* ''	   ''	   ''	 Upper 32  */
-#define   TEXASR_FS     __MASK(63-36) /* TEXASR Failure Summary */
+#define   TEXASR_FS	__MASK(63-36) /* TEXASR Failure Summary */
 #define SPRN_TFHAR	0x80	/* Transaction Failure Handler Addr */
 #define SPRN_CTRLF	0x088
 #define SPRN_CTRLT	0x098
-- 
cgit 


From a71d64b4dc4067808549935583d984c7fc9ea647 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 5 Aug 2014 14:55:00 +1000
Subject: powerpc: Hard disable interrupts in xmon

xmon only soft disables interrupts. This seems like a bad idea - we
certainly don't want decrementer and PMU exceptions going off when
we are debugging something inside xmon.

This issue was uncovered when the hard lockup detector went off
inside xmon. To ensure we wont get a spurious hard lockup warning,
I also call touch_nmi_watchdog() when exiting xmon.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/xmon/xmon.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 8d198b5e9e0a..b988b5addf86 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -24,6 +24,7 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/bug.h>
+#include <linux/nmi.h>
 
 #include <asm/ptrace.h>
 #include <asm/string.h>
@@ -374,6 +375,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 #endif
 
 	local_irq_save(flags);
+	hard_irq_disable();
 
 	bp = in_breakpoint_table(regs->nip, &offset);
 	if (bp != NULL) {
@@ -558,6 +560,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 #endif
 	insert_cpu_bpts();
 
+	touch_nmi_watchdog();
 	local_irq_restore(flags);
 
 	return cmd != 'X' && cmd != EOF;
-- 
cgit 


From 587870e8650a0571e895cc879cd895c78c6391bf Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Tue, 5 Aug 2014 16:42:39 -0500
Subject: powerpc/pseries/hvcserver: Fix endian issue in hvcs_get_partner_info

A buffer returned by H_VTERM_PARTNER_INFO contains device information
in big endian format, causing problems for little endian architectures.
This patch ensures that they are in cpu endian.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/pseries/hvcserver.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c
index 4557e91626c4..eedb64594dc5 100644
--- a/arch/powerpc/platforms/pseries/hvcserver.c
+++ b/arch/powerpc/platforms/pseries/hvcserver.c
@@ -163,8 +163,8 @@ int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
 			return retval;
 		}
 
-		last_p_partition_ID = pi_buff[0];
-		last_p_unit_address = pi_buff[1];
+		last_p_partition_ID = be64_to_cpu(pi_buff[0]);
+		last_p_unit_address = be64_to_cpu(pi_buff[1]);
 
 		/* This indicates that there are no further partners */
 		if (last_p_partition_ID == ~0UL
-- 
cgit 


From d6589722846a57a4ddf7af595a7f854ff5180950 Mon Sep 17 00:00:00 2001
From: Himangi Saraogi <himangi774@gmail.com>
Date: Tue, 22 Jul 2014 23:40:19 +0530
Subject: powerpc/perf/hv-24x7: Use kmem_cache_free

Free memory allocated using kmem_cache_zalloc using kmem_cache_free
rather than kfree.

The Coccinelle semantic patch that makes this change is as follows:

// <smpl>
@@
expression x,E,c;
@@

 x = \(kmem_cache_alloc\|kmem_cache_zalloc\|kmem_cache_alloc_node\)(c,...)
 ... when != x = E
     when != &x
?-kfree(x)
+kmem_cache_free(c,x)
// </smpl>

Signed-off-by: Himangi Saraogi <himangi774@gmail.com>
Acked-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/perf/hv-24x7.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 66d0f179650f..70d4f748b54b 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -223,7 +223,7 @@ e_free:
 		pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:"
 		       " rc=%ld\n",
 		       catalog_version_num, page_offset, hret);
-	kfree(page);
+	kmem_cache_free(hv_page_cache, page);
 
 	pr_devel("catalog_read: offset=%lld(%lld) count=%zu(%zu) catalog_len=%zu(%zu) => %zd\n",
 			offset, page_offset, count, page_count, catalog_len,
-- 
cgit 


From 2fabf084b6ad6337675d700b159a6091023544f2 Mon Sep 17 00:00:00 2001
From: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Date: Thu, 17 Jul 2014 16:15:12 -0700
Subject: powerpc: reorder per-cpu NUMA information's initialization

There is an issue currently where NUMA information is used on powerpc
(and possibly ia64) before it has been read from the device-tree, which
leads to large slab consumption with CONFIG_SLUB and memoryless nodes.

NUMA powerpc non-boot CPU's cpu_to_node/cpu_to_mem is only accurate
after start_secondary(), similar to ia64, which is invoked via
smp_init().

Commit 6ee0578b4daae ("workqueue: mark init_workqueues() as
early_initcall()") made init_workqueues() be invoked via
do_pre_smp_initcalls(), which is obviously before the secondary
processors are online.

Additionally, the following commits changed init_workqueues() to use
cpu_to_node to determine the node to use for kthread_create_on_node:

bce903809ab3f ("workqueue: add wq_numa_tbl_len and
wq_numa_possible_cpumask[]")
f3f90ad469342 ("workqueue: determine NUMA node of workers accourding to
the allowed cpumask")

Therefore, when init_workqueues() runs, it sees all CPUs as being on
Node 0. On LPARs or KVM guests where Node 0 is memoryless, this leads to
a high number of slab deactivations
(http://www.spinics.net/lists/linux-mm/msg67489.html).

Fix this by initializing the powerpc-specific CPU<->node/local memory
node mapping as early as possible, which on powerpc is
do_init_bootmem(). Currently that function initializes the mapping for
the boot CPU, but we extend it to setup the mapping for all possible
CPUs. Then, in smp_prepare_cpus(), we can correspondingly set the
per-cpu values for all possible CPUs. That ensures that before the
early_initcalls run (and really as early as possible), the per-cpu NUMA
mapping is accurate.

While testing memoryless nodes on PowerKVM guests with a fix to the
workqueue logic to use cpu_to_mem() instead of cpu_to_node(), with a
guest topology of:

available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
node 0 size: 0 MB
node 0 free: 0 MB
node 1 cpus: 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
node 1 size: 16336 MB
node 1 free: 15329 MB
node distances:
node   0   1
  0:  10  40
  1:  40  10

the slab consumption decreases from

Slab:             932416 kB
SUnreclaim:       902336 kB

to

Slab:             395264 kB
SUnreclaim:       359424 kB

And we a corresponding increase in the slab efficiency from

slab                                   mem     objs    slabs
                                      used   active   active
------------------------------------------------------------
kmalloc-16384                       337 MB   11.28%  100.00%
task_struct                         288 MB    9.93%  100.00%

to

slab                                   mem     objs    slabs
                                      used   active   active
------------------------------------------------------------
kmalloc-16384                        37 MB  100.00%  100.00%
task_struct                          31 MB  100.00%  100.00%

Powerpc didn't support memoryless nodes until recently (64bb80d87f01
"powerpc/numa: Enable CONFIG_HAVE_MEMORYLESS_NODES" and 8c272261194d
"powerpc/numa: Enable USE_PERCPU_NUMA_NODE_ID"). Those commits also
helped improve memory consumption with these kind of environments.

Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/smp.c | 11 +++++------
 arch/powerpc/mm/numa.c    | 13 ++++++++++---
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 1007fb802e6b..a0738af4aba6 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -376,6 +376,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 					GFP_KERNEL, cpu_to_node(cpu));
 		zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
 					GFP_KERNEL, cpu_to_node(cpu));
+		/*
+		 * numa_node_id() works after this.
+		 */
+		set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]);
+		set_cpu_numa_mem(cpu, local_memory_node(numa_cpu_lookup_table[cpu]));
 	}
 
 	cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
@@ -723,12 +728,6 @@ void start_secondary(void *unused)
 	}
 	traverse_core_siblings(cpu, true);
 
-	/*
-	 * numa_node_id() works after this.
-	 */
-	set_numa_node(numa_cpu_lookup_table[cpu]);
-	set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
-
 	smp_wmb();
 	notify_cpu_starting(cpu);
 	set_cpu_online(cpu, true);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index d3e9a78eaed3..d7737a542fd7 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1049,7 +1049,7 @@ static void __init mark_reserved_regions_for_nid(int nid)
 
 void __init do_init_bootmem(void)
 {
-	int nid;
+	int nid, cpu;
 
 	min_low_pfn = 0;
 	max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
@@ -1122,8 +1122,15 @@ void __init do_init_bootmem(void)
 
 	reset_numa_cpu_lookup_table();
 	register_cpu_notifier(&ppc64_numa_nb);
-	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
-			  (void *)(unsigned long)boot_cpuid);
+	/*
+	 * We need the numa_cpu_lookup_table to be accurate for all CPUs,
+	 * even before we online them, so that we can use cpu_to_{node,mem}
+	 * early in boot, cf. smp_prepare_cpus().
+	 */
+	for_each_possible_cpu(cpu) {
+		cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
+				  (void *)(unsigned long)cpu);
+	}
 }
 
 void __init paging_init(void)
-- 
cgit 


From b0aa44a3dfae3d8f45bd1264349aa87f87b7774f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 Aug 2014 12:31:57 +0530
Subject: powerpc/thp: Add write barrier after updating the valid bit

With hugepages, we store the hpte valid information in the pte page
whose address is stored in the second half of the PMD. Use a
write barrier to make sure clearing pmd busy bit and updating
hpte valid info are ordered properly.

CC: <stable@vger.kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/hugepage-hash64.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 826893fcb3a7..11f9a37ca2c6 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -172,8 +172,11 @@ repeat:
 		mark_hpte_slot_valid(hpte_slot_array, index, slot);
 	}
 	/*
-	 * No need to use ldarx/stdcx here
+	 * The hpte valid is stored in the pgtable whose address is in the
+	 * second half of the PMD. Order this against clearing of the busy bit in
+	 * huge pmd.
 	 */
+	smp_wmb();
 	*pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
 	return 0;
 }
-- 
cgit 


From fa1f8ae80f8bb996594167ff4750a0b0a5a5bb5d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 Aug 2014 12:31:58 +0530
Subject: powerpc/thp: Don't recompute vsid and ssize in loop on invalidate

The segment identifier and segment size will remain the same in
the loop, So we can compute it outside. We also change the
hugepage_invalidate interface so that we can use it the later patch

CC: <stable@vger.kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/machdep.h    |  6 +++---
 arch/powerpc/mm/hash_native_64.c      | 19 +++++--------------
 arch/powerpc/mm/pgtable_64.c          | 24 ++++++++++++------------
 arch/powerpc/platforms/pseries/lpar.c | 20 ++++++--------------
 4 files changed, 26 insertions(+), 43 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 44e90516519b..b125ceab149c 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -57,10 +57,10 @@ struct machdep_calls {
 	void            (*hpte_removebolted)(unsigned long ea,
 					     int psize, int ssize);
 	void		(*flush_hash_range)(unsigned long number, int local);
-	void		(*hugepage_invalidate)(struct mm_struct *mm,
+	void		(*hugepage_invalidate)(unsigned long vsid,
+					       unsigned long addr,
 					       unsigned char *hpte_slot_array,
-					       unsigned long addr, int psize);
-
+					       int psize, int ssize);
 	/* special for kexec, to be called in real mode, linear mapping is
 	 * destroyed as well */
 	void		(*hpte_clear_all)(void);
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index cf1d325eae8b..fb89d7695a9a 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -412,18 +412,18 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	local_irq_restore(flags);
 }
 
-static void native_hugepage_invalidate(struct mm_struct *mm,
+static void native_hugepage_invalidate(unsigned long vsid,
+				       unsigned long addr,
 				       unsigned char *hpte_slot_array,
-				       unsigned long addr, int psize)
+				       int psize, int ssize)
 {
-	int ssize = 0, i;
-	int lock_tlbie;
+	int i, lock_tlbie;
 	struct hash_pte *hptep;
 	int actual_psize = MMU_PAGE_16M;
 	unsigned int max_hpte_count, valid;
 	unsigned long flags, s_addr = addr;
 	unsigned long hpte_v, want_v, shift;
-	unsigned long hidx, vpn = 0, vsid, hash, slot;
+	unsigned long hidx, vpn = 0, hash, slot;
 
 	shift = mmu_psize_defs[psize].shift;
 	max_hpte_count = 1U << (PMD_SHIFT - shift);
@@ -437,15 +437,6 @@ static void native_hugepage_invalidate(struct mm_struct *mm,
 
 		/* get the vpn */
 		addr = s_addr + (i * (1ul << shift));
-		if (!is_kernel_addr(addr)) {
-			ssize = user_segment_size(addr);
-			vsid = get_vsid(mm->context.id, addr, ssize);
-			WARN_ON(vsid == 0);
-		} else {
-			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-			ssize = mmu_kernel_ssize;
-		}
-
 		vpn = hpt_vpn(addr, vsid, ssize);
 		hash = hpt_hash(vpn, shift, ssize);
 		if (hidx & _PTEIDX_SECONDARY)
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 3b3c4d34c7a0..5039f3b04d6e 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -745,12 +745,21 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 	if (!hpte_slot_array)
 		return;
 
-	/* get the base page size */
+	/* get the base page size,vsid and segment size */
 	psize = get_slice_psize(mm, s_addr);
+	if (!is_kernel_addr(s_addr)) {
+		ssize = user_segment_size(s_addr);
+		vsid = get_vsid(mm->context.id, s_addr, ssize);
+		WARN_ON(vsid == 0);
+	} else {
+		vsid = get_kernel_vsid(s_addr, mmu_kernel_ssize);
+		ssize = mmu_kernel_ssize;
+	}
 
 	if (ppc_md.hugepage_invalidate)
-		return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
-						  s_addr, psize);
+		return ppc_md.hugepage_invalidate(vsid, s_addr,
+						  hpte_slot_array,
+						  psize, ssize);
 	/*
 	 * No bluk hpte removal support, invalidate each entry
 	 */
@@ -768,15 +777,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 
 		/* get the vpn */
 		addr = s_addr + (i * (1ul << shift));
-		if (!is_kernel_addr(addr)) {
-			ssize = user_segment_size(addr);
-			vsid = get_vsid(mm->context.id, addr, ssize);
-			WARN_ON(vsid == 0);
-		} else {
-			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-			ssize = mmu_kernel_ssize;
-		}
-
 		vpn = hpt_vpn(addr, vsid, ssize);
 		hash = hpt_hash(vpn, shift, ssize);
 		if (hidx & _PTEIDX_SECONDARY)
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index fbfcef514aa7..34e64237fff9 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -431,16 +431,17 @@ static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
 		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
 }
 
-static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
-				       unsigned char *hpte_slot_array,
-				       unsigned long addr, int psize)
+static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
+					     unsigned long addr,
+					     unsigned char *hpte_slot_array,
+					     int psize, int ssize)
 {
-	int ssize = 0, i, index = 0;
+	int i, index = 0;
 	unsigned long s_addr = addr;
 	unsigned int max_hpte_count, valid;
 	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
 	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
-	unsigned long shift, hidx, vpn = 0, vsid, hash, slot;
+	unsigned long shift, hidx, vpn = 0, hash, slot;
 
 	shift = mmu_psize_defs[psize].shift;
 	max_hpte_count = 1U << (PMD_SHIFT - shift);
@@ -453,15 +454,6 @@ static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
 
 		/* get the vpn */
 		addr = s_addr + (i * (1ul << shift));
-		if (!is_kernel_addr(addr)) {
-			ssize = user_segment_size(addr);
-			vsid = get_vsid(mm->context.id, addr, ssize);
-			WARN_ON(vsid == 0);
-		} else {
-			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-			ssize = mmu_kernel_ssize;
-		}
-
 		vpn = hpt_vpn(addr, vsid, ssize);
 		hash = hpt_hash(vpn, shift, ssize);
 		if (hidx & _PTEIDX_SECONDARY)
-- 
cgit 


From 629149fae478f0ac6bf705a535708b192e9c6b59 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 Aug 2014 12:31:59 +0530
Subject: powerpc/thp: Invalidate old 64K based hash page mapping before insert
 of 4k pte

If we changed base page size of the segment, either via sub_page_protect
or via remap_4k_pfn, we do a demote_segment which doesn't flush the hash
table entries. We do a lazy hash page table flush for all mapped pages
in the demoted segment. This happens when we handle hash page fault
for these pages.

We use _PAGE_COMBO bit along with _PAGE_HASHPTE to indicate whether a
pte is backed by 4K hash pte. If we find _PAGE_COMBO not set on the pte,
that implies that we could possibly have older 64K hash pte entries in
the hash page table and we need to invalidate those entries.

Handle this correctly for 16M pages

CC: <stable@vger.kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/hugepage-hash64.c | 79 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 11f9a37ca2c6..1fb609dcc49b 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -18,6 +18,57 @@
 #include <linux/mm.h>
 #include <asm/machdep.h>
 
+static void invalidate_old_hpte(unsigned long vsid, unsigned long addr,
+				pmd_t *pmdp, unsigned int psize, int ssize)
+{
+	int i, max_hpte_count, valid;
+	unsigned long s_addr;
+	unsigned char *hpte_slot_array;
+	unsigned long hidx, shift, vpn, hash, slot;
+
+	s_addr = addr & HPAGE_PMD_MASK;
+	hpte_slot_array = get_hpte_slot_array(pmdp);
+	/*
+	 * IF we try to do a HUGE PTE update after a withdraw is done.
+	 * we will find the below NULL. This happens when we do
+	 * split_huge_page_pmd
+	 */
+	if (!hpte_slot_array)
+		return;
+
+	if (ppc_md.hugepage_invalidate)
+		return ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
+						  psize, ssize);
+	/*
+	 * No bluk hpte removal support, invalidate each entry
+	 */
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = HPAGE_PMD_SIZE >> shift;
+	for (i = 0; i < max_hpte_count; i++) {
+		/*
+		 * 8 bits per each hpte entries
+		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+		 */
+		valid = hpte_valid(hpte_slot_array, i);
+		if (!valid)
+			continue;
+		hidx =  hpte_hash_index(hpte_slot_array, i);
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+		ppc_md.hpte_invalidate(slot, vpn, psize,
+				       MMU_PAGE_16M, ssize, 0);
+	}
+}
+
+
 int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 		    pmd_t *pmdp, unsigned long trap, int local, int ssize,
 		    unsigned int psize)
@@ -85,6 +136,15 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 	vpn = hpt_vpn(ea, vsid, ssize);
 	hash = hpt_hash(vpn, shift, ssize);
 	hpte_slot_array = get_hpte_slot_array(pmdp);
+	if (psize == MMU_PAGE_4K) {
+		/*
+		 * invalidate the old hpte entry if we have that mapped via 64K
+		 * base page size. This is because demote_segment won't flush
+		 * hash page table entries.
+		 */
+		if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO))
+			invalidate_old_hpte(vsid, ea, pmdp, MMU_PAGE_64K, ssize);
+	}
 
 	valid = hpte_valid(hpte_slot_array, index);
 	if (valid) {
@@ -107,11 +167,8 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 			 * safely update this here.
 			 */
 			valid = 0;
-			new_pmd &= ~_PAGE_HPTEFLAGS;
 			hpte_slot_array[index] = 0;
-		} else
-			/* clear the busy bits and set the hash pte bits */
-			new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+		}
 	}
 
 	if (!valid) {
@@ -119,11 +176,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 
 		/* insert new entry */
 		pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
-repeat:
-		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
-
-		/* clear the busy bits and set the hash pte bits */
-		new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+		new_pmd |= _PAGE_HASHPTE;
 
 		/* Add in WIMG bits */
 		rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
@@ -132,6 +185,8 @@ repeat:
 		 * enable the memory coherence always
 		 */
 		rflags |= HPTE_R_M;
+repeat:
+		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
 
 		/* Insert into the hash table, primary slot */
 		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
@@ -171,6 +226,12 @@ repeat:
 		 */
 		mark_hpte_slot_valid(hpte_slot_array, index, slot);
 	}
+	/*
+	 * Mark the pte with _PAGE_COMBO, if we are trying to hash it with
+	 * base page size 4k.
+	 */
+	if (psize == MMU_PAGE_4K)
+		new_pmd |= _PAGE_COMBO;
 	/*
 	 * The hpte valid is stored in the pgtable whose address is in the
 	 * second half of the PMD. Order this against clearing of the busy bit in
-- 
cgit 


From fc0479557572375100ef16c71170b29a98e0d69a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 Aug 2014 12:32:00 +0530
Subject: powerpc/thp: Handle combo pages in invalidate

If we changed base page size of the segment, either via sub_page_protect
or via remap_4k_pfn, we do a demote_segment which doesn't flush the hash
table entries. We do a lazy hash page table flush for all mapped pages
in the demoted segment. This happens when we handle hash page fault for
these pages.

We use _PAGE_COMBO bit along with _PAGE_HASHPTE to indicate whether a
pte is backed by 4K hash pte. If we find _PAGE_COMBO not set on the pte,
that implies that we could possibly have older 64K hash pte entries in
the hash page table and we need to invalidate those entries.

Use _PAGE_COMBO to determine the page size with which we should
invalidate the hash table entries on unmap.

CC: <stable@vger.kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/pgtable-ppc64.h |  2 +-
 arch/powerpc/mm/pgtable_64.c             | 14 +++++++++++---
 arch/powerpc/mm/tlb_hash64.c             |  2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index eb9261024f51..7b3d54fae46f 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -413,7 +413,7 @@ static inline char *get_hpte_slot_array(pmd_t *pmdp)
 }
 
 extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
-				   pmd_t *pmdp);
+				   pmd_t *pmdp, unsigned long old_pmd);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
 extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 5039f3b04d6e..948a81e02ddb 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -538,7 +538,7 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 	*pmdp = __pmd((old & ~clr) | set);
 #endif
 	if (old & _PAGE_HASHPTE)
-		hpte_do_hugepage_flush(mm, addr, pmdp);
+		hpte_do_hugepage_flush(mm, addr, pmdp, old);
 	return old;
 }
 
@@ -645,7 +645,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
 	if (!(old & _PAGE_SPLITTING)) {
 		/* We need to flush the hpte */
 		if (old & _PAGE_HASHPTE)
-			hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
+			hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
 	}
 	/*
 	 * This ensures that generic code that rely on IRQ disabling
@@ -723,7 +723,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
  * neesd to be flushed.
  */
 void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
-			    pmd_t *pmdp)
+			    pmd_t *pmdp, unsigned long old_pmd)
 {
 	int ssize, i;
 	unsigned long s_addr;
@@ -746,7 +746,15 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 		return;
 
 	/* get the base page size,vsid and segment size */
+#ifdef CONFIG_DEBUG_VM
 	psize = get_slice_psize(mm, s_addr);
+	BUG_ON(psize == MMU_PAGE_16M);
+#endif
+	if (old_pmd & _PAGE_COMBO)
+		psize = MMU_PAGE_4K;
+	else
+		psize = MMU_PAGE_64K;
+
 	if (!is_kernel_addr(s_addr)) {
 		ssize = user_segment_size(s_addr);
 		vsid = get_vsid(mm->context.id, s_addr, ssize);
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index c99f6510a0b2..9adda5790463 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -216,7 +216,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 		if (!(pte & _PAGE_HASHPTE))
 			continue;
 		if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
-			hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
+			hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
 		else
 			hpte_need_flush(mm, start, ptep, pte, 0);
 	}
-- 
cgit 


From 969b7b208f7408712a3526856e4ae60ad13f6928 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 Aug 2014 12:32:01 +0530
Subject: powerpc/thp: Invalidate with vpn in loop

As per ISA, for 4k base page size we compare 14..65 bits of VA specified
with the entry_VA in tlb. That implies we need to make sure we do a
tlbie with all the possible 4k va we used to access the 16MB hugepage.
With 64k base page size we compare 14..57 bits of VA. Hence we cannot
ignore the lower 24 bits of va while tlbie .We also cannot tlb
invalidate a 16MB entry with just one tlbie instruction because
we don't track which va was used to instantiate the tlb entry.

CC: <stable@vger.kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/hash_native_64.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index fb89d7695a9a..afc0a8295f84 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -417,7 +417,7 @@ static void native_hugepage_invalidate(unsigned long vsid,
 				       unsigned char *hpte_slot_array,
 				       int psize, int ssize)
 {
-	int i, lock_tlbie;
+	int i;
 	struct hash_pte *hptep;
 	int actual_psize = MMU_PAGE_16M;
 	unsigned int max_hpte_count, valid;
@@ -456,22 +456,13 @@ static void native_hugepage_invalidate(unsigned long vsid,
 		else
 			/* Invalidate the hpte. NOTE: this also unlocks it */
 			hptep->v = 0;
+		/*
+		 * We need to do tlb invalidate for all the address, tlbie
+		 * instruction compares entry_VA in tlb with the VA specified
+		 * here
+		 */
+		tlbie(vpn, psize, actual_psize, ssize, 0);
 	}
-	/*
-	 * Since this is a hugepage, we just need a single tlbie.
-	 * use the last vpn.
-	 */
-	lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-	if (lock_tlbie)
-		raw_spin_lock(&native_tlbie_lock);
-
-	asm volatile("ptesync":::"memory");
-	__tlbie(vpn, psize, actual_psize, ssize);
-	asm volatile("eieio; tlbsync; ptesync":::"memory");
-
-	if (lock_tlbie)
-		raw_spin_unlock(&native_tlbie_lock);
-
 	local_irq_restore(flags);
 }
 
-- 
cgit 


From 7e467245bf5226db34c4b12d3cbacfa2f7a15a8b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 Aug 2014 12:32:02 +0530
Subject: powerpc/thp: Use ACCESS_ONCE when loading pmdp

We would get wrong results in compiler recomputed old_pmd. Avoid
that by using ACCESS_ONCE

CC: <stable@vger.kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/hugepage-hash64.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 1fb609dcc49b..5f5e6328c21c 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -84,7 +84,9 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 	 * atomically mark the linux large page PMD busy and dirty
 	 */
 	do {
-		old_pmd = pmd_val(*pmdp);
+		pmd_t pmd = ACCESS_ONCE(*pmdp);
+
+		old_pmd = pmd_val(pmd);
 		/* If PMD busy, retry the access */
 		if (unlikely(old_pmd & _PAGE_BUSY))
 			return 0;
-- 
cgit 


From 85c1fafd7262e68ad821ee1808686b1392b1167d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 Aug 2014 12:32:03 +0530
Subject: powerpc/mm: Use read barrier when creating real_pte

On ppc64 we support 4K hash pte with 64K page size. That requires
us to track the hash pte slot information on a per 4k basis. We do that
by storing the slot details in the second half of pte page. The pte bit
_PAGE_COMBO is used to indicate whether the second half need to be
looked while building real_pte. We need to use read memory barrier while
doing that so that load of hidx is not reordered w.r.t _PAGE_COMBO
check. On the store side we already do a lwsync in __hash_page_4K

CC: <stable@vger.kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/pte-hash64-64k.h | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/pte-hash64-64k.h b/arch/powerpc/include/asm/pte-hash64-64k.h
index b6d2d42f84b5..4f4ec2ab45c9 100644
--- a/arch/powerpc/include/asm/pte-hash64-64k.h
+++ b/arch/powerpc/include/asm/pte-hash64-64k.h
@@ -46,11 +46,31 @@
  * in order to deal with 64K made of 4K HW pages. Thus we override the
  * generic accessors and iterators here
  */
-#define __real_pte(e,p) 	((real_pte_t) { \
-			(e), (pte_val(e) & _PAGE_COMBO) ? \
-				(pte_val(*((p) + PTRS_PER_PTE))) : 0 })
-#define __rpte_to_hidx(r,index)	((pte_val((r).pte) & _PAGE_COMBO) ? \
-        (((r).hidx >> ((index)<<2)) & 0xf) : ((pte_val((r).pte) >> 12) & 0xf))
+#define __real_pte __real_pte
+static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
+{
+	real_pte_t rpte;
+
+	rpte.pte = pte;
+	rpte.hidx = 0;
+	if (pte_val(pte) & _PAGE_COMBO) {
+		/*
+		 * Make sure we order the hidx load against the _PAGE_COMBO
+		 * check. The store side ordering is done in __hash_page_4K
+		 */
+		smp_rmb();
+		rpte.hidx = pte_val(*((ptep) + PTRS_PER_PTE));
+	}
+	return rpte;
+}
+
+static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
+{
+	if ((pte_val(rpte.pte) & _PAGE_COMBO))
+		return (rpte.hidx >> (index<<2)) & 0xf;
+	return (pte_val(rpte.pte) >> 12) & 0xf;
+}
+
 #define __rpte_to_pte(r)	((r).pte)
 #define __rpte_sub_valid(rpte, index) \
 	(pte_val(rpte.pte) & (_PAGE_HPTE_SUB0 >> (index)))
-- 
cgit 


From 9e813308a5c18c58f9ccae1ec72ed4e14eaf9025 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 Aug 2014 12:32:04 +0530
Subject: powerpc/thp: Add tracepoints to track hugepage invalidate

Add tracepoint to track hugepage invalidate. This help us
in debugging difficult to track bugs.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/pgtable_64.c | 6 ++++++
 arch/powerpc/mm/tlb_hash64.c | 4 ++++
 2 files changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 948a81e02ddb..c8d709ab489d 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -54,6 +54,9 @@
 
 #include "mmu_decl.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
 /* Some sanity checking */
 #if TASK_SIZE_USER64 > PGTABLE_RANGE
 #error TASK_SIZE_USER64 exceeds pagetable range
@@ -537,6 +540,7 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 	old = pmd_val(*pmdp);
 	*pmdp = __pmd((old & ~clr) | set);
 #endif
+	trace_hugepage_update(addr, old, clr, set);
 	if (old & _PAGE_HASHPTE)
 		hpte_do_hugepage_flush(mm, addr, pmdp, old);
 	return old;
@@ -642,6 +646,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
 	 * If we didn't had the splitting flag set, go and flush the
 	 * HPTE entries.
 	 */
+	trace_hugepage_splitting(address, old);
 	if (!(old & _PAGE_SPLITTING)) {
 		/* We need to flush the hpte */
 		if (old & _PAGE_HASHPTE)
@@ -709,6 +714,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 	assert_spin_locked(&mm->page_table_lock);
 	WARN_ON(!pmd_trans_huge(pmd));
 #endif
+	trace_hugepage_set_pmd(addr, pmd);
 	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
 }
 
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 9adda5790463..d2a94b85dbc2 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -30,6 +30,8 @@
 #include <asm/tlb.h>
 #include <asm/bug.h>
 
+#include <trace/events/thp.h>
+
 DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 
 /*
@@ -213,6 +215,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 		if (ptep == NULL)
 			continue;
 		pte = pte_val(*ptep);
+		if (hugepage_shift)
+			trace_hugepage_invalidate(start, pte_val(pte));
 		if (!(pte & _PAGE_HASHPTE))
 			continue;
 		if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
-- 
cgit