From 4c3cb6e9a9d94d1553807854a565cd27ff4c22aa Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 3 Sep 2016 10:36:00 -0700
Subject: dax: fix mapping size check

pgoff_to_phys() validates that both the starting address and the length
of the mapping against the resource list.  We need to check for a
mapping size of PMD_SIZE not PAGE_SIZE in the pmd fault path.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 803f3953b341..29f600f2c447 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -459,7 +459,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 	}
 
 	pgoff = linear_page_index(vma, pmd_addr);
-	phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE);
+	phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
 	if (phys == -1) {
 		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
 				pgoff);
-- 
cgit 


From ca120cf688874f4423e579e7cc5ddf7244aeca45 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 3 Sep 2016 10:38:03 -0700
Subject: mm: fix show_smap() for zone_device-pmd ranges

Attempting to dump /proc/<pid>/smaps for a process with pmd dax mappings
currently results in the following VM_BUG_ONs:

 kernel BUG at mm/huge_memory.c:1105!
 task: ffff88045f16b140 task.stack: ffff88045be14000
 RIP: 0010:[<ffffffff81268f9b>]  [<ffffffff81268f9b>] follow_trans_huge_pmd+0x2cb/0x340
 [..]
 Call Trace:
  [<ffffffff81306030>] smaps_pte_range+0xa0/0x4b0
  [<ffffffff814c2755>] ? vsnprintf+0x255/0x4c0
  [<ffffffff8123c46e>] __walk_page_range+0x1fe/0x4d0
  [<ffffffff8123c8a2>] walk_page_vma+0x62/0x80
  [<ffffffff81307656>] show_smap+0xa6/0x2b0

 kernel BUG at fs/proc/task_mmu.c:585!
 RIP: 0010:[<ffffffff81306469>]  [<ffffffff81306469>] smaps_pte_range+0x499/0x4b0
 Call Trace:
  [<ffffffff814c2795>] ? vsnprintf+0x255/0x4c0
  [<ffffffff8123c46e>] __walk_page_range+0x1fe/0x4d0
  [<ffffffff8123c8a2>] walk_page_vma+0x62/0x80
  [<ffffffff81307696>] show_smap+0xa6/0x2b0

These locations are sanity checking page flags that must be set for an
anonymous transparent huge page, but are not set for the zone_device
pages associated with dax mappings.

Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/proc/task_mmu.c | 2 ++
 mm/huge_memory.c   | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187d84ef9de9..f6fa99eca515 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -581,6 +581,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 		mss->anonymous_thp += HPAGE_PMD_SIZE;
 	else if (PageSwapBacked(page))
 		mss->shmem_thp += HPAGE_PMD_SIZE;
+	else if (is_zone_device_page(page))
+		/* pass */;
 	else
 		VM_BUG_ON_PAGE(1, page);
 	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2db2112aa31e..a6abd76baa72 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1078,7 +1078,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 		goto out;
 
 	page = pmd_page(*pmd);
-	VM_BUG_ON_PAGE(!PageHead(page), page);
+	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
 	if (flags & FOLL_TOUCH)
 		touch_pmd(vma, addr, pmd);
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
@@ -1116,7 +1116,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 	}
 skip_mlock:
 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
-	VM_BUG_ON_PAGE(!PageCompound(page), page);
+	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
 	if (flags & FOLL_GET)
 		get_page(page);
 
-- 
cgit 


From 9049771f7d5490a302589976984810064c83ab40 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 7 Sep 2016 08:51:21 -0700
Subject: mm: fix cache mode of dax pmd mappings

track_pfn_insert() in vmf_insert_pfn_pmd() is marking dax mappings as
uncacheable rendering them impractical for application usage.  DAX-pte
mappings are cached and the goal of establishing DAX-pmd mappings is to
attain more performance, not dramatically less (3 orders of magnitude).

track_pfn_insert() relies on a previous call to reserve_memtype() to
establish the expected page_cache_mode for the range.  While memremap()
arranges for reserve_memtype() to be called, devm_memremap_pages() does
not.  So, teach track_pfn_insert() and untrack_pfn() how to handle
tracking without a vma, and arrange for devm_memremap_pages() to
establish the write-back-cache reservation in the memtype tree.

Cc: <stable@vger.kernel.org>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Toshi Kani <toshi.kani@hpe.com>
Reported-by: Kai Zhang <kai.ka.zhang@oracle.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/mm/pat.c | 17 ++++++++++-------
 kernel/memremap.c |  9 +++++++++
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index ecb1b69c1651..170cc4ff057b 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -927,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma)
 }
 
 /*
- * prot is passed in as a parameter for the new mapping. If the vma has a
- * linear pfn mapping for the entire range reserve the entire vma range with
- * single reserve_pfn_range call.
+ * prot is passed in as a parameter for the new mapping. If the vma has
+ * a linear pfn mapping for the entire range, or no vma is provided,
+ * reserve the entire pfn + size range with single reserve_pfn_range
+ * call.
  */
 int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 		    unsigned long pfn, unsigned long addr, unsigned long size)
@@ -938,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 	enum page_cache_mode pcm;
 
 	/* reserve the whole chunk starting from paddr */
-	if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
+	if (!vma || (addr == vma->vm_start
+				&& size == (vma->vm_end - vma->vm_start))) {
 		int ret;
 
 		ret = reserve_pfn_range(paddr, size, prot, 0);
-		if (!ret)
+		if (ret == 0 && vma)
 			vma->vm_flags |= VM_PAT;
 		return ret;
 	}
@@ -997,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 	resource_size_t paddr;
 	unsigned long prot;
 
-	if (!(vma->vm_flags & VM_PAT))
+	if (vma && !(vma->vm_flags & VM_PAT))
 		return;
 
 	/* free the chunk starting from pfn or the whole chunk */
@@ -1011,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 		size = vma->vm_end - vma->vm_start;
 	}
 	free_pfn_range(paddr, size);
-	vma->vm_flags &= ~VM_PAT;
+	if (vma)
+		vma->vm_flags &= ~VM_PAT;
 }
 
 /*
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 251d16b4cb41..b501e390bb34 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(resource_size(res), SECTION_SIZE);
 	arch_remove_memory(align_start, align_size);
+	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
 	pgmap_radix_release(res);
 	dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
 			"%s: failed to free all reserved pages\n", __func__);
@@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 		struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
 	resource_size_t key, align_start, align_size, align_end;
+	pgprot_t pgprot = PAGE_KERNEL;
 	struct dev_pagemap *pgmap;
 	struct page_map *page_map;
 	int error, nid, is_ram;
@@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	if (nid < 0)
 		nid = numa_mem_id();
 
+	error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0,
+			align_size);
+	if (error)
+		goto err_pfn_remap;
+
 	error = arch_add_memory(nid, align_start, align_size, true);
 	if (error)
 		goto err_add_memory;
@@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	return __va(res->start);
 
  err_add_memory:
+	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+ err_pfn_remap:
  err_radix:
 	pgmap_radix_release(res);
 	devres_free(page_map);
-- 
cgit 


From 2e21807d4b131dfd4a8e5c82116a85b62f28aeec Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Fri, 2 Sep 2016 17:27:30 -0600
Subject: nfit, mce: Fix SPA matching logic in MCE handler

The check for a 'pmem' type SPA in the MCE handler was inverted due to a
merge/rebase error.

Fixes: 6839a6d nfit: do an ARS scrub on hitting a latent media error
Cc: linux-acpi@vger.kernel.org
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
index 4c745bf389fe..161f91539ae6 100644
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -42,7 +42,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
 		list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
 			struct acpi_nfit_system_address *spa = nfit_spa->spa;
 
-			if (nfit_spa_type(spa) == NFIT_SPA_PM)
+			if (nfit_spa_type(spa) != NFIT_SPA_PM)
 				continue;
 			/* find the spa that covers the mce addr */
 			if (spa->address > mce->addr)
-- 
cgit 


From 1e8b8d9619f9476e94f32eb20cab000d50d236aa Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 9 Sep 2016 09:10:08 -0700
Subject: libnvdimm: allow legacy (e820) pmem region to clear bad blocks

Bad blocks can be injected via /sys/block/pmemN/badblocks. In a situation
where legacy pmem is being used or a pmem region created by using memmap
kernel parameter, the injected bad blocks are not cleared due to
nvdimm_clear_poison() failing from lack of ndctl function pointer. In
this case we need to just return as handled and allow the bad blocks to
be cleared rather than fail.

Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 458daf927336..935866fe5ec2 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -185,8 +185,12 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 		return -ENXIO;
 
 	nd_desc = nvdimm_bus->nd_desc;
+	/*
+	 * if ndctl does not exist, it's PMEM_LEGACY and
+	 * we want to just pretend everything is handled.
+	 */
 	if (!nd_desc->ndctl)
-		return -ENXIO;
+		return len;
 
 	memset(&ars_cap, 0, sizeof(ars_cap));
 	ars_cap.address = phys;
-- 
cgit