From 4c3cb6e9a9d94d1553807854a565cd27ff4c22aa Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 3 Sep 2016 10:36:00 -0700 Subject: dax: fix mapping size check pgoff_to_phys() validates that both the starting address and the length of the mapping against the resource list. We need to check for a mapping size of PMD_SIZE not PAGE_SIZE in the pmd fault path. Signed-off-by: Dan Williams --- drivers/dax/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 803f3953b341..29f600f2c447 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -459,7 +459,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, } pgoff = linear_page_index(vma, pmd_addr); - phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE); + phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); if (phys == -1) { dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, pgoff); -- cgit From ca120cf688874f4423e579e7cc5ddf7244aeca45 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 3 Sep 2016 10:38:03 -0700 Subject: mm: fix show_smap() for zone_device-pmd ranges Attempting to dump /proc//smaps for a process with pmd dax mappings currently results in the following VM_BUG_ONs: kernel BUG at mm/huge_memory.c:1105! task: ffff88045f16b140 task.stack: ffff88045be14000 RIP: 0010:[] [] follow_trans_huge_pmd+0x2cb/0x340 [..] Call Trace: [] smaps_pte_range+0xa0/0x4b0 [] ? vsnprintf+0x255/0x4c0 [] __walk_page_range+0x1fe/0x4d0 [] walk_page_vma+0x62/0x80 [] show_smap+0xa6/0x2b0 kernel BUG at fs/proc/task_mmu.c:585! RIP: 0010:[] [] smaps_pte_range+0x499/0x4b0 Call Trace: [] ? vsnprintf+0x255/0x4c0 [] __walk_page_range+0x1fe/0x4d0 [] walk_page_vma+0x62/0x80 [] show_smap+0xa6/0x2b0 These locations are sanity checking page flags that must be set for an anonymous transparent huge page, but are not set for the zone_device pages associated with dax mappings. Cc: Ross Zwisler Cc: Kirill A. Shutemov Acked-by: Andrew Morton Signed-off-by: Dan Williams --- fs/proc/task_mmu.c | 2 ++ mm/huge_memory.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 187d84ef9de9..f6fa99eca515 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -581,6 +581,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, mss->anonymous_thp += HPAGE_PMD_SIZE; else if (PageSwapBacked(page)) mss->shmem_thp += HPAGE_PMD_SIZE; + else if (is_zone_device_page(page)) + /* pass */; else VM_BUG_ON_PAGE(1, page); smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2db2112aa31e..a6abd76baa72 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1078,7 +1078,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, goto out; page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd); if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { @@ -1116,7 +1116,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, } skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - VM_BUG_ON_PAGE(!PageCompound(page), page); + VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); if (flags & FOLL_GET) get_page(page); -- cgit From 9049771f7d5490a302589976984810064c83ab40 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 7 Sep 2016 08:51:21 -0700 Subject: mm: fix cache mode of dax pmd mappings track_pfn_insert() in vmf_insert_pfn_pmd() is marking dax mappings as uncacheable rendering them impractical for application usage. DAX-pte mappings are cached and the goal of establishing DAX-pmd mappings is to attain more performance, not dramatically less (3 orders of magnitude). track_pfn_insert() relies on a previous call to reserve_memtype() to establish the expected page_cache_mode for the range. While memremap() arranges for reserve_memtype() to be called, devm_memremap_pages() does not. So, teach track_pfn_insert() and untrack_pfn() how to handle tracking without a vma, and arrange for devm_memremap_pages() to establish the write-back-cache reservation in the memtype tree. Cc: Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Nilesh Choudhury Cc: Kirill A. Shutemov Reported-by: Toshi Kani Reported-by: Kai Zhang Acked-by: Andrew Morton Signed-off-by: Dan Williams --- arch/x86/mm/pat.c | 17 ++++++++++------- kernel/memremap.c | 9 +++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ecb1b69c1651..170cc4ff057b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -927,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma) } /* - * prot is passed in as a parameter for the new mapping. If the vma has a - * linear pfn mapping for the entire range reserve the entire vma range with - * single reserve_pfn_range call. + * prot is passed in as a parameter for the new mapping. If the vma has + * a linear pfn mapping for the entire range, or no vma is provided, + * reserve the entire pfn + size range with single reserve_pfn_range + * call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) @@ -938,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ - if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + if (!vma || (addr == vma->vm_start + && size == (vma->vm_end - vma->vm_start))) { int ret; ret = reserve_pfn_range(paddr, size, prot, 0); - if (!ret) + if (ret == 0 && vma) vma->vm_flags |= VM_PAT; return ret; } @@ -997,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long prot; - if (!(vma->vm_flags & VM_PAT)) + if (vma && !(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ @@ -1011,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - vma->vm_flags &= ~VM_PAT; + if (vma) + vma->vm_flags &= ~VM_PAT; } /* diff --git a/kernel/memremap.c b/kernel/memremap.c index 251d16b4cb41..b501e390bb34 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); @@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { resource_size_t key, align_start, align_size, align_end; + pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; @@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); + error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, + align_size); + if (error) + goto err_pfn_remap; + error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; @@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, return __va(res->start); err_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + err_pfn_remap: err_radix: pgmap_radix_release(res); devres_free(page_map); -- cgit From 2e21807d4b131dfd4a8e5c82116a85b62f28aeec Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Fri, 2 Sep 2016 17:27:30 -0600 Subject: nfit, mce: Fix SPA matching logic in MCE handler The check for a 'pmem' type SPA in the MCE handler was inverted due to a merge/rebase error. Fixes: 6839a6d nfit: do an ARS scrub on hitting a latent media error Cc: linux-acpi@vger.kernel.org Cc: Dan Williams Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/acpi/nfit/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index 4c745bf389fe..161f91539ae6 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -42,7 +42,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { struct acpi_nfit_system_address *spa = nfit_spa->spa; - if (nfit_spa_type(spa) == NFIT_SPA_PM) + if (nfit_spa_type(spa) != NFIT_SPA_PM) continue; /* find the spa that covers the mce addr */ if (spa->address > mce->addr) -- cgit From 1e8b8d9619f9476e94f32eb20cab000d50d236aa Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 9 Sep 2016 09:10:08 -0700 Subject: libnvdimm: allow legacy (e820) pmem region to clear bad blocks Bad blocks can be injected via /sys/block/pmemN/badblocks. In a situation where legacy pmem is being used or a pmem region created by using memmap kernel parameter, the injected bad blocks are not cleared due to nvdimm_clear_poison() failing from lack of ndctl function pointer. In this case we need to just return as handled and allow the bad blocks to be cleared rather than fail. Reviewed-by: Vishal Verma Signed-off-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/nvdimm/bus.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 458daf927336..935866fe5ec2 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -185,8 +185,12 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, return -ENXIO; nd_desc = nvdimm_bus->nd_desc; + /* + * if ndctl does not exist, it's PMEM_LEGACY and + * we want to just pretend everything is handled. + */ if (!nd_desc->ndctl) - return -ENXIO; + return len; memset(&ars_cap, 0, sizeof(ars_cap)); ars_cap.address = phys; -- cgit