From 095ab4b39f91b475b07c0028c7a6a25e3cea2a86 Mon Sep 17 00:00:00 2001
From: Linda Knippers <linda.knippers@hpe.com>
Date: Tue, 7 Mar 2017 16:35:12 -0500
Subject: acpi, nfit: allow override of built-in bitmasks for nvdimm DSMs

As it is today, we can't enable or test new NVDIMM management functions
provided by new firmware and tools without changing the kernel.  We also
can't prevent documented DSM functions from being called in the case of
buggy firmware.  This patch provides a module parameter that overrides
the DSM function mask that is built into the kernel.

If the "disable_vendor_specific" module parameter is also used we ignore
the new parameter.

Signed-off-by: Linda Knippers <linda.knippers@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index c8ea9d698cd0..6bc784bd0828 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -51,6 +51,10 @@ module_param(disable_vendor_specific, bool, S_IRUGO);
 MODULE_PARM_DESC(disable_vendor_specific,
 		"Limit commands to the publicly specified set\n");
 
+static unsigned long override_dsm_mask;
+module_param(override_dsm_mask, ulong, S_IRUGO);
+MODULE_PARM_DESC(override_dsm_mask, "Bitmask of allowed NVDIMM DSM functions");
+
 LIST_HEAD(acpi_descs);
 DEFINE_MUTEX(acpi_desc_lock);
 
@@ -1402,7 +1406,9 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 
 	/* limit the supported commands to those that are publicly documented */
 	nfit_mem->family = i;
-	if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
+	if (override_dsm_mask && !disable_vendor_specific)
+		dsm_mask = override_dsm_mask;
+	else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
 		dsm_mask = 0x3fe;
 		if (disable_vendor_specific)
 			dsm_mask &= ~(1 << ND_CMD_VENDOR);
-- 
cgit 


From ba650cfcf94090a528fb9d8be467d17308311121 Mon Sep 17 00:00:00 2001
From: Linda Knippers <linda.knippers@hpe.com>
Date: Tue, 7 Mar 2017 16:35:13 -0500
Subject: acpi, nfit: allow specifying a default DSM family

Provide the ability to request a default DSM family. If it is not
supported, then fall back to the normal discovery order.

This is helpful for testing platforms that support multiple DSM
families.  It will also allow administrators to request the DSM family
that their management tools support, which may not be the first one
found using the current discovery order.

Signed-off-by: Linda Knippers <linda.knippers@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 6bc784bd0828..98d3bc6b90f1 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -55,6 +55,11 @@ static unsigned long override_dsm_mask;
 module_param(override_dsm_mask, ulong, S_IRUGO);
 MODULE_PARM_DESC(override_dsm_mask, "Bitmask of allowed NVDIMM DSM functions");
 
+static int default_dsm_family = -1;
+module_param(default_dsm_family, int, S_IRUGO);
+MODULE_PARM_DESC(default_dsm_family,
+		"Try this DSM type first when identifying NVDIMM family");
+
 LIST_HEAD(acpi_descs);
 DEFINE_MUTEX(acpi_desc_lock);
 
@@ -1372,6 +1377,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 	unsigned long dsm_mask;
 	const u8 *uuid;
 	int i;
+	int family = -1;
 
 	/* nfit test assumes 1:1 relationship between commands and dsms */
 	nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
@@ -1402,10 +1408,11 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 	 */
 	for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++)
 		if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
-			break;
+			if (family < 0 || i == default_dsm_family)
+				family = i;
 
 	/* limit the supported commands to those that are publicly documented */
-	nfit_mem->family = i;
+	nfit_mem->family = family;
 	if (override_dsm_mask && !disable_vendor_specific)
 		dsm_mask = override_dsm_mask;
 	else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
-- 
cgit 


From f2668fa7f6fbb0dfa4eb2e571f8ec0c34bc66b6f Mon Sep 17 00:00:00 2001
From: Linda Knippers <linda.knippers@hpe.com>
Date: Tue, 7 Mar 2017 16:35:14 -0500
Subject: acpi, nfit: remove unnecessary newline

The newline in MODULE_PARM_DESC causes modinfo to print the parameter
data type on a separate line, which is different from all the other
module parameters and could potentially cause a problem for someone
parsing the output of modinfo.

Signed-off-by: Linda Knippers <linda.knippers@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 98d3bc6b90f1..c3decdca46f9 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -49,7 +49,7 @@ MODULE_PARM_DESC(scrub_overflow_abort,
 static bool disable_vendor_specific;
 module_param(disable_vendor_specific, bool, S_IRUGO);
 MODULE_PARM_DESC(disable_vendor_specific,
-		"Limit commands to the publicly specified set\n");
+		"Limit commands to the publicly specified set");
 
 static unsigned long override_dsm_mask;
 module_param(override_dsm_mask, ulong, S_IRUGO);
-- 
cgit 


From 3c87f372695592bf86e9dab7b1ec7d439fda577b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 3 Apr 2017 13:52:14 -0700
Subject: acpi, nfit: fix acpi_get_table leak

Calls to acpi_get_table() must be paired with acpi_put_table() to undo
the mapping established by acpi_tb_acquire_table().

It turns out this has no effect in practice since the NFIT will already
be mapped to support the /sys/firmware/acpi/tables/NFIT attribute in
sysfs.

Fixes: 6b11d1d67713 ("ACPI / osl: Remove acpi_get_table_with_size()/early_acpi_os_unmap_memory() users")
Cc: Lv Zheng <lv.zheng@intel.com>
Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index c3decdca46f9..53943d6f4214 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2831,6 +2831,11 @@ void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(acpi_nfit_desc_init);
 
+static void acpi_nfit_put_table(void *table)
+{
+	acpi_put_table(table);
+}
+
 static int acpi_nfit_add(struct acpi_device *adev)
 {
 	struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -2847,6 +2852,10 @@ static int acpi_nfit_add(struct acpi_device *adev)
 		dev_dbg(dev, "failed to find NFIT at startup\n");
 		return 0;
 	}
+
+	rc = devm_add_action_or_reset(dev, acpi_nfit_put_table, tbl);
+	if (rc)
+		return rc;
 	sz = tbl->length;
 
 	acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL);
-- 
cgit 


From 6a6bef90425e8cba7c53919d923240559b7f247c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2017 15:33:20 -0700
Subject: libnvdimm: add mechanism to publish badblocks at the region level

badblocks sysfs file will be export at region level. When nvdimm event
notifier happens for NVDIMM_REVALIATE_POISON, the badblocks in the
region will be updated.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/nd.h          |  1 +
 drivers/nvdimm/region.c      | 24 ++++++++++++++++++++++++
 drivers/nvdimm/region_devs.c | 19 +++++++++++++++++++
 3 files changed, 44 insertions(+)

diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 2a99c83aa19f..c3b33cf655fb 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -154,6 +154,7 @@ struct nd_region {
 	u64 ndr_start;
 	int id, num_lanes, ro, numa_node;
 	void *provider_data;
+	struct badblocks bb;
 	struct nd_interleave_set *nd_set;
 	struct nd_percpu_lane __percpu *lane;
 	struct nd_mapping mapping[0];
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 8f241772ec0b..869a886c292e 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/nd.h>
+#include "nd-core.h"
 #include "nd.h"
 
 static int nd_region_probe(struct device *dev)
@@ -52,6 +53,17 @@ static int nd_region_probe(struct device *dev)
 	if (rc && err && rc == err)
 		return -ENODEV;
 
+	if (is_nd_pmem(&nd_region->dev)) {
+		struct resource ndr_res;
+
+		if (devm_init_badblocks(dev, &nd_region->bb))
+			return -ENODEV;
+		ndr_res.start = nd_region->ndr_start;
+		ndr_res.end = nd_region->ndr_start + nd_region->ndr_size - 1;
+		nvdimm_badblocks_populate(nd_region,
+				&nd_region->bb, &ndr_res);
+	}
+
 	nd_region->btt_seed = nd_btt_create(nd_region);
 	nd_region->pfn_seed = nd_pfn_create(nd_region);
 	nd_region->dax_seed = nd_dax_create(nd_region);
@@ -104,6 +116,18 @@ static int child_notify(struct device *dev, void *data)
 
 static void nd_region_notify(struct device *dev, enum nvdimm_event event)
 {
+	if (event == NVDIMM_REVALIDATE_POISON) {
+		struct nd_region *nd_region = to_nd_region(dev);
+		struct resource res;
+
+		if (is_nd_pmem(&nd_region->dev)) {
+			res.start = nd_region->ndr_start;
+			res.end = nd_region->ndr_start +
+				nd_region->ndr_size - 1;
+			nvdimm_badblocks_populate(nd_region,
+					&nd_region->bb, &res);
+		}
+	}
 	device_for_each_child(dev, &event, child_notify);
 }
 
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index b7cb5066d961..3500fc84d939 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -448,6 +448,21 @@ static ssize_t read_only_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(read_only);
 
+static ssize_t nd_badblocks_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	return badblocks_show(&nd_region->bb, buf, 0);
+}
+static struct device_attribute dev_attr_nd_badblocks = {
+	.attr = {
+		.name = "badblocks",
+		.mode = S_IRUGO
+	},
+	.show = nd_badblocks_show,
+};
+
 static struct attribute *nd_region_attributes[] = {
 	&dev_attr_size.attr,
 	&dev_attr_nstype.attr,
@@ -460,6 +475,7 @@ static struct attribute *nd_region_attributes[] = {
 	&dev_attr_available_size.attr,
 	&dev_attr_namespace_seed.attr,
 	&dev_attr_init_namespaces.attr,
+	&dev_attr_nd_badblocks.attr,
 	NULL,
 };
 
@@ -476,6 +492,9 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr)
 		return 0;
 
+	if (!is_nd_pmem(dev) && a == &dev_attr_nd_badblocks.attr)
+		return 0;
+
 	if (a != &dev_attr_set_cookie.attr
 			&& a != &dev_attr_available_size.attr)
 		return a->mode;
-- 
cgit 


From 802f4be6feee3f0395c26ac9717da5b15f6b8fec Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2017 15:33:25 -0700
Subject: libnvdimm: Add 'resource' sysfs attribute to regions

Adding sysfs attribute in order to export the physical address of the
region. This is for supporting of user app poison clear via
ND_IOCTL_CLEAR_ERROR.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/region_devs.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 3500fc84d939..8de5a04644a1 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -463,6 +463,15 @@ static struct device_attribute dev_attr_nd_badblocks = {
 	.show = nd_badblocks_show,
 };
 
+static ssize_t resource_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	return sprintf(buf, "%#llx\n", nd_region->ndr_start);
+}
+static DEVICE_ATTR_RO(resource);
+
 static struct attribute *nd_region_attributes[] = {
 	&dev_attr_size.attr,
 	&dev_attr_nstype.attr,
@@ -476,6 +485,7 @@ static struct attribute *nd_region_attributes[] = {
 	&dev_attr_namespace_seed.attr,
 	&dev_attr_init_namespaces.attr,
 	&dev_attr_nd_badblocks.attr,
+	&dev_attr_resource.attr,
 	NULL,
 };
 
@@ -495,6 +505,9 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	if (!is_nd_pmem(dev) && a == &dev_attr_nd_badblocks.attr)
 		return 0;
 
+	if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr)
+		return 0;
+
 	if (a != &dev_attr_set_cookie.attr
 			&& a != &dev_attr_available_size.attr)
 		return a->mode;
-- 
cgit 


From 006358b35c73ab75544fb4509483a81ef1a9c0b2 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2017 15:33:31 -0700
Subject: libnvdimm: add support for clear poison list and badblocks for device
 dax

Providing mechanism to clear poison list via the ndctl ND_CMD_CLEAR_ERROR
call. We will update the poison list and also the badblocks at region level
if the region is in dax mode or in pmem mode and not active. In other
words we force badblocks to be cleared through write requests if the
address is currently accessed through a block device, otherwise it can
only be done via the ioctl+dsm path.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c      | 83 ++++++++++++++++++++++++++++++++++++++++++-----
 drivers/nvdimm/core.c     | 17 +++++++---
 drivers/nvdimm/region.c   | 25 ++++++++++++++
 include/linux/libnvdimm.h |  7 +++-
 4 files changed, 118 insertions(+), 14 deletions(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 351bac8f6503..5ad2e5909e1a 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -27,6 +27,7 @@
 #include <linux/nd.h>
 #include "nd-core.h"
 #include "nd.h"
+#include "pfn.h"
 
 int nvdimm_major;
 static int nvdimm_bus_major;
@@ -218,11 +219,20 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 	if (cmd_rc < 0)
 		return cmd_rc;
 
-	nvdimm_clear_from_poison_list(nvdimm_bus, phys, len);
+	nvdimm_forget_poison(nvdimm_bus, phys, len);
 	return clear_err.cleared;
 }
 EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
 
+void __nvdimm_bus_badblocks_clear(struct nvdimm_bus *nvdimm_bus,
+		struct resource *res)
+{
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+	device_for_each_child(&nvdimm_bus->dev, (void *)res,
+			nvdimm_region_badblocks_clear);
+}
+EXPORT_SYMBOL_GPL(__nvdimm_bus_badblocks_clear);
+
 static int nvdimm_bus_match(struct device *dev, struct device_driver *drv);
 
 static struct bus_type nvdimm_bus_type = {
@@ -769,16 +779,55 @@ void wait_nvdimm_bus_probe_idle(struct device *dev)
 	} while (true);
 }
 
-static int pmem_active(struct device *dev, void *data)
+static int nd_pmem_forget_poison_check(struct device *dev, void *data)
 {
-	if (is_nd_pmem(dev) && dev->driver)
+	struct nd_cmd_clear_error *clear_err =
+		(struct nd_cmd_clear_error *)data;
+	struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
+	struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL;
+	struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL;
+	struct nd_namespace_common *ndns = NULL;
+	struct nd_namespace_io *nsio;
+	resource_size_t offset = 0, end_trunc = 0, start, end, pstart, pend;
+
+	if (nd_dax || !dev->driver)
+		return 0;
+
+	start = clear_err->address;
+	end = clear_err->address + clear_err->cleared - 1;
+
+	if (nd_btt || nd_pfn || nd_dax) {
+		if (nd_btt)
+			ndns = nd_btt->ndns;
+		else if (nd_pfn)
+			ndns = nd_pfn->ndns;
+		else if (nd_dax)
+			ndns = nd_dax->nd_pfn.ndns;
+
+		if (!ndns)
+			return 0;
+	} else
+		ndns = to_ndns(dev);
+
+	nsio = to_nd_namespace_io(&ndns->dev);
+	pstart = nsio->res.start + offset;
+	pend = nsio->res.end - end_trunc;
+
+	if ((pstart >= start) && (pend <= end))
 		return -EBUSY;
+
 	return 0;
+
+}
+
+static int nd_ns_forget_poison_check(struct device *dev, void *data)
+{
+	return device_for_each_child(dev, data, nd_pmem_forget_poison_check);
 }
 
 /* set_config requires an idle interleave set */
 static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
-		struct nvdimm *nvdimm, unsigned int cmd)
+		struct nvdimm *nvdimm, unsigned int cmd, void *data)
 {
 	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
 
@@ -792,8 +841,8 @@ static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
 
 	/* require clear error to go through the pmem driver */
 	if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR)
-		return device_for_each_child(&nvdimm_bus->dev, NULL,
-				pmem_active);
+		return device_for_each_child(&nvdimm_bus->dev, data,
+				nd_ns_forget_poison_check);
 
 	if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA)
 		return 0;
@@ -820,7 +869,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	const char *cmd_name, *dimm_name;
 	unsigned long cmd_mask;
 	void *buf;
-	int rc, i;
+	int rc, i, cmd_rc;
 
 	if (nvdimm) {
 		desc = nd_cmd_dimm_desc(cmd);
@@ -927,13 +976,29 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	}
 
 	nvdimm_bus_lock(&nvdimm_bus->dev);
-	rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd);
+	rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd, buf);
 	if (rc)
 		goto out_unlock;
 
-	rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, NULL);
+	rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, &cmd_rc);
 	if (rc < 0)
 		goto out_unlock;
+
+	if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR && cmd_rc >= 0) {
+		struct nd_cmd_clear_error *clear_err = buf;
+		struct resource res;
+
+		if (clear_err->cleared) {
+			/* clearing the poison list we keep track of */
+			__nvdimm_forget_poison(nvdimm_bus, clear_err->address,
+					clear_err->cleared);
+
+			/* now sync the badblocks lists */
+			res.start = clear_err->address;
+			res.end = clear_err->address + clear_err->cleared - 1;
+			__nvdimm_bus_badblocks_clear(nvdimm_bus, &res);
+		}
+	}
 	nvdimm_bus_unlock(&nvdimm_bus->dev);
 
 	if (copy_to_user(p, buf, buf_len))
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 9303cfeb8bee..40a3da088fd2 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -574,14 +574,15 @@ int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
 
-void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus,
-		phys_addr_t start, unsigned int len)
+void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
+		unsigned int len)
 {
 	struct list_head *poison_list = &nvdimm_bus->poison_list;
 	u64 clr_end = start + len - 1;
 	struct nd_poison *pl, *next;
 
-	nvdimm_bus_lock(&nvdimm_bus->dev);
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+
 	WARN_ON_ONCE(list_empty(poison_list));
 
 	/*
@@ -634,9 +635,17 @@ void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus,
 			continue;
 		}
 	}
+}
+EXPORT_SYMBOL_GPL(__nvdimm_forget_poison);
+
+void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t start, unsigned int len)
+{
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	__nvdimm_forget_poison(nvdimm_bus, start, len);
 	nvdimm_bus_unlock(&nvdimm_bus->dev);
 }
-EXPORT_SYMBOL_GPL(nvdimm_clear_from_poison_list);
+EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 869a886c292e..23c4307d254c 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -131,6 +131,31 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event)
 	device_for_each_child(dev, &event, child_notify);
 }
 
+int nvdimm_region_badblocks_clear(struct device *dev, void *data)
+{
+	struct resource *res = (struct resource *)data;
+	struct nd_region *nd_region;
+	resource_size_t ndr_end;
+	sector_t sector;
+
+	/* make sure device is a region */
+	if (!is_nd_pmem(dev))
+		return 0;
+
+	nd_region = to_nd_region(dev);
+	ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1;
+
+	/* make sure we are in the region */
+	if (res->start < nd_region->ndr_start || res->end > ndr_end)
+		return 0;
+
+	sector = (res->start - nd_region->ndr_start) >> 9;
+	badblocks_clear(&nd_region->bb, sector, resource_size(res) >> 9);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_region_badblocks_clear);
+
 static struct nd_device_driver nd_region_driver = {
 	.probe = nd_region_probe,
 	.remove = nd_region_remove,
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 77e7af32543f..1c609e89048a 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -120,7 +120,9 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
 }
 
 int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
-void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus,
+void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t start, unsigned int len);
+void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
 		phys_addr_t start, unsigned int len);
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc);
@@ -162,4 +164,7 @@ void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
 void nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
+int nvdimm_region_badblocks_clear(struct device *dev, void *data);
+void __nvdimm_bus_badblocks_clear(struct nvdimm_bus *nvdimm_bus,
+		struct resource *res);
 #endif /* __LIBNVDIMM_H__ */
-- 
cgit 


From efebc711180f7fed701f6e23f23814fcfda7fbfc Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2017 15:33:36 -0700
Subject: device-dax, tools/testing/nvdimm: enable device-dax with mock
 resources

Provide a replacement pgoff_to_phys() that translates an nfit_test
resource (allocated by vmalloc()) to a pfn.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax-private.h      | 61 ++++++++++++++++++++++++++++++++++++++++++
 drivers/dax/dax.c              | 52 +++++------------------------------
 tools/testing/nvdimm/Kbuild    |  3 ++-
 tools/testing/nvdimm/dax-dev.c | 49 +++++++++++++++++++++++++++++++++
 4 files changed, 118 insertions(+), 47 deletions(-)
 create mode 100644 drivers/dax/dax-private.h
 create mode 100644 tools/testing/nvdimm/dax-dev.c

diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
new file mode 100644
index 000000000000..b1cd7a8e5ab9
--- /dev/null
+++ b/drivers/dax/dax-private.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __DAX_PRIVATE_H__
+#define __DAX_PRIVATE_H__
+
+#include <linux/device.h>
+#include <linux/cdev.h>
+
+/**
+ * struct dax_region - mapping infrastructure for dax devices
+ * @id: kernel-wide unique region for a memory range
+ * @base: linear address corresponding to @res
+ * @kref: to pin while other agents have a need to do lookups
+ * @dev: parent device backing this region
+ * @align: allocation and mapping alignment for child dax devices
+ * @res: physical address range of the region
+ * @pfn_flags: identify whether the pfns are paged back or not
+ */
+struct dax_region {
+	int id;
+	struct ida ida;
+	void *base;
+	struct kref kref;
+	struct device *dev;
+	unsigned int align;
+	struct resource res;
+	unsigned long pfn_flags;
+};
+
+/**
+ * struct dax_dev - subdivision of a dax region
+ * @region - parent region
+ * @inode - inode
+ * @dev - device backing the character device
+ * @cdev - core chardev data
+ * @alive - !alive + srcu grace period == no new mappings can be established
+ * @id - child id in the region
+ * @num_resources - number of physical address extents in this device
+ * @res - array of physical address ranges
+ */
+struct dax_dev {
+	struct dax_region *region;
+	struct inode *inode;
+	struct device dev;
+	struct cdev cdev;
+	bool alive;
+	int id;
+	int num_resources;
+	struct resource res[0];
+};
+#endif
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 806f180c80d8..ef93aa84622b 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -22,6 +22,7 @@
 #include <linux/dax.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include "dax-private.h"
 #include "dax.h"
 
 static dev_t dax_devt;
@@ -35,48 +36,6 @@ static struct kmem_cache *dax_cache __read_mostly;
 static struct super_block *dax_superblock __read_mostly;
 MODULE_PARM_DESC(nr_dax, "max number of device-dax instances");
 
-/**
- * struct dax_region - mapping infrastructure for dax devices
- * @id: kernel-wide unique region for a memory range
- * @base: linear address corresponding to @res
- * @kref: to pin while other agents have a need to do lookups
- * @dev: parent device backing this region
- * @align: allocation and mapping alignment for child dax devices
- * @res: physical address range of the region
- * @pfn_flags: identify whether the pfns are paged back or not
- */
-struct dax_region {
-	int id;
-	struct ida ida;
-	void *base;
-	struct kref kref;
-	struct device *dev;
-	unsigned int align;
-	struct resource res;
-	unsigned long pfn_flags;
-};
-
-/**
- * struct dax_dev - subdivision of a dax region
- * @region - parent region
- * @dev - device backing the character device
- * @cdev - core chardev data
- * @alive - !alive + srcu grace period == no new mappings can be established
- * @id - child id in the region
- * @num_resources - number of physical address extents in this device
- * @res - array of physical address ranges
- */
-struct dax_dev {
-	struct dax_region *region;
-	struct inode *inode;
-	struct device dev;
-	struct cdev cdev;
-	bool alive;
-	int id;
-	int num_resources;
-	struct resource res[0];
-};
-
 static ssize_t id_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -397,7 +356,8 @@ static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
 	return 0;
 }
 
-static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
+/* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */
+__weak phys_addr_t dax_pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
 		unsigned long size)
 {
 	struct resource *res;
@@ -442,7 +402,7 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 	if (fault_size != dax_region->align)
 		return VM_FAULT_SIGBUS;
 
-	phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
+	phys = dax_pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
 	if (phys == -1) {
 		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
 				vmf->pgoff);
@@ -497,7 +457,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	pgoff = linear_page_index(vmf->vma, pmd_addr);
-	phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
+	phys = dax_pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
 	if (phys == -1) {
 		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
 				pgoff);
@@ -548,7 +508,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	pgoff = linear_page_index(vmf->vma, pud_addr);
-	phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE);
+	phys = dax_pgoff_to_phys(dax_dev, pgoff, PUD_SIZE);
 	if (phys == -1) {
 		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
 				pgoff);
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 405212be044a..6dcb3c4d53be 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -28,7 +28,7 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
 obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_ACPI_NFIT) += nfit.o
-obj-$(CONFIG_DEV_DAX) += dax.o
+obj-$(CONFIG_DEV_DAX) += dax.o dax-dev.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 
 nfit-y := $(ACPI_SRC)/core.o
@@ -49,6 +49,7 @@ nd_e820-y := $(NVDIMM_SRC)/e820.o
 nd_e820-y += config_check.o
 
 dax-y := $(DAX_SRC)/dax.o
+dax-y += dax-dev.o
 dax-y += config_check.o
 
 dax_pmem-y := $(DAX_SRC)/pmem.o
diff --git a/tools/testing/nvdimm/dax-dev.c b/tools/testing/nvdimm/dax-dev.c
new file mode 100644
index 000000000000..e89721d8924c
--- /dev/null
+++ b/tools/testing/nvdimm/dax-dev.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include "test/nfit_test.h"
+#include <linux/mm.h>
+#include "../../../drivers/dax/dax-private.h"
+
+phys_addr_t dax_pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
+		unsigned long size)
+{
+	struct resource *res;
+	phys_addr_t addr;
+	int i;
+
+	for (i = 0; i < dax_dev->num_resources; i++) {
+		res = &dax_dev->res[i];
+		addr = pgoff * PAGE_SIZE + res->start;
+		if (addr >= res->start && addr <= res->end)
+			break;
+		pgoff -= PHYS_PFN(resource_size(res));
+	}
+
+	if (i < dax_dev->num_resources) {
+		res = &dax_dev->res[i];
+		if (addr + size - 1 <= res->end) {
+			if (get_nfit_res(addr)) {
+				struct page *page;
+
+				if (dax_dev->region->align > PAGE_SIZE)
+					return -1;
+
+				page = vmalloc_to_page((void *)addr);
+				return PFN_PHYS(page_to_pfn(page));
+			} else
+				return addr;
+		}
+	}
+
+	return -1;
+}
-- 
cgit 


From 54eafcc9e339affb8982fd21e1fc4aa4a036655b Mon Sep 17 00:00:00 2001
From: Pushkar Jambhlekar <pushkar.iit@gmail.com>
Date: Tue, 11 Apr 2017 09:12:25 -0700
Subject: device-dax: fix dax_dev_huge_fault() unknown fault size handling

The default case for dax_dev_huge_fault() fault size handling mistakenly
returns when it should unlock. This is not a problem in practice since
the only three possible fault sizes are handled. Going forward, if the
core mm adds a new fault size beyond pte, pmd, or pud device-dax should
abort VM_FAULT_SIGBUS requests not VM_FAULT_FALLBACK since device-dax
guarantees a configured fault granularity for all faults.

Signed-off-by: Pushkar Jambhlekar <pushkar.iit@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 19795eb35579..94036d92ed16 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -591,7 +591,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf,
 		rc = __dax_dev_pud_fault(dax_dev, vmf);
 		break;
 	default:
-		return VM_FAULT_FALLBACK;
+		rc = VM_FAULT_SIGBUS;
 	}
 	srcu_read_unlock(&dax_srcu, id);
 
-- 
cgit 


From 762026203c0b87b1088342b664e67ca7c45fb7c4 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Wed, 12 Apr 2017 01:59:36 +1000
Subject: device-dax: improve fault handler debug output

A couple of minor improvements to the debug output in the fault handlers:

a) Print the region alignment and fault size when we sent a SIGBUS
   because the region alignment is greater than the fault size.
b) Fix the message in the PFN_{DEV|MAP} check.
c) Additionally print the fault size enum value in the huge fault handler.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 94036d92ed16..352cc54056ce 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -435,7 +435,8 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 
 	dax_region = dax_dev->region;
 	if (dax_region->align > PAGE_SIZE) {
-		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
+			__func__, dax_region->align, fault_size);
 		return VM_FAULT_SIGBUS;
 	}
 
@@ -476,13 +477,14 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 
 	dax_region = dax_dev->region;
 	if (dax_region->align > PMD_SIZE) {
-		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
+			__func__, dax_region->align, fault_size);
 		return VM_FAULT_SIGBUS;
 	}
 
 	/* dax pmd mappings require pfn_t_devmap() */
 	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
-		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
 		return VM_FAULT_SIGBUS;
 	}
 
@@ -527,13 +529,14 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 
 	dax_region = dax_dev->region;
 	if (dax_region->align > PUD_SIZE) {
-		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
+			__func__, dax_region->align, fault_size);
 		return VM_FAULT_SIGBUS;
 	}
 
 	/* dax pud mappings require pfn_t_devmap() */
 	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
-		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
 		return VM_FAULT_SIGBUS;
 	}
 
@@ -574,10 +577,10 @@ static int dax_dev_huge_fault(struct vm_fault *vmf,
 	struct file *filp = vmf->vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
 
-	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
+	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__,
 			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
 			? "write" : "read",
-			vmf->vma->vm_start, vmf->vma->vm_end);
+			vmf->vma->vm_start, vmf->vma->vm_end, pe_size);
 
 	id = srcu_read_lock(&dax_srcu);
 	switch (pe_size) {
-- 
cgit 


From 5f0694b300b9fb8409272c550418c22e0e57314a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 30 Jan 2017 21:43:10 -0800
Subject: device-dax: rename 'dax_dev' to 'dev_dax'

In preparation for introducing a struct dax_device type to the kernel
global type namespace, rename dax_dev to dev_dax. A 'dax_device'
instance will be a generic device-driver object for any provider of dax
functionality. A 'dev_dax' object is a device-dax-driver local /
internal instance.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.c  | 206 ++++++++++++++++++++++++++---------------------------
 drivers/dax/dax.h  |   4 +-
 drivers/dax/pmem.c |   8 +--
 3 files changed, 109 insertions(+), 109 deletions(-)

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 352cc54056ce..376fdd353aea 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -57,7 +57,7 @@ struct dax_region {
 };
 
 /**
- * struct dax_dev - subdivision of a dax region
+ * struct dev_dax - instance data for a subdivision of a dax region
  * @region - parent region
  * @dev - device backing the character device
  * @cdev - core chardev data
@@ -66,7 +66,7 @@ struct dax_region {
  * @num_resources - number of physical address extents in this device
  * @res - array of physical address ranges
  */
-struct dax_dev {
+struct dev_dax {
 	struct dax_region *region;
 	struct inode *inode;
 	struct device dev;
@@ -323,47 +323,47 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
 }
 EXPORT_SYMBOL_GPL(alloc_dax_region);
 
-static struct dax_dev *to_dax_dev(struct device *dev)
+static struct dev_dax *to_dev_dax(struct device *dev)
 {
-	return container_of(dev, struct dax_dev, dev);
+	return container_of(dev, struct dev_dax, dev);
 }
 
 static ssize_t size_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct dax_dev *dax_dev = to_dax_dev(dev);
+	struct dev_dax *dev_dax = to_dev_dax(dev);
 	unsigned long long size = 0;
 	int i;
 
-	for (i = 0; i < dax_dev->num_resources; i++)
-		size += resource_size(&dax_dev->res[i]);
+	for (i = 0; i < dev_dax->num_resources; i++)
+		size += resource_size(&dev_dax->res[i]);
 
 	return sprintf(buf, "%llu\n", size);
 }
 static DEVICE_ATTR_RO(size);
 
-static struct attribute *dax_device_attributes[] = {
+static struct attribute *dev_dax_attributes[] = {
 	&dev_attr_size.attr,
 	NULL,
 };
 
-static const struct attribute_group dax_device_attribute_group = {
-	.attrs = dax_device_attributes,
+static const struct attribute_group dev_dax_attribute_group = {
+	.attrs = dev_dax_attributes,
 };
 
 static const struct attribute_group *dax_attribute_groups[] = {
-	&dax_device_attribute_group,
+	&dev_dax_attribute_group,
 	NULL,
 };
 
-static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
+static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
 		const char *func)
 {
-	struct dax_region *dax_region = dax_dev->region;
-	struct device *dev = &dax_dev->dev;
+	struct dax_region *dax_region = dev_dax->region;
+	struct device *dev = &dev_dax->dev;
 	unsigned long mask;
 
-	if (!dax_dev->alive)
+	if (!dev_dax->alive)
 		return -ENXIO;
 
 	/* prevent private mappings from being established */
@@ -397,23 +397,23 @@ static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
 	return 0;
 }
 
-static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
+static phys_addr_t pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
 		unsigned long size)
 {
 	struct resource *res;
 	phys_addr_t phys;
 	int i;
 
-	for (i = 0; i < dax_dev->num_resources; i++) {
-		res = &dax_dev->res[i];
+	for (i = 0; i < dev_dax->num_resources; i++) {
+		res = &dev_dax->res[i];
 		phys = pgoff * PAGE_SIZE + res->start;
 		if (phys >= res->start && phys <= res->end)
 			break;
 		pgoff -= PHYS_PFN(resource_size(res));
 	}
 
-	if (i < dax_dev->num_resources) {
-		res = &dax_dev->res[i];
+	if (i < dev_dax->num_resources) {
+		res = &dev_dax->res[i];
 		if (phys + size - 1 <= res->end)
 			return phys;
 	}
@@ -421,19 +421,19 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
 	return -1;
 }
 
-static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
+static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
 {
-	struct device *dev = &dax_dev->dev;
+	struct device *dev = &dev_dax->dev;
 	struct dax_region *dax_region;
 	int rc = VM_FAULT_SIGBUS;
 	phys_addr_t phys;
 	pfn_t pfn;
 	unsigned int fault_size = PAGE_SIZE;
 
-	if (check_vma(dax_dev, vmf->vma, __func__))
+	if (check_vma(dev_dax, vmf->vma, __func__))
 		return VM_FAULT_SIGBUS;
 
-	dax_region = dax_dev->region;
+	dax_region = dev_dax->region;
 	if (dax_region->align > PAGE_SIZE) {
 		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
 			__func__, dax_region->align, fault_size);
@@ -443,7 +443,7 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 	if (fault_size != dax_region->align)
 		return VM_FAULT_SIGBUS;
 
-	phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
+	phys = pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE);
 	if (phys == -1) {
 		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
 				vmf->pgoff);
@@ -462,20 +462,20 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 	return VM_FAULT_NOPAGE;
 }
 
-static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
+static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
 {
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
-	struct device *dev = &dax_dev->dev;
+	struct device *dev = &dev_dax->dev;
 	struct dax_region *dax_region;
 	phys_addr_t phys;
 	pgoff_t pgoff;
 	pfn_t pfn;
 	unsigned int fault_size = PMD_SIZE;
 
-	if (check_vma(dax_dev, vmf->vma, __func__))
+	if (check_vma(dev_dax, vmf->vma, __func__))
 		return VM_FAULT_SIGBUS;
 
-	dax_region = dax_dev->region;
+	dax_region = dev_dax->region;
 	if (dax_region->align > PMD_SIZE) {
 		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
 			__func__, dax_region->align, fault_size);
@@ -499,7 +499,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	pgoff = linear_page_index(vmf->vma, pmd_addr);
-	phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
+	phys = pgoff_to_phys(dev_dax, pgoff, PMD_SIZE);
 	if (phys == -1) {
 		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
 				pgoff);
@@ -513,10 +513,10 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 }
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
+static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
 {
 	unsigned long pud_addr = vmf->address & PUD_MASK;
-	struct device *dev = &dax_dev->dev;
+	struct device *dev = &dev_dax->dev;
 	struct dax_region *dax_region;
 	phys_addr_t phys;
 	pgoff_t pgoff;
@@ -524,10 +524,10 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 	unsigned int fault_size = PUD_SIZE;
 
 
-	if (check_vma(dax_dev, vmf->vma, __func__))
+	if (check_vma(dev_dax, vmf->vma, __func__))
 		return VM_FAULT_SIGBUS;
 
-	dax_region = dax_dev->region;
+	dax_region = dev_dax->region;
 	if (dax_region->align > PUD_SIZE) {
 		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
 			__func__, dax_region->align, fault_size);
@@ -551,7 +551,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	pgoff = linear_page_index(vmf->vma, pud_addr);
-	phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE);
+	phys = pgoff_to_phys(dev_dax, pgoff, PUD_SIZE);
 	if (phys == -1) {
 		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
 				pgoff);
@@ -564,20 +564,20 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 			vmf->flags & FAULT_FLAG_WRITE);
 }
 #else
-static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
+static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
 {
 	return VM_FAULT_FALLBACK;
 }
 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-static int dax_dev_huge_fault(struct vm_fault *vmf,
+static int dev_dax_huge_fault(struct vm_fault *vmf,
 		enum page_entry_size pe_size)
 {
 	int rc, id;
 	struct file *filp = vmf->vma->vm_file;
-	struct dax_dev *dax_dev = filp->private_data;
+	struct dev_dax *dev_dax = filp->private_data;
 
-	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__,
+	dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__,
 			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
 			? "write" : "read",
 			vmf->vma->vm_start, vmf->vma->vm_end, pe_size);
@@ -585,13 +585,13 @@ static int dax_dev_huge_fault(struct vm_fault *vmf,
 	id = srcu_read_lock(&dax_srcu);
 	switch (pe_size) {
 	case PE_SIZE_PTE:
-		rc = __dax_dev_pte_fault(dax_dev, vmf);
+		rc = __dev_dax_pte_fault(dev_dax, vmf);
 		break;
 	case PE_SIZE_PMD:
-		rc = __dax_dev_pmd_fault(dax_dev, vmf);
+		rc = __dev_dax_pmd_fault(dev_dax, vmf);
 		break;
 	case PE_SIZE_PUD:
-		rc = __dax_dev_pud_fault(dax_dev, vmf);
+		rc = __dev_dax_pud_fault(dev_dax, vmf);
 		break;
 	default:
 		rc = VM_FAULT_SIGBUS;
@@ -601,28 +601,28 @@ static int dax_dev_huge_fault(struct vm_fault *vmf,
 	return rc;
 }
 
-static int dax_dev_fault(struct vm_fault *vmf)
+static int dev_dax_fault(struct vm_fault *vmf)
 {
-	return dax_dev_huge_fault(vmf, PE_SIZE_PTE);
+	return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
 }
 
-static const struct vm_operations_struct dax_dev_vm_ops = {
-	.fault = dax_dev_fault,
-	.huge_fault = dax_dev_huge_fault,
+static const struct vm_operations_struct dax_vm_ops = {
+	.fault = dev_dax_fault,
+	.huge_fault = dev_dax_huge_fault,
 };
 
 static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
 {
-	struct dax_dev *dax_dev = filp->private_data;
+	struct dev_dax *dev_dax = filp->private_data;
 	int rc;
 
-	dev_dbg(&dax_dev->dev, "%s\n", __func__);
+	dev_dbg(&dev_dax->dev, "%s\n", __func__);
 
-	rc = check_vma(dax_dev, vma, __func__);
+	rc = check_vma(dev_dax, vma, __func__);
 	if (rc)
 		return rc;
 
-	vma->vm_ops = &dax_dev_vm_ops;
+	vma->vm_ops = &dax_vm_ops;
 	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
 	return 0;
 }
@@ -633,13 +633,13 @@ static unsigned long dax_get_unmapped_area(struct file *filp,
 		unsigned long flags)
 {
 	unsigned long off, off_end, off_align, len_align, addr_align, align;
-	struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
+	struct dev_dax *dev_dax = filp ? filp->private_data : NULL;
 	struct dax_region *dax_region;
 
-	if (!dax_dev || addr)
+	if (!dev_dax || addr)
 		goto out;
 
-	dax_region = dax_dev->region;
+	dax_region = dev_dax->region;
 	align = dax_region->align;
 	off = pgoff << PAGE_SHIFT;
 	off_end = off + len;
@@ -664,14 +664,14 @@ static unsigned long dax_get_unmapped_area(struct file *filp,
 
 static int dax_open(struct inode *inode, struct file *filp)
 {
-	struct dax_dev *dax_dev;
+	struct dev_dax *dev_dax;
 
-	dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev);
-	dev_dbg(&dax_dev->dev, "%s\n", __func__);
-	inode->i_mapping = dax_dev->inode->i_mapping;
-	inode->i_mapping->host = dax_dev->inode;
+	dev_dax = container_of(inode->i_cdev, struct dev_dax, cdev);
+	dev_dbg(&dev_dax->dev, "%s\n", __func__);
+	inode->i_mapping = dev_dax->inode->i_mapping;
+	inode->i_mapping->host = dev_dax->inode;
 	filp->f_mapping = inode->i_mapping;
-	filp->private_data = dax_dev;
+	filp->private_data = dev_dax;
 	inode->i_flags = S_DAX;
 
 	return 0;
@@ -679,9 +679,9 @@ static int dax_open(struct inode *inode, struct file *filp)
 
 static int dax_release(struct inode *inode, struct file *filp)
 {
-	struct dax_dev *dax_dev = filp->private_data;
+	struct dev_dax *dev_dax = filp->private_data;
 
-	dev_dbg(&dax_dev->dev, "%s\n", __func__);
+	dev_dbg(&dev_dax->dev, "%s\n", __func__);
 	return 0;
 }
 
@@ -694,55 +694,55 @@ static const struct file_operations dax_fops = {
 	.mmap = dax_mmap,
 };
 
-static void dax_dev_release(struct device *dev)
+static void dev_dax_release(struct device *dev)
 {
-	struct dax_dev *dax_dev = to_dax_dev(dev);
-	struct dax_region *dax_region = dax_dev->region;
+	struct dev_dax *dev_dax = to_dev_dax(dev);
+	struct dax_region *dax_region = dev_dax->region;
 
-	ida_simple_remove(&dax_region->ida, dax_dev->id);
+	ida_simple_remove(&dax_region->ida, dev_dax->id);
 	ida_simple_remove(&dax_minor_ida, MINOR(dev->devt));
 	dax_region_put(dax_region);
-	iput(dax_dev->inode);
-	kfree(dax_dev);
+	iput(dev_dax->inode);
+	kfree(dev_dax);
 }
 
-static void kill_dax_dev(struct dax_dev *dax_dev)
+static void kill_dev_dax(struct dev_dax *dev_dax)
 {
 	/*
-	 * Note, rcu is not protecting the liveness of dax_dev, rcu is
+	 * Note, rcu is not protecting the liveness of dev_dax, rcu is
 	 * ensuring that any fault handlers that might have seen
-	 * dax_dev->alive == true, have completed.  Any fault handlers
+	 * dev_dax->alive == true, have completed.  Any fault handlers
 	 * that start after synchronize_srcu() has started will abort
-	 * upon seeing dax_dev->alive == false.
+	 * upon seeing dev_dax->alive == false.
 	 */
-	dax_dev->alive = false;
+	dev_dax->alive = false;
 	synchronize_srcu(&dax_srcu);
-	unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1);
+	unmap_mapping_range(dev_dax->inode->i_mapping, 0, 0, 1);
 }
 
-static void unregister_dax_dev(void *dev)
+static void unregister_dev_dax(void *dev)
 {
-	struct dax_dev *dax_dev = to_dax_dev(dev);
+	struct dev_dax *dev_dax = to_dev_dax(dev);
 
 	dev_dbg(dev, "%s\n", __func__);
 
-	kill_dax_dev(dax_dev);
-	cdev_device_del(&dax_dev->cdev, dev);
+	kill_dev_dax(dev_dax);
+	cdev_device_del(&dev_dax->cdev, dev);
 	put_device(dev);
 }
 
-struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region,
+struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
 		struct resource *res, int count)
 {
 	struct device *parent = dax_region->dev;
-	struct dax_dev *dax_dev;
+	struct dev_dax *dev_dax;
 	int rc = 0, minor, i;
 	struct device *dev;
 	struct cdev *cdev;
 	dev_t dev_t;
 
-	dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL);
-	if (!dax_dev)
+	dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL);
+	if (!dev_dax)
 		return ERR_PTR(-ENOMEM);
 
 	for (i = 0; i < count; i++) {
@@ -752,16 +752,16 @@ struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region,
 			rc = -EINVAL;
 			break;
 		}
-		dax_dev->res[i].start = res[i].start;
-		dax_dev->res[i].end = res[i].end;
+		dev_dax->res[i].start = res[i].start;
+		dev_dax->res[i].end = res[i].end;
 	}
 
 	if (i < count)
 		goto err_id;
 
-	dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
-	if (dax_dev->id < 0) {
-		rc = dax_dev->id;
+	dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
+	if (dev_dax->id < 0) {
+		rc = dev_dax->id;
 		goto err_id;
 	}
 
@@ -772,55 +772,55 @@ struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region,
 	}
 
 	dev_t = MKDEV(MAJOR(dax_devt), minor);
-	dev = &dax_dev->dev;
-	dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t);
-	if (!dax_dev->inode) {
+	dev = &dev_dax->dev;
+	dev_dax->inode = dax_inode_get(&dev_dax->cdev, dev_t);
+	if (!dev_dax->inode) {
 		rc = -ENOMEM;
 		goto err_inode;
 	}
 
-	/* from here on we're committed to teardown via dax_dev_release() */
+	/* from here on we're committed to teardown via dev_dax_release() */
 	device_initialize(dev);
 
-	cdev = &dax_dev->cdev;
+	cdev = &dev_dax->cdev;
 	cdev_init(cdev, &dax_fops);
 	cdev->owner = parent->driver->owner;
 
-	dax_dev->num_resources = count;
-	dax_dev->alive = true;
-	dax_dev->region = dax_region;
+	dev_dax->num_resources = count;
+	dev_dax->alive = true;
+	dev_dax->region = dax_region;
 	kref_get(&dax_region->kref);
 
 	dev->devt = dev_t;
 	dev->class = dax_class;
 	dev->parent = parent;
 	dev->groups = dax_attribute_groups;
-	dev->release = dax_dev_release;
-	dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id);
+	dev->release = dev_dax_release;
+	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
 
 	rc = cdev_device_add(cdev, dev);
 	if (rc) {
-		kill_dax_dev(dax_dev);
+		kill_dev_dax(dev_dax);
 		put_device(dev);
 		return ERR_PTR(rc);
 	}
 
-	rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev);
+	rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev);
 	if (rc)
 		return ERR_PTR(rc);
 
-	return dax_dev;
+	return dev_dax;
 
  err_inode:
 	ida_simple_remove(&dax_minor_ida, minor);
  err_minor:
-	ida_simple_remove(&dax_region->ida, dax_dev->id);
+	ida_simple_remove(&dax_region->ida, dev_dax->id);
  err_id:
-	kfree(dax_dev);
+	kfree(dev_dax);
 
 	return ERR_PTR(rc);
 }
-EXPORT_SYMBOL_GPL(devm_create_dax_dev);
+EXPORT_SYMBOL_GPL(devm_create_dev_dax);
 
 static int __init dax_init(void)
 {
diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
index ddd829ab58c0..ea176d875d60 100644
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
@@ -13,13 +13,13 @@
 #ifndef __DAX_H__
 #define __DAX_H__
 struct device;
-struct dax_dev;
+struct dev_dax;
 struct resource;
 struct dax_region;
 void dax_region_put(struct dax_region *dax_region);
 struct dax_region *alloc_dax_region(struct device *parent,
 		int region_id, struct resource *res, unsigned int align,
 		void *addr, unsigned long flags);
-struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region,
+struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
 		struct resource *res, int count);
 #endif /* __DAX_H__ */
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 033f49b31fdc..2c736fc4508b 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -61,8 +61,8 @@ static int dax_pmem_probe(struct device *dev)
 	int rc;
 	void *addr;
 	struct resource res;
-	struct dax_dev *dax_dev;
 	struct nd_pfn_sb *pfn_sb;
+	struct dev_dax *dev_dax;
 	struct dax_pmem *dax_pmem;
 	struct nd_region *nd_region;
 	struct nd_namespace_io *nsio;
@@ -130,12 +130,12 @@ static int dax_pmem_probe(struct device *dev)
 		return -ENOMEM;
 
 	/* TODO: support for subdividing a dax region... */
-	dax_dev = devm_create_dax_dev(dax_region, &res, 1);
+	dev_dax = devm_create_dev_dax(dax_region, &res, 1);
 
-	/* child dax_dev instances now own the lifetime of the dax_region */
+	/* child dev_dax instances now own the lifetime of the dax_region */
 	dax_region_put(dax_region);
 
-	return PTR_ERR_OR_ZERO(dax_dev);
+	return PTR_ERR_OR_ZERO(dev_dax);
 }
 
 static struct nd_device_driver dax_pmem_driver = {
-- 
cgit 


From 7b6be8444e0f0dd675b54d059793423d3c9b4c03 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 11 Apr 2017 09:49:49 -0700
Subject: dax: refactor dax-fs into a generic provider of 'struct dax_device'
 instances

We want dax capable drivers to be able to publish a set of dax
operations [1]. However, we do not want to further abuse block_devices
to advertise these operations. Instead we will attach these operations
to a dax device and add a lookup mechanism to go from block device path
to a dax device. A dax capable driver like pmem or brd is responsible
for registering a dax device, alongside a block device, and then a dax
capable filesystem is responsible for retrieving the dax device by path
name if it wants to call dax_operations.

For now, we refactor the dax pseudo-fs to be a generic facility, rather
than an implementation detail, of the device-dax use case. Where a "dax
device" is just an inode + dax infrastructure, and "Device DAX" is a
mapping service layered on top of that base 'struct dax_device'.
"Filesystem DAX" is then a mapping service that layers a filesystem on
top of that same base device. Filesystem DAX is associated with a
block_device for now, but perhaps directly to a dax device in the
future, or for new pmem-only filesystems.

[1]: https://lkml.org/lkml/2017/1/19/880

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/Makefile            |   2 +-
 drivers/dax/Kconfig         |  10 +-
 drivers/dax/Makefile        |   5 +-
 drivers/dax/dax.c           | 864 --------------------------------------------
 drivers/dax/dax.h           |  20 +-
 drivers/dax/device-dax.h    |  25 ++
 drivers/dax/device.c        | 709 ++++++++++++++++++++++++++++++++++++
 drivers/dax/pmem.c          |   2 +-
 drivers/dax/super.c         | 303 ++++++++++++++++
 include/linux/dax.h         |   3 +
 tools/testing/nvdimm/Kbuild |  10 +-
 11 files changed, 1070 insertions(+), 883 deletions(-)
 delete mode 100644 drivers/dax/dax.c
 create mode 100644 drivers/dax/device-dax.h
 create mode 100644 drivers/dax/device.c
 create mode 100644 drivers/dax/super.c

diff --git a/drivers/Makefile b/drivers/Makefile
index 2eced9afba53..0442e982cf35 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_PARPORT)		+= parport/
 obj-$(CONFIG_NVM)		+= lightnvm/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
-obj-$(CONFIG_DEV_DAX)		+= dax/
+obj-$(CONFIG_DAX)		+= dax/
 obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= macintosh/
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 9e95bf94eb13..b7053eafd88e 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,8 +1,13 @@
-menuconfig DEV_DAX
+menuconfig DAX
 	tristate "DAX: direct access to differentiated memory"
+	select SRCU
 	default m if NVDIMM_DAX
+
+if DAX
+
+config DEV_DAX
+	tristate "Device DAX: direct access mapping device"
 	depends on TRANSPARENT_HUGEPAGE
-	select SRCU
 	help
 	  Support raw access to differentiated (persistence, bandwidth,
 	  latency...) memory via an mmap(2) capable character
@@ -11,7 +16,6 @@ menuconfig DEV_DAX
 	  baseline memory pool.  Mappings of a /dev/daxX.Y device impose
 	  restrictions that make the mapping behavior deterministic.
 
-if DEV_DAX
 
 config DEV_DAX_PMEM
 	tristate "PMEM DAX: direct access to persistent memory"
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 27c54e38478a..dc7422530462 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -1,4 +1,7 @@
-obj-$(CONFIG_DEV_DAX) += dax.o
+obj-$(CONFIG_DAX) += dax.o
+obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 
+dax-y := super.o
 dax_pmem-y := pmem.o
+device_dax-y := device.o
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
deleted file mode 100644
index 376fdd353aea..000000000000
--- a/drivers/dax/dax.c
+++ /dev/null
@@ -1,864 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/pagemap.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/magic.h>
-#include <linux/mount.h>
-#include <linux/pfn_t.h>
-#include <linux/hash.h>
-#include <linux/cdev.h>
-#include <linux/slab.h>
-#include <linux/dax.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include "dax.h"
-
-static dev_t dax_devt;
-DEFINE_STATIC_SRCU(dax_srcu);
-static struct class *dax_class;
-static DEFINE_IDA(dax_minor_ida);
-static int nr_dax = CONFIG_NR_DEV_DAX;
-module_param(nr_dax, int, S_IRUGO);
-static struct vfsmount *dax_mnt;
-static struct kmem_cache *dax_cache __read_mostly;
-static struct super_block *dax_superblock __read_mostly;
-MODULE_PARM_DESC(nr_dax, "max number of device-dax instances");
-
-/**
- * struct dax_region - mapping infrastructure for dax devices
- * @id: kernel-wide unique region for a memory range
- * @base: linear address corresponding to @res
- * @kref: to pin while other agents have a need to do lookups
- * @dev: parent device backing this region
- * @align: allocation and mapping alignment for child dax devices
- * @res: physical address range of the region
- * @pfn_flags: identify whether the pfns are paged back or not
- */
-struct dax_region {
-	int id;
-	struct ida ida;
-	void *base;
-	struct kref kref;
-	struct device *dev;
-	unsigned int align;
-	struct resource res;
-	unsigned long pfn_flags;
-};
-
-/**
- * struct dev_dax - instance data for a subdivision of a dax region
- * @region - parent region
- * @dev - device backing the character device
- * @cdev - core chardev data
- * @alive - !alive + srcu grace period == no new mappings can be established
- * @id - child id in the region
- * @num_resources - number of physical address extents in this device
- * @res - array of physical address ranges
- */
-struct dev_dax {
-	struct dax_region *region;
-	struct inode *inode;
-	struct device dev;
-	struct cdev cdev;
-	bool alive;
-	int id;
-	int num_resources;
-	struct resource res[0];
-};
-
-static ssize_t id_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
-
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%d\n", dax_region->id);
-	device_unlock(dev);
-
-	return rc;
-}
-static DEVICE_ATTR_RO(id);
-
-static ssize_t region_size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
-
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%llu\n", (unsigned long long)
-				resource_size(&dax_region->res));
-	device_unlock(dev);
-
-	return rc;
-}
-static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
-		region_size_show, NULL);
-
-static ssize_t align_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
-
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%u\n", dax_region->align);
-	device_unlock(dev);
-
-	return rc;
-}
-static DEVICE_ATTR_RO(align);
-
-static struct attribute *dax_region_attributes[] = {
-	&dev_attr_region_size.attr,
-	&dev_attr_align.attr,
-	&dev_attr_id.attr,
-	NULL,
-};
-
-static const struct attribute_group dax_region_attribute_group = {
-	.name = "dax_region",
-	.attrs = dax_region_attributes,
-};
-
-static const struct attribute_group *dax_region_attribute_groups[] = {
-	&dax_region_attribute_group,
-	NULL,
-};
-
-static struct inode *dax_alloc_inode(struct super_block *sb)
-{
-	return kmem_cache_alloc(dax_cache, GFP_KERNEL);
-}
-
-static void dax_i_callback(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-
-	kmem_cache_free(dax_cache, inode);
-}
-
-static void dax_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, dax_i_callback);
-}
-
-static const struct super_operations dax_sops = {
-	.statfs = simple_statfs,
-	.alloc_inode = dax_alloc_inode,
-	.destroy_inode = dax_destroy_inode,
-	.drop_inode = generic_delete_inode,
-};
-
-static struct dentry *dax_mount(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *data)
-{
-	return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
-}
-
-static struct file_system_type dax_type = {
-	.name = "dax",
-	.mount = dax_mount,
-	.kill_sb = kill_anon_super,
-};
-
-static int dax_test(struct inode *inode, void *data)
-{
-	return inode->i_cdev == data;
-}
-
-static int dax_set(struct inode *inode, void *data)
-{
-	inode->i_cdev = data;
-	return 0;
-}
-
-static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt)
-{
-	struct inode *inode;
-
-	inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
-			dax_test, dax_set, cdev);
-
-	if (!inode)
-		return NULL;
-
-	if (inode->i_state & I_NEW) {
-		inode->i_mode = S_IFCHR;
-		inode->i_flags = S_DAX;
-		inode->i_rdev = devt;
-		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
-		unlock_new_inode(inode);
-	}
-	return inode;
-}
-
-static void init_once(void *inode)
-{
-	inode_init_once(inode);
-}
-
-static int dax_inode_init(void)
-{
-	int rc;
-
-	dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0,
-			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-			 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
-			init_once);
-	if (!dax_cache)
-		return -ENOMEM;
-
-	rc = register_filesystem(&dax_type);
-	if (rc)
-		goto err_register_fs;
-
-	dax_mnt = kern_mount(&dax_type);
-	if (IS_ERR(dax_mnt)) {
-		rc = PTR_ERR(dax_mnt);
-		goto err_mount;
-	}
-	dax_superblock = dax_mnt->mnt_sb;
-
-	return 0;
-
- err_mount:
-	unregister_filesystem(&dax_type);
- err_register_fs:
-	kmem_cache_destroy(dax_cache);
-
-	return rc;
-}
-
-static void dax_inode_exit(void)
-{
-	kern_unmount(dax_mnt);
-	unregister_filesystem(&dax_type);
-	kmem_cache_destroy(dax_cache);
-}
-
-static void dax_region_free(struct kref *kref)
-{
-	struct dax_region *dax_region;
-
-	dax_region = container_of(kref, struct dax_region, kref);
-	kfree(dax_region);
-}
-
-void dax_region_put(struct dax_region *dax_region)
-{
-	kref_put(&dax_region->kref, dax_region_free);
-}
-EXPORT_SYMBOL_GPL(dax_region_put);
-
-static void dax_region_unregister(void *region)
-{
-	struct dax_region *dax_region = region;
-
-	sysfs_remove_groups(&dax_region->dev->kobj,
-			dax_region_attribute_groups);
-	dax_region_put(dax_region);
-}
-
-struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-		struct resource *res, unsigned int align, void *addr,
-		unsigned long pfn_flags)
-{
-	struct dax_region *dax_region;
-
-	/*
-	 * The DAX core assumes that it can store its private data in
-	 * parent->driver_data. This WARN is a reminder / safeguard for
-	 * developers of device-dax drivers.
-	 */
-	if (dev_get_drvdata(parent)) {
-		dev_WARN(parent, "dax core failed to setup private data\n");
-		return NULL;
-	}
-
-	if (!IS_ALIGNED(res->start, align)
-			|| !IS_ALIGNED(resource_size(res), align))
-		return NULL;
-
-	dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
-	if (!dax_region)
-		return NULL;
-
-	dev_set_drvdata(parent, dax_region);
-	memcpy(&dax_region->res, res, sizeof(*res));
-	dax_region->pfn_flags = pfn_flags;
-	kref_init(&dax_region->kref);
-	dax_region->id = region_id;
-	ida_init(&dax_region->ida);
-	dax_region->align = align;
-	dax_region->dev = parent;
-	dax_region->base = addr;
-	if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
-		kfree(dax_region);
-		return NULL;;
-	}
-
-	kref_get(&dax_region->kref);
-	if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region))
-		return NULL;
-	return dax_region;
-}
-EXPORT_SYMBOL_GPL(alloc_dax_region);
-
-static struct dev_dax *to_dev_dax(struct device *dev)
-{
-	return container_of(dev, struct dev_dax, dev);
-}
-
-static ssize_t size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct dev_dax *dev_dax = to_dev_dax(dev);
-	unsigned long long size = 0;
-	int i;
-
-	for (i = 0; i < dev_dax->num_resources; i++)
-		size += resource_size(&dev_dax->res[i]);
-
-	return sprintf(buf, "%llu\n", size);
-}
-static DEVICE_ATTR_RO(size);
-
-static struct attribute *dev_dax_attributes[] = {
-	&dev_attr_size.attr,
-	NULL,
-};
-
-static const struct attribute_group dev_dax_attribute_group = {
-	.attrs = dev_dax_attributes,
-};
-
-static const struct attribute_group *dax_attribute_groups[] = {
-	&dev_dax_attribute_group,
-	NULL,
-};
-
-static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
-		const char *func)
-{
-	struct dax_region *dax_region = dev_dax->region;
-	struct device *dev = &dev_dax->dev;
-	unsigned long mask;
-
-	if (!dev_dax->alive)
-		return -ENXIO;
-
-	/* prevent private mappings from being established */
-	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
-		dev_info(dev, "%s: %s: fail, attempted private mapping\n",
-				current->comm, func);
-		return -EINVAL;
-	}
-
-	mask = dax_region->align - 1;
-	if (vma->vm_start & mask || vma->vm_end & mask) {
-		dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
-				current->comm, func, vma->vm_start, vma->vm_end,
-				mask);
-		return -EINVAL;
-	}
-
-	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
-			&& (vma->vm_flags & VM_DONTCOPY) == 0) {
-		dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
-				current->comm, func);
-		return -EINVAL;
-	}
-
-	if (!vma_is_dax(vma)) {
-		dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
-				current->comm, func);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static phys_addr_t pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
-		unsigned long size)
-{
-	struct resource *res;
-	phys_addr_t phys;
-	int i;
-
-	for (i = 0; i < dev_dax->num_resources; i++) {
-		res = &dev_dax->res[i];
-		phys = pgoff * PAGE_SIZE + res->start;
-		if (phys >= res->start && phys <= res->end)
-			break;
-		pgoff -= PHYS_PFN(resource_size(res));
-	}
-
-	if (i < dev_dax->num_resources) {
-		res = &dev_dax->res[i];
-		if (phys + size - 1 <= res->end)
-			return phys;
-	}
-
-	return -1;
-}
-
-static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
-{
-	struct device *dev = &dev_dax->dev;
-	struct dax_region *dax_region;
-	int rc = VM_FAULT_SIGBUS;
-	phys_addr_t phys;
-	pfn_t pfn;
-	unsigned int fault_size = PAGE_SIZE;
-
-	if (check_vma(dev_dax, vmf->vma, __func__))
-		return VM_FAULT_SIGBUS;
-
-	dax_region = dev_dax->region;
-	if (dax_region->align > PAGE_SIZE) {
-		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
-			__func__, dax_region->align, fault_size);
-		return VM_FAULT_SIGBUS;
-	}
-
-	if (fault_size != dax_region->align)
-		return VM_FAULT_SIGBUS;
-
-	phys = pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE);
-	if (phys == -1) {
-		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
-				vmf->pgoff);
-		return VM_FAULT_SIGBUS;
-	}
-
-	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
-
-	rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
-
-	if (rc == -ENOMEM)
-		return VM_FAULT_OOM;
-	if (rc < 0 && rc != -EBUSY)
-		return VM_FAULT_SIGBUS;
-
-	return VM_FAULT_NOPAGE;
-}
-
-static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
-{
-	unsigned long pmd_addr = vmf->address & PMD_MASK;
-	struct device *dev = &dev_dax->dev;
-	struct dax_region *dax_region;
-	phys_addr_t phys;
-	pgoff_t pgoff;
-	pfn_t pfn;
-	unsigned int fault_size = PMD_SIZE;
-
-	if (check_vma(dev_dax, vmf->vma, __func__))
-		return VM_FAULT_SIGBUS;
-
-	dax_region = dev_dax->region;
-	if (dax_region->align > PMD_SIZE) {
-		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
-			__func__, dax_region->align, fault_size);
-		return VM_FAULT_SIGBUS;
-	}
-
-	/* dax pmd mappings require pfn_t_devmap() */
-	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
-		dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
-		return VM_FAULT_SIGBUS;
-	}
-
-	if (fault_size < dax_region->align)
-		return VM_FAULT_SIGBUS;
-	else if (fault_size > dax_region->align)
-		return VM_FAULT_FALLBACK;
-
-	/* if we are outside of the VMA */
-	if (pmd_addr < vmf->vma->vm_start ||
-			(pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
-		return VM_FAULT_SIGBUS;
-
-	pgoff = linear_page_index(vmf->vma, pmd_addr);
-	phys = pgoff_to_phys(dev_dax, pgoff, PMD_SIZE);
-	if (phys == -1) {
-		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
-				pgoff);
-		return VM_FAULT_SIGBUS;
-	}
-
-	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
-
-	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
-			vmf->flags & FAULT_FLAG_WRITE);
-}
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
-{
-	unsigned long pud_addr = vmf->address & PUD_MASK;
-	struct device *dev = &dev_dax->dev;
-	struct dax_region *dax_region;
-	phys_addr_t phys;
-	pgoff_t pgoff;
-	pfn_t pfn;
-	unsigned int fault_size = PUD_SIZE;
-
-
-	if (check_vma(dev_dax, vmf->vma, __func__))
-		return VM_FAULT_SIGBUS;
-
-	dax_region = dev_dax->region;
-	if (dax_region->align > PUD_SIZE) {
-		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
-			__func__, dax_region->align, fault_size);
-		return VM_FAULT_SIGBUS;
-	}
-
-	/* dax pud mappings require pfn_t_devmap() */
-	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
-		dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
-		return VM_FAULT_SIGBUS;
-	}
-
-	if (fault_size < dax_region->align)
-		return VM_FAULT_SIGBUS;
-	else if (fault_size > dax_region->align)
-		return VM_FAULT_FALLBACK;
-
-	/* if we are outside of the VMA */
-	if (pud_addr < vmf->vma->vm_start ||
-			(pud_addr + PUD_SIZE) > vmf->vma->vm_end)
-		return VM_FAULT_SIGBUS;
-
-	pgoff = linear_page_index(vmf->vma, pud_addr);
-	phys = pgoff_to_phys(dev_dax, pgoff, PUD_SIZE);
-	if (phys == -1) {
-		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
-				pgoff);
-		return VM_FAULT_SIGBUS;
-	}
-
-	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
-
-	return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn,
-			vmf->flags & FAULT_FLAG_WRITE);
-}
-#else
-static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
-{
-	return VM_FAULT_FALLBACK;
-}
-#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-
-static int dev_dax_huge_fault(struct vm_fault *vmf,
-		enum page_entry_size pe_size)
-{
-	int rc, id;
-	struct file *filp = vmf->vma->vm_file;
-	struct dev_dax *dev_dax = filp->private_data;
-
-	dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__,
-			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
-			? "write" : "read",
-			vmf->vma->vm_start, vmf->vma->vm_end, pe_size);
-
-	id = srcu_read_lock(&dax_srcu);
-	switch (pe_size) {
-	case PE_SIZE_PTE:
-		rc = __dev_dax_pte_fault(dev_dax, vmf);
-		break;
-	case PE_SIZE_PMD:
-		rc = __dev_dax_pmd_fault(dev_dax, vmf);
-		break;
-	case PE_SIZE_PUD:
-		rc = __dev_dax_pud_fault(dev_dax, vmf);
-		break;
-	default:
-		rc = VM_FAULT_SIGBUS;
-	}
-	srcu_read_unlock(&dax_srcu, id);
-
-	return rc;
-}
-
-static int dev_dax_fault(struct vm_fault *vmf)
-{
-	return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
-}
-
-static const struct vm_operations_struct dax_vm_ops = {
-	.fault = dev_dax_fault,
-	.huge_fault = dev_dax_huge_fault,
-};
-
-static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
-{
-	struct dev_dax *dev_dax = filp->private_data;
-	int rc;
-
-	dev_dbg(&dev_dax->dev, "%s\n", __func__);
-
-	rc = check_vma(dev_dax, vma, __func__);
-	if (rc)
-		return rc;
-
-	vma->vm_ops = &dax_vm_ops;
-	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
-	return 0;
-}
-
-/* return an unmapped area aligned to the dax region specified alignment */
-static unsigned long dax_get_unmapped_area(struct file *filp,
-		unsigned long addr, unsigned long len, unsigned long pgoff,
-		unsigned long flags)
-{
-	unsigned long off, off_end, off_align, len_align, addr_align, align;
-	struct dev_dax *dev_dax = filp ? filp->private_data : NULL;
-	struct dax_region *dax_region;
-
-	if (!dev_dax || addr)
-		goto out;
-
-	dax_region = dev_dax->region;
-	align = dax_region->align;
-	off = pgoff << PAGE_SHIFT;
-	off_end = off + len;
-	off_align = round_up(off, align);
-
-	if ((off_end <= off_align) || ((off_end - off_align) < align))
-		goto out;
-
-	len_align = len + align;
-	if ((off + len_align) < off)
-		goto out;
-
-	addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
-			pgoff, flags);
-	if (!IS_ERR_VALUE(addr_align)) {
-		addr_align += (off - addr_align) & (align - 1);
-		return addr_align;
-	}
- out:
-	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
-}
-
-static int dax_open(struct inode *inode, struct file *filp)
-{
-	struct dev_dax *dev_dax;
-
-	dev_dax = container_of(inode->i_cdev, struct dev_dax, cdev);
-	dev_dbg(&dev_dax->dev, "%s\n", __func__);
-	inode->i_mapping = dev_dax->inode->i_mapping;
-	inode->i_mapping->host = dev_dax->inode;
-	filp->f_mapping = inode->i_mapping;
-	filp->private_data = dev_dax;
-	inode->i_flags = S_DAX;
-
-	return 0;
-}
-
-static int dax_release(struct inode *inode, struct file *filp)
-{
-	struct dev_dax *dev_dax = filp->private_data;
-
-	dev_dbg(&dev_dax->dev, "%s\n", __func__);
-	return 0;
-}
-
-static const struct file_operations dax_fops = {
-	.llseek = noop_llseek,
-	.owner = THIS_MODULE,
-	.open = dax_open,
-	.release = dax_release,
-	.get_unmapped_area = dax_get_unmapped_area,
-	.mmap = dax_mmap,
-};
-
-static void dev_dax_release(struct device *dev)
-{
-	struct dev_dax *dev_dax = to_dev_dax(dev);
-	struct dax_region *dax_region = dev_dax->region;
-
-	ida_simple_remove(&dax_region->ida, dev_dax->id);
-	ida_simple_remove(&dax_minor_ida, MINOR(dev->devt));
-	dax_region_put(dax_region);
-	iput(dev_dax->inode);
-	kfree(dev_dax);
-}
-
-static void kill_dev_dax(struct dev_dax *dev_dax)
-{
-	/*
-	 * Note, rcu is not protecting the liveness of dev_dax, rcu is
-	 * ensuring that any fault handlers that might have seen
-	 * dev_dax->alive == true, have completed.  Any fault handlers
-	 * that start after synchronize_srcu() has started will abort
-	 * upon seeing dev_dax->alive == false.
-	 */
-	dev_dax->alive = false;
-	synchronize_srcu(&dax_srcu);
-	unmap_mapping_range(dev_dax->inode->i_mapping, 0, 0, 1);
-}
-
-static void unregister_dev_dax(void *dev)
-{
-	struct dev_dax *dev_dax = to_dev_dax(dev);
-
-	dev_dbg(dev, "%s\n", __func__);
-
-	kill_dev_dax(dev_dax);
-	cdev_device_del(&dev_dax->cdev, dev);
-	put_device(dev);
-}
-
-struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
-		struct resource *res, int count)
-{
-	struct device *parent = dax_region->dev;
-	struct dev_dax *dev_dax;
-	int rc = 0, minor, i;
-	struct device *dev;
-	struct cdev *cdev;
-	dev_t dev_t;
-
-	dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL);
-	if (!dev_dax)
-		return ERR_PTR(-ENOMEM);
-
-	for (i = 0; i < count; i++) {
-		if (!IS_ALIGNED(res[i].start, dax_region->align)
-				|| !IS_ALIGNED(resource_size(&res[i]),
-					dax_region->align)) {
-			rc = -EINVAL;
-			break;
-		}
-		dev_dax->res[i].start = res[i].start;
-		dev_dax->res[i].end = res[i].end;
-	}
-
-	if (i < count)
-		goto err_id;
-
-	dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
-	if (dev_dax->id < 0) {
-		rc = dev_dax->id;
-		goto err_id;
-	}
-
-	minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL);
-	if (minor < 0) {
-		rc = minor;
-		goto err_minor;
-	}
-
-	dev_t = MKDEV(MAJOR(dax_devt), minor);
-	dev = &dev_dax->dev;
-	dev_dax->inode = dax_inode_get(&dev_dax->cdev, dev_t);
-	if (!dev_dax->inode) {
-		rc = -ENOMEM;
-		goto err_inode;
-	}
-
-	/* from here on we're committed to teardown via dev_dax_release() */
-	device_initialize(dev);
-
-	cdev = &dev_dax->cdev;
-	cdev_init(cdev, &dax_fops);
-	cdev->owner = parent->driver->owner;
-
-	dev_dax->num_resources = count;
-	dev_dax->alive = true;
-	dev_dax->region = dax_region;
-	kref_get(&dax_region->kref);
-
-	dev->devt = dev_t;
-	dev->class = dax_class;
-	dev->parent = parent;
-	dev->groups = dax_attribute_groups;
-	dev->release = dev_dax_release;
-	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
-
-	rc = cdev_device_add(cdev, dev);
-	if (rc) {
-		kill_dev_dax(dev_dax);
-		put_device(dev);
-		return ERR_PTR(rc);
-	}
-
-	rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev);
-	if (rc)
-		return ERR_PTR(rc);
-
-	return dev_dax;
-
- err_inode:
-	ida_simple_remove(&dax_minor_ida, minor);
- err_minor:
-	ida_simple_remove(&dax_region->ida, dev_dax->id);
- err_id:
-	kfree(dev_dax);
-
-	return ERR_PTR(rc);
-}
-EXPORT_SYMBOL_GPL(devm_create_dev_dax);
-
-static int __init dax_init(void)
-{
-	int rc;
-
-	rc = dax_inode_init();
-	if (rc)
-		return rc;
-
-	nr_dax = max(nr_dax, 256);
-	rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax");
-	if (rc)
-		goto err_chrdev;
-
-	dax_class = class_create(THIS_MODULE, "dax");
-	if (IS_ERR(dax_class)) {
-		rc = PTR_ERR(dax_class);
-		goto err_class;
-	}
-
-	return 0;
-
- err_class:
-	unregister_chrdev_region(dax_devt, nr_dax);
- err_chrdev:
-	dax_inode_exit();
-	return rc;
-}
-
-static void __exit dax_exit(void)
-{
-	class_destroy(dax_class);
-	unregister_chrdev_region(dax_devt, nr_dax);
-	ida_destroy(&dax_minor_ida);
-	dax_inode_exit();
-}
-
-MODULE_AUTHOR("Intel Corporation");
-MODULE_LICENSE("GPL v2");
-subsys_initcall(dax_init);
-module_exit(dax_exit);
diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
index ea176d875d60..2472d9da96db 100644
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -12,14 +12,12 @@
  */
 #ifndef __DAX_H__
 #define __DAX_H__
-struct device;
-struct dev_dax;
-struct resource;
-struct dax_region;
-void dax_region_put(struct dax_region *dax_region);
-struct dax_region *alloc_dax_region(struct device *parent,
-		int region_id, struct resource *res, unsigned int align,
-		void *addr, unsigned long flags);
-struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
-		struct resource *res, int count);
+struct dax_device;
+struct dax_device *alloc_dax(void *private);
+void put_dax(struct dax_device *dax_dev);
+bool dax_alive(struct dax_device *dax_dev);
+void kill_dax(struct dax_device *dax_dev);
+struct dax_device *inode_dax(struct inode *inode);
+struct inode *dax_inode(struct dax_device *dax_dev);
+void *dax_get_private(struct dax_device *dax_dev);
 #endif /* __DAX_H__ */
diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h
new file mode 100644
index 000000000000..fdcd9769ffde
--- /dev/null
+++ b/drivers/dax/device-dax.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __DEVICE_DAX_H__
+#define __DEVICE_DAX_H__
+struct device;
+struct dev_dax;
+struct resource;
+struct dax_region;
+void dax_region_put(struct dax_region *dax_region);
+struct dax_region *alloc_dax_region(struct device *parent,
+		int region_id, struct resource *res, unsigned int align,
+		void *addr, unsigned long flags);
+struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
+		struct resource *res, int count);
+#endif /* __DEVICE_DAX_H__ */
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
new file mode 100644
index 000000000000..19a42edbfa03
--- /dev/null
+++ b/drivers/dax/device.c
@@ -0,0 +1,709 @@
+/*
+ * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/pfn_t.h>
+#include <linux/cdev.h>
+#include <linux/slab.h>
+#include <linux/dax.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "dax.h"
+
+static struct class *dax_class;
+
+/**
+ * struct dax_region - mapping infrastructure for dax devices
+ * @id: kernel-wide unique region for a memory range
+ * @base: linear address corresponding to @res
+ * @kref: to pin while other agents have a need to do lookups
+ * @dev: parent device backing this region
+ * @align: allocation and mapping alignment for child dax devices
+ * @res: physical address range of the region
+ * @pfn_flags: identify whether the pfns are paged back or not
+ */
+struct dax_region {
+	int id;
+	struct ida ida;
+	void *base;
+	struct kref kref;
+	struct device *dev;
+	unsigned int align;
+	struct resource res;
+	unsigned long pfn_flags;
+};
+
+/**
+ * struct dev_dax - instance data for a subdivision of a dax region
+ * @region - parent region
+ * @dax_dev - core dax functionality
+ * @dev - device core
+ * @id - child id in the region
+ * @num_resources - number of physical address extents in this device
+ * @res - array of physical address ranges
+ */
+struct dev_dax {
+	struct dax_region *region;
+	struct dax_device *dax_dev;
+	struct device dev;
+	int id;
+	int num_resources;
+	struct resource res[0];
+};
+
+static ssize_t id_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dax_region *dax_region;
+	ssize_t rc = -ENXIO;
+
+	device_lock(dev);
+	dax_region = dev_get_drvdata(dev);
+	if (dax_region)
+		rc = sprintf(buf, "%d\n", dax_region->id);
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(id);
+
+static ssize_t region_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dax_region *dax_region;
+	ssize_t rc = -ENXIO;
+
+	device_lock(dev);
+	dax_region = dev_get_drvdata(dev);
+	if (dax_region)
+		rc = sprintf(buf, "%llu\n", (unsigned long long)
+				resource_size(&dax_region->res));
+	device_unlock(dev);
+
+	return rc;
+}
+static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
+		region_size_show, NULL);
+
+static ssize_t align_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dax_region *dax_region;
+	ssize_t rc = -ENXIO;
+
+	device_lock(dev);
+	dax_region = dev_get_drvdata(dev);
+	if (dax_region)
+		rc = sprintf(buf, "%u\n", dax_region->align);
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(align);
+
+static struct attribute *dax_region_attributes[] = {
+	&dev_attr_region_size.attr,
+	&dev_attr_align.attr,
+	&dev_attr_id.attr,
+	NULL,
+};
+
+static const struct attribute_group dax_region_attribute_group = {
+	.name = "dax_region",
+	.attrs = dax_region_attributes,
+};
+
+static const struct attribute_group *dax_region_attribute_groups[] = {
+	&dax_region_attribute_group,
+	NULL,
+};
+
+static void dax_region_free(struct kref *kref)
+{
+	struct dax_region *dax_region;
+
+	dax_region = container_of(kref, struct dax_region, kref);
+	kfree(dax_region);
+}
+
+void dax_region_put(struct dax_region *dax_region)
+{
+	kref_put(&dax_region->kref, dax_region_free);
+}
+EXPORT_SYMBOL_GPL(dax_region_put);
+
+static void dax_region_unregister(void *region)
+{
+	struct dax_region *dax_region = region;
+
+	sysfs_remove_groups(&dax_region->dev->kobj,
+			dax_region_attribute_groups);
+	dax_region_put(dax_region);
+}
+
+struct dax_region *alloc_dax_region(struct device *parent, int region_id,
+		struct resource *res, unsigned int align, void *addr,
+		unsigned long pfn_flags)
+{
+	struct dax_region *dax_region;
+
+	/*
+	 * The DAX core assumes that it can store its private data in
+	 * parent->driver_data. This WARN is a reminder / safeguard for
+	 * developers of device-dax drivers.
+	 */
+	if (dev_get_drvdata(parent)) {
+		dev_WARN(parent, "dax core failed to setup private data\n");
+		return NULL;
+	}
+
+	if (!IS_ALIGNED(res->start, align)
+			|| !IS_ALIGNED(resource_size(res), align))
+		return NULL;
+
+	dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
+	if (!dax_region)
+		return NULL;
+
+	dev_set_drvdata(parent, dax_region);
+	memcpy(&dax_region->res, res, sizeof(*res));
+	dax_region->pfn_flags = pfn_flags;
+	kref_init(&dax_region->kref);
+	dax_region->id = region_id;
+	ida_init(&dax_region->ida);
+	dax_region->align = align;
+	dax_region->dev = parent;
+	dax_region->base = addr;
+	if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
+		kfree(dax_region);
+		return NULL;;
+	}
+
+	kref_get(&dax_region->kref);
+	if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region))
+		return NULL;
+	return dax_region;
+}
+EXPORT_SYMBOL_GPL(alloc_dax_region);
+
+static struct dev_dax *to_dev_dax(struct device *dev)
+{
+	return container_of(dev, struct dev_dax, dev);
+}
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dev_dax *dev_dax = to_dev_dax(dev);
+	unsigned long long size = 0;
+	int i;
+
+	for (i = 0; i < dev_dax->num_resources; i++)
+		size += resource_size(&dev_dax->res[i]);
+
+	return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(size);
+
+static struct attribute *dev_dax_attributes[] = {
+	&dev_attr_size.attr,
+	NULL,
+};
+
+static const struct attribute_group dev_dax_attribute_group = {
+	.attrs = dev_dax_attributes,
+};
+
+static const struct attribute_group *dax_attribute_groups[] = {
+	&dev_dax_attribute_group,
+	NULL,
+};
+
+static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
+		const char *func)
+{
+	struct dax_region *dax_region = dev_dax->region;
+	struct device *dev = &dev_dax->dev;
+	unsigned long mask;
+
+	if (!dax_alive(dev_dax->dax_dev))
+		return -ENXIO;
+
+	/* prevent private mappings from being established */
+	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
+		dev_info(dev, "%s: %s: fail, attempted private mapping\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	mask = dax_region->align - 1;
+	if (vma->vm_start & mask || vma->vm_end & mask) {
+		dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
+				current->comm, func, vma->vm_start, vma->vm_end,
+				mask);
+		return -EINVAL;
+	}
+
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
+			&& (vma->vm_flags & VM_DONTCOPY) == 0) {
+		dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	if (!vma_is_dax(vma)) {
+		dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static phys_addr_t pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
+		unsigned long size)
+{
+	struct resource *res;
+	phys_addr_t phys;
+	int i;
+
+	for (i = 0; i < dev_dax->num_resources; i++) {
+		res = &dev_dax->res[i];
+		phys = pgoff * PAGE_SIZE + res->start;
+		if (phys >= res->start && phys <= res->end)
+			break;
+		pgoff -= PHYS_PFN(resource_size(res));
+	}
+
+	if (i < dev_dax->num_resources) {
+		res = &dev_dax->res[i];
+		if (phys + size - 1 <= res->end)
+			return phys;
+	}
+
+	return -1;
+}
+
+static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+{
+	struct device *dev = &dev_dax->dev;
+	struct dax_region *dax_region;
+	int rc = VM_FAULT_SIGBUS;
+	phys_addr_t phys;
+	pfn_t pfn;
+	unsigned int fault_size = PAGE_SIZE;
+
+	if (check_vma(dev_dax, vmf->vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dev_dax->region;
+	if (dax_region->align > PAGE_SIZE) {
+		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
+			__func__, dax_region->align, fault_size);
+		return VM_FAULT_SIGBUS;
+	}
+
+	if (fault_size != dax_region->align)
+		return VM_FAULT_SIGBUS;
+
+	phys = pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
+				vmf->pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
+
+	if (rc == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (rc < 0 && rc != -EBUSY)
+		return VM_FAULT_SIGBUS;
+
+	return VM_FAULT_NOPAGE;
+}
+
+static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+{
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
+	struct device *dev = &dev_dax->dev;
+	struct dax_region *dax_region;
+	phys_addr_t phys;
+	pgoff_t pgoff;
+	pfn_t pfn;
+	unsigned int fault_size = PMD_SIZE;
+
+	if (check_vma(dev_dax, vmf->vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dev_dax->region;
+	if (dax_region->align > PMD_SIZE) {
+		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
+			__func__, dax_region->align, fault_size);
+		return VM_FAULT_SIGBUS;
+	}
+
+	/* dax pmd mappings require pfn_t_devmap() */
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
+		dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	if (fault_size < dax_region->align)
+		return VM_FAULT_SIGBUS;
+	else if (fault_size > dax_region->align)
+		return VM_FAULT_FALLBACK;
+
+	/* if we are outside of the VMA */
+	if (pmd_addr < vmf->vma->vm_start ||
+			(pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
+		return VM_FAULT_SIGBUS;
+
+	pgoff = linear_page_index(vmf->vma, pmd_addr);
+	phys = pgoff_to_phys(dev_dax, pgoff, PMD_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
+				pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
+			vmf->flags & FAULT_FLAG_WRITE);
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+{
+	unsigned long pud_addr = vmf->address & PUD_MASK;
+	struct device *dev = &dev_dax->dev;
+	struct dax_region *dax_region;
+	phys_addr_t phys;
+	pgoff_t pgoff;
+	pfn_t pfn;
+	unsigned int fault_size = PUD_SIZE;
+
+
+	if (check_vma(dev_dax, vmf->vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dev_dax->region;
+	if (dax_region->align > PUD_SIZE) {
+		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
+			__func__, dax_region->align, fault_size);
+		return VM_FAULT_SIGBUS;
+	}
+
+	/* dax pud mappings require pfn_t_devmap() */
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
+		dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	if (fault_size < dax_region->align)
+		return VM_FAULT_SIGBUS;
+	else if (fault_size > dax_region->align)
+		return VM_FAULT_FALLBACK;
+
+	/* if we are outside of the VMA */
+	if (pud_addr < vmf->vma->vm_start ||
+			(pud_addr + PUD_SIZE) > vmf->vma->vm_end)
+		return VM_FAULT_SIGBUS;
+
+	pgoff = linear_page_index(vmf->vma, pud_addr);
+	phys = pgoff_to_phys(dev_dax, pgoff, PUD_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
+				pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn,
+			vmf->flags & FAULT_FLAG_WRITE);
+}
+#else
+static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+{
+	return VM_FAULT_FALLBACK;
+}
+#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
+static int dev_dax_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
+{
+	int rc, id;
+	struct file *filp = vmf->vma->vm_file;
+	struct dev_dax *dev_dax = filp->private_data;
+
+	dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__,
+			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
+			? "write" : "read",
+			vmf->vma->vm_start, vmf->vma->vm_end, pe_size);
+
+	id = dax_read_lock();
+	switch (pe_size) {
+	case PE_SIZE_PTE:
+		rc = __dev_dax_pte_fault(dev_dax, vmf);
+		break;
+	case PE_SIZE_PMD:
+		rc = __dev_dax_pmd_fault(dev_dax, vmf);
+		break;
+	case PE_SIZE_PUD:
+		rc = __dev_dax_pud_fault(dev_dax, vmf);
+		break;
+	default:
+		rc = VM_FAULT_SIGBUS;
+	}
+	dax_read_unlock(id);
+
+	return rc;
+}
+
+static int dev_dax_fault(struct vm_fault *vmf)
+{
+	return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static const struct vm_operations_struct dax_vm_ops = {
+	.fault = dev_dax_fault,
+	.huge_fault = dev_dax_huge_fault,
+};
+
+static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct dev_dax *dev_dax = filp->private_data;
+	int rc, id;
+
+	dev_dbg(&dev_dax->dev, "%s\n", __func__);
+
+	/*
+	 * We lock to check dax_dev liveness and will re-check at
+	 * fault time.
+	 */
+	id = dax_read_lock();
+	rc = check_vma(dev_dax, vma, __func__);
+	dax_read_unlock(id);
+	if (rc)
+		return rc;
+
+	vma->vm_ops = &dax_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	return 0;
+}
+
+/* return an unmapped area aligned to the dax region specified alignment */
+static unsigned long dax_get_unmapped_area(struct file *filp,
+		unsigned long addr, unsigned long len, unsigned long pgoff,
+		unsigned long flags)
+{
+	unsigned long off, off_end, off_align, len_align, addr_align, align;
+	struct dev_dax *dev_dax = filp ? filp->private_data : NULL;
+	struct dax_region *dax_region;
+
+	if (!dev_dax || addr)
+		goto out;
+
+	dax_region = dev_dax->region;
+	align = dax_region->align;
+	off = pgoff << PAGE_SHIFT;
+	off_end = off + len;
+	off_align = round_up(off, align);
+
+	if ((off_end <= off_align) || ((off_end - off_align) < align))
+		goto out;
+
+	len_align = len + align;
+	if ((off + len_align) < off)
+		goto out;
+
+	addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
+			pgoff, flags);
+	if (!IS_ERR_VALUE(addr_align)) {
+		addr_align += (off - addr_align) & (align - 1);
+		return addr_align;
+	}
+ out:
+	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+
+static int dax_open(struct inode *inode, struct file *filp)
+{
+	struct dax_device *dax_dev = inode_dax(inode);
+	struct inode *__dax_inode = dax_inode(dax_dev);
+	struct dev_dax *dev_dax = dax_get_private(dax_dev);
+
+	dev_dbg(&dev_dax->dev, "%s\n", __func__);
+	inode->i_mapping = __dax_inode->i_mapping;
+	inode->i_mapping->host = __dax_inode;
+	filp->f_mapping = inode->i_mapping;
+	filp->private_data = dev_dax;
+	inode->i_flags = S_DAX;
+
+	return 0;
+}
+
+static int dax_release(struct inode *inode, struct file *filp)
+{
+	struct dev_dax *dev_dax = filp->private_data;
+
+	dev_dbg(&dev_dax->dev, "%s\n", __func__);
+	return 0;
+}
+
+static const struct file_operations dax_fops = {
+	.llseek = noop_llseek,
+	.owner = THIS_MODULE,
+	.open = dax_open,
+	.release = dax_release,
+	.get_unmapped_area = dax_get_unmapped_area,
+	.mmap = dax_mmap,
+};
+
+static void dev_dax_release(struct device *dev)
+{
+	struct dev_dax *dev_dax = to_dev_dax(dev);
+	struct dax_region *dax_region = dev_dax->region;
+	struct dax_device *dax_dev = dev_dax->dax_dev;
+
+	ida_simple_remove(&dax_region->ida, dev_dax->id);
+	dax_region_put(dax_region);
+	put_dax(dax_dev);
+	kfree(dev_dax);
+}
+
+static void kill_dev_dax(struct dev_dax *dev_dax)
+{
+	struct dax_device *dax_dev = dev_dax->dax_dev;
+	struct inode *inode = dax_inode(dax_dev);
+
+	kill_dax(dax_dev);
+	unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+}
+
+static void unregister_dev_dax(void *dev)
+{
+	struct dev_dax *dev_dax = to_dev_dax(dev);
+	struct dax_device *dax_dev = dev_dax->dax_dev;
+	struct inode *inode = dax_inode(dax_dev);
+	struct cdev *cdev = inode->i_cdev;
+
+	dev_dbg(dev, "%s\n", __func__);
+
+	kill_dev_dax(dev_dax);
+	cdev_device_del(cdev, dev);
+	put_device(dev);
+}
+
+struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
+		struct resource *res, int count)
+{
+	struct device *parent = dax_region->dev;
+	struct dax_device *dax_dev;
+	struct dev_dax *dev_dax;
+	struct inode *inode;
+	struct device *dev;
+	struct cdev *cdev;
+	int rc = 0, i;
+
+	dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL);
+	if (!dev_dax)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+		if (!IS_ALIGNED(res[i].start, dax_region->align)
+				|| !IS_ALIGNED(resource_size(&res[i]),
+					dax_region->align)) {
+			rc = -EINVAL;
+			break;
+		}
+		dev_dax->res[i].start = res[i].start;
+		dev_dax->res[i].end = res[i].end;
+	}
+
+	if (i < count)
+		goto err_id;
+
+	dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
+	if (dev_dax->id < 0) {
+		rc = dev_dax->id;
+		goto err_id;
+	}
+
+	dax_dev = alloc_dax(dev_dax);
+	if (!dax_dev)
+		goto err_dax;
+
+	/* from here on we're committed to teardown via dax_dev_release() */
+	dev = &dev_dax->dev;
+	device_initialize(dev);
+
+	inode = dax_inode(dax_dev);
+	cdev = inode->i_cdev;
+	cdev_init(cdev, &dax_fops);
+	cdev->owner = parent->driver->owner;
+
+	dev_dax->num_resources = count;
+	dev_dax->dax_dev = dax_dev;
+	dev_dax->region = dax_region;
+	kref_get(&dax_region->kref);
+
+	dev->devt = inode->i_rdev;
+	dev->class = dax_class;
+	dev->parent = parent;
+	dev->groups = dax_attribute_groups;
+	dev->release = dev_dax_release;
+	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
+
+	rc = cdev_device_add(cdev, dev);
+	if (rc) {
+		kill_dev_dax(dev_dax);
+		put_device(dev);
+		return ERR_PTR(rc);
+	}
+
+	rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev);
+	if (rc)
+		return ERR_PTR(rc);
+
+	return dev_dax;
+
+ err_dax:
+	ida_simple_remove(&dax_region->ida, dev_dax->id);
+ err_id:
+	kfree(dev_dax);
+
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_GPL(devm_create_dev_dax);
+
+static int __init dax_init(void)
+{
+	dax_class = class_create(THIS_MODULE, "dax");
+	return PTR_ERR_OR_ZERO(dax_class);
+}
+
+static void __exit dax_exit(void)
+{
+	class_destroy(dax_class);
+}
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL v2");
+subsys_initcall(dax_init);
+module_exit(dax_exit);
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 2c736fc4508b..d4ca19bd74eb 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -16,7 +16,7 @@
 #include <linux/pfn_t.h>
 #include "../nvdimm/pfn.h"
 #include "../nvdimm/nd.h"
-#include "dax.h"
+#include "device-dax.h"
 
 struct dax_pmem {
 	struct device *dev;
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
new file mode 100644
index 000000000000..c9f85f1c086e
--- /dev/null
+++ b/drivers/dax/super.c
@@ -0,0 +1,303 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include <linux/cdev.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+
+static int nr_dax = CONFIG_NR_DEV_DAX;
+module_param(nr_dax, int, S_IRUGO);
+MODULE_PARM_DESC(nr_dax, "max number of dax device instances");
+
+static dev_t dax_devt;
+DEFINE_STATIC_SRCU(dax_srcu);
+static struct vfsmount *dax_mnt;
+static DEFINE_IDA(dax_minor_ida);
+static struct kmem_cache *dax_cache __read_mostly;
+static struct super_block *dax_superblock __read_mostly;
+
+int dax_read_lock(void)
+{
+	return srcu_read_lock(&dax_srcu);
+}
+EXPORT_SYMBOL_GPL(dax_read_lock);
+
+void dax_read_unlock(int id)
+{
+	srcu_read_unlock(&dax_srcu, id);
+}
+EXPORT_SYMBOL_GPL(dax_read_unlock);
+
+/**
+ * struct dax_device - anchor object for dax services
+ * @inode: core vfs
+ * @cdev: optional character interface for "device dax"
+ * @private: dax driver private data
+ * @alive: !alive + rcu grace period == no new operations / mappings
+ */
+struct dax_device {
+	struct inode inode;
+	struct cdev cdev;
+	void *private;
+	bool alive;
+};
+
+bool dax_alive(struct dax_device *dax_dev)
+{
+	lockdep_assert_held(&dax_srcu);
+	return dax_dev->alive;
+}
+EXPORT_SYMBOL_GPL(dax_alive);
+
+/*
+ * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
+ * that any fault handlers or operations that might have seen
+ * dax_alive(), have completed.  Any operations that start after
+ * synchronize_srcu() has run will abort upon seeing !dax_alive().
+ */
+void kill_dax(struct dax_device *dax_dev)
+{
+	if (!dax_dev)
+		return;
+
+	dax_dev->alive = false;
+	synchronize_srcu(&dax_srcu);
+	dax_dev->private = NULL;
+}
+EXPORT_SYMBOL_GPL(kill_dax);
+
+static struct inode *dax_alloc_inode(struct super_block *sb)
+{
+	struct dax_device *dax_dev;
+
+	dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
+	return &dax_dev->inode;
+}
+
+static struct dax_device *to_dax_dev(struct inode *inode)
+{
+	return container_of(inode, struct dax_device, inode);
+}
+
+static void dax_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct dax_device *dax_dev = to_dax_dev(inode);
+
+	ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
+	kmem_cache_free(dax_cache, dax_dev);
+}
+
+static void dax_destroy_inode(struct inode *inode)
+{
+	struct dax_device *dax_dev = to_dax_dev(inode);
+
+	WARN_ONCE(dax_dev->alive,
+			"kill_dax() must be called before final iput()\n");
+	call_rcu(&inode->i_rcu, dax_i_callback);
+}
+
+static const struct super_operations dax_sops = {
+	.statfs = simple_statfs,
+	.alloc_inode = dax_alloc_inode,
+	.destroy_inode = dax_destroy_inode,
+	.drop_inode = generic_delete_inode,
+};
+
+static struct dentry *dax_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
+}
+
+static struct file_system_type dax_fs_type = {
+	.name = "dax",
+	.mount = dax_mount,
+	.kill_sb = kill_anon_super,
+};
+
+static int dax_test(struct inode *inode, void *data)
+{
+	dev_t devt = *(dev_t *) data;
+
+	return inode->i_rdev == devt;
+}
+
+static int dax_set(struct inode *inode, void *data)
+{
+	dev_t devt = *(dev_t *) data;
+
+	inode->i_rdev = devt;
+	return 0;
+}
+
+static struct dax_device *dax_dev_get(dev_t devt)
+{
+	struct dax_device *dax_dev;
+	struct inode *inode;
+
+	inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
+			dax_test, dax_set, &devt);
+
+	if (!inode)
+		return NULL;
+
+	dax_dev = to_dax_dev(inode);
+	if (inode->i_state & I_NEW) {
+		dax_dev->alive = true;
+		inode->i_cdev = &dax_dev->cdev;
+		inode->i_mode = S_IFCHR;
+		inode->i_flags = S_DAX;
+		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
+		unlock_new_inode(inode);
+	}
+
+	return dax_dev;
+}
+
+struct dax_device *alloc_dax(void *private)
+{
+	struct dax_device *dax_dev;
+	dev_t devt;
+	int minor;
+
+	minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL);
+	if (minor < 0)
+		return NULL;
+
+	devt = MKDEV(MAJOR(dax_devt), minor);
+	dax_dev = dax_dev_get(devt);
+	if (!dax_dev)
+		goto err_inode;
+
+	dax_dev->private = private;
+	return dax_dev;
+
+ err_inode:
+	ida_simple_remove(&dax_minor_ida, minor);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(alloc_dax);
+
+void put_dax(struct dax_device *dax_dev)
+{
+	if (!dax_dev)
+		return;
+	iput(&dax_dev->inode);
+}
+EXPORT_SYMBOL_GPL(put_dax);
+
+/**
+ * inode_dax: convert a public inode into its dax_dev
+ * @inode: An inode with i_cdev pointing to a dax_dev
+ *
+ * Note this is not equivalent to to_dax_dev() which is for private
+ * internal use where we know the inode filesystem type == dax_fs_type.
+ */
+struct dax_device *inode_dax(struct inode *inode)
+{
+	struct cdev *cdev = inode->i_cdev;
+
+	return container_of(cdev, struct dax_device, cdev);
+}
+EXPORT_SYMBOL_GPL(inode_dax);
+
+struct inode *dax_inode(struct dax_device *dax_dev)
+{
+	return &dax_dev->inode;
+}
+EXPORT_SYMBOL_GPL(dax_inode);
+
+void *dax_get_private(struct dax_device *dax_dev)
+{
+	return dax_dev->private;
+}
+EXPORT_SYMBOL_GPL(dax_get_private);
+
+static void init_once(void *_dax_dev)
+{
+	struct dax_device *dax_dev = _dax_dev;
+	struct inode *inode = &dax_dev->inode;
+
+	inode_init_once(inode);
+}
+
+static int __dax_fs_init(void)
+{
+	int rc;
+
+	dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
+			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+			 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+			init_once);
+	if (!dax_cache)
+		return -ENOMEM;
+
+	rc = register_filesystem(&dax_fs_type);
+	if (rc)
+		goto err_register_fs;
+
+	dax_mnt = kern_mount(&dax_fs_type);
+	if (IS_ERR(dax_mnt)) {
+		rc = PTR_ERR(dax_mnt);
+		goto err_mount;
+	}
+	dax_superblock = dax_mnt->mnt_sb;
+
+	return 0;
+
+ err_mount:
+	unregister_filesystem(&dax_fs_type);
+ err_register_fs:
+	kmem_cache_destroy(dax_cache);
+
+	return rc;
+}
+
+static void __dax_fs_exit(void)
+{
+	kern_unmount(dax_mnt);
+	unregister_filesystem(&dax_fs_type);
+	kmem_cache_destroy(dax_cache);
+}
+
+static int __init dax_fs_init(void)
+{
+	int rc;
+
+	rc = __dax_fs_init();
+	if (rc)
+		return rc;
+
+	nr_dax = max(nr_dax, 256);
+	rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax");
+	if (rc)
+		__dax_fs_exit();
+	return rc;
+}
+
+static void __exit dax_fs_exit(void)
+{
+	unregister_chrdev_region(dax_devt, nr_dax);
+	ida_destroy(&dax_minor_ida);
+	__dax_fs_exit();
+}
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL v2");
+subsys_initcall(dax_fs_init);
+module_exit(dax_fs_exit);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index d8a3dc042e1c..5b62f5d19aea 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -8,6 +8,9 @@
 
 struct iomap_ops;
 
+int dax_read_lock(void);
+void dax_read_unlock(int id);
+
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for
  * the entry size (PMD) and two more to tell us if the entry is a huge zero
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 405212be044a..2033ad03b8cd 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -28,7 +28,10 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
 obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_ACPI_NFIT) += nfit.o
-obj-$(CONFIG_DEV_DAX) += dax.o
+ifeq ($(CONFIG_DAX),m)
+obj-$(CONFIG_DAX) += dax.o
+endif
+obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 
 nfit-y := $(ACPI_SRC)/core.o
@@ -48,9 +51,12 @@ nd_blk-y += config_check.o
 nd_e820-y := $(NVDIMM_SRC)/e820.o
 nd_e820-y += config_check.o
 
-dax-y := $(DAX_SRC)/dax.o
+dax-y := $(DAX_SRC)/super.o
 dax-y += config_check.o
 
+device_dax-y := $(DAX_SRC)/device.o
+device_dax-y += config_check.o
+
 dax_pmem-y := $(DAX_SRC)/pmem.o
 dax_pmem-y += config_check.o
 
-- 
cgit 


From b3b454f694db663773bc22002e10909afe9c1739 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 13 Apr 2017 14:25:17 -0700
Subject: libnvdimm: fix clear poison locking with spinlock and GFP_NOWAIT
 allocation

The following warning results from holding a lane spinlock,
preempt_disable(), or the btt map spinlock and then trying to take the
reconfig_mutex to walk the poison list and potentially add new entries.

BUG: sleeping function called from invalid context at kernel/locking/mutex.
c:747
in_atomic(): 1, irqs_disabled(): 0, pid: 17159, name: dd
[..]
Call Trace:
dump_stack+0x85/0xc8
___might_sleep+0x184/0x250
__might_sleep+0x4a/0x90
__mutex_lock+0x58/0x9b0
? nvdimm_bus_lock+0x21/0x30 [libnvdimm]
? __nvdimm_bus_badblocks_clear+0x2f/0x60 [libnvdimm]
? acpi_nfit_forget_poison+0x79/0x80 [nfit]
? _raw_spin_unlock+0x27/0x40
mutex_lock_nested+0x1b/0x20
nvdimm_bus_lock+0x21/0x30 [libnvdimm]
nvdimm_forget_poison+0x25/0x50 [libnvdimm]
nvdimm_clear_poison+0x106/0x140 [libnvdimm]
nsio_rw_bytes+0x164/0x270 [libnvdimm]
btt_write_pg+0x1de/0x3e0 [nd_btt]
? blk_queue_enter+0x30/0x290
btt_make_request+0x11a/0x310 [nd_btt]
? blk_queue_enter+0xb7/0x290
? blk_queue_enter+0x30/0x290
generic_make_request+0x118/0x3b0

A spinlock is introduced to protect the poison list. This allows us to not
having to acquire the reconfig_mutex for touching the poison list. The
add_poison() function has been broken out into two helper functions. One to
allocate the poison entry and the other to apppend the entry. This allows us
to unlock the poison_lock in non-I/O path and continue to be able to allocate
the poison entry with GFP_KERNEL. We will use GFP_NOWAIT in the I/O path in
order to satisfy being in atomic context.

Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c      |  7 +++---
 drivers/nvdimm/core.c     | 56 ++++++++++++++++++++++++++++-------------------
 drivers/nvdimm/nd-core.h  |  1 +
 include/linux/libnvdimm.h |  2 --
 4 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 5ad2e5909e1a..d214ac44d111 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -296,6 +296,7 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 	init_waitqueue_head(&nvdimm_bus->probe_wait);
 	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
 	mutex_init(&nvdimm_bus->reconfig_mutex);
+	spin_lock_init(&nvdimm_bus->poison_lock);
 	if (nvdimm_bus->id < 0) {
 		kfree(nvdimm_bus);
 		return NULL;
@@ -364,9 +365,9 @@ static int nd_bus_remove(struct device *dev)
 	nd_synchronize();
 	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
 
-	nvdimm_bus_lock(&nvdimm_bus->dev);
+	spin_lock(&nvdimm_bus->poison_lock);
 	free_poison_list(&nvdimm_bus->poison_list);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
+	spin_unlock(&nvdimm_bus->poison_lock);
 
 	nvdimm_bus_destroy_ndctl(nvdimm_bus);
 
@@ -990,7 +991,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 
 		if (clear_err->cleared) {
 			/* clearing the poison list we keep track of */
-			__nvdimm_forget_poison(nvdimm_bus, clear_err->address,
+			nvdimm_forget_poison(nvdimm_bus, clear_err->address,
 					clear_err->cleared);
 
 			/* now sync the badblocks lists */
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 40a3da088fd2..2dee908e4bae 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -518,6 +518,15 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region,
 }
 EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
 
+static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
+		struct nd_poison *pl, u64 addr, u64 length)
+{
+	lockdep_assert_held(&nvdimm_bus->poison_lock);
+	pl->start = addr;
+	pl->length = length;
+	list_add_tail(&pl->list, &nvdimm_bus->poison_list);
+}
+
 static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
 			gfp_t flags)
 {
@@ -527,19 +536,24 @@ static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
 	if (!pl)
 		return -ENOMEM;
 
-	pl->start = addr;
-	pl->length = length;
-	list_add_tail(&pl->list, &nvdimm_bus->poison_list);
-
+	append_poison_entry(nvdimm_bus, pl, addr, length);
 	return 0;
 }
 
 static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 {
-	struct nd_poison *pl;
+	struct nd_poison *pl, *pl_new;
 
-	if (list_empty(&nvdimm_bus->poison_list))
-		return add_poison(nvdimm_bus, addr, length, GFP_KERNEL);
+	spin_unlock(&nvdimm_bus->poison_lock);
+	pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL);
+	spin_lock(&nvdimm_bus->poison_lock);
+
+	if (list_empty(&nvdimm_bus->poison_list)) {
+		if (!pl_new)
+			return -ENOMEM;
+		append_poison_entry(nvdimm_bus, pl_new, addr, length);
+		return 0;
+	}
 
 	/*
 	 * There is a chance this is a duplicate, check for those first.
@@ -551,6 +565,7 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 			/* If length has changed, update this list entry */
 			if (pl->length != length)
 				pl->length = length;
+			kfree(pl_new);
 			return 0;
 		}
 
@@ -559,30 +574,33 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 	 * as any overlapping ranges will get resolved when the list is consumed
 	 * and converted to badblocks
 	 */
-	return add_poison(nvdimm_bus, addr, length, GFP_KERNEL);
+	if (!pl_new)
+		return -ENOMEM;
+	append_poison_entry(nvdimm_bus, pl_new, addr, length);
+
+	return 0;
 }
 
 int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 {
 	int rc;
 
-	nvdimm_bus_lock(&nvdimm_bus->dev);
+	spin_lock(&nvdimm_bus->poison_lock);
 	rc = bus_add_poison(nvdimm_bus, addr, length);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
+	spin_unlock(&nvdimm_bus->poison_lock);
 
 	return rc;
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
 
-void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
+void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
 		unsigned int len)
 {
 	struct list_head *poison_list = &nvdimm_bus->poison_list;
 	u64 clr_end = start + len - 1;
 	struct nd_poison *pl, *next;
 
-	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
-
+	spin_lock(&nvdimm_bus->poison_lock);
 	WARN_ON_ONCE(list_empty(poison_list));
 
 	/*
@@ -629,21 +647,13 @@ void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
 			u64 new_len = pl_end - new_start + 1;
 
 			/* Add new entry covering the right half */
-			add_poison(nvdimm_bus, new_start, new_len, GFP_NOIO);
+			add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT);
 			/* Adjust this entry to cover the left half */
 			pl->length = start - pl->start;
 			continue;
 		}
 	}
-}
-EXPORT_SYMBOL_GPL(__nvdimm_forget_poison);
-
-void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
-		phys_addr_t start, unsigned int len)
-{
-	nvdimm_bus_lock(&nvdimm_bus->dev);
-	__nvdimm_forget_poison(nvdimm_bus, start, len);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
+	spin_unlock(&nvdimm_bus->poison_lock);
 }
 EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
 
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 8623e57c2ce3..4c4bd209e725 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -32,6 +32,7 @@ struct nvdimm_bus {
 	struct list_head poison_list;
 	struct list_head mapping_list;
 	struct mutex reconfig_mutex;
+	spinlock_t poison_lock;
 };
 
 struct nvdimm {
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 1c609e89048a..98b207611b06 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -122,8 +122,6 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
 int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
 void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
 		phys_addr_t start, unsigned int len);
-void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
-		phys_addr_t start, unsigned int len);
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc);
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
-- 
cgit 


From e88da7998d7dc7b40dbe3ac85d9a5d6f91cc2d13 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 13 Apr 2017 14:37:39 -0700
Subject: Revert "libnvdimm: band aid btt vs clear poison locking"

This reverts commit 4aa5615e080a "libnvdimm: band aid btt vs clear
poison locking".

Now that poison list locking has been converted to a spinlock and poison
list entry allocation during i/o has been converted to GFP_NOWAIT,
revert the band-aid that disabled error clearing from btt i/o.

Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/claim.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index ca6d572c48fc..b3323c0697f6 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -243,15 +243,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	}
 
 	if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
-		/*
-		 * FIXME: nsio_rw_bytes() may be called from atomic
-		 * context in the btt case and nvdimm_clear_poison()
-		 * takes a sleeping lock. Until the locking can be
-		 * reworked this capability requires that the namespace
-		 * is not claimed by btt.
-		 */
-		if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
-				&& (!ndns->claim || !is_nd_btt(ndns->claim))) {
+		if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
 			long cleared;
 
 			cleared = nvdimm_clear_poison(&ndns->dev, offset, size);
-- 
cgit 


From ffab9385b314d62039a4a5df29703fc264e85e08 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 13 Apr 2017 15:05:30 -0700
Subject: acpi, nfit: add support for acpi 6.1 dimm state flags

Add support for the ACPI_NFIT_MEM_MAP_FAILED ("map_fail") and
ACPI_NFIT_MEM_HEALTH_ENABLED ("smart_notify") health state flags. The
"map_fail" flag identifies DIMMs that were not mapped into one or more
physical address ranges. The "health_notify" flag indicates whether
platform firmware will send notifications when there is new SMART health
data to consume.

Acked-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 53943d6f4214..05829de43b1d 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1246,12 +1246,14 @@ static ssize_t flags_show(struct device *dev,
 {
 	u16 flags = to_nfit_memdev(dev)->flags;
 
-	return sprintf(buf, "%s%s%s%s%s\n",
+	return sprintf(buf, "%s%s%s%s%s%s%s\n",
 		flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save_fail " : "",
 		flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore_fail " : "",
 		flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush_fail " : "",
 		flags & ACPI_NFIT_MEM_NOT_ARMED ? "not_armed " : "",
-		flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart_event " : "");
+		flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart_event " : "",
+		flags & ACPI_NFIT_MEM_MAP_FAILED ? "map_fail " : "",
+		flags & ACPI_NFIT_MEM_HEALTH_ENABLED ? "smart_notify " : "");
 }
 static DEVICE_ATTR_RO(flags);
 
-- 
cgit 


From ac40b675c74b9243069aba3a5799481dbcb9d2ef Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 13 Apr 2017 15:17:38 -0700
Subject: tools/testing/nvdimm: test acpi 6.1 health state flags

Add a simulated dimm with an ACPI_NFIT_MEM_MAP_FAILED indication, and
set the ACPI_NFIT_MEM_HEALTH_ENABLED flag on all the dimms where
nfit_test simulates health events, but spread it out over several
redundant memdev entries to test that the nfit driver coalesces all the
flags.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 798f17655433..bc02f28ed8b8 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -132,6 +132,7 @@ static u32 handle[] = {
 	[3] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 1),
 	[4] = NFIT_DIMM_HANDLE(0, 1, 0, 0, 0),
 	[5] = NFIT_DIMM_HANDLE(1, 0, 0, 0, 0),
+	[6] = NFIT_DIMM_HANDLE(1, 0, 0, 0, 1),
 };
 
 static unsigned long dimm_fail_cmd_flags[NUM_DCR];
@@ -728,8 +729,8 @@ static int nfit_test0_alloc(struct nfit_test *t)
 static int nfit_test1_alloc(struct nfit_test *t)
 {
 	size_t nfit_size = sizeof(struct acpi_nfit_system_address) * 2
-		+ sizeof(struct acpi_nfit_memory_map)
-		+ offsetof(struct acpi_nfit_control_region, window_size);
+		+ sizeof(struct acpi_nfit_memory_map) * 2
+		+ offsetof(struct acpi_nfit_control_region, window_size) * 2;
 	int i;
 
 	t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma);
@@ -906,6 +907,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	memdev->address = 0;
 	memdev->interleave_index = 0;
 	memdev->interleave_ways = 2;
+	memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED;
 
 	/* mem-region2 (spa1, dimm0) */
 	memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 2;
@@ -921,6 +923,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	memdev->address = SPA0_SIZE/2;
 	memdev->interleave_index = 0;
 	memdev->interleave_ways = 4;
+	memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED;
 
 	/* mem-region3 (spa1, dimm1) */
 	memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 3;
@@ -951,6 +954,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	memdev->address = SPA0_SIZE/2;
 	memdev->interleave_index = 0;
 	memdev->interleave_ways = 4;
+	memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED;
 
 	/* mem-region5 (spa1, dimm3) */
 	memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 5;
@@ -1086,6 +1090,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	memdev->address = 0;
 	memdev->interleave_index = 0;
 	memdev->interleave_ways = 1;
+	memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED;
 
 	offset = offset + sizeof(struct acpi_nfit_memory_map) * 14;
 	/* dcr-descriptor0: blk */
@@ -1384,6 +1389,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 		memdev->address = 0;
 		memdev->interleave_index = 0;
 		memdev->interleave_ways = 1;
+		memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED;
 
 		/* mem-region16 (spa/bdw4, dimm4) */
 		memdev = nfit_buf + offset +
@@ -1486,6 +1492,34 @@ static void nfit_test1_setup(struct nfit_test *t)
 	dcr->code = NFIT_FIC_BYTE;
 	dcr->windows = 0;
 
+	offset += dcr->header.length;
+	memdev = nfit_buf + offset;
+	memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+	memdev->header.length = sizeof(*memdev);
+	memdev->device_handle = handle[6];
+	memdev->physical_id = 0;
+	memdev->region_id = 0;
+	memdev->range_index = 0;
+	memdev->region_index = 0+2;
+	memdev->region_size = SPA2_SIZE;
+	memdev->region_offset = 0;
+	memdev->address = 0;
+	memdev->interleave_index = 0;
+	memdev->interleave_ways = 1;
+	memdev->flags = ACPI_NFIT_MEM_MAP_FAILED;
+
+	/* dcr-descriptor1 */
+	offset += sizeof(*memdev);
+	dcr = nfit_buf + offset;
+	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
+	dcr->header.length = offsetof(struct acpi_nfit_control_region,
+			window_size);
+	dcr->region_index = 0+2;
+	dcr_common_init(dcr);
+	dcr->serial_number = ~handle[6];
+	dcr->code = NFIT_FIC_BYTE;
+	dcr->windows = 0;
+
 	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE);
 
 	acpi_desc = &t->acpi_desc;
@@ -1907,7 +1941,7 @@ static __init int nfit_test_init(void)
 		case 1:
 			nfit_test->num_pm = 1;
 			nfit_test->dcr_idx = NUM_DCR;
-			nfit_test->num_dcr = 1;
+			nfit_test->num_dcr = 2;
 			nfit_test->alloc = nfit_test1_alloc;
 			nfit_test->setup = nfit_test1_setup;
 			break;
-- 
cgit 


From 1499934dcd15a7c9aca1467025b78f6675032849 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 13 Apr 2017 19:46:36 -0700
Subject: acpi, nfit: support "map failed" dimms

Stop requiring dimms be successfully mapped into a
system-physical-address range. For provisioning and hardware remediation
purposes the kernel should account for failed devices in sysfs. If
possible it should still allow management commands to be sent to the
device.

Reported-by: Toshi Kani <toshi.kani@hpe.com>
Tested-by: Toshi Kani <toshi.kani@hpe.com>
Reported-by: Linda Knippers <linda.knippers@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 53 +++++++++++++++++++++++++++++++++++++-----------
 drivers/acpi/nfit/nfit.h |  2 +-
 2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 05829de43b1d..2bd842c46fbb 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -738,28 +738,38 @@ static void nfit_mem_init_bdw(struct acpi_nfit_desc *acpi_desc,
 	}
 }
 
-static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
+static int __nfit_mem_init(struct acpi_nfit_desc *acpi_desc,
 		struct acpi_nfit_system_address *spa)
 {
 	struct nfit_mem *nfit_mem, *found;
 	struct nfit_memdev *nfit_memdev;
-	int type = nfit_spa_type(spa);
+	int type = spa ? nfit_spa_type(spa) : 0;
 
 	switch (type) {
 	case NFIT_SPA_DCR:
 	case NFIT_SPA_PM:
 		break;
 	default:
-		return 0;
+		if (spa)
+			return 0;
 	}
 
+	/*
+	 * This loop runs in two modes, when a dimm is mapped the loop
+	 * adds memdev associations to an existing dimm, or creates a
+	 * dimm. In the unmapped dimm case this loop sweeps for memdev
+	 * instances with an invalid / zero range_index and adds those
+	 * dimms without spa associations.
+	 */
 	list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
 		struct nfit_flush *nfit_flush;
 		struct nfit_dcr *nfit_dcr;
 		u32 device_handle;
 		u16 dcr;
 
-		if (nfit_memdev->memdev->range_index != spa->range_index)
+		if (spa && nfit_memdev->memdev->range_index != spa->range_index)
+			continue;
+		if (!spa && nfit_memdev->memdev->range_index)
 			continue;
 		found = NULL;
 		dcr = nfit_memdev->memdev->region_index;
@@ -844,14 +854,15 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
 				break;
 			}
 			nfit_mem_init_bdw(acpi_desc, nfit_mem, spa);
-		} else {
+		} else if (type == NFIT_SPA_PM) {
 			/*
 			 * A single dimm may belong to multiple SPA-PM
 			 * ranges, record at least one in addition to
 			 * any SPA-DCR range.
 			 */
 			nfit_mem->memdev_pmem = nfit_memdev->memdev;
-		}
+		} else
+			nfit_mem->memdev_dcr = nfit_memdev->memdev;
 	}
 
 	return 0;
@@ -875,6 +886,8 @@ static int nfit_mem_cmp(void *priv, struct list_head *_a, struct list_head *_b)
 static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc)
 {
 	struct nfit_spa *nfit_spa;
+	int rc;
+
 
 	/*
 	 * For each SPA-DCR or SPA-PMEM address range find its
@@ -885,13 +898,20 @@ static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc)
 	 * BDWs are optional.
 	 */
 	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
-		int rc;
-
-		rc = nfit_mem_dcr_init(acpi_desc, nfit_spa->spa);
+		rc = __nfit_mem_init(acpi_desc, nfit_spa->spa);
 		if (rc)
 			return rc;
 	}
 
+	/*
+	 * If a DIMM has failed to be mapped into SPA there will be no
+	 * SPA entries above. Find and register all the unmapped DIMMs
+	 * for reporting and recovery purposes.
+	 */
+	rc = __nfit_mem_init(acpi_desc, NULL);
+	if (rc)
+		return rc;
+
 	list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp);
 
 	return 0;
@@ -1301,8 +1321,16 @@ static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj,
 	struct device *dev = container_of(kobj, struct device, kobj);
 	struct nvdimm *nvdimm = to_nvdimm(dev);
 
-	if (!to_nfit_dcr(dev))
+	if (!to_nfit_dcr(dev)) {
+		/* Without a dcr only the memdev attributes can be surfaced */
+		if (a == &dev_attr_handle.attr || a == &dev_attr_phys_id.attr
+				|| a == &dev_attr_flags.attr
+				|| a == &dev_attr_family.attr
+				|| a == &dev_attr_dsm_mask.attr)
+			return a->mode;
 		return 0;
+	}
+
 	if (a == &dev_attr_format1.attr && num_nvdimm_formats(nvdimm) <= 1)
 		return 0;
 	return a->mode;
@@ -1522,12 +1550,13 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0)
 			continue;
 
-		dev_info(acpi_desc->dev, "%s flags:%s%s%s%s\n",
+		dev_info(acpi_desc->dev, "%s flags:%s%s%s%s%s\n",
 				nvdimm_name(nvdimm),
 		  mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? " save_fail" : "",
 		  mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? " restore_fail":"",
 		  mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? " flush_fail" : "",
-		  mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : "");
+		  mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : "",
+		  mem_flags & ACPI_NFIT_MEM_MAP_FAILED ? " map_fail" : "");
 
 	}
 
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index fc29c2e9832e..aaabd2721f8e 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -37,7 +37,7 @@
 
 #define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \
 		| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
-		| ACPI_NFIT_MEM_NOT_ARMED)
+		| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
 
 enum nfit_uuids {
 	/* for simplicity alias the uuid index with the family id */
-- 
cgit 


From caa603aae0cc0b6f25fa4673de067f625faaf47f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 14 Apr 2017 10:27:11 -0700
Subject: acpi, nfit: collate health state flags

Be tolerant of cases where the BIOS provided NFIT does not consistently
set the flags in all NVDIMM Region Mapping structures associated with a
given dimm.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 2bd842c46fbb..1d54833ade3f 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -457,9 +457,9 @@ static bool add_memdev(struct acpi_nfit_desc *acpi_desc,
 	INIT_LIST_HEAD(&nfit_memdev->list);
 	memcpy(nfit_memdev->memdev, memdev, sizeof(*memdev));
 	list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs);
-	dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d\n",
+	dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d flags: %#x\n",
 			__func__, memdev->device_handle, memdev->range_index,
-			memdev->region_index);
+			memdev->region_index, memdev->flags);
 	return true;
 }
 
@@ -1505,6 +1505,7 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 	list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
 		struct acpi_nfit_flush_address *flush;
 		unsigned long flags = 0, cmd_mask;
+		struct nfit_memdev *nfit_memdev;
 		u32 device_handle;
 		u16 mem_flags;
 
@@ -1518,6 +1519,17 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		if (nfit_mem->bdw && nfit_mem->memdev_pmem)
 			flags |= NDD_ALIASING;
 
+		/* collate flags across all memdevs for this dimm */
+		list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+			struct acpi_nfit_memory_map *dimm_memdev;
+
+			dimm_memdev = __to_nfit_memdev(nfit_mem);
+			if (dimm_memdev->device_handle
+					!= nfit_memdev->memdev->device_handle)
+				continue;
+			dimm_memdev->flags |= nfit_memdev->memdev->flags;
+		}
+
 		mem_flags = __to_nfit_memdev(nfit_mem)->flags;
 		if (mem_flags & ACPI_NFIT_MEM_NOT_ARMED)
 			flags |= NDD_UNARMED;
-- 
cgit 


From 9ccaed4bfd4f186e8350ffc7d8f188f8d2991fd9 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 13 Apr 2017 22:48:46 -0700
Subject: acpi, nfit: limit ->flush_probe() to initialization work

The nvdimm probe flushing mechanism gives userspace a sync point where
it knows all asynchronous driver probe sequences have completed.
However, it need not wait for other asynchronous actions, like
on-demand address-range-scrub. Track the init work separately from other
work in the workqueue, and only flush the former.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 7 +++++++
 drivers/acpi/nfit/nfit.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 1d54833ade3f..69c6cc77130c 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2581,6 +2581,7 @@ static void acpi_nfit_scrub(struct work_struct *work)
 			acpi_nfit_register_region(acpi_desc, nfit_spa);
 		}
 	}
+	acpi_desc->init_complete = 1;
 
 	list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
 		acpi_nfit_async_scrub(acpi_desc, nfit_spa);
@@ -2784,6 +2785,12 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
 	device_lock(dev);
 	device_unlock(dev);
 
+	/* bounce the init_mutex to make init_complete valid */
+	mutex_lock(&acpi_desc->init_mutex);
+	mutex_unlock(&acpi_desc->init_mutex);
+	if (acpi_desc->init_complete)
+		return 0;
+
 	/*
 	 * Scrub work could take 10s of seconds, userspace may give up so we
 	 * need to be interruptible while waiting.
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index aaabd2721f8e..fac098bfa585 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -163,6 +163,7 @@ struct acpi_nfit_desc {
 	unsigned int scrub_count;
 	unsigned int scrub_mode;
 	unsigned int cancel:1;
+	unsigned int init_complete:1;
 	unsigned long dimm_cmd_force_en;
 	unsigned long bus_cmd_force_en;
 	int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
-- 
cgit 


From 8b06b884cd98f7ec8b5028680b99fabfb7b3e192 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 13 Apr 2017 23:14:34 -0700
Subject: tools/testing/nvdimm: fix nfit_test shutdown crash

Keep the nfit_test instances alive until after nfit_test_teardown(), as
we may be doing resource lookups until the final un-registrations have
completed. This fixes crashes of the form.

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
 IP: __release_resource+0x12/0x90
 Call Trace:
  remove_resource+0x23/0x40
  __wrap_remove_resource+0x29/0x30 [nfit_test_iomap]
  acpi_nfit_remove_resource+0xe/0x10 [nfit]
  devm_action_release+0xf/0x20
  release_nodes+0x16d/0x2b0
  devres_release_all+0x3c/0x60
  device_release+0x21/0x90
  kobject_release+0x6a/0x170
  kobject_put+0x2f/0x60
  put_device+0x17/0x20
  platform_device_unregister+0x20/0x30
  nfit_test_exit+0x36/0x960 [nfit_test]

Reported-by: Linda Knippers <linda.knippers@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index bc02f28ed8b8..d7fb1b894128 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1958,6 +1958,7 @@ static __init int nfit_test_init(void)
 			put_device(&pdev->dev);
 			goto err_register;
 		}
+		get_device(&pdev->dev);
 
 		rc = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
 		if (rc)
@@ -1976,6 +1977,10 @@ static __init int nfit_test_init(void)
 		if (instances[i])
 			platform_device_unregister(&instances[i]->pdev);
 	nfit_test_teardown();
+	for (i = 0; i < NUM_NFITS; i++)
+		if (instances[i])
+			put_device(&instances[i]->pdev.dev);
+
 	return rc;
 }
 
@@ -1983,10 +1988,13 @@ static __exit void nfit_test_exit(void)
 {
 	int i;
 
-	platform_driver_unregister(&nfit_test_driver);
 	for (i = 0; i < NUM_NFITS; i++)
 		platform_device_unregister(&instances[i]->pdev);
+	platform_driver_unregister(&nfit_test_driver);
 	nfit_test_teardown();
+
+	for (i = 0; i < NUM_NFITS; i++)
+		put_device(&instances[i]->pdev.dev);
 	class_destroy(nfit_test_dimm);
 }
 
-- 
cgit 


From fbabd829fe76a72a6444f64361cf8b2a9848639d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 18 Apr 2017 09:56:31 -0700
Subject: acpi, nfit: fix module unload vs workqueue shutdown race

The workqueue may still be running when the devres callbacks start
firing to deallocate an acpi_nfit_desc instance. Stop and flush the
workqueue before letting any other devres de-allocations proceed.

Reported-by: Linda Knippers <linda.knippers@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c         | 76 ++++++++++++++++++++++++----------------
 drivers/acpi/nfit/nfit.h         |  1 +
 tools/testing/nvdimm/test/nfit.c |  4 +++
 3 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 69c6cc77130c..261eea1d2906 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2604,7 +2604,8 @@ static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
 				return rc;
 		}
 
-	queue_work(nfit_wq, &acpi_desc->work);
+	if (!acpi_desc->cancel)
+		queue_work(nfit_wq, &acpi_desc->work);
 	return 0;
 }
 
@@ -2650,32 +2651,11 @@ static int acpi_nfit_desc_init_scrub_attr(struct acpi_nfit_desc *acpi_desc)
 	return 0;
 }
 
-static void acpi_nfit_destruct(void *data)
+static void acpi_nfit_unregister(void *data)
 {
 	struct acpi_nfit_desc *acpi_desc = data;
-	struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
 
-	/*
-	 * Destruct under acpi_desc_lock so that nfit_handle_mce does not
-	 * race teardown
-	 */
-	mutex_lock(&acpi_desc_lock);
-	acpi_desc->cancel = 1;
-	/*
-	 * Bounce the nvdimm bus lock to make sure any in-flight
-	 * acpi_nfit_ars_rescan() submissions have had a chance to
-	 * either submit or see ->cancel set.
-	 */
-	device_lock(bus_dev);
-	device_unlock(bus_dev);
-
-	flush_workqueue(nfit_wq);
-	if (acpi_desc->scrub_count_state)
-		sysfs_put(acpi_desc->scrub_count_state);
 	nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
-	acpi_desc->nvdimm_bus = NULL;
-	list_del(&acpi_desc->list);
-	mutex_unlock(&acpi_desc_lock);
 }
 
 int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
@@ -2693,7 +2673,7 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
 		if (!acpi_desc->nvdimm_bus)
 			return -ENOMEM;
 
-		rc = devm_add_action_or_reset(dev, acpi_nfit_destruct,
+		rc = devm_add_action_or_reset(dev, acpi_nfit_unregister,
 				acpi_desc);
 		if (rc)
 			return rc;
@@ -2787,9 +2767,10 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
 
 	/* bounce the init_mutex to make init_complete valid */
 	mutex_lock(&acpi_desc->init_mutex);
-	mutex_unlock(&acpi_desc->init_mutex);
-	if (acpi_desc->init_complete)
+	if (acpi_desc->cancel || acpi_desc->init_complete) {
+		mutex_unlock(&acpi_desc->init_mutex);
 		return 0;
+	}
 
 	/*
 	 * Scrub work could take 10s of seconds, userspace may give up so we
@@ -2798,6 +2779,7 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
 	INIT_WORK_ONSTACK(&flush.work, flush_probe);
 	COMPLETION_INITIALIZER_ONSTACK(flush.cmp);
 	queue_work(nfit_wq, &flush.work);
+	mutex_unlock(&acpi_desc->init_mutex);
 
 	rc = wait_for_completion_interruptible(&flush.cmp);
 	cancel_work_sync(&flush.work);
@@ -2834,10 +2816,12 @@ int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
 	if (work_busy(&acpi_desc->work))
 		return -EBUSY;
 
-	if (acpi_desc->cancel)
+	mutex_lock(&acpi_desc->init_mutex);
+	if (acpi_desc->cancel) {
+		mutex_unlock(&acpi_desc->init_mutex);
 		return 0;
+	}
 
-	mutex_lock(&acpi_desc->init_mutex);
 	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
 		struct acpi_nfit_system_address *spa = nfit_spa->spa;
 
@@ -2886,6 +2870,35 @@ static void acpi_nfit_put_table(void *table)
 	acpi_put_table(table);
 }
 
+void acpi_nfit_shutdown(void *data)
+{
+	struct acpi_nfit_desc *acpi_desc = data;
+	struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
+
+	/*
+	 * Destruct under acpi_desc_lock so that nfit_handle_mce does not
+	 * race teardown
+	 */
+	mutex_lock(&acpi_desc_lock);
+	list_del(&acpi_desc->list);
+	mutex_unlock(&acpi_desc_lock);
+
+	mutex_lock(&acpi_desc->init_mutex);
+	acpi_desc->cancel = 1;
+	mutex_unlock(&acpi_desc->init_mutex);
+
+	/*
+	 * Bounce the nvdimm bus lock to make sure any in-flight
+	 * acpi_nfit_ars_rescan() submissions have had a chance to
+	 * either submit or see ->cancel set.
+	 */
+	device_lock(bus_dev);
+	device_unlock(bus_dev);
+
+	flush_workqueue(nfit_wq);
+}
+EXPORT_SYMBOL_GPL(acpi_nfit_shutdown);
+
 static int acpi_nfit_add(struct acpi_device *adev)
 {
 	struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -2933,12 +2946,15 @@ static int acpi_nfit_add(struct acpi_device *adev)
 		rc = acpi_nfit_init(acpi_desc, (void *) tbl
 				+ sizeof(struct acpi_table_nfit),
 				sz - sizeof(struct acpi_table_nfit));
-	return rc;
+
+	if (rc)
+		return rc;
+	return devm_add_action_or_reset(dev, acpi_nfit_shutdown, acpi_desc);
 }
 
 static int acpi_nfit_remove(struct acpi_device *adev)
 {
-	/* see acpi_nfit_destruct */
+	/* see acpi_nfit_unregister */
 	return 0;
 }
 
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index fac098bfa585..58fb7d68e04a 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -239,6 +239,7 @@ static inline struct acpi_nfit_desc *to_acpi_desc(
 
 const u8 *to_nfit_uuid(enum nfit_uuids id);
 int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz);
+void acpi_nfit_shutdown(void *data);
 void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event);
 void __acpi_nvdimm_notify(struct device *dev, u32 event);
 int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index d7fb1b894128..c2187178fb13 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1851,6 +1851,10 @@ static int nfit_test_probe(struct platform_device *pdev)
 	if (rc)
 		return rc;
 
+	rc = devm_add_action_or_reset(&pdev->dev, acpi_nfit_shutdown, acpi_desc);
+	if (rc)
+		return rc;
+
 	if (nfit_test->setup != nfit_test0_setup)
 		return 0;
 
-- 
cgit 


From 72058005411ffddcae6c06f7b691d635489132af Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 19 Apr 2017 15:14:31 -0700
Subject: dax: add a facility to lookup a dax device by 'host' device name

For the current block_device based filesystem-dax path, we need a way
for it to lookup the dax_device associated with a block_device. Add a
'host' property of a dax_device that can be used for this purpose. It is
a free form string, but for a dax_device associated with a block device
it is the bdev name.

This is a stop-gap until filesystems are able to mount on a dax-inode
directly.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.h    |  2 +-
 drivers/dax/device.c |  2 +-
 drivers/dax/super.c  | 87 +++++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/dax.h  |  1 +
 4 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
index 2472d9da96db..246a24d68d4c 100644
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
@@ -13,7 +13,7 @@
 #ifndef __DAX_H__
 #define __DAX_H__
 struct dax_device;
-struct dax_device *alloc_dax(void *private);
+struct dax_device *alloc_dax(void *private, const char *host);
 void put_dax(struct dax_device *dax_dev);
 bool dax_alive(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 19a42edbfa03..db68f4fa8ce0 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -645,7 +645,7 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
 		goto err_id;
 	}
 
-	dax_dev = alloc_dax(dev_dax);
+	dax_dev = alloc_dax(dev_dax, NULL);
 	if (!dax_dev)
 		goto err_dax;
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index c9f85f1c086e..8d446674c1da 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -30,6 +30,10 @@ static DEFINE_IDA(dax_minor_ida);
 static struct kmem_cache *dax_cache __read_mostly;
 static struct super_block *dax_superblock __read_mostly;
 
+#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head))
+static struct hlist_head dax_host_list[DAX_HASH_SIZE];
+static DEFINE_SPINLOCK(dax_host_lock);
+
 int dax_read_lock(void)
 {
 	return srcu_read_lock(&dax_srcu);
@@ -46,12 +50,15 @@ EXPORT_SYMBOL_GPL(dax_read_unlock);
  * struct dax_device - anchor object for dax services
  * @inode: core vfs
  * @cdev: optional character interface for "device dax"
+ * @host: optional name for lookups where the device path is not available
  * @private: dax driver private data
  * @alive: !alive + rcu grace period == no new operations / mappings
  */
 struct dax_device {
+	struct hlist_node list;
 	struct inode inode;
 	struct cdev cdev;
+	const char *host;
 	void *private;
 	bool alive;
 };
@@ -63,6 +70,11 @@ bool dax_alive(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(dax_alive);
 
+static int dax_host_hash(const char *host)
+{
+	return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
+}
+
 /*
  * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
  * that any fault handlers or operations that might have seen
@@ -75,7 +87,13 @@ void kill_dax(struct dax_device *dax_dev)
 		return;
 
 	dax_dev->alive = false;
+
 	synchronize_srcu(&dax_srcu);
+
+	spin_lock(&dax_host_lock);
+	hlist_del_init(&dax_dev->list);
+	spin_unlock(&dax_host_lock);
+
 	dax_dev->private = NULL;
 }
 EXPORT_SYMBOL_GPL(kill_dax);
@@ -98,6 +116,8 @@ static void dax_i_callback(struct rcu_head *head)
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct dax_device *dax_dev = to_dax_dev(inode);
 
+	kfree(dax_dev->host);
+	dax_dev->host = NULL;
 	ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
 	kmem_cache_free(dax_cache, dax_dev);
 }
@@ -169,26 +189,53 @@ static struct dax_device *dax_dev_get(dev_t devt)
 	return dax_dev;
 }
 
-struct dax_device *alloc_dax(void *private)
+static void dax_add_host(struct dax_device *dax_dev, const char *host)
+{
+	int hash;
+
+	/*
+	 * Unconditionally init dax_dev since it's coming from a
+	 * non-zeroed slab cache
+	 */
+	INIT_HLIST_NODE(&dax_dev->list);
+	dax_dev->host = host;
+	if (!host)
+		return;
+
+	hash = dax_host_hash(host);
+	spin_lock(&dax_host_lock);
+	hlist_add_head(&dax_dev->list, &dax_host_list[hash]);
+	spin_unlock(&dax_host_lock);
+}
+
+struct dax_device *alloc_dax(void *private, const char *__host)
 {
 	struct dax_device *dax_dev;
+	const char *host;
 	dev_t devt;
 	int minor;
 
+	host = kstrdup(__host, GFP_KERNEL);
+	if (__host && !host)
+		return NULL;
+
 	minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL);
 	if (minor < 0)
-		return NULL;
+		goto err_minor;
 
 	devt = MKDEV(MAJOR(dax_devt), minor);
 	dax_dev = dax_dev_get(devt);
 	if (!dax_dev)
-		goto err_inode;
+		goto err_dev;
 
+	dax_add_host(dax_dev, host);
 	dax_dev->private = private;
 	return dax_dev;
 
- err_inode:
+ err_dev:
 	ida_simple_remove(&dax_minor_ida, minor);
+ err_minor:
+	kfree(host);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(alloc_dax);
@@ -201,6 +248,38 @@ void put_dax(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(put_dax);
 
+/**
+ * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
+ * @host: alternate name for the device registered by a dax driver
+ */
+struct dax_device *dax_get_by_host(const char *host)
+{
+	struct dax_device *dax_dev, *found = NULL;
+	int hash, id;
+
+	if (!host)
+		return NULL;
+
+	hash = dax_host_hash(host);
+
+	id = dax_read_lock();
+	spin_lock(&dax_host_lock);
+	hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) {
+		if (!dax_alive(dax_dev)
+				|| strcmp(host, dax_dev->host) != 0)
+			continue;
+
+		if (igrab(&dax_dev->inode))
+			found = dax_dev;
+		break;
+	}
+	spin_unlock(&dax_host_lock);
+	dax_read_unlock(id);
+
+	return found;
+}
+EXPORT_SYMBOL_GPL(dax_get_by_host);
+
 /**
  * inode_dax: convert a public inode into its dax_dev
  * @inode: An inode with i_cdev pointing to a dax_dev
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 5b62f5d19aea..9b2d5ba10d7d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -10,6 +10,7 @@ struct iomap_ops;
 
 int dax_read_lock(void);
 void dax_read_unlock(int id);
+struct dax_device *dax_get_by_host(const char *host);
 
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for
-- 
cgit 


From 6568b08b77816cda2a95919c7494108d983d5941 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 24 Jan 2017 18:44:18 -0800
Subject: dax: introduce dax_operations

Track a set of dax_operations per dax_device that can be set at
alloc_dax() time. These operations will be used to stop the abuse of
block_device_operations for communicating dax capabilities to
filesystems. It will also be used to replace the "pmem api" and move
pmem-specific cache maintenance, and other dax-driver-specific
filesystem-dax operations, to dax device methods. In particular this
allows us to stop abusing __copy_user_nocache(), via memcpy_to_pmem(),
with a driver specific replacement.

This is a standalone introduction of the operations. Follow on patches
convert each dax-driver and teach fs/dax.c to use ->direct_access() from
dax_operations instead of block_device_operations.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.h    |  4 +++-
 drivers/dax/device.c |  6 +++++-
 drivers/dax/super.c  |  6 +++++-
 include/linux/dax.h  | 10 ++++++++++
 4 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
index 246a24d68d4c..617bbc24be2b 100644
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
@@ -13,7 +13,9 @@
 #ifndef __DAX_H__
 #define __DAX_H__
 struct dax_device;
-struct dax_device *alloc_dax(void *private, const char *host);
+struct dax_operations;
+struct dax_device *alloc_dax(void *private, const char *host,
+		const struct dax_operations *ops);
 void put_dax(struct dax_device *dax_dev);
 bool dax_alive(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index db68f4fa8ce0..a0db055054a4 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -645,7 +645,11 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
 		goto err_id;
 	}
 
-	dax_dev = alloc_dax(dev_dax, NULL);
+	/*
+	 * No 'host' or dax_operations since there is no access to this
+	 * device outside of mmap of the resulting character device.
+	 */
+	dax_dev = alloc_dax(dev_dax, NULL, NULL);
 	if (!dax_dev)
 		goto err_dax;
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 8d446674c1da..1a58542ee8fd 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -17,6 +17,7 @@
 #include <linux/cdev.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
+#include <linux/dax.h>
 #include <linux/fs.h>
 
 static int nr_dax = CONFIG_NR_DEV_DAX;
@@ -61,6 +62,7 @@ struct dax_device {
 	const char *host;
 	void *private;
 	bool alive;
+	const struct dax_operations *ops;
 };
 
 bool dax_alive(struct dax_device *dax_dev)
@@ -208,7 +210,8 @@ static void dax_add_host(struct dax_device *dax_dev, const char *host)
 	spin_unlock(&dax_host_lock);
 }
 
-struct dax_device *alloc_dax(void *private, const char *__host)
+struct dax_device *alloc_dax(void *private, const char *__host,
+		const struct dax_operations *ops)
 {
 	struct dax_device *dax_dev;
 	const char *host;
@@ -229,6 +232,7 @@ struct dax_device *alloc_dax(void *private, const char *__host)
 		goto err_dev;
 
 	dax_add_host(dax_dev, host);
+	dax_dev->ops = ops;
 	dax_dev->private = private;
 	return dax_dev;
 
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9b2d5ba10d7d..74ebb92b625a 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -7,6 +7,16 @@
 #include <asm/pgtable.h>
 
 struct iomap_ops;
+struct dax_device;
+struct dax_operations {
+	/*
+	 * direct_access: translate a device-relative
+	 * logical-page-offset into an absolute physical pfn. Return the
+	 * number of pages available for DAX at that pfn.
+	 */
+	long (*direct_access)(struct dax_device *, pgoff_t, long,
+			void **, pfn_t *);
+};
 
 int dax_read_lock(void);
 void dax_read_unlock(int id);
-- 
cgit 


From c1d6e828a35df524df2af277eedd1471d05e4f4c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 24 Jan 2017 23:02:09 -0800
Subject: pmem: add dax_operations support

Setup a dax_device to have the same lifetime as the pmem block device
and add a ->direct_access() method that is equivalent to
pmem_direct_access(). Once fs/dax.c has been converted to use
dax_operations the old pmem_direct_access() will be removed.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.h               |  7 -----
 drivers/nvdimm/Kconfig          |  1 +
 drivers/nvdimm/pmem.c           | 61 ++++++++++++++++++++++++++++++++---------
 drivers/nvdimm/pmem.h           |  7 +++--
 include/linux/dax.h             |  6 ++++
 tools/testing/nvdimm/pmem-dax.c | 21 +++++++-------
 6 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
index 617bbc24be2b..f9e5feea742c 100644
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
@@ -13,13 +13,6 @@
 #ifndef __DAX_H__
 #define __DAX_H__
 struct dax_device;
-struct dax_operations;
-struct dax_device *alloc_dax(void *private, const char *host,
-		const struct dax_operations *ops);
-void put_dax(struct dax_device *dax_dev);
-bool dax_alive(struct dax_device *dax_dev);
-void kill_dax(struct dax_device *dax_dev);
 struct dax_device *inode_dax(struct inode *inode);
 struct inode *dax_inode(struct dax_device *dax_dev);
-void *dax_get_private(struct dax_device *dax_dev);
 #endif /* __DAX_H__ */
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 59e750183b7f..5bdd499b5f4f 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -20,6 +20,7 @@ if LIBNVDIMM
 config BLK_DEV_PMEM
 	tristate "PMEM: Persistent memory block device support"
 	default LIBNVDIMM
+	select DAX
 	select ND_BTT if BTT
 	select ND_PFN if NVDIMM_PFN
 	help
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 5b536be5a12e..fbbcf8154eec 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -28,6 +28,7 @@
 #include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/pmem.h>
+#include <linux/dax.h>
 #include <linux/nd.h>
 #include "pmem.h"
 #include "pfn.h"
@@ -199,13 +200,13 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
-__weak long pmem_direct_access(struct block_device *bdev, sector_t sector,
-		      void **kaddr, pfn_t *pfn, long size)
+__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
-	struct pmem_device *pmem = bdev->bd_queue->queuedata;
-	resource_size_t offset = sector * 512 + pmem->data_offset;
+	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
 
-	if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
+	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
+					PFN_PHYS(nr_pages))))
 		return -EIO;
 	*kaddr = pmem->virt_addr + offset;
 	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
@@ -215,26 +216,51 @@ __weak long pmem_direct_access(struct block_device *bdev, sector_t sector,
 	 * requested range.
 	 */
 	if (unlikely(pmem->bb.count))
-		return size;
-	return pmem->size - pmem->pfn_pad - offset;
+		return nr_pages;
+	return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
+}
+
+static long pmem_blk_direct_access(struct block_device *bdev, sector_t sector,
+		void **kaddr, pfn_t *pfn, long size)
+{
+	struct pmem_device *pmem = bdev->bd_queue->queuedata;
+
+	return __pmem_direct_access(pmem, PHYS_PFN(sector * 512),
+			PHYS_PFN(size), kaddr, pfn);
 }
 
 static const struct block_device_operations pmem_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		pmem_rw_page,
-	.direct_access =	pmem_direct_access,
+	.direct_access =	pmem_blk_direct_access,
 	.revalidate_disk =	nvdimm_revalidate_disk,
 };
 
+static long pmem_dax_direct_access(struct dax_device *dax_dev,
+		pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
+{
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+
+	return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
+}
+
+static const struct dax_operations pmem_dax_ops = {
+	.direct_access = pmem_dax_direct_access,
+};
+
 static void pmem_release_queue(void *q)
 {
 	blk_cleanup_queue(q);
 }
 
-static void pmem_release_disk(void *disk)
+static void pmem_release_disk(void *__pmem)
 {
-	del_gendisk(disk);
-	put_disk(disk);
+	struct pmem_device *pmem = __pmem;
+
+	kill_dax(pmem->dax_dev);
+	put_dax(pmem->dax_dev);
+	del_gendisk(pmem->disk);
+	put_disk(pmem->disk);
 }
 
 static int pmem_attach_disk(struct device *dev,
@@ -245,6 +271,7 @@ static int pmem_attach_disk(struct device *dev,
 	struct vmem_altmap __altmap, *altmap = NULL;
 	struct resource *res = &nsio->res;
 	struct nd_pfn *nd_pfn = NULL;
+	struct dax_device *dax_dev;
 	int nid = dev_to_node(dev);
 	struct nd_pfn_sb *pfn_sb;
 	struct pmem_device *pmem;
@@ -325,6 +352,7 @@ static int pmem_attach_disk(struct device *dev,
 	disk = alloc_disk_node(0, nid);
 	if (!disk)
 		return -ENOMEM;
+	pmem->disk = disk;
 
 	disk->fops		= &pmem_fops;
 	disk->queue		= q;
@@ -336,9 +364,16 @@ static int pmem_attach_disk(struct device *dev,
 		return -ENOMEM;
 	nvdimm_badblocks_populate(nd_region, &pmem->bb, res);
 	disk->bb = &pmem->bb;
-	device_add_disk(dev, disk);
 
-	if (devm_add_action_or_reset(dev, pmem_release_disk, disk))
+	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
+	if (!dax_dev) {
+		put_disk(disk);
+		return -ENOMEM;
+	}
+	pmem->dax_dev = dax_dev;
+
+	device_add_disk(dev, disk);
+	if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
 		return -ENOMEM;
 
 	revalidate_disk(disk);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index b4ee4f71b4a1..7f4dbd72a90a 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -5,8 +5,6 @@
 #include <linux/pfn_t.h>
 #include <linux/fs.h>
 
-long pmem_direct_access(struct block_device *bdev, sector_t sector,
-		      void **kaddr, pfn_t *pfn, long size);
 /* this definition is in it's own header for tools/testing/nvdimm to consume */
 struct pmem_device {
 	/* One contiguous memory region per device */
@@ -20,5 +18,10 @@ struct pmem_device {
 	/* trim size when namespace capacity has been section aligned */
 	u32			pfn_pad;
 	struct badblocks	bb;
+	struct dax_device	*dax_dev;
+	struct gendisk		*disk;
 };
+
+long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn);
 #endif /* __NVDIMM_PMEM_H__ */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 74ebb92b625a..39a0312c45c3 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -21,6 +21,12 @@ struct dax_operations {
 int dax_read_lock(void);
 void dax_read_unlock(int id);
 struct dax_device *dax_get_by_host(const char *host);
+struct dax_device *alloc_dax(void *private, const char *host,
+		const struct dax_operations *ops);
+void put_dax(struct dax_device *dax_dev);
+bool dax_alive(struct dax_device *dax_dev);
+void kill_dax(struct dax_device *dax_dev);
+void *dax_get_private(struct dax_device *dax_dev);
 
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for
diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c
index c9b8c48f85fc..b53596ad601b 100644
--- a/tools/testing/nvdimm/pmem-dax.c
+++ b/tools/testing/nvdimm/pmem-dax.c
@@ -15,13 +15,13 @@
 #include <pmem.h>
 #include <nd.h>
 
-long pmem_direct_access(struct block_device *bdev, sector_t sector,
-		void **kaddr, pfn_t *pfn, long size)
+long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
-	struct pmem_device *pmem = bdev->bd_queue->queuedata;
-	resource_size_t offset = sector * 512 + pmem->data_offset;
+	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
 
-	if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
+	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
+					PFN_PHYS(nr_pages))))
 		return -EIO;
 
 	/*
@@ -34,11 +34,10 @@ long pmem_direct_access(struct block_device *bdev, sector_t sector,
 		*kaddr = pmem->virt_addr + offset;
 		page = vmalloc_to_page(pmem->virt_addr + offset);
 		*pfn = page_to_pfn_t(page);
-		dev_dbg_ratelimited(disk_to_dev(bdev->bd_disk)->parent,
-				"%s: sector: %#llx pfn: %#lx\n", __func__,
-				(unsigned long long) sector, page_to_pfn(page));
+		pr_debug_ratelimited("%s: pmem: %p pgoff: %#lx pfn: %#lx\n",
+				__func__, pmem, pgoff, page_to_pfn(page));
 
-		return PAGE_SIZE;
+		return 1;
 	}
 
 	*kaddr = pmem->virt_addr + offset;
@@ -49,6 +48,6 @@ long pmem_direct_access(struct block_device *bdev, sector_t sector,
 	 * requested range.
 	 */
 	if (unlikely(pmem->bb.count))
-		return size;
-	return pmem->size - pmem->pfn_pad - offset;
+		return nr_pages;
+	return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
 }
-- 
cgit 


From 60fcd55cc236dbb3d6587f48120f00f59cb08540 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 25 Jan 2017 15:25:11 -0800
Subject: axon_ram: add dax_operations support

Setup a dax_device to have the same lifetime as the axon_ram block
device and add a ->direct_access() method that is equivalent to
axon_ram_direct_access(). Once fs/dax.c has been converted to use
dax_operations the old axon_ram_direct_access() will be removed.

Reported-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/powerpc/platforms/Kconfig |  1 +
 arch/powerpc/sysdev/axonram.c  | 48 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 7e3a2ebba29b..33244e3d9375 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -284,6 +284,7 @@ config CPM2
 config AXON_RAM
 	tristate "Axon DDR2 memory device driver"
 	depends on PPC_IBM_CELL_BLADE && BLOCK
+	select DAX
 	default m
 	help
 	  It registers one block device per Axon's DDR2 memory bank found
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index f523ac883150..171ba86a3494 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -25,6 +25,7 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/dax.h>
 #include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -62,6 +63,7 @@ static int azfs_major, azfs_minor;
 struct axon_ram_bank {
 	struct platform_device	*device;
 	struct gendisk		*disk;
+	struct dax_device	*dax_dev;
 	unsigned int		irq_id;
 	unsigned long		ph_addr;
 	unsigned long		io_addr;
@@ -137,25 +139,47 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
+static long
+__axon_ram_direct_access(struct axon_ram_bank *bank, pgoff_t pgoff, long nr_pages,
+		       void **kaddr, pfn_t *pfn)
+{
+	resource_size_t offset = pgoff * PAGE_SIZE;
+
+	*kaddr = (void *) bank->io_addr + offset;
+	*pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV);
+	return (bank->size - offset) / PAGE_SIZE;
+}
+
 /**
  * axon_ram_direct_access - direct_access() method for block device
  * @device, @sector, @data: see block_device_operations method
  */
 static long
-axon_ram_direct_access(struct block_device *device, sector_t sector,
+axon_ram_blk_direct_access(struct block_device *device, sector_t sector,
 		       void **kaddr, pfn_t *pfn, long size)
 {
 	struct axon_ram_bank *bank = device->bd_disk->private_data;
-	loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
 
-	*kaddr = (void *) bank->io_addr + offset;
-	*pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV);
-	return bank->size - offset;
+	return __axon_ram_direct_access(bank, (sector * 512) / PAGE_SIZE,
+			size / PAGE_SIZE, kaddr, pfn) * PAGE_SIZE;
 }
 
 static const struct block_device_operations axon_ram_devops = {
 	.owner		= THIS_MODULE,
-	.direct_access	= axon_ram_direct_access
+	.direct_access	= axon_ram_blk_direct_access
+};
+
+static long
+axon_ram_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
+		       void **kaddr, pfn_t *pfn)
+{
+	struct axon_ram_bank *bank = dax_get_private(dax_dev);
+
+	return __axon_ram_direct_access(bank, pgoff, nr_pages, kaddr, pfn);
+}
+
+static const struct dax_operations axon_ram_dax_ops = {
+	.direct_access = axon_ram_dax_direct_access,
 };
 
 /**
@@ -219,6 +243,7 @@ static int axon_ram_probe(struct platform_device *device)
 		goto failed;
 	}
 
+
 	bank->disk->major = azfs_major;
 	bank->disk->first_minor = azfs_minor;
 	bank->disk->fops = &axon_ram_devops;
@@ -227,6 +252,13 @@ static int axon_ram_probe(struct platform_device *device)
 	sprintf(bank->disk->disk_name, "%s%d",
 			AXON_RAM_DEVICE_NAME, axon_ram_bank_id);
 
+	bank->dax_dev = alloc_dax(bank, bank->disk->disk_name,
+			&axon_ram_dax_ops);
+	if (!bank->dax_dev) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
 	bank->disk->queue = blk_alloc_queue(GFP_KERNEL);
 	if (bank->disk->queue == NULL) {
 		dev_err(&device->dev, "Cannot register disk queue\n");
@@ -278,6 +310,8 @@ failed:
 				del_gendisk(bank->disk);
 			put_disk(bank->disk);
 		}
+		kill_dax(bank->dax_dev);
+		put_dax(bank->dax_dev);
 		device->dev.platform_data = NULL;
 		if (bank->io_addr != 0)
 			iounmap((void __iomem *) bank->io_addr);
@@ -300,6 +334,8 @@ axon_ram_remove(struct platform_device *device)
 
 	device_remove_file(&device->dev, &dev_attr_ecc);
 	free_irq(bank->irq_id, device);
+	kill_dax(bank->dax_dev);
+	put_dax(bank->dax_dev);
 	del_gendisk(bank->disk);
 	put_disk(bank->disk);
 	iounmap((void __iomem *) bank->io_addr);
-- 
cgit 


From 1647b9b959c7b1b3c20e8efa2c40529e7dce756a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 25 Jan 2017 16:54:45 -0800
Subject: brd: add dax_operations support

Setup a dax_inode to have the same lifetime as the brd block device and
add a ->direct_access() method that is equivalent to
brd_direct_access(). Once fs/dax.c has been converted to use
dax_operations the old brd_direct_access() will be removed.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/block/Kconfig |  1 +
 drivers/block/brd.c   | 65 ++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index f744de7a0f9b..e66956fc2c88 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -339,6 +339,7 @@ config BLK_DEV_SX8
 
 config BLK_DEV_RAM
 	tristate "RAM block device support"
+	select DAX if BLK_DEV_RAM_DAX
 	---help---
 	  Saying Y here will allow you to use a portion of your RAM memory as
 	  a block device, so that you can make file systems on it, read and
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 3adc32a3153b..60f3193c9ce2 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 #include <linux/pfn_t.h>
+#include <linux/dax.h>
 #endif
 
 #include <linux/uaccess.h>
@@ -41,6 +42,9 @@ struct brd_device {
 
 	struct request_queue	*brd_queue;
 	struct gendisk		*brd_disk;
+#ifdef CONFIG_BLK_DEV_RAM_DAX
+	struct dax_device	*dax_dev;
+#endif
 	struct list_head	brd_list;
 
 	/*
@@ -375,30 +379,53 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
-static long brd_direct_access(struct block_device *bdev, sector_t sector,
-			void **kaddr, pfn_t *pfn, long size)
+static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
-	struct brd_device *brd = bdev->bd_disk->private_data;
 	struct page *page;
 
 	if (!brd)
 		return -ENODEV;
-	page = brd_insert_page(brd, sector);
+	page = brd_insert_page(brd, PFN_PHYS(pgoff) / 512);
 	if (!page)
 		return -ENOSPC;
 	*kaddr = page_address(page);
 	*pfn = page_to_pfn_t(page);
 
-	return PAGE_SIZE;
+	return 1;
+}
+
+static long brd_blk_direct_access(struct block_device *bdev, sector_t sector,
+			void **kaddr, pfn_t *pfn, long size)
+{
+	struct brd_device *brd = bdev->bd_disk->private_data;
+	long nr_pages = __brd_direct_access(brd, PHYS_PFN(sector * 512),
+			PHYS_PFN(size), kaddr, pfn);
+
+	if (nr_pages < 0)
+		return nr_pages;
+	return nr_pages * PAGE_SIZE;
+}
+
+static long brd_dax_direct_access(struct dax_device *dax_dev,
+		pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
+{
+	struct brd_device *brd = dax_get_private(dax_dev);
+
+	return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn);
 }
+
+static const struct dax_operations brd_dax_ops = {
+	.direct_access = brd_dax_direct_access,
+};
 #else
-#define brd_direct_access NULL
+#define brd_blk_direct_access NULL
 #endif
 
 static const struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		brd_rw_page,
-	.direct_access =	brd_direct_access,
+	.direct_access =	brd_blk_direct_access,
 };
 
 /*
@@ -441,7 +468,9 @@ static struct brd_device *brd_alloc(int i)
 {
 	struct brd_device *brd;
 	struct gendisk *disk;
-
+#ifdef CONFIG_BLK_DEV_RAM_DAX
+	struct dax_device *dax_dev;
+#endif
 	brd = kzalloc(sizeof(*brd), GFP_KERNEL);
 	if (!brd)
 		goto out;
@@ -469,9 +498,6 @@ static struct brd_device *brd_alloc(int i)
 	blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX);
 	brd->brd_queue->limits.discard_zeroes_data = 1;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-	queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
-#endif
 	disk = brd->brd_disk = alloc_disk(max_part);
 	if (!disk)
 		goto out_free_queue;
@@ -484,8 +510,21 @@ static struct brd_device *brd_alloc(int i)
 	sprintf(disk->disk_name, "ram%d", i);
 	set_capacity(disk, rd_size * 2);
 
+#ifdef CONFIG_BLK_DEV_RAM_DAX
+	queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
+	dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops);
+	if (!dax_dev)
+		goto out_free_inode;
+#endif
+
+
 	return brd;
 
+#ifdef CONFIG_BLK_DEV_RAM_DAX
+out_free_inode:
+	kill_dax(dax_dev);
+	put_dax(dax_dev);
+#endif
 out_free_queue:
 	blk_cleanup_queue(brd->brd_queue);
 out_free_dev:
@@ -525,6 +564,10 @@ out:
 static void brd_del_one(struct brd_device *brd)
 {
 	list_del(&brd->brd_list);
+#ifdef CONFIG_BLK_DEV_RAM_DAX
+	kill_dax(brd->dax_dev);
+	put_dax(brd->dax_dev);
+#endif
 	del_gendisk(brd->brd_disk);
 	brd_free(brd);
 }
-- 
cgit 


From 7a2765f6e82063f348ebce78c28eceff741689d4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 26 Jan 2017 13:54:35 -0800
Subject: dcssblk: add dax_operations support

Setup a dax_dev to have the same lifetime as the dcssblk block device
and add a ->direct_access() method that is equivalent to
dcssblk_direct_access(). Once fs/dax.c has been converted to use
dax_operations the old dcssblk_direct_access() will be removed.

Reported-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Acked-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/s390/block/Kconfig   |  1 +
 drivers/s390/block/dcssblk.c | 55 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index 4a3b62326183..0acb8c2f9475 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -14,6 +14,7 @@ config BLK_DEV_XPRAM
 
 config DCSSBLK
 	def_tristate m
+	select DAX
 	prompt "DCSSBLK support"
 	depends on S390 && BLOCK
 	help
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 415d10a67b7a..dc84cfd4e438 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 #include <linux/pfn_t.h>
+#include <linux/dax.h>
 #include <asm/extmem.h>
 #include <asm/io.h>
 
@@ -30,8 +31,10 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode);
 static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static blk_qc_t dcssblk_make_request(struct request_queue *q,
 						struct bio *bio);
-static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
+static long dcssblk_blk_direct_access(struct block_device *bdev, sector_t secnum,
 			 void **kaddr, pfn_t *pfn, long size);
+static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -40,7 +43,11 @@ static const struct block_device_operations dcssblk_devops = {
 	.owner   	= THIS_MODULE,
 	.open    	= dcssblk_open,
 	.release 	= dcssblk_release,
-	.direct_access 	= dcssblk_direct_access,
+	.direct_access 	= dcssblk_blk_direct_access,
+};
+
+static const struct dax_operations dcssblk_dax_ops = {
+	.direct_access = dcssblk_dax_direct_access,
 };
 
 struct dcssblk_dev_info {
@@ -57,6 +64,7 @@ struct dcssblk_dev_info {
 	struct request_queue *dcssblk_queue;
 	int num_of_segments;
 	struct list_head seg_list;
+	struct dax_device *dax_dev;
 };
 
 struct segment_info {
@@ -389,6 +397,8 @@ removeseg:
 	}
 	list_del(&dev_info->lh);
 
+	kill_dax(dev_info->dax_dev);
+	put_dax(dev_info->dax_dev);
 	del_gendisk(dev_info->gd);
 	blk_cleanup_queue(dev_info->dcssblk_queue);
 	dev_info->gd->queue = NULL;
@@ -654,6 +664,13 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	if (rc)
 		goto put_dev;
 
+	dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
+			&dcssblk_dax_ops);
+	if (!dev_info->dax_dev) {
+		rc = -ENOMEM;
+		goto put_dev;
+	}
+
 	get_device(&dev_info->dev);
 	device_add_disk(&dev_info->dev, dev_info->gd);
 
@@ -752,6 +769,8 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch
 	}
 
 	list_del(&dev_info->lh);
+	kill_dax(dev_info->dax_dev);
+	put_dax(dev_info->dax_dev);
 	del_gendisk(dev_info->gd);
 	blk_cleanup_queue(dev_info->dcssblk_queue);
 	dev_info->gd->queue = NULL;
@@ -883,21 +902,39 @@ fail:
 }
 
 static long
-dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
+__dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
+{
+	resource_size_t offset = pgoff * PAGE_SIZE;
+	unsigned long dev_sz;
+
+	dev_sz = dev_info->end - dev_info->start + 1;
+	*kaddr = (void *) dev_info->start + offset;
+	*pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV);
+
+	return (dev_sz - offset) / PAGE_SIZE;
+}
+
+static long
+dcssblk_blk_direct_access(struct block_device *bdev, sector_t secnum,
 			void **kaddr, pfn_t *pfn, long size)
 {
 	struct dcssblk_dev_info *dev_info;
-	unsigned long offset, dev_sz;
 
 	dev_info = bdev->bd_disk->private_data;
 	if (!dev_info)
 		return -ENODEV;
-	dev_sz = dev_info->end - dev_info->start + 1;
-	offset = secnum * 512;
-	*kaddr = (void *) dev_info->start + offset;
-	*pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV);
+	return __dcssblk_direct_access(dev_info, PHYS_PFN(secnum * 512),
+			PHYS_PFN(size), kaddr, pfn) * PAGE_SIZE;
+}
+
+static long
+dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
+{
+	struct dcssblk_dev_info *dev_info = dax_get_private(dax_dev);
 
-	return dev_sz - offset;
+	return __dcssblk_direct_access(dev_info, pgoff, nr_pages, kaddr, pfn);
 }
 
 static void
-- 
cgit 


From d8f07aee3f2fd959878bf614d4e984900018eb9e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 26 Jan 2017 23:30:05 -0800
Subject: block: kill bdev_dax_capable()

This is leftover dead code that has since been replaced by
bdev_dax_supported().

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/block_dev.c         | 24 ------------------------
 include/linux/blkdev.h |  1 -
 2 files changed, 25 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eca00ec4370..7f40ea2f0875 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -807,30 +807,6 @@ int bdev_dax_supported(struct super_block *sb, int blocksize)
 }
 EXPORT_SYMBOL_GPL(bdev_dax_supported);
 
-/**
- * bdev_dax_capable() - Return if the raw device is capable for dax
- * @bdev: The device for raw block device access
- */
-bool bdev_dax_capable(struct block_device *bdev)
-{
-	struct blk_dax_ctl dax = {
-		.size = PAGE_SIZE,
-	};
-
-	if (!IS_ENABLED(CONFIG_FS_DAX))
-		return false;
-
-	dax.sector = 0;
-	if (bdev_direct_access(bdev, &dax) < 0)
-		return false;
-
-	dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
-	if (bdev_direct_access(bdev, &dax) < 0)
-		return false;
-
-	return true;
-}
-
 /*
  * pseudo-fs
  */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5a7da607ca04..f72708399b83 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1958,7 +1958,6 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
 extern int bdev_dax_supported(struct super_block *, int);
-extern bool bdev_dax_capable(struct block_device *);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
-- 
cgit 


From b0686260fecaa924d8eff2ace94bee70506bc308 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 26 Jan 2017 20:37:35 -0800
Subject: dax: introduce dax_direct_access()

Replace bdev_direct_access() with dax_direct_access() that uses
dax_device and dax_operations instead of a block_device and
block_device_operations for dax. Once all consumers of the old api have
been converted bdev_direct_access() will be deleted.

Given that block device partitioning decisions can cause dax page
alignment constraints to be violated this also introduces the
bdev_dax_pgoff() helper. It handles calculating a logical pgoff relative
to the dax_device and also checks for page alignment.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/Kconfig          |  1 +
 drivers/dax/super.c    | 39 +++++++++++++++++++++++++++++++++++++++
 fs/block_dev.c         | 14 ++++++++++++++
 include/linux/blkdev.h |  1 +
 include/linux/dax.h    |  2 ++
 5 files changed, 57 insertions(+)

diff --git a/block/Kconfig b/block/Kconfig
index e9f780f815f5..93da7fc3f254 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -6,6 +6,7 @@ menuconfig BLOCK
        default y
        select SBITMAP
        select SRCU
+       select DAX
        help
 	 Provide block layer support for the kernel.
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 1a58542ee8fd..465dcd7317d5 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -65,6 +65,45 @@ struct dax_device {
 	const struct dax_operations *ops;
 };
 
+/**
+ * dax_direct_access() - translate a device pgoff to an absolute pfn
+ * @dax_dev: a dax_device instance representing the logical memory range
+ * @pgoff: offset in pages from the start of the device to translate
+ * @nr_pages: number of consecutive pages caller can handle relative to @pfn
+ * @kaddr: output parameter that returns a virtual address mapping of pfn
+ * @pfn: output parameter that returns an absolute pfn translation of @pgoff
+ *
+ * Return: negative errno if an error occurs, otherwise the number of
+ * pages accessible at the device relative @pgoff.
+ */
+long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
+		void **kaddr, pfn_t *pfn)
+{
+	long avail;
+
+	/*
+	 * The device driver is allowed to sleep, in order to make the
+	 * memory directly accessible.
+	 */
+	might_sleep();
+
+	if (!dax_dev)
+		return -EOPNOTSUPP;
+
+	if (!dax_alive(dax_dev))
+		return -ENXIO;
+
+	if (nr_pages < 0)
+		return nr_pages;
+
+	avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
+			kaddr, pfn);
+	if (!avail)
+		return -ERANGE;
+	return min(avail, nr_pages);
+}
+EXPORT_SYMBOL_GPL(dax_direct_access);
+
 bool dax_alive(struct dax_device *dax_dev)
 {
 	lockdep_assert_held(&dax_srcu);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7f40ea2f0875..2f7885712575 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/magic.h>
+#include <linux/dax.h>
 #include <linux/buffer_head.h>
 #include <linux/swap.h>
 #include <linux/pagevec.h>
@@ -762,6 +763,19 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 }
 EXPORT_SYMBOL_GPL(bdev_direct_access);
 
+int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+		pgoff_t *pgoff)
+{
+	phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
+
+	if (pgoff)
+		*pgoff = PHYS_PFN(phys_off);
+	if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(bdev_dax_pgoff);
+
 /**
  * bdev_dax_supported() - Check if the device supports dax for filesystem
  * @sb: The superblock of the device
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f72708399b83..612c497d1461 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1958,6 +1958,7 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
 extern int bdev_dax_supported(struct super_block *, int);
+int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 39a0312c45c3..7e62e280c11f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -27,6 +27,8 @@ void put_dax(struct dax_device *dax_dev);
 bool dax_alive(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
+long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
+		void **kaddr, pfn_t *pfn);
 
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for
-- 
cgit 


From f26c5719b2d7b00de69eb83eb1c1c831759fdc9b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 12 Apr 2017 12:35:44 -0700
Subject: dm: add dax_device and dax_operations support

Allocate a dax_device to represent the capacity of a device-mapper
instance. Provide a ->direct_access() method via the new dax_operations
indirection that mirrors the functionality of the current direct_access
support via block_device_operations.  Once fs/dax.c has been converted
to use dax_operations the old dm_blk_direct_access() will be removed.

A new helper dm_dax_get_live_target() is introduced to separate some of
the dm-specifics from the direct_access implementation.

This enabling is only for the top-level dm representation to upper
layers. Converting target direct_access implementations is deferred to a
separate patch.

Cc: Toshi Kani <toshi.kani@hpe.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/md/Kconfig            |  1 +
 drivers/md/dm-core.h          |  1 +
 drivers/md/dm.c               | 84 +++++++++++++++++++++++++++++++++++--------
 include/linux/device-mapper.h |  1 +
 4 files changed, 73 insertions(+), 14 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b7767da50c26..1de8372d9459 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -200,6 +200,7 @@ config BLK_DEV_DM_BUILTIN
 config BLK_DEV_DM
 	tristate "Device mapper support"
 	select BLK_DEV_DM_BUILTIN
+	select DAX
 	---help---
 	  Device-mapper is a low level volume manager.  It works by allowing
 	  people to specify mappings for ranges of logical sectors.  Various
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 136fda3ff9e5..538630190f66 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -58,6 +58,7 @@ struct mapped_device {
 	struct target_type *immutable_target_type;
 
 	struct gendisk *disk;
+	struct dax_device *dax_dev;
 	char name[16];
 
 	void *interface_ptr;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb75979e455..bd56dfe43a99 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -16,6 +16,7 @@
 #include <linux/blkpg.h>
 #include <linux/bio.h>
 #include <linux/mempool.h>
+#include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
@@ -908,31 +909,68 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 }
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 
-static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
-				 void **kaddr, pfn_t *pfn, long size)
+static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
+		sector_t sector, int *srcu_idx)
 {
-	struct mapped_device *md = bdev->bd_disk->private_data;
 	struct dm_table *map;
 	struct dm_target *ti;
-	int srcu_idx;
-	long len, ret = -EIO;
 
-	map = dm_get_live_table(md, &srcu_idx);
+	map = dm_get_live_table(md, srcu_idx);
 	if (!map)
-		goto out;
+		return NULL;
 
 	ti = dm_table_find_target(map, sector);
 	if (!dm_target_is_valid(ti))
-		goto out;
+		return NULL;
 
-	len = max_io_len(sector, ti) << SECTOR_SHIFT;
-	size = min(len, size);
+	return ti;
+}
 
-	if (ti->type->direct_access)
-		ret = ti->type->direct_access(ti, sector, kaddr, pfn, size);
-out:
+static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
+{
+	struct mapped_device *md = dax_get_private(dax_dev);
+	sector_t sector = pgoff * PAGE_SECTORS;
+	struct dm_target *ti;
+	long len, ret = -EIO;
+	int srcu_idx;
+
+	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+	if (!ti)
+		goto out;
+	if (!ti->type->direct_access)
+		goto out;
+	len = max_io_len(sector, ti) / PAGE_SECTORS;
+	if (len < 1)
+		goto out;
+	nr_pages = min(len, nr_pages);
+	if (ti->type->direct_access) {
+		ret = ti->type->direct_access(ti, sector, kaddr, pfn,
+				nr_pages * PAGE_SIZE);
+		/*
+		 * FIXME: convert ti->type->direct_access to return
+		 * nr_pages directly.
+		 */
+		if (ret >= 0)
+			ret /= PAGE_SIZE;
+	}
+ out:
 	dm_put_live_table(md, srcu_idx);
-	return min(ret, size);
+
+	return ret;
+}
+
+static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
+		void **kaddr, pfn_t *pfn, long size)
+{
+	struct mapped_device *md = bdev->bd_disk->private_data;
+	struct dax_device *dax_dev = md->dax_dev;
+	long nr_pages = size / PAGE_SIZE;
+
+	nr_pages = dm_dax_direct_access(dax_dev, sector / PAGE_SECTORS,
+			nr_pages, kaddr, pfn);
+	return nr_pages < 0 ? nr_pages : nr_pages * PAGE_SIZE;
 }
 
 /*
@@ -1437,6 +1475,7 @@ static int next_free_minor(int *minor)
 }
 
 static const struct block_device_operations dm_blk_dops;
+static const struct dax_operations dm_dax_ops;
 
 static void dm_wq_work(struct work_struct *work);
 
@@ -1483,6 +1522,12 @@ static void cleanup_mapped_device(struct mapped_device *md)
 	if (md->bs)
 		bioset_free(md->bs);
 
+	if (md->dax_dev) {
+		kill_dax(md->dax_dev);
+		put_dax(md->dax_dev);
+		md->dax_dev = NULL;
+	}
+
 	if (md->disk) {
 		spin_lock(&_minor_lock);
 		md->disk->private_data = NULL;
@@ -1510,6 +1555,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
 static struct mapped_device *alloc_dev(int minor)
 {
 	int r, numa_node_id = dm_get_numa_node();
+	struct dax_device *dax_dev;
 	struct mapped_device *md;
 	void *old_md;
 
@@ -1574,6 +1620,12 @@ static struct mapped_device *alloc_dev(int minor)
 	md->disk->queue = md->queue;
 	md->disk->private_data = md;
 	sprintf(md->disk->disk_name, "dm-%d", minor);
+
+	dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+	if (!dax_dev)
+		goto bad;
+	md->dax_dev = dax_dev;
+
 	add_disk(md->disk);
 	format_dev_t(md->name, MKDEV(_major, minor));
 
@@ -2781,6 +2833,10 @@ static const struct block_device_operations dm_blk_dops = {
 	.owner = THIS_MODULE
 };
 
+static const struct dax_operations dm_dax_ops = {
+	.direct_access = dm_dax_direct_access,
+};
+
 /*
  * module hooks
  */
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a7e6903866fd..bcba4d89089c 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -130,6 +130,7 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
  */
 typedef long (*dm_direct_access_fn) (struct dm_target *ti, sector_t sector,
 				     void **kaddr, pfn_t *pfn, long size);
+#define PAGE_SECTORS (PAGE_SIZE / 512)
 
 void dm_error(const char *message);
 
-- 
cgit 


From bc042fdfbb92b5b13421316b4548e2d6e98eed37 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 24 Apr 2017 15:43:05 -0700
Subject: libnvdimm, region: fix flush hint detection crash

In the case where a dimm does not have any associated flush hints the
ndrd->flush_wpq array may be uninitialized leading to crashes with the
following signature:

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
 IP: region_visible+0x10f/0x160 [libnvdimm]

 Call Trace:
  internal_create_group+0xbe/0x2f0
  sysfs_create_groups+0x40/0x80
  device_add+0x2d8/0x650
  nd_async_device_register+0x12/0x40 [libnvdimm]
  async_run_entry_fn+0x39/0x170
  process_one_work+0x212/0x6c0
  ? process_one_work+0x197/0x6c0
  worker_thread+0x4e/0x4a0
  kthread+0x10c/0x140
  ? process_one_work+0x6c0/0x6c0
  ? kthread_create_on_node+0x60/0x60
  ret_from_fork+0x31/0x40

Cc: <stable@vger.kernel.org>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Fixes: f284a4f23752 ("libnvdimm: introduce nvdimm_flush() and nvdimm_has_flush()")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/region_devs.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 8de5a04644a1..24abceda986a 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1000,17 +1000,20 @@ EXPORT_SYMBOL_GPL(nvdimm_flush);
  */
 int nvdimm_has_flush(struct nd_region *nd_region)
 {
-	struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
 	int i;
 
 	/* no nvdimm == flushing capability unknown */
 	if (nd_region->ndr_mappings == 0)
 		return -ENXIO;
 
-	for (i = 0; i < nd_region->ndr_mappings; i++)
-		/* flush hints present, flushing required */
-		if (ndrd_get_flush_wpq(ndrd, i, 0))
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		/* flush hints present / available */
+		if (nvdimm->num_flush)
 			return 1;
+	}
 
 	/*
 	 * The platform defines dimm devices without hints, assume
-- 
cgit 


From 817bf40265459578abc36c6bd53e27775b5c7ec4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 12 Apr 2017 13:37:44 -0700
Subject: dm: teach dm-targets to use a dax_device + dax_operations

Arrange for dm to lookup the dax services available from member devices.
Update the dax-capable targets, linear and stripe, to route dax
operations to the underlying device. Changes the target-internal
->direct_access() method to more closely align with the dax_operations
->direct_access() calling convention.

Cc: Toshi Kani <toshi.kani@hpe.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/md/dm-linear.c        | 27 +++++++++++++--------------
 drivers/md/dm-snap.c          |  6 +++---
 drivers/md/dm-stripe.c        | 29 ++++++++++++++---------------
 drivers/md/dm-target.c        |  6 +++---
 drivers/md/dm.c               | 16 ++++++----------
 include/linux/device-mapper.h |  7 ++++---
 6 files changed, 43 insertions(+), 48 deletions(-)

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9..c5a52f4dae81 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -9,6 +9,7 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
+#include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/device-mapper.h>
 
@@ -141,22 +142,20 @@ static int linear_iterate_devices(struct dm_target *ti,
 	return fn(ti, lc->dev, lc->start, ti->len, data);
 }
 
-static long linear_direct_access(struct dm_target *ti, sector_t sector,
-				 void **kaddr, pfn_t *pfn, long size)
+static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
+	long ret;
 	struct linear_c *lc = ti->private;
 	struct block_device *bdev = lc->dev->bdev;
-	struct blk_dax_ctl dax = {
-		.sector = linear_map_sector(ti, sector),
-		.size = size,
-	};
-	long ret;
-
-	ret = bdev_direct_access(bdev, &dax);
-	*kaddr = dax.addr;
-	*pfn = dax.pfn;
-
-	return ret;
+	struct dax_device *dax_dev = lc->dev->dax_dev;
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+	dev_sector = linear_map_sector(ti, sector);
+	ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff);
+	if (ret)
+		return ret;
+	return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
 }
 
 static struct target_type linear_target = {
@@ -169,7 +168,7 @@ static struct target_type linear_target = {
 	.status = linear_status,
 	.prepare_ioctl = linear_prepare_ioctl,
 	.iterate_devices = linear_iterate_devices,
-	.direct_access = linear_direct_access,
+	.direct_access = linear_dax_direct_access,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c65feeada864..e152d9817c81 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2302,8 +2302,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
 	return do_origin(o->dev, bio);
 }
 
-static long origin_direct_access(struct dm_target *ti, sector_t sector,
-		void **kaddr, pfn_t *pfn, long size)
+static long origin_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
 	DMWARN("device does not support dax.");
 	return -EIO;
@@ -2368,7 +2368,7 @@ static struct target_type origin_target = {
 	.postsuspend = origin_postsuspend,
 	.status  = origin_status,
 	.iterate_devices = origin_iterate_devices,
-	.direct_access = origin_direct_access,
+	.direct_access = origin_dax_direct_access,
 };
 
 static struct target_type snapshot_target = {
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 28193a57bf47..cb4b1e9e16ab 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
+#include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/log2.h>
 
@@ -308,27 +309,25 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_REMAPPED;
 }
 
-static long stripe_direct_access(struct dm_target *ti, sector_t sector,
-				 void **kaddr, pfn_t *pfn, long size)
+static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
 	struct stripe_c *sc = ti->private;
-	uint32_t stripe;
+	struct dax_device *dax_dev;
 	struct block_device *bdev;
-	struct blk_dax_ctl dax = {
-		.size = size,
-	};
+	uint32_t stripe;
 	long ret;
 
-	stripe_map_sector(sc, sector, &stripe, &dax.sector);
-
-	dax.sector += sc->stripe[stripe].physical_start;
+	stripe_map_sector(sc, sector, &stripe, &dev_sector);
+	dev_sector += sc->stripe[stripe].physical_start;
+	dax_dev = sc->stripe[stripe].dev->dax_dev;
 	bdev = sc->stripe[stripe].dev->bdev;
 
-	ret = bdev_direct_access(bdev, &dax);
-	*kaddr = dax.addr;
-	*pfn = dax.pfn;
-
-	return ret;
+	ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff);
+	if (ret)
+		return ret;
+	return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
 }
 
 /*
@@ -448,7 +447,7 @@ static struct target_type stripe_target = {
 	.status = stripe_status,
 	.iterate_devices = stripe_iterate_devices,
 	.io_hints = stripe_io_hints,
-	.direct_access = stripe_direct_access,
+	.direct_access = stripe_dax_direct_access,
 };
 
 int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 43d3445b121d..6a7968f93f3c 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -142,8 +142,8 @@ static void io_err_release_clone_rq(struct request *clone)
 {
 }
 
-static long io_err_direct_access(struct dm_target *ti, sector_t sector,
-				 void **kaddr, pfn_t *pfn, long size)
+static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
 	return -EIO;
 }
@@ -157,7 +157,7 @@ static struct target_type error_target = {
 	.map  = io_err_map,
 	.clone_and_map_rq = io_err_clone_and_map_rq,
 	.release_clone_rq = io_err_release_clone_rq,
-	.direct_access = io_err_direct_access,
+	.direct_access = io_err_dax_direct_access,
 };
 
 int __init dm_target_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index bd56dfe43a99..ef4c6f8cad47 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -630,6 +630,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
 	}
 
 	td->dm_dev.bdev = bdev;
+	td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 	return 0;
 }
 
@@ -643,7 +644,9 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
 
 	bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
+	put_dax(td->dm_dev.dax_dev);
 	td->dm_dev.bdev = NULL;
+	td->dm_dev.dax_dev = NULL;
 }
 
 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
@@ -945,16 +948,9 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (len < 1)
 		goto out;
 	nr_pages = min(len, nr_pages);
-	if (ti->type->direct_access) {
-		ret = ti->type->direct_access(ti, sector, kaddr, pfn,
-				nr_pages * PAGE_SIZE);
-		/*
-		 * FIXME: convert ti->type->direct_access to return
-		 * nr_pages directly.
-		 */
-		if (ret >= 0)
-			ret /= PAGE_SIZE;
-	}
+	if (ti->type->direct_access)
+		ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
+
  out:
 	dm_put_live_table(md, srcu_idx);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index bcba4d89089c..df830d167892 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -128,14 +128,15 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
  *  < 0 : error
  * >= 0 : the number of bytes accessible at the address
  */
-typedef long (*dm_direct_access_fn) (struct dm_target *ti, sector_t sector,
-				     void **kaddr, pfn_t *pfn, long size);
+typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn);
 #define PAGE_SECTORS (PAGE_SIZE / 512)
 
 void dm_error(const char *message);
 
 struct dm_dev {
 	struct block_device *bdev;
+	struct dax_device *dax_dev;
 	fmode_t mode;
 	char name[16];
 };
@@ -177,7 +178,7 @@ struct target_type {
 	dm_busy_fn busy;
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
-	dm_direct_access_fn direct_access;
+	dm_dax_direct_access_fn direct_access;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
-- 
cgit 


From fa5d932c323e8e0d9b24b3517997d15b36d1607d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 27 Jan 2017 12:04:59 -0800
Subject: ext2, ext4, xfs: retrieve dax_device for iomap operations

In preparation for converting fs/dax.c to use dax_direct_access()
instead of bdev_direct_access(), add the plumbing to retrieve the
dax_device associated with a given block_device.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/ext2/inode.c       |  9 ++++++++-
 fs/ext4/inode.c       |  9 ++++++++-
 fs/xfs/xfs_iomap.c    | 10 ++++++++++
 include/linux/iomap.h |  1 +
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 128cce540645..4c9d2d44e879 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -799,6 +799,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
 static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		unsigned flags, struct iomap *iomap)
 {
+	struct block_device *bdev;
 	unsigned int blkbits = inode->i_blkbits;
 	unsigned long first_block = offset >> blkbits;
 	unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
@@ -812,8 +813,13 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		return ret;
 
 	iomap->flags = 0;
-	iomap->bdev = inode->i_sb->s_bdev;
+	bdev = inode->i_sb->s_bdev;
+	iomap->bdev = bdev;
 	iomap->offset = (u64)first_block << blkbits;
+	if (blk_queue_dax(bdev->bd_queue))
+		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	else
+		iomap->dax_dev = NULL;
 
 	if (ret == 0) {
 		iomap->type = IOMAP_HOLE;
@@ -835,6 +841,7 @@ static int
 ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 		ssize_t written, unsigned flags, struct iomap *iomap)
 {
+	put_dax(iomap->dax_dev);
 	if (iomap->type == IOMAP_MAPPED &&
 	    written < length &&
 	    (flags & IOMAP_WRITE))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4247d8d25687..2cb2634daa99 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3305,6 +3305,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
+	struct block_device *bdev;
 	unsigned int blkbits = inode->i_blkbits;
 	unsigned long first_block = offset >> blkbits;
 	unsigned long last_block = (offset + length - 1) >> blkbits;
@@ -3373,7 +3374,12 @@ retry:
 	}
 
 	iomap->flags = 0;
-	iomap->bdev = inode->i_sb->s_bdev;
+	bdev = inode->i_sb->s_bdev;
+	iomap->bdev = bdev;
+	if (blk_queue_dax(bdev->bd_queue))
+		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	else
+		iomap->dax_dev = NULL;
 	iomap->offset = first_block << blkbits;
 
 	if (ret == 0) {
@@ -3406,6 +3412,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 	int blkbits = inode->i_blkbits;
 	bool truncate = false;
 
+	put_dax(iomap->dax_dev);
 	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
 		return 0;
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 288ee5b840d7..4b47403f8089 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -976,6 +976,7 @@ xfs_file_iomap_begin(
 	int			nimaps = 1, error = 0;
 	bool			shared = false, trimmed = false;
 	unsigned		lockmode;
+	struct block_device	*bdev;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
@@ -1063,6 +1064,14 @@ xfs_file_iomap_begin(
 	}
 
 	xfs_bmbt_to_iomap(ip, iomap, &imap);
+
+	/* optionally associate a dax device with the iomap bdev */
+	bdev = iomap->bdev;
+	if (blk_queue_dax(bdev->bd_queue))
+		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	else
+		iomap->dax_dev = NULL;
+
 	if (shared)
 		iomap->flags |= IOMAP_F_SHARED;
 	return 0;
@@ -1140,6 +1149,7 @@ xfs_file_iomap_end(
 	unsigned		flags,
 	struct iomap		*iomap)
 {
+	put_dax(iomap->dax_dev);
 	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
 		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
 				length, written, iomap);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 7291810067eb..f753e788da31 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -41,6 +41,7 @@ struct iomap {
 	u16			type;	/* type of mapping */
 	u16			flags;	/* flags for mapping */
 	struct block_device	*bdev;	/* block device for I/O */
+	struct dax_device	*dax_dev; /* dax_dev for dax operations */
 };
 
 /*
-- 
cgit 


From a41fe02b6bba853a29c864d00fd161bbe6cfc715 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 27 Jan 2017 14:13:15 -0800
Subject: Revert "block: use DAX for partition table reads"

commit d1a5f2b4d8a1 ("block: use DAX for partition table reads") was
part of a stalled effort to allow dax mappings of block devices. Since
then the device-dax mechanism has filled the role of dax-mapping static
device ranges.

Now that we are moving ->direct_access() from a block_device operation
to a dax_inode operation we would need block devices to map and carry
their own dax_inode reference.

Unless / until we decide to revive dax mapping of raw block devices
through the dax_inode scheme, there is no need to carry
read_dax_sector(). Its removal in turn allows for the removal of
bdev_direct_access() and should have been included in commit
223757016837 ("block_dev: remove DAX leftovers").

Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/partition-generic.c | 17 ++---------------
 fs/dax.c                  | 20 --------------------
 include/linux/dax.h       |  6 ------
 3 files changed, 2 insertions(+), 41 deletions(-)

diff --git a/block/partition-generic.c b/block/partition-generic.c
index 7afb9907821f..5dfac337b0f2 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -16,7 +16,6 @@
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
-#include <linux/dax.h>
 #include <linux/blktrace_api.h>
 
 #include "partitions/check.h"
@@ -631,24 +630,12 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
 	return 0;
 }
 
-static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n)
-{
-	struct address_space *mapping = bdev->bd_inode->i_mapping;
-
-	return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)),
-				 NULL);
-}
-
 unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 {
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
 	struct page *page;
 
-	/* don't populate page cache for dax capable devices */
-	if (IS_DAX(bdev->bd_inode))
-		page = read_dax_sector(bdev, n);
-	else
-		page = read_pagecache_sector(bdev, n);
-
+	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)), NULL);
 	if (!IS_ERR(page)) {
 		if (PageError(page))
 			goto fail;
diff --git a/fs/dax.c b/fs/dax.c
index de622d4282a6..b78a6947c4f5 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -101,26 +101,6 @@ static int dax_is_empty_entry(void *entry)
 	return (unsigned long)entry & RADIX_DAX_EMPTY;
 }
 
-struct page *read_dax_sector(struct block_device *bdev, sector_t n)
-{
-	struct page *page = alloc_pages(GFP_KERNEL, 0);
-	struct blk_dax_ctl dax = {
-		.size = PAGE_SIZE,
-		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
-	};
-	long rc;
-
-	if (!page)
-		return ERR_PTR(-ENOMEM);
-
-	rc = dax_map_atomic(bdev, &dax);
-	if (rc < 0)
-		return ERR_PTR(rc);
-	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
-	dax_unmap_atomic(bdev, &dax);
-	return page;
-}
-
 /*
  * DAX radix tree locking
  */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 7e62e280c11f..0d0d890f9186 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -70,15 +70,9 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 		pgoff_t index, void *entry, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
-struct page *read_dax_sector(struct block_device *bdev, sector_t n);
 int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 		unsigned int offset, unsigned int length);
 #else
-static inline struct page *read_dax_sector(struct block_device *bdev,
-		sector_t n)
-{
-	return ERR_PTR(-ENXIO);
-}
 static inline int __dax_zero_page_range(struct block_device *bdev,
 		sector_t sector, unsigned int offset, unsigned int length)
 {
-- 
cgit 


From cccbce67158290537cc671cbd4c1564876485a65 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 27 Jan 2017 13:31:42 -0800
Subject: filesystem-dax: convert to dax_direct_access()

Now that a dax_device is plumbed through all dax-capable drivers we can
switch from block_device_operations to dax_operations for invoking
->direct_access.

This also lets us kill off some usages of struct blk_dax_ctl on the way
to its eventual removal.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c            | 277 +++++++++++++++++++++++++++++-----------------------
 fs/iomap.c          |   3 +-
 include/linux/dax.h |   6 +-
 3 files changed, 162 insertions(+), 124 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index b78a6947c4f5..ce9dc9c3e829 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void)
 }
 fs_initcall(init_dax_wait_table);
 
-static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
-{
-	struct request_queue *q = bdev->bd_queue;
-	long rc = -EIO;
-
-	dax->addr = ERR_PTR(-EIO);
-	if (blk_queue_enter(q, true) != 0)
-		return rc;
-
-	rc = bdev_direct_access(bdev, dax);
-	if (rc < 0) {
-		dax->addr = ERR_PTR(rc);
-		blk_queue_exit(q);
-		return rc;
-	}
-	return rc;
-}
-
-static void dax_unmap_atomic(struct block_device *bdev,
-		const struct blk_dax_ctl *dax)
-{
-	if (IS_ERR(dax->addr))
-		return;
-	blk_queue_exit(bdev->bd_queue);
-}
-
 static int dax_is_pmd_entry(void *entry)
 {
 	return (unsigned long)entry & RADIX_DAX_PMD;
@@ -553,21 +527,30 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
 	return ret;
 }
 
-static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
-		struct page *to, unsigned long vaddr)
+static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
+		sector_t sector, size_t size, struct page *to,
+		unsigned long vaddr)
 {
-	struct blk_dax_ctl dax = {
-		.sector = sector,
-		.size = size,
-	};
-	void *vto;
-
-	if (dax_map_atomic(bdev, &dax) < 0)
-		return PTR_ERR(dax.addr);
+	void *vto, *kaddr;
+	pgoff_t pgoff;
+	pfn_t pfn;
+	long rc;
+	int id;
+
+	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	if (rc)
+		return rc;
+
+	id = dax_read_lock();
+	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+	if (rc < 0) {
+		dax_read_unlock(id);
+		return rc;
+	}
 	vto = kmap_atomic(to);
-	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
+	copy_user_page(vto, (void __force *)kaddr, vaddr, to);
 	kunmap_atomic(vto);
-	dax_unmap_atomic(bdev, &dax);
+	dax_read_unlock(id);
 	return 0;
 }
 
@@ -735,12 +718,16 @@ unlock_pte:
 }
 
 static int dax_writeback_one(struct block_device *bdev,
-		struct address_space *mapping, pgoff_t index, void *entry)
+		struct dax_device *dax_dev, struct address_space *mapping,
+		pgoff_t index, void *entry)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	struct blk_dax_ctl dax;
-	void *entry2, **slot;
-	int ret = 0;
+	void *entry2, **slot, *kaddr;
+	long ret = 0, id;
+	sector_t sector;
+	pgoff_t pgoff;
+	size_t size;
+	pfn_t pfn;
 
 	/*
 	 * A page got tagged dirty in DAX mapping? Something is seriously
@@ -789,26 +776,29 @@ static int dax_writeback_one(struct block_device *bdev,
 	 * 'entry'.  This allows us to flush for PMD_SIZE and not have to
 	 * worry about partial PMD writebacks.
 	 */
-	dax.sector = dax_radix_sector(entry);
-	dax.size = PAGE_SIZE << dax_radix_order(entry);
+	sector = dax_radix_sector(entry);
+	size = PAGE_SIZE << dax_radix_order(entry);
+
+	id = dax_read_lock();
+	ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	if (ret)
+		goto dax_unlock;
 
 	/*
-	 * We cannot hold tree_lock while calling dax_map_atomic() because it
-	 * eventually calls cond_resched().
+	 * dax_direct_access() may sleep, so cannot hold tree_lock over
+	 * its invocation.
 	 */
-	ret = dax_map_atomic(bdev, &dax);
-	if (ret < 0) {
-		put_locked_mapping_entry(mapping, index, entry);
-		return ret;
-	}
+	ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
+	if (ret < 0)
+		goto dax_unlock;
 
-	if (WARN_ON_ONCE(ret < dax.size)) {
+	if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
 		ret = -EIO;
-		goto unmap;
+		goto dax_unlock;
 	}
 
-	dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
-	wb_cache_pmem(dax.addr, dax.size);
+	dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
+	wb_cache_pmem(kaddr, size);
 	/*
 	 * After we have flushed the cache, we can clear the dirty tag. There
 	 * cannot be new dirty data in the pfn after the flush has completed as
@@ -818,8 +808,8 @@ static int dax_writeback_one(struct block_device *bdev,
 	spin_lock_irq(&mapping->tree_lock);
 	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
 	spin_unlock_irq(&mapping->tree_lock);
- unmap:
-	dax_unmap_atomic(bdev, &dax);
+ dax_unlock:
+	dax_read_unlock(id);
 	put_locked_mapping_entry(mapping, index, entry);
 	return ret;
 
@@ -840,6 +830,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	pgoff_t start_index, end_index;
 	pgoff_t indices[PAGEVEC_SIZE];
+	struct dax_device *dax_dev;
 	struct pagevec pvec;
 	bool done = false;
 	int i, ret = 0;
@@ -850,6 +841,10 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
 		return 0;
 
+	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	if (!dax_dev)
+		return -EIO;
+
 	start_index = wbc->range_start >> PAGE_SHIFT;
 	end_index = wbc->range_end >> PAGE_SHIFT;
 
@@ -870,38 +865,49 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 				break;
 			}
 
-			ret = dax_writeback_one(bdev, mapping, indices[i],
-					pvec.pages[i]);
-			if (ret < 0)
+			ret = dax_writeback_one(bdev, dax_dev, mapping,
+					indices[i], pvec.pages[i]);
+			if (ret < 0) {
+				put_dax(dax_dev);
 				return ret;
+			}
 		}
 	}
+	put_dax(dax_dev);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
 static int dax_insert_mapping(struct address_space *mapping,
-		struct block_device *bdev, sector_t sector, size_t size,
-		void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
+		struct block_device *bdev, struct dax_device *dax_dev,
+		sector_t sector, size_t size, void **entryp,
+		struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	unsigned long vaddr = vmf->address;
-	struct blk_dax_ctl dax = {
-		.sector = sector,
-		.size = size,
-	};
-	void *ret;
 	void *entry = *entryp;
+	void *ret, *kaddr;
+	pgoff_t pgoff;
+	int id, rc;
+	pfn_t pfn;
 
-	if (dax_map_atomic(bdev, &dax) < 0)
-		return PTR_ERR(dax.addr);
-	dax_unmap_atomic(bdev, &dax);
+	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	if (rc)
+		return rc;
 
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
+	id = dax_read_lock();
+	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+	if (rc < 0) {
+		dax_read_unlock(id);
+		return rc;
+	}
+	dax_read_unlock(id);
+
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
 	if (IS_ERR(ret))
 		return PTR_ERR(ret);
 	*entryp = ret;
 
-	return vm_insert_mixed(vma, vaddr, dax.pfn);
+	return vm_insert_mixed(vma, vaddr, pfn);
 }
 
 /**
@@ -950,24 +956,34 @@ static bool dax_range_is_aligned(struct block_device *bdev,
 	return true;
 }
 
-int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
-		unsigned int offset, unsigned int length)
+int __dax_zero_page_range(struct block_device *bdev,
+		struct dax_device *dax_dev, sector_t sector,
+		unsigned int offset, unsigned int size)
 {
-	struct blk_dax_ctl dax = {
-		.sector		= sector,
-		.size		= PAGE_SIZE,
-	};
-
-	if (dax_range_is_aligned(bdev, offset, length)) {
-		sector_t start_sector = dax.sector + (offset >> 9);
+	if (dax_range_is_aligned(bdev, offset, size)) {
+		sector_t start_sector = sector + (offset >> 9);
 
 		return blkdev_issue_zeroout(bdev, start_sector,
-				length >> 9, GFP_NOFS, true);
+				size >> 9, GFP_NOFS, true);
 	} else {
-		if (dax_map_atomic(bdev, &dax) < 0)
-			return PTR_ERR(dax.addr);
-		clear_pmem(dax.addr + offset, length);
-		dax_unmap_atomic(bdev, &dax);
+		pgoff_t pgoff;
+		long rc, id;
+		void *kaddr;
+		pfn_t pfn;
+
+		rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+		if (rc)
+			return rc;
+
+		id = dax_read_lock();
+		rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
+				&pfn);
+		if (rc < 0) {
+			dax_read_unlock(id);
+			return rc;
+		}
+		clear_pmem(kaddr + offset, size);
+		dax_read_unlock(id);
 	}
 	return 0;
 }
@@ -982,9 +998,12 @@ static loff_t
 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
 {
+	struct block_device *bdev = iomap->bdev;
+	struct dax_device *dax_dev = iomap->dax_dev;
 	struct iov_iter *iter = data;
 	loff_t end = pos + length, done = 0;
 	ssize_t ret = 0;
+	int id;
 
 	if (iov_iter_rw(iter) == READ) {
 		end = min(end, i_size_read(inode));
@@ -1009,34 +1028,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 					      (end - 1) >> PAGE_SHIFT);
 	}
 
+	id = dax_read_lock();
 	while (pos < end) {
 		unsigned offset = pos & (PAGE_SIZE - 1);
-		struct blk_dax_ctl dax = { 0 };
+		const size_t size = ALIGN(length + offset, PAGE_SIZE);
+		const sector_t sector = dax_iomap_sector(iomap, pos);
 		ssize_t map_len;
+		pgoff_t pgoff;
+		void *kaddr;
+		pfn_t pfn;
 
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 
-		dax.sector = dax_iomap_sector(iomap, pos);
-		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
-		map_len = dax_map_atomic(iomap->bdev, &dax);
+		ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+		if (ret)
+			break;
+
+		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
+				&kaddr, &pfn);
 		if (map_len < 0) {
 			ret = map_len;
 			break;
 		}
 
-		dax.addr += offset;
+		map_len = PFN_PHYS(map_len);
+		kaddr += offset;
 		map_len -= offset;
 		if (map_len > end - pos)
 			map_len = end - pos;
 
 		if (iov_iter_rw(iter) == WRITE)
-			map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+			map_len = copy_from_iter_pmem(kaddr, map_len, iter);
 		else
-			map_len = copy_to_iter(dax.addr, map_len, iter);
-		dax_unmap_atomic(iomap->bdev, &dax);
+			map_len = copy_to_iter(kaddr, map_len, iter);
 		if (map_len <= 0) {
 			ret = map_len ? map_len : -EFAULT;
 			break;
@@ -1046,6 +1073,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		length -= map_len;
 		done += map_len;
 	}
+	dax_read_unlock(id);
 
 	return done ? done : ret;
 }
@@ -1152,8 +1180,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 			clear_user_highpage(vmf->cow_page, vaddr);
 			break;
 		case IOMAP_MAPPED:
-			error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
-					vmf->cow_page, vaddr);
+			error = copy_user_dax(iomap.bdev, iomap.dax_dev,
+					sector, PAGE_SIZE, vmf->cow_page, vaddr);
 			break;
 		default:
 			WARN_ON_ONCE(1);
@@ -1178,8 +1206,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 			mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
-		error = dax_insert_mapping(mapping, iomap.bdev, sector,
-				PAGE_SIZE, &entry, vmf->vma, vmf);
+		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
+				sector, PAGE_SIZE, &entry, vmf->vma, vmf);
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
@@ -1229,41 +1257,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 		loff_t pos, void **entryp)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	const sector_t sector = dax_iomap_sector(iomap, pos);
+	struct dax_device *dax_dev = iomap->dax_dev;
 	struct block_device *bdev = iomap->bdev;
 	struct inode *inode = mapping->host;
-	struct blk_dax_ctl dax = {
-		.sector = dax_iomap_sector(iomap, pos),
-		.size = PMD_SIZE,
-	};
-	long length = dax_map_atomic(bdev, &dax);
-	void *ret = NULL;
-
-	if (length < 0) /* dax_map_atomic() failed */
+	const size_t size = PMD_SIZE;
+	void *ret = NULL, *kaddr;
+	long length = 0;
+	pgoff_t pgoff;
+	pfn_t pfn;
+	int id;
+
+	if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
 		goto fallback;
-	if (length < PMD_SIZE)
-		goto unmap_fallback;
-	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
-		goto unmap_fallback;
-	if (!pfn_t_devmap(dax.pfn))
-		goto unmap_fallback;
-
-	dax_unmap_atomic(bdev, &dax);
 
-	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
+	id = dax_read_lock();
+	length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+	if (length < 0)
+		goto unlock_fallback;
+	length = PFN_PHYS(length);
+
+	if (length < size)
+		goto unlock_fallback;
+	if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
+		goto unlock_fallback;
+	if (!pfn_t_devmap(pfn))
+		goto unlock_fallback;
+	dax_read_unlock(id);
+
+	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
 			RADIX_DAX_PMD);
 	if (IS_ERR(ret))
 		goto fallback;
 	*entryp = ret;
 
-	trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
 	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-			dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
+			pfn, vmf->flags & FAULT_FLAG_WRITE);
 
- unmap_fallback:
-	dax_unmap_atomic(bdev, &dax);
+unlock_fallback:
+	dax_read_unlock(id);
 fallback:
-	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
-			dax.pfn, ret);
+	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
 	return VM_FAULT_FALLBACK;
 }
 
diff --git a/fs/iomap.c b/fs/iomap.c
index 141c3cd55a8b..4ec0d6ac8bc1 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -360,7 +360,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
 	sector_t sector = iomap->blkno +
 		(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
 
-	return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
+			offset, bytes);
 }
 
 static loff_t
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0d0d890f9186..d3158e74a59e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -70,11 +70,13 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 		pgoff_t index, void *entry, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
-int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+int __dax_zero_page_range(struct block_device *bdev,
+		struct dax_device *dax_dev, sector_t sector,
 		unsigned int offset, unsigned int length);
 #else
 static inline int __dax_zero_page_range(struct block_device *bdev,
-		sector_t sector, unsigned int offset, unsigned int length)
+		struct dax_device *dax_dev, sector_t sector,
+		unsigned int offset, unsigned int length)
 {
 	return -ENXIO;
 }
-- 
cgit 


From 2093f2e9dfec98e561c83e807910267bcbd8bb7b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 1 Apr 2017 16:02:38 -0700
Subject: block, dax: convert bdev_dax_supported() to dax_direct_access()

Kill of the final user of bdev_direct_access() and struct blk_dax_ctl.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/block_dev.c | 48 ++++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2f7885712575..ecbdc8f9f718 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -788,35 +788,43 @@ EXPORT_SYMBOL(bdev_dax_pgoff);
  */
 int bdev_dax_supported(struct super_block *sb, int blocksize)
 {
-	struct blk_dax_ctl dax = {
-		.sector = 0,
-		.size = PAGE_SIZE,
-	};
-	int err;
+	struct block_device *bdev = sb->s_bdev;
+	struct dax_device *dax_dev;
+	pgoff_t pgoff;
+	int err, id;
+	void *kaddr;
+	pfn_t pfn;
+	long len;
 
 	if (blocksize != PAGE_SIZE) {
 		vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
 		return -EINVAL;
 	}
 
-	err = bdev_direct_access(sb->s_bdev, &dax);
-	if (err < 0) {
-		switch (err) {
-		case -EOPNOTSUPP:
-			vfs_msg(sb, KERN_ERR,
-				"error: device does not support dax");
-			break;
-		case -EINVAL:
-			vfs_msg(sb, KERN_ERR,
-				"error: unaligned partition for dax");
-			break;
-		default:
-			vfs_msg(sb, KERN_ERR,
-				"error: dax access failed (%d)", err);
-		}
+	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
+	if (err) {
+		vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax");
 		return err;
 	}
 
+	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	if (!dax_dev) {
+		vfs_msg(sb, KERN_ERR, "error: device does not support dax");
+		return -EOPNOTSUPP;
+	}
+
+	id = dax_read_lock();
+	len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
+	dax_read_unlock(id);
+
+	put_dax(dax_dev);
+
+	if (len < 1) {
+		vfs_msg(sb, KERN_ERR,
+				"error: dax access failed (%d)", len);
+		return len < 0 ? len : -EIO;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(bdev_dax_supported);
-- 
cgit 


From d4b29fd78ea6fc2be219be3af1a992149b4ff0f6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 27 Jan 2017 17:22:03 -0800
Subject: block: remove block_device_operations ->direct_access()

Now that all the producers and consumers of dax interfaces have been
converted to using dax_operations on a dax_device, remove the block
device direct_access enabling.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/powerpc/sysdev/axonram.c | 23 ++++------------------
 drivers/block/brd.c           | 15 ---------------
 drivers/md/dm.c               | 13 -------------
 drivers/nvdimm/pmem.c         | 10 ----------
 drivers/s390/block/dcssblk.c  | 16 ---------------
 fs/block_dev.c                | 45 -------------------------------------------
 include/linux/blkdev.h        | 17 ----------------
 7 files changed, 4 insertions(+), 135 deletions(-)

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 171ba86a3494..a7fe5fee744f 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -139,6 +139,10 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
+static const struct block_device_operations axon_ram_devops = {
+	.owner		= THIS_MODULE,
+};
+
 static long
 __axon_ram_direct_access(struct axon_ram_bank *bank, pgoff_t pgoff, long nr_pages,
 		       void **kaddr, pfn_t *pfn)
@@ -150,25 +154,6 @@ __axon_ram_direct_access(struct axon_ram_bank *bank, pgoff_t pgoff, long nr_page
 	return (bank->size - offset) / PAGE_SIZE;
 }
 
-/**
- * axon_ram_direct_access - direct_access() method for block device
- * @device, @sector, @data: see block_device_operations method
- */
-static long
-axon_ram_blk_direct_access(struct block_device *device, sector_t sector,
-		       void **kaddr, pfn_t *pfn, long size)
-{
-	struct axon_ram_bank *bank = device->bd_disk->private_data;
-
-	return __axon_ram_direct_access(bank, (sector * 512) / PAGE_SIZE,
-			size / PAGE_SIZE, kaddr, pfn) * PAGE_SIZE;
-}
-
-static const struct block_device_operations axon_ram_devops = {
-	.owner		= THIS_MODULE,
-	.direct_access	= axon_ram_blk_direct_access
-};
-
 static long
 axon_ram_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 		       void **kaddr, pfn_t *pfn)
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 60f3193c9ce2..bfa4ed2c75ef 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -395,18 +395,6 @@ static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff,
 	return 1;
 }
 
-static long brd_blk_direct_access(struct block_device *bdev, sector_t sector,
-			void **kaddr, pfn_t *pfn, long size)
-{
-	struct brd_device *brd = bdev->bd_disk->private_data;
-	long nr_pages = __brd_direct_access(brd, PHYS_PFN(sector * 512),
-			PHYS_PFN(size), kaddr, pfn);
-
-	if (nr_pages < 0)
-		return nr_pages;
-	return nr_pages * PAGE_SIZE;
-}
-
 static long brd_dax_direct_access(struct dax_device *dax_dev,
 		pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
 {
@@ -418,14 +406,11 @@ static long brd_dax_direct_access(struct dax_device *dax_dev,
 static const struct dax_operations brd_dax_ops = {
 	.direct_access = brd_dax_direct_access,
 };
-#else
-#define brd_blk_direct_access NULL
 #endif
 
 static const struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		brd_rw_page,
-	.direct_access =	brd_blk_direct_access,
 };
 
 /*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ef4c6f8cad47..79d5f5fd823e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -957,18 +957,6 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 	return ret;
 }
 
-static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
-		void **kaddr, pfn_t *pfn, long size)
-{
-	struct mapped_device *md = bdev->bd_disk->private_data;
-	struct dax_device *dax_dev = md->dax_dev;
-	long nr_pages = size / PAGE_SIZE;
-
-	nr_pages = dm_dax_direct_access(dax_dev, sector / PAGE_SECTORS,
-			nr_pages, kaddr, pfn);
-	return nr_pages < 0 ? nr_pages : nr_pages * PAGE_SIZE;
-}
-
 /*
  * A target may call dm_accept_partial_bio only from the map routine.  It is
  * allowed for all bio types except REQ_PREFLUSH.
@@ -2823,7 +2811,6 @@ static const struct block_device_operations dm_blk_dops = {
 	.open = dm_blk_open,
 	.release = dm_blk_close,
 	.ioctl = dm_blk_ioctl,
-	.direct_access = dm_blk_direct_access,
 	.getgeo = dm_blk_getgeo,
 	.pr_ops = &dm_pr_ops,
 	.owner = THIS_MODULE
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index fbbcf8154eec..85b85633d674 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -220,19 +220,9 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 	return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
 }
 
-static long pmem_blk_direct_access(struct block_device *bdev, sector_t sector,
-		void **kaddr, pfn_t *pfn, long size)
-{
-	struct pmem_device *pmem = bdev->bd_queue->queuedata;
-
-	return __pmem_direct_access(pmem, PHYS_PFN(sector * 512),
-			PHYS_PFN(size), kaddr, pfn);
-}
-
 static const struct block_device_operations pmem_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		pmem_rw_page,
-	.direct_access =	pmem_blk_direct_access,
 	.revalidate_disk =	nvdimm_revalidate_disk,
 };
 
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index dc84cfd4e438..36e5280af3e4 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -31,8 +31,6 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode);
 static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static blk_qc_t dcssblk_make_request(struct request_queue *q,
 						struct bio *bio);
-static long dcssblk_blk_direct_access(struct block_device *bdev, sector_t secnum,
-			 void **kaddr, pfn_t *pfn, long size);
 static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn);
 
@@ -43,7 +41,6 @@ static const struct block_device_operations dcssblk_devops = {
 	.owner   	= THIS_MODULE,
 	.open    	= dcssblk_open,
 	.release 	= dcssblk_release,
-	.direct_access 	= dcssblk_blk_direct_access,
 };
 
 static const struct dax_operations dcssblk_dax_ops = {
@@ -915,19 +912,6 @@ __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff,
 	return (dev_sz - offset) / PAGE_SIZE;
 }
 
-static long
-dcssblk_blk_direct_access(struct block_device *bdev, sector_t secnum,
-			void **kaddr, pfn_t *pfn, long size)
-{
-	struct dcssblk_dev_info *dev_info;
-
-	dev_info = bdev->bd_disk->private_data;
-	if (!dev_info)
-		return -ENODEV;
-	return __dcssblk_direct_access(dev_info, PHYS_PFN(secnum * 512),
-			PHYS_PFN(size), kaddr, pfn) * PAGE_SIZE;
-}
-
 static long
 dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ecbdc8f9f718..10e21465d5a9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -718,51 +718,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(bdev_write_page);
 
-/**
- * bdev_direct_access() - Get the address for directly-accessibly memory
- * @bdev: The device containing the memory
- * @dax: control and output parameters for ->direct_access
- *
- * If a block device is made up of directly addressable memory, this function
- * will tell the caller the PFN and the address of the memory.  The address
- * may be directly dereferenced within the kernel without the need to call
- * ioremap(), kmap() or similar.  The PFN is suitable for inserting into
- * page tables.
- *
- * Return: negative errno if an error occurs, otherwise the number of bytes
- * accessible at this address.
- */
-long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
-{
-	sector_t sector = dax->sector;
-	long avail, size = dax->size;
-	const struct block_device_operations *ops = bdev->bd_disk->fops;
-
-	/*
-	 * The device driver is allowed to sleep, in order to make the
-	 * memory directly accessible.
-	 */
-	might_sleep();
-
-	if (size < 0)
-		return size;
-	if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access)
-		return -EOPNOTSUPP;
-	if ((sector + DIV_ROUND_UP(size, 512)) >
-					part_nr_sects_read(bdev->bd_part))
-		return -ERANGE;
-	sector += get_start_sect(bdev);
-	if (sector % (PAGE_SIZE / 512))
-		return -EINVAL;
-	avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
-	if (!avail)
-		return -ERANGE;
-	if (avail > 0 && avail & ~PAGE_MASK)
-		return -ENXIO;
-	return min(avail, size);
-}
-EXPORT_SYMBOL_GPL(bdev_direct_access);
-
 int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
 		pgoff_t *pgoff)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 612c497d1461..848f87eb1905 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1916,28 +1916,12 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-/**
- * struct blk_dax_ctl - control and output parameters for ->direct_access
- * @sector: (input) offset relative to a block_device
- * @addr: (output) kernel virtual address for @sector populated by driver
- * @pfn: (output) page frame number for @addr populated by driver
- * @size: (input) number of bytes requested
- */
-struct blk_dax_ctl {
-	sector_t sector;
-	void *addr;
-	long size;
-	pfn_t pfn;
-};
-
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
 	void (*release) (struct gendisk *, fmode_t);
 	int (*rw_page)(struct block_device *, sector_t, struct page *, bool);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-	long (*direct_access)(struct block_device *, sector_t, void **, pfn_t *,
-			long);
 	unsigned int (*check_events) (struct gendisk *disk,
 				      unsigned int clearing);
 	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1956,7 +1940,6 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
-extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
 extern int bdev_dax_supported(struct super_block *, int);
 int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #else /* CONFIG_BLOCK */
-- 
cgit 


From 6abccd1bfee49e491095772fd5aa9e96d915ae52 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 13 Jan 2017 14:14:23 -0800
Subject: x86, dax, pmem: remove indirection around memcpy_from_pmem()

memcpy_from_pmem() maps directly to memcpy_mcsafe(). The wrapper
serves no real benefit aside from affording a more generic function name
than the x86-specific 'mcsafe'. However this would not be the first time
that x86 terminology leaked into the global namespace. For lack of
better name, just use memcpy_mcsafe() directly.

This conversion also catches a place where we should have been using
plain memcpy, acpi_nfit_blk_single_io().

Cc: <x86@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/include/asm/pmem.h      |  5 -----
 arch/x86/include/asm/string_64.h |  1 +
 drivers/acpi/nfit/core.c         |  3 +--
 drivers/nvdimm/claim.c           |  2 +-
 drivers/nvdimm/pmem.c            |  2 +-
 include/linux/pmem.h             | 23 -----------------------
 include/linux/string.h           |  8 ++++++++
 7 files changed, 12 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index 529bb4a6487a..d5a22bac9988 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -44,11 +44,6 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
 		BUG();
 }
 
-static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
-{
-	return memcpy_mcsafe(dst, src, n);
-}
-
 /**
  * arch_wb_cache_pmem - write back a cache range with CLWB
  * @vaddr:	virtual start address
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index a164862d77e3..733bae07fb29 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -79,6 +79,7 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+#define __HAVE_ARCH_MEMCPY_MCSAFE 1
 __must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
 DECLARE_STATIC_KEY_FALSE(mcsafe_key);
 
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index c8ea9d698cd0..d0c07b2344e4 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1783,8 +1783,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
 				mmio_flush_range((void __force *)
 					mmio->addr.aperture + offset, c);
 
-			memcpy_from_pmem(iobuf + copied,
-					mmio->addr.aperture + offset, c);
+			memcpy(iobuf + copied, mmio->addr.aperture + offset, c);
 		}
 
 		copied += c;
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index ca6d572c48fc..3a35e8028b9c 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -239,7 +239,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	if (rw == READ) {
 		if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
 			return -EIO;
-		return memcpy_from_pmem(buf, nsio->addr + offset, size);
+		return memcpy_mcsafe(buf, nsio->addr + offset, size);
 	}
 
 	if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 85b85633d674..3b3dab73d741 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -89,7 +89,7 @@ static int read_pmem(struct page *page, unsigned int off,
 	int rc;
 	void *mem = kmap_atomic(page);
 
-	rc = memcpy_from_pmem(mem + off, pmem_addr, len);
+	rc = memcpy_mcsafe(mem + off, pmem_addr, len);
 	kunmap_atomic(mem);
 	if (rc)
 		return -EIO;
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index e856c2cb0fe8..71ecf3d46aac 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -31,12 +31,6 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
 	BUG();
 }
 
-static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
-{
-	BUG();
-	return -EFAULT;
-}
-
 static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
 		struct iov_iter *i)
 {
@@ -65,23 +59,6 @@ static inline bool arch_has_pmem_api(void)
 	return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API);
 }
 
-/*
- * memcpy_from_pmem - read from persistent memory with error handling
- * @dst: destination buffer
- * @src: source buffer
- * @size: transfer length
- *
- * Returns 0 on success negative error code on failure.
- */
-static inline int memcpy_from_pmem(void *dst, void const *src, size_t size)
-{
-	if (arch_has_pmem_api())
-		return arch_memcpy_from_pmem(dst, src, size);
-	else
-		memcpy(dst, src, size);
-	return 0;
-}
-
 /**
  * memcpy_to_pmem - copy data to persistent memory
  * @dst: destination buffer for the copy
diff --git a/include/linux/string.h b/include/linux/string.h
index 26b6f6a66f83..9d6f189157e2 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -114,6 +114,14 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 #ifndef __HAVE_ARCH_MEMCHR
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
+#ifndef __HAVE_ARCH_MEMCPY_MCSAFE
+static inline __must_check int memcpy_mcsafe(void *dst, const void *src,
+		size_t cnt)
+{
+	memcpy(dst, src, cnt);
+	return 0;
+}
+#endif
 void *memchr_inv(const void *s, int c, size_t n);
 char *strreplace(char *s, char old, char new);
 
-- 
cgit 


From 97681f9b086ac4009ed914fdc56b0ebbfafe8b3d Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 25 Apr 2017 15:16:51 -0600
Subject: libnvdimm: fix phys_addr for nvdimm_clear_poison

nvdimm_clear_poison() expects a physical address, not an offset.
Fix nsio_rw_bytes() to call nvdimm_clear_poison() with a physical
address.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/claim.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index b3323c0697f6..2c19bc7fc056 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -246,7 +246,8 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 		if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
 			long cleared;
 
-			cleared = nvdimm_clear_poison(&ndns->dev, offset, size);
+			cleared = nvdimm_clear_poison(&ndns->dev,
+					nsio->res.start + offset, size);
 			if (cleared < size)
 				rc = -EIO;
 			if (cleared > 0 && cleared / 512) {
-- 
cgit 


From ab630891ce0eb83dc6c37971b35b228fc9e225ca Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 21 Apr 2017 13:28:12 -0700
Subject: libnvdimm, region: sysfs trigger for nvdimm_flush()

The nvdimm_flush() mechanism helps to reduce the impact of an ADR
(asynchronous-dimm-refresh) failure. The ADR mechanism handles flushing
platform WPQ (write-pending-queue) buffers when power is removed. The
nvdimm_flush() mechanism performs that same function on-demand.

When a pmem namespace is associated with a block device, an
nvdimm_flush() is triggered with every block-layer REQ_FUA, or REQ_FLUSH
request. These requests are typically associated with filesystem
metadata updates. However, when a namespace is in device-dax mode,
userspace (think database metadata) needs another path to perform the
same flushing. In other words this is not required to make data
persistent, but in the case of metadata it allows for a smaller failure
domain in the unlikely event of an ADR failure.

The new 'deep_flush' attribute is visible when the individual DIMMs
backing a given interleave-set are described by platform firmware. In
ACPI terms this is "NVDIMM Region Mapping Structures" and associated
"Flush Hint Address Structures". Reads return "1" if the region supports
triggering WPQ flushes on all DIMMs. Reads return "0" the flush
operation is a platform nop, and in that case the attribute is
read-only.

Why sysfs and not an ioctl? An ioctl requires establishing a new
ioctl function number space for device-dax. Given that this would be
called on a device-dax fd an application could be forgiven for
accidentally calling this on a filesystem-dax fd. Placing this interface
in libnvdimm sysfs removes that potential for collision with a
filesystem ioctl, and it keeps ioctls out of the generic device-dax
implementation.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/region_devs.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 24abceda986a..53d1ba4e6d99 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -255,6 +255,35 @@ static ssize_t size_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(size);
 
+static ssize_t deep_flush_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	/*
+	 * NOTE: in the nvdimm_has_flush() error case this attribute is
+	 * not visible.
+	 */
+	return sprintf(buf, "%d\n", nvdimm_has_flush(nd_region));
+}
+
+static ssize_t deep_flush_store(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t len)
+{
+	bool flush;
+	int rc = strtobool(buf, &flush);
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	if (rc)
+		return rc;
+	if (!flush)
+		return -EINVAL;
+	nvdimm_flush(nd_region);
+
+	return len;
+}
+static DEVICE_ATTR_RW(deep_flush);
+
 static ssize_t mappings_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -479,6 +508,7 @@ static struct attribute *nd_region_attributes[] = {
 	&dev_attr_btt_seed.attr,
 	&dev_attr_pfn_seed.attr,
 	&dev_attr_dax_seed.attr,
+	&dev_attr_deep_flush.attr,
 	&dev_attr_read_only.attr,
 	&dev_attr_set_cookie.attr,
 	&dev_attr_available_size.attr,
@@ -508,6 +538,17 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr)
 		return 0;
 
+	if (a == &dev_attr_deep_flush.attr) {
+		int has_flush = nvdimm_has_flush(nd_region);
+
+		if (has_flush == 1)
+			return a->mode;
+		else if (has_flush == 0)
+			return 0444;
+		else
+			return 0;
+	}
+
 	if (a != &dev_attr_set_cookie.attr
 			&& a != &dev_attr_available_size.attr)
 		return a->mode;
-- 
cgit 


From b2518c78ce76896f0f8f7940bf02104b227e1709 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 25 Apr 2017 17:04:13 -0600
Subject: libnvdimm, pmem: fix a NULL pointer BUG in nd_pmem_notify

The following BUG was observed when nd_pmem_notify() was called
for a BTT device.  The use of a pmem_device pointer is not valid
with BTT.

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000030
 IP: nd_pmem_notify+0x30/0xf0 [nd_pmem]
 Call Trace:
  nd_device_notify+0x40/0x50
  child_notify+0x10/0x20
  device_for_each_child+0x50/0x90
  nd_region_notify+0x20/0x30
  nd_device_notify+0x40/0x50
  nvdimm_region_notify+0x27/0x30
  acpi_nfit_scrub+0x341/0x590 [nfit]
  process_one_work+0x197/0x450
  worker_thread+0x4e/0x4a0
  kthread+0x109/0x140

Fix nd_pmem_notify() by setting nd_region and badblocks pointers
properly for BTT.

Cc: <stable@vger.kernel.org>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Fixes: 719994660c24 ("libnvdimm: async notification support")
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pmem.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 5b536be5a12e..0fc18262a2bc 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -388,12 +388,12 @@ static void nd_pmem_shutdown(struct device *dev)
 
 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 {
-	struct pmem_device *pmem = dev_get_drvdata(dev);
-	struct nd_region *nd_region = to_region(pmem);
+	struct nd_region *nd_region;
 	resource_size_t offset = 0, end_trunc = 0;
 	struct nd_namespace_common *ndns;
 	struct nd_namespace_io *nsio;
 	struct resource res;
+	struct badblocks *bb;
 
 	if (event != NVDIMM_REVALIDATE_POISON)
 		return;
@@ -402,20 +402,33 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 		struct nd_btt *nd_btt = to_nd_btt(dev);
 
 		ndns = nd_btt->ndns;
-	} else if (is_nd_pfn(dev)) {
-		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
-		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+		nd_region = to_nd_region(ndns->dev.parent);
+		nsio = to_nd_namespace_io(&ndns->dev);
+		bb = &nsio->bb;
+	} else {
+		struct pmem_device *pmem = dev_get_drvdata(dev);
 
-		ndns = nd_pfn->ndns;
-		offset = pmem->data_offset + __le32_to_cpu(pfn_sb->start_pad);
-		end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
-	} else
-		ndns = to_ndns(dev);
+		nd_region = to_region(pmem);
+		bb = &pmem->bb;
+
+		if (is_nd_pfn(dev)) {
+			struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+			struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+
+			ndns = nd_pfn->ndns;
+			offset = pmem->data_offset +
+					__le32_to_cpu(pfn_sb->start_pad);
+			end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+		} else {
+			ndns = to_ndns(dev);
+		}
+
+		nsio = to_nd_namespace_io(&ndns->dev);
+	}
 
-	nsio = to_nd_namespace_io(&ndns->dev);
 	res.start = nsio->res.start + offset;
 	res.end = nsio->res.end - end_trunc;
-	nvdimm_badblocks_populate(nd_region, &pmem->bb, &res);
+	nvdimm_badblocks_populate(nd_region, bb, &res);
 }
 
 MODULE_ALIAS("pmem");
-- 
cgit 


From 8d13c0290655b883df9083a2a0af0d782bc38aef Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Thu, 27 Apr 2017 16:57:05 -0600
Subject: libnvdimm: fix clear length of nvdimm_forget_poison()

ND_CMD_CLEAR_ERROR command returns 'clear_err.cleared', the length
of error actually cleared, which may be smaller than its requested
'len'.

Change nvdimm_clear_poison() to call nvdimm_forget_poison() with
'clear_err.cleared' when this value is valid.

Cc: <stable@vger.kernel.org>
Fixes: e046114af5fc ("libnvdimm: clear the internal poison_list when clearing badblocks")
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index d214ac44d111..43ddfd487c85 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -219,7 +219,9 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 	if (cmd_rc < 0)
 		return cmd_rc;
 
-	nvdimm_forget_poison(nvdimm_bus, phys, len);
+	if (clear_err.cleared > 0)
+		nvdimm_forget_poison(nvdimm_bus, phys, clear_err.cleared);
+
 	return clear_err.cleared;
 }
 EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
-- 
cgit 


From 7699a6a36b82f22cb0b355c6411531abedf12964 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Apr 2017 13:54:30 -0700
Subject: acpi, nfit: kill ACPI_NFIT_DEBUG

Inevitably when one actually needs to debug a DSM issue it's on a
distribution kernel that has CONFIG_ACPI_NFIT_DEBUG=n. The config symbol
was only there to avoid the compile error due to the missing fallback for
print_hex_dump_debug in the CONFIG_DYNAMIC_DEBUG=n case. That was fixed
with commit cdf17449af1d "hexdump: do not print debug dumps for
!CONFIG_DEBUG", so the config symbol can just be dropped.

Cc: Joe Perches <joe@perches.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/Kconfig | 12 ------------
 drivers/acpi/nfit/core.c  | 21 ++++++++-------------
 2 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/drivers/acpi/nfit/Kconfig b/drivers/acpi/nfit/Kconfig
index dd0d53c52552..6d3351452ea2 100644
--- a/drivers/acpi/nfit/Kconfig
+++ b/drivers/acpi/nfit/Kconfig
@@ -12,15 +12,3 @@ config ACPI_NFIT
 
 	  To compile this driver as a module, choose M here:
 	  the module will be called nfit.
-
-config ACPI_NFIT_DEBUG
-	bool "NFIT DSM debug"
-	depends on ACPI_NFIT
-	depends on DYNAMIC_DEBUG
-	default n
-	help
-	  Enabling this option causes the nfit driver to dump the
-	  input and output buffers of _DSM operations on the ACPI0012
-	  device and its children.  This can be very verbose, so leave
-	  it disabled unless you are debugging a hardware / firmware
-	  issue.
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 261eea1d2906..ced95a79c1a1 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -268,14 +268,11 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 		in_buf.buffer.length = call_pkg->nd_size_in;
 	}
 
-	if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) {
-		dev_dbg(dev, "%s:%s cmd: %d: func: %d input length: %d\n",
-				__func__, dimm_name, cmd, func,
-				in_buf.buffer.length);
-		print_hex_dump_debug("nvdimm in  ", DUMP_PREFIX_OFFSET, 4, 4,
+	dev_dbg(dev, "%s:%s cmd: %d: func: %d input length: %d\n",
+			__func__, dimm_name, cmd, func, in_buf.buffer.length);
+	print_hex_dump_debug("nvdimm in  ", DUMP_PREFIX_OFFSET, 4, 4,
 			in_buf.buffer.pointer,
 			min_t(u32, 256, in_buf.buffer.length), true);
-	}
 
 	out_obj = acpi_evaluate_dsm(handle, uuid, 1, func, &in_obj);
 	if (!out_obj) {
@@ -307,13 +304,11 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 		goto out;
 	}
 
-	if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) {
-		dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__,
-				dimm_name, cmd_name, out_obj->buffer.length);
-		print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4,
-				4, out_obj->buffer.pointer, min_t(u32, 128,
-					out_obj->buffer.length), true);
-	}
+	dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__, dimm_name,
+			cmd_name, out_obj->buffer.length);
+	print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, 4,
+			out_obj->buffer.pointer,
+			min_t(u32, 128, out_obj->buffer.length), true);
 
 	for (i = 0, offset = 0; i < desc->out_num; i++) {
 		u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf,
-- 
cgit 


From 23f4984483623cf8621246004228f08fcabf51e4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 29 Apr 2017 15:24:03 -0700
Subject: libnvdimm: rework region badblocks clearing

Toshi noticed that the new support for a region-level badblocks missed
the case where errors are cleared due to BTT I/O.

An initial attempt to fix this ran into a "sleeping while atomic"
warning due to taking the nvdimm_bus_lock() in the BTT I/O path to
satisfy the locking requirements of __nvdimm_bus_badblocks_clear().
However, that lock is not needed since we are not acting on any data that
is subject to change under that lock. The badblocks instance has its own
internal lock to handle mutations of the error list.

So, in order to make it clear that we are just acting on region devices,
rename __nvdimm_bus_badblocks_clear() to nvdimm_clear_badblocks_regions().
Eliminate the lock and consolidate all support routines for the new
nvdimm_account_cleared_poison() in drivers/nvdimm/bus.c. Finally, to the
opportunity to cleanup to some unnecessary casts, make the calling
convention of nvdimm_clear_badblocks_regions() clearer by replacing struct
resource with the minimal struct clear_badblocks_context, and use the
DEVICE_ATTR macro.

Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Reported-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c         | 76 +++++++++++++++++++++++++++++++-------------
 drivers/nvdimm/region.c      | 25 ---------------
 drivers/nvdimm/region_devs.c | 15 +++------
 include/linux/libnvdimm.h    |  3 --
 4 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 43ddfd487c85..e9361bffe5ee 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -172,6 +172,57 @@ void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event)
 }
 EXPORT_SYMBOL_GPL(nvdimm_region_notify);
 
+struct clear_badblocks_context {
+	resource_size_t phys, cleared;
+};
+
+static int nvdimm_clear_badblocks_region(struct device *dev, void *data)
+{
+	struct clear_badblocks_context *ctx = data;
+	struct nd_region *nd_region;
+	resource_size_t ndr_end;
+	sector_t sector;
+
+	/* make sure device is a region */
+	if (!is_nd_pmem(dev))
+		return 0;
+
+	nd_region = to_nd_region(dev);
+	ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1;
+
+	/* make sure we are in the region */
+	if (ctx->phys < nd_region->ndr_start
+			|| (ctx->phys + ctx->cleared) > ndr_end)
+		return 0;
+
+	sector = (ctx->phys - nd_region->ndr_start) / 512;
+	badblocks_clear(&nd_region->bb, sector, ctx->cleared / 512);
+
+	return 0;
+}
+
+static void nvdimm_clear_badblocks_regions(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t phys, u64 cleared)
+{
+	struct clear_badblocks_context ctx = {
+		.phys = phys,
+		.cleared = cleared,
+	};
+
+	device_for_each_child(&nvdimm_bus->dev, &ctx,
+			nvdimm_clear_badblocks_region);
+}
+
+static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t phys, u64 cleared)
+{
+	if (cleared > 0)
+		nvdimm_forget_poison(nvdimm_bus, phys, cleared);
+
+	if (cleared > 0 && cleared / 512)
+		nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
+}
+
 long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 		unsigned int len)
 {
@@ -219,22 +270,12 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 	if (cmd_rc < 0)
 		return cmd_rc;
 
-	if (clear_err.cleared > 0)
-		nvdimm_forget_poison(nvdimm_bus, phys, clear_err.cleared);
+	nvdimm_account_cleared_poison(nvdimm_bus, phys, clear_err.cleared);
 
 	return clear_err.cleared;
 }
 EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
 
-void __nvdimm_bus_badblocks_clear(struct nvdimm_bus *nvdimm_bus,
-		struct resource *res)
-{
-	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
-	device_for_each_child(&nvdimm_bus->dev, (void *)res,
-			nvdimm_region_badblocks_clear);
-}
-EXPORT_SYMBOL_GPL(__nvdimm_bus_badblocks_clear);
-
 static int nvdimm_bus_match(struct device *dev, struct device_driver *drv);
 
 static struct bus_type nvdimm_bus_type = {
@@ -989,18 +1030,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 
 	if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR && cmd_rc >= 0) {
 		struct nd_cmd_clear_error *clear_err = buf;
-		struct resource res;
-
-		if (clear_err->cleared) {
-			/* clearing the poison list we keep track of */
-			nvdimm_forget_poison(nvdimm_bus, clear_err->address,
-					clear_err->cleared);
 
-			/* now sync the badblocks lists */
-			res.start = clear_err->address;
-			res.end = clear_err->address + clear_err->cleared - 1;
-			__nvdimm_bus_badblocks_clear(nvdimm_bus, &res);
-		}
+		nvdimm_account_cleared_poison(nvdimm_bus, clear_err->address,
+				clear_err->cleared);
 	}
 	nvdimm_bus_unlock(&nvdimm_bus->dev);
 
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 23c4307d254c..869a886c292e 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -131,31 +131,6 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event)
 	device_for_each_child(dev, &event, child_notify);
 }
 
-int nvdimm_region_badblocks_clear(struct device *dev, void *data)
-{
-	struct resource *res = (struct resource *)data;
-	struct nd_region *nd_region;
-	resource_size_t ndr_end;
-	sector_t sector;
-
-	/* make sure device is a region */
-	if (!is_nd_pmem(dev))
-		return 0;
-
-	nd_region = to_nd_region(dev);
-	ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1;
-
-	/* make sure we are in the region */
-	if (res->start < nd_region->ndr_start || res->end > ndr_end)
-		return 0;
-
-	sector = (res->start - nd_region->ndr_start) >> 9;
-	badblocks_clear(&nd_region->bb, sector, resource_size(res) >> 9);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nvdimm_region_badblocks_clear);
-
 static struct nd_device_driver nd_region_driver = {
 	.probe = nd_region_probe,
 	.remove = nd_region_remove,
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 53d1ba4e6d99..07756b2e1cd5 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -477,20 +477,15 @@ static ssize_t read_only_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(read_only);
 
-static ssize_t nd_badblocks_show(struct device *dev,
+static ssize_t region_badblocks_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	struct nd_region *nd_region = to_nd_region(dev);
 
 	return badblocks_show(&nd_region->bb, buf, 0);
 }
-static struct device_attribute dev_attr_nd_badblocks = {
-	.attr = {
-		.name = "badblocks",
-		.mode = S_IRUGO
-	},
-	.show = nd_badblocks_show,
-};
+
+static DEVICE_ATTR(badblocks, 0444, region_badblocks_show, NULL);
 
 static ssize_t resource_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
@@ -514,7 +509,7 @@ static struct attribute *nd_region_attributes[] = {
 	&dev_attr_available_size.attr,
 	&dev_attr_namespace_seed.attr,
 	&dev_attr_init_namespaces.attr,
-	&dev_attr_nd_badblocks.attr,
+	&dev_attr_badblocks.attr,
 	&dev_attr_resource.attr,
 	NULL,
 };
@@ -532,7 +527,7 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr)
 		return 0;
 
-	if (!is_nd_pmem(dev) && a == &dev_attr_nd_badblocks.attr)
+	if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
 		return 0;
 
 	if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr)
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 98b207611b06..f07b1b14159a 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -162,7 +162,4 @@ void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
 void nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
-int nvdimm_region_badblocks_clear(struct device *dev, void *data);
-void __nvdimm_bus_badblocks_clear(struct nvdimm_bus *nvdimm_bus,
-		struct resource *res);
 #endif /* __LIBNVDIMM_H__ */
-- 
cgit 


From 452bae0aede774f87bf56c28b6dd50b72c78986c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Apr 2017 22:05:14 -0700
Subject: libnvdimm: fix nvdimm_bus_lock() vs device_lock() ordering

A debug patch to turn the standard device_lock() into something that
lockdep can analyze yielded the following:

 ======================================================
 [ INFO: possible circular locking dependency detected ]
 4.11.0-rc4+ #106 Tainted: G           O
 -------------------------------------------------------
 lt-libndctl/1898 is trying to acquire lock:
  (&dev->nvdimm_mutex/3){+.+.+.}, at: [<ffffffffc023c948>] nd_attach_ndns+0x178/0x1b0 [libnvdimm]

 but task is already holding lock:
  (&nvdimm_bus->reconfig_mutex){+.+.+.}, at: [<ffffffffc022e0b1>] nvdimm_bus_lock+0x21/0x30 [libnvdimm]

 which lock already depends on the new lock.

 the existing dependency chain (in reverse order) is:

 -> #1 (&nvdimm_bus->reconfig_mutex){+.+.+.}:
        lock_acquire+0xf6/0x1f0
        __mutex_lock+0x88/0x980
        mutex_lock_nested+0x1b/0x20
        nvdimm_bus_lock+0x21/0x30 [libnvdimm]
        nvdimm_namespace_capacity+0x1b/0x40 [libnvdimm]
        nvdimm_namespace_common_probe+0x230/0x510 [libnvdimm]
        nd_pmem_probe+0x14/0x180 [nd_pmem]
        nvdimm_bus_probe+0xa9/0x260 [libnvdimm]

 -> #0 (&dev->nvdimm_mutex/3){+.+.+.}:
        __lock_acquire+0x1107/0x1280
        lock_acquire+0xf6/0x1f0
        __mutex_lock+0x88/0x980
        mutex_lock_nested+0x1b/0x20
        nd_attach_ndns+0x178/0x1b0 [libnvdimm]
        nd_namespace_store+0x308/0x3c0 [libnvdimm]
        namespace_store+0x87/0x220 [libnvdimm]

In this case '&dev->nvdimm_mutex/3' mirrors '&dev->mutex'.

Fix this by replacing the use of device_lock() with nvdimm_bus_lock() to protect
nd_{attach,detach}_ndns() operations.

Cc: <stable@vger.kernel.org>
Fixes: 8c2f7e8658df ("libnvdimm: infrastructure for btt devices")
Reported-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt_devs.c |  2 +-
 drivers/nvdimm/claim.c    | 23 +++++++++++++++--------
 drivers/nvdimm/dax_devs.c |  2 +-
 drivers/nvdimm/pfn_devs.c |  2 +-
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 97dd2925ed6e..4b76af2b8715 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -314,7 +314,7 @@ int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns)
 	if (rc < 0) {
 		struct nd_btt *nd_btt = to_nd_btt(btt_dev);
 
-		__nd_detach_ndns(btt_dev, &nd_btt->ndns);
+		nd_detach_ndns(btt_dev, &nd_btt->ndns);
 		put_device(btt_dev);
 	}
 
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 2c19bc7fc056..35b210dc1e56 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -21,8 +21,13 @@
 void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns)
 {
 	struct nd_namespace_common *ndns = *_ndns;
+	struct nvdimm_bus *nvdimm_bus;
 
-	lockdep_assert_held(&ndns->dev.mutex);
+	if (!ndns)
+		return;
+
+	nvdimm_bus = walk_to_nvdimm_bus(&ndns->dev);
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
 	dev_WARN_ONCE(dev, ndns->claim != dev, "%s: invalid claim\n", __func__);
 	ndns->claim = NULL;
 	*_ndns = NULL;
@@ -37,18 +42,20 @@ void nd_detach_ndns(struct device *dev,
 	if (!ndns)
 		return;
 	get_device(&ndns->dev);
-	device_lock(&ndns->dev);
+	nvdimm_bus_lock(&ndns->dev);
 	__nd_detach_ndns(dev, _ndns);
-	device_unlock(&ndns->dev);
+	nvdimm_bus_unlock(&ndns->dev);
 	put_device(&ndns->dev);
 }
 
 bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
 		struct nd_namespace_common **_ndns)
 {
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&attach->dev);
+
 	if (attach->claim)
 		return false;
-	lockdep_assert_held(&attach->dev.mutex);
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
 	dev_WARN_ONCE(dev, *_ndns, "%s: invalid claim\n", __func__);
 	attach->claim = dev;
 	*_ndns = attach;
@@ -61,9 +68,9 @@ bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
 {
 	bool claimed;
 
-	device_lock(&attach->dev);
+	nvdimm_bus_lock(&attach->dev);
 	claimed = __nd_attach_ndns(dev, attach, _ndns);
-	device_unlock(&attach->dev);
+	nvdimm_bus_unlock(&attach->dev);
 	return claimed;
 }
 
@@ -114,7 +121,7 @@ static void nd_detach_and_reset(struct device *dev,
 		struct nd_namespace_common **_ndns)
 {
 	/* detach the namespace and destroy / reset the device */
-	nd_detach_ndns(dev, _ndns);
+	__nd_detach_ndns(dev, _ndns);
 	if (is_idle(dev, *_ndns)) {
 		nd_device_unregister(dev, ND_ASYNC);
 	} else if (is_nd_btt(dev)) {
@@ -184,7 +191,7 @@ ssize_t nd_namespace_store(struct device *dev,
 	}
 
 	WARN_ON_ONCE(!is_nvdimm_bus_locked(dev));
-	if (!nd_attach_ndns(dev, ndns, _ndns)) {
+	if (!__nd_attach_ndns(dev, ndns, _ndns)) {
 		dev_dbg(dev, "%s already claimed\n",
 				dev_name(&ndns->dev));
 		len = -EBUSY;
diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c
index 45fa82cae87c..c1b6556aea6e 100644
--- a/drivers/nvdimm/dax_devs.c
+++ b/drivers/nvdimm/dax_devs.c
@@ -124,7 +124,7 @@ int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns)
 	dev_dbg(dev, "%s: dax: %s\n", __func__,
 			rc == 0 ? dev_name(dax_dev) : "<none>");
 	if (rc < 0) {
-		__nd_detach_ndns(dax_dev, &nd_pfn->ndns);
+		nd_detach_ndns(dax_dev, &nd_pfn->ndns);
 		put_device(dax_dev);
 	} else
 		__nd_device_register(dax_dev);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 6c033c9a2f06..c38566f4da7d 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -484,7 +484,7 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
 	dev_dbg(dev, "%s: pfn: %s\n", __func__,
 			rc == 0 ? dev_name(pfn_dev) : "<none>");
 	if (rc < 0) {
-		__nd_detach_ndns(pfn_dev, &nd_pfn->ndns);
+		nd_detach_ndns(pfn_dev, &nd_pfn->ndns);
 		put_device(pfn_dev);
 	} else
 		__nd_device_register(pfn_dev);
-- 
cgit 


From a3e9af95f794d000debc2a5ba3186ba85a6e115f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 1 May 2017 10:00:02 -0700
Subject: libnvdimm: restore "libnvdimm: band aid btt vs clear poison locking"

This continues the 4.11 status quo of disabling of error clearing from
the BTT I/O path. Toshi found that even though we have eliminated all
the libnvdimm sources of sleeping-while-atomic triggers, we still have
sleeping operations that will occur in the path to send the ACPI DSM to
the DIMM to clear the error:

 BUG: sleeping function called from invalid context at mm/slab.h:432
 in_atomic(): 1, irqs_disabled(): 0, pid: 13353, name: dd
 Call Trace:
  dump_stack+0x86/0xc3
  ___might_sleep+0x17d/0x250
  __might_sleep+0x4a/0x80
  __kmalloc+0x1c0/0x2e0
  acpi_os_allocate_zeroed+0x2d/0x2f
  acpi_evaluate_object+0x59/0x3b1
  acpi_evaluate_dsm+0xbd/0x10c
  acpi_nfit_ctl+0x1ef/0x7c0 [nfit]
  ? nsio_rw_bytes+0x152/0x280
  nvdimm_clear_poison+0x77/0x140
  nsio_rw_bytes+0x18f/0x280
  btt_write_pg+0x1d4/0x3d0 [nd_btt]
  btt_make_request+0x119/0x2d0 [nd_btt]

A solution for tracking and handling media errors natively in the BTT is
needed.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Reported-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/claim.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 35b210dc1e56..6945e35058bf 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -250,7 +250,16 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	}
 
 	if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
-		if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
+		/*
+		 * FIXME: nsio_rw_bytes() may be called from atomic
+		 * context in the btt case and the ACPI DSM path for
+		 * clearing the error takes sleeping locks and allocates
+		 * memory. An explicit error clearing path, and support
+		 * for tracking badblocks in BTT metadata is needed to
+		 * work around this collision.
+		 */
+		if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
+				&& (!ndns->claim || !is_nd_btt(ndns->claim))) {
 			long cleared;
 
 			cleared = nvdimm_clear_poison(&ndns->dev,
-- 
cgit 


From 565851c972b50612f3a4542e26879ffb3e906fc2 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 30 Apr 2017 06:57:01 -0700
Subject: device-dax: fix sysfs attribute deadlock

Usage of device_lock() for dax_region attributes is unnecessary and
deadlock prone. It's unnecessary because the order of registration /
un-registration guarantees that drvdata is always valid. It's deadlock
prone because it sets up this situation:

 ndctl           D    0  2170   2082 0x00000000
 Call Trace:
  __schedule+0x31f/0x980
  schedule+0x3d/0x90
  schedule_preempt_disabled+0x15/0x20
  __mutex_lock+0x402/0x980
  ? __mutex_lock+0x158/0x980
  ? align_show+0x2b/0x80 [dax]
  ? kernfs_seq_start+0x2f/0x90
  mutex_lock_nested+0x1b/0x20
  align_show+0x2b/0x80 [dax]
  dev_attr_show+0x20/0x50

 ndctl           D    0  2186   2079 0x00000000
 Call Trace:
  __schedule+0x31f/0x980
  schedule+0x3d/0x90
  __kernfs_remove+0x1f6/0x340
  ? kernfs_remove_by_name_ns+0x45/0xa0
  ? remove_wait_queue+0x70/0x70
  kernfs_remove_by_name_ns+0x45/0xa0
  remove_files.isra.1+0x35/0x70
  sysfs_remove_group+0x44/0x90
  sysfs_remove_groups+0x2e/0x50
  dax_region_unregister+0x25/0x40 [dax]
  devm_action_release+0xf/0x20
  release_nodes+0x16d/0x2b0
  devres_release_all+0x3c/0x60
  device_release_driver_internal+0x17d/0x220
  device_release_driver+0x12/0x20
  unbind_store+0x112/0x160

ndctl/2170 is trying to acquire the device_lock() to read an attribute,
and ndctl/2186 is holding the device_lock() while trying to drain all
active attribute readers.

Thanks to Yi Zhang for the reproduction script.

Fixes: d7fe1a67f658 ("dax: add region 'id', 'size', and 'align' attributes")
Cc: <stable@vger.kernel.org>
Reported-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.c | 40 ++++++++++++----------------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index ef93aa84622b..5e8302d3a89c 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -36,36 +36,27 @@ static struct kmem_cache *dax_cache __read_mostly;
 static struct super_block *dax_superblock __read_mostly;
 MODULE_PARM_DESC(nr_dax, "max number of device-dax instances");
 
+/*
+ * Rely on the fact that drvdata is set before the attributes are
+ * registered, and that the attributes are unregistered before drvdata
+ * is cleared to assume that drvdata is always valid.
+ */
 static ssize_t id_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
+	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%d\n", dax_region->id);
-	device_unlock(dev);
-
-	return rc;
+	return sprintf(buf, "%d\n", dax_region->id);
 }
 static DEVICE_ATTR_RO(id);
 
 static ssize_t region_size_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
+	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%llu\n", (unsigned long long)
-				resource_size(&dax_region->res));
-	device_unlock(dev);
-
-	return rc;
+	return sprintf(buf, "%llu\n", (unsigned long long)
+			resource_size(&dax_region->res));
 }
 static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
 		region_size_show, NULL);
@@ -73,16 +64,9 @@ static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
 static ssize_t align_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
+	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%u\n", dax_region->align);
-	device_unlock(dev);
-
-	return rc;
+	return sprintf(buf, "%u\n", dax_region->align);
 }
 static DEVICE_ATTR_RO(align);
 
-- 
cgit 


From 67fd38973513c341c84654ae2f819089b840a39b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 27 Apr 2017 16:30:41 +0200
Subject: block, dax: use correct format string in bdev_dax_supported

The new message has an incorrect format string, causing a warning in some
configurations:

fs/block_dev.c: In function 'bdev_dax_supported':
fs/block_dev.c:779:5: error: format '%d' expects argument of type 'int', but argument 2 has type 'long int' [-Werror=format=]
     "error: dax access failed (%d)", len);

This changes it to use the correct %ld instead of %d.

Fixes: 2093f2e9dfec ("block, dax: convert bdev_dax_supported() to dax_direct_access()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/block_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 10e21465d5a9..666367e13711 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -776,7 +776,7 @@ int bdev_dax_supported(struct super_block *sb, int blocksize)
 
 	if (len < 1) {
 		vfs_msg(sb, KERN_ERR,
-				"error: dax access failed (%d)", len);
+				"error: dax access failed (%ld)", len);
 		return len < 0 ? len : -EIO;
 	}
 
-- 
cgit 


From 1ef97fe4f8abd3317d5c3c860f990e02c2633959 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Wed, 3 May 2017 14:56:02 +0200
Subject: brd: fix uninitialized use of brd->dax_dev

commit 1647b9b9 "brd: add dax_operations support" introduced the allocation
and freeing of a dax_device, but the allocated dax_device is not stored
into the brd_device, so brd_del_one() will eventually operate on an
uninitialized brd->dax_dev.

Fix this by storing the allocated dax_device to brd->dax_dev.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/block/brd.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index bfa4ed2c75ef..ec00c01b8dc3 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -453,9 +453,7 @@ static struct brd_device *brd_alloc(int i)
 {
 	struct brd_device *brd;
 	struct gendisk *disk;
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-	struct dax_device *dax_dev;
-#endif
+
 	brd = kzalloc(sizeof(*brd), GFP_KERNEL);
 	if (!brd)
 		goto out;
@@ -497,8 +495,8 @@ static struct brd_device *brd_alloc(int i)
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 	queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
-	dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops);
-	if (!dax_dev)
+	brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops);
+	if (!brd->dax_dev)
 		goto out_free_inode;
 #endif
 
@@ -507,8 +505,8 @@ static struct brd_device *brd_alloc(int i)
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 out_free_inode:
-	kill_dax(dax_dev);
-	put_dax(dax_dev);
+	kill_dax(brd->dax_dev);
+	put_dax(brd->dax_dev);
 #endif
 out_free_queue:
 	blk_cleanup_queue(brd->brd_queue);
-- 
cgit 


From 8f078b38dd382710884ce7abd31a1935c440e6f8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 4 May 2017 14:01:24 -0700
Subject: libnvdimm: convert NDD_ flags to use bitops, introduce NDD_LOCKED

This is a preparation patch for handling locked nvdimm label regions, a
new concept as introduced by the latest DSM document on pmem.io [1]. A
future patch will leverage nvdimm_set_locked() at DIMM probe time to
flag regions that can not be enabled. There should be no functional
difference resulting from this change.

[1]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example-V1.3.pdf

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c        |  4 ++--
 drivers/nvdimm/dimm_devs.c      | 11 +++++++++--
 drivers/nvdimm/namespace_devs.c |  2 +-
 drivers/nvdimm/nd.h             |  1 +
 drivers/nvdimm/region_devs.c    |  4 ++--
 include/linux/libnvdimm.h       |  6 ++++--
 6 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index ced95a79c1a1..a26c2297c1ed 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1512,7 +1512,7 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		}
 
 		if (nfit_mem->bdw && nfit_mem->memdev_pmem)
-			flags |= NDD_ALIASING;
+			set_bit(NDD_ALIASING, &flags);
 
 		/* collate flags across all memdevs for this dimm */
 		list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
@@ -1527,7 +1527,7 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 
 		mem_flags = __to_nfit_memdev(nfit_mem)->flags;
 		if (mem_flags & ACPI_NFIT_MEM_NOT_ARMED)
-			flags |= NDD_UNARMED;
+			set_bit(NDD_UNARMED, &flags);
 
 		rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle);
 		if (rc)
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 8b721321be5b..7d1a3dbc7d5d 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -34,7 +34,7 @@ int nvdimm_check_config_data(struct device *dev)
 
 	if (!nvdimm->cmd_mask ||
 	    !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) {
-		if (nvdimm->flags & NDD_ALIASING)
+		if (test_bit(NDD_ALIASING, &nvdimm->flags))
 			return -ENXIO;
 		else
 			return -ENOTTY;
@@ -188,7 +188,14 @@ void nvdimm_set_aliasing(struct device *dev)
 {
 	struct nvdimm *nvdimm = to_nvdimm(dev);
 
-	nvdimm->flags |= NDD_ALIASING;
+	set_bit(NDD_ALIASING, &nvdimm->flags);
+}
+
+void nvdimm_set_locked(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	set_bit(NDD_LOCKED, &nvdimm->flags);
 }
 
 static void nvdimm_release(struct device *dev)
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 1b481a5fb966..2f2d8afc684a 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -2240,7 +2240,7 @@ static int init_active_labels(struct nd_region *nd_region)
 		 * being activated if it aliases DPA.
 		 */
 		if (!ndd) {
-			if ((nvdimm->flags & NDD_ALIASING) == 0)
+			if (!test_bit(NDD_ALIASING, &nvdimm->flags))
 				return 0;
 			dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n",
 					dev_name(&nd_mapping->nvdimm->dev));
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index c3b33cf655fb..77d032192bf7 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -240,6 +240,7 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
 long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 		unsigned int len);
 void nvdimm_set_aliasing(struct device *dev);
+void nvdimm_set_locked(struct device *dev);
 struct nd_btt *to_nd_btt(struct device *dev);
 
 struct nd_gen_sb {
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 07756b2e1cd5..b550edf2571f 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -222,7 +222,7 @@ int nd_region_to_nstype(struct nd_region *nd_region)
 			struct nd_mapping *nd_mapping = &nd_region->mapping[i];
 			struct nvdimm *nvdimm = nd_mapping->nvdimm;
 
-			if (nvdimm->flags & NDD_ALIASING)
+			if (test_bit(NDD_ALIASING, &nvdimm->flags))
 				alias++;
 		}
 		if (alias)
@@ -881,7 +881,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 			return NULL;
 		}
 
-		if (nvdimm->flags & NDD_UNARMED)
+		if (test_bit(NDD_UNARMED, &nvdimm->flags))
 			ro = 1;
 	}
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index f07b1b14159a..6c807017128d 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -20,9 +20,11 @@
 
 enum {
 	/* when a dimm supports both PMEM and BLK access a label is required */
-	NDD_ALIASING = 1 << 0,
+	NDD_ALIASING = 0,
 	/* unarmed memory devices may not persist writes */
-	NDD_UNARMED = 1 << 1,
+	NDD_UNARMED = 1,
+	/* locked memory devices should not be accessed */
+	NDD_LOCKED = 2,
 
 	/* need to set a limit somewhere, but yes, this is likely overkill */
 	ND_IOCTL_MAX_BUFLEN = SZ_4M,
-- 
cgit 


From 9d62ed96511823fa9c2ac7a819d7b4be22a9a6de Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 4 May 2017 11:47:22 -0700
Subject: libnvdimm: handle locked label storage areas

Per the latest version of the "NVDIMM DSM Interface Example" [1], the
label data retrieval routine can report a "locked" status. In this case
all regions associated with that DIMM are disabled until the label area
is unlocked. Provide generic libnvdimm enabling for NVDIMMs with label
data area locking capabilities.

[1]: http://pmem.io/documents/

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c        | 21 ++++++++++++++++++---
 drivers/nvdimm/dimm.c           |  2 ++
 drivers/nvdimm/dimm_devs.c      |  8 ++++++--
 drivers/nvdimm/namespace_devs.c | 17 ++++++++++++-----
 include/uapi/linux/ndctl.h      |  1 +
 5 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index a26c2297c1ed..891bd49ada0b 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -184,14 +184,29 @@ static int xlat_bus_status(void *buf, unsigned int cmd, u32 status)
 	return 0;
 }
 
+static int xlat_nvdimm_status(void *buf, unsigned int cmd, u32 status)
+{
+	switch (cmd) {
+	case ND_CMD_GET_CONFIG_SIZE:
+		if (status >> 16 & ND_CONFIG_LOCKED)
+			return -EACCES;
+		break;
+	default:
+		break;
+	}
+
+	/* all other non-zero status results in an error */
+	if (status)
+		return -EIO;
+	return 0;
+}
+
 static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd,
 		u32 status)
 {
 	if (!nvdimm)
 		return xlat_bus_status(buf, cmd, status);
-	if (status)
-		return -EIO;
-	return 0;
+	return xlat_nvdimm_status(buf, cmd, status);
 }
 
 int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
index ee0b412827bf..e0f0e3ce1a32 100644
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -49,6 +49,8 @@ static int nvdimm_probe(struct device *dev)
 	kref_init(&ndd->kref);
 
 	rc = nvdimm_init_nsarea(ndd);
+	if (rc == -EACCES)
+		nvdimm_set_locked(dev);
 	if (rc)
 		goto err;
 
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 7d1a3dbc7d5d..fac1e9fbd11d 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -67,6 +67,7 @@ int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd)
 	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
 	struct nvdimm_bus_descriptor *nd_desc;
 	int rc = validate_dimm(ndd);
+	int cmd_rc = 0;
 
 	if (rc)
 		return rc;
@@ -76,8 +77,11 @@ int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd)
 
 	memset(cmd, 0, sizeof(*cmd));
 	nd_desc = nvdimm_bus->nd_desc;
-	return nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
-			ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), NULL);
+	rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+			ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), &cmd_rc);
+	if (rc < 0)
+		return rc;
+	return cmd_rc;
 }
 
 int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 2f2d8afc684a..2f9dfbd2dbec 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -2236,14 +2236,21 @@ static int init_active_labels(struct nd_region *nd_region)
 		int count, j;
 
 		/*
-		 * If the dimm is disabled then prevent the region from
-		 * being activated if it aliases DPA.
+		 * If the dimm is disabled then we may need to prevent
+		 * the region from being activated.
 		 */
 		if (!ndd) {
-			if (!test_bit(NDD_ALIASING, &nvdimm->flags))
+			if (test_bit(NDD_LOCKED, &nvdimm->flags))
+				/* fail, label data may be unreadable */;
+			else if (test_bit(NDD_ALIASING, &nvdimm->flags))
+				/* fail, labels needed to disambiguate dpa */;
+			else
 				return 0;
-			dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n",
-					dev_name(&nd_mapping->nvdimm->dev));
+
+			dev_err(&nd_region->dev, "%s: is %s, failing probe\n",
+					dev_name(&nd_mapping->nvdimm->dev),
+					test_bit(NDD_LOCKED, &nvdimm->flags)
+					? "locked" : "disabled");
 			return -ENXIO;
 		}
 		nd_mapping->ndd = ndd;
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index ede5c6a62164..7ad3863cb88b 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -169,6 +169,7 @@ enum {
 enum {
 	ND_ARS_VOLATILE = 1,
 	ND_ARS_PERSISTENT = 2,
+	ND_CONFIG_LOCKED = 1,
 };
 
 static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
-- 
cgit 


From d5483feda85a8f39ee2e940e279547c686aac30c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 4 May 2017 19:54:42 -0700
Subject: libnvdimm, pfn: fix 'npfns' vs section alignment

Fix failures to create namespaces due to the vmem_altmap not advertising
enough free space to store the memmap.

 WARNING: CPU: 15 PID: 8022 at arch/x86/mm/init_64.c:656 arch_add_memory+0xde/0xf0
 [..]
 Call Trace:
  dump_stack+0x63/0x83
  __warn+0xcb/0xf0
  warn_slowpath_null+0x1d/0x20
  arch_add_memory+0xde/0xf0
  devm_memremap_pages+0x244/0x440
  pmem_attach_disk+0x37e/0x490 [nd_pmem]
  nd_pmem_probe+0x7e/0xa0 [nd_pmem]
  nvdimm_bus_probe+0x71/0x120 [libnvdimm]
  driver_probe_device+0x2bb/0x460
  bind_store+0x114/0x160
  drv_attr_store+0x25/0x30

In commit 658922e57b84 "libnvdimm, pfn: fix memmap reservation sizing"
we arranged for the capacity to be allocated, but failed to also update
the 'npfns' parameter. This leads to cases where there is enough
capacity reserved to hold all the allocated sections, but
vmemmap_populate_hugepages() still encounters -ENOMEM from
altmap_alloc_block_buf().

This fix is a stop-gap until we can teach the core memory hotplug
implementation to permit sub-section hotplug.

Cc: <stable@vger.kernel.org>
Fixes: 658922e57b84 ("libnvdimm, pfn: fix memmap reservation sizing")
Reported-by: Anisha Allada <anisha.allada@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pfn_devs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index c38566f4da7d..335c8175410b 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -538,7 +538,8 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
 		altmap = NULL;
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
-		nd_pfn->npfns = (resource_size(res) - offset) / PAGE_SIZE;
+		nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
+					- offset) / PAGE_SIZE);
 		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
 			dev_info(&nd_pfn->dev,
 					"number of pfns truncated from %lld to %ld\n",
@@ -625,7 +626,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	 */
 	start += start_pad;
 	size = resource_size(&nsio->res);
-	npfns = (size - start_pad - end_trunc - SZ_8K) / SZ_4K;
+	npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K)
+			/ PAGE_SIZE);
 	if (nd_pfn->mode == PFN_MODE_PMEM) {
 		/*
 		 * vmemmap_populate_hugepages() allocates the memmap array in
-- 
cgit