From ad5fb870c486d932a1749d7853dd70f436a7e03f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 3 Apr 2015 12:05:28 -0400 Subject: e820, efi: add ACPI 6.0 persistent memory types ACPI 6.0 formalizes e820-type-7 and efi-type-14 as persistent memory. Mark it "reserved" and allow it to be claimed by a persistent memory device driver. This definition is in addition to the Linux kernel's existing type-12 definition that was recently added in support of shipping platforms with NVDIMM support that predate ACPI 6.0 (which now classifies type-12 as OEM reserved). Note, /proc/iomem can be consulted for differentiating legacy "Persistent Memory (legacy)" E820_PRAM vs standard "Persistent Memory" E820_PMEM. Cc: Boaz Harrosh Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jens Axboe Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Thomas Gleixner Acked-by: Jeff Moyer Acked-by: Andy Lutomirski Reviewed-by: Ross Zwisler Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/efi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index af5be0368dec..825b6e3d69cb 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -85,7 +85,8 @@ typedef struct { #define EFI_MEMORY_MAPPED_IO 11 #define EFI_MEMORY_MAPPED_IO_PORT_SPACE 12 #define EFI_PAL_CODE 13 -#define EFI_MAX_MEMORY_TYPE 14 +#define EFI_PERSISTENT_MEMORY 14 +#define EFI_MAX_MEMORY_TYPE 15 /* Attribute values: */ #define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ -- cgit From b94d5230d06eb930be82e67fb1a9a58271e78297 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 19 May 2015 22:54:31 -0400 Subject: libnvdimm, nfit: initial libnvdimm infrastructure and NFIT support A struct nvdimm_bus is the anchor device for registering nvdimm resources and interfaces, for example, a character control device, nvdimm devices, and I/O region devices. The ACPI NFIT (NVDIMM Firmware Interface Table) is one possible platform description for such non-volatile memory resources in a system. The nfit.ko driver attaches to the "ACPI0012" device that indicates the presence of the NFIT and parses the table to register a struct nvdimm_bus instance. Cc: Cc: Lv Zheng Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Jeff Moyer Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 include/linux/libnvdimm.h (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h new file mode 100644 index 000000000000..2b3c63950c91 --- /dev/null +++ b/include/linux/libnvdimm.h @@ -0,0 +1,34 @@ +/* + * libnvdimm - Non-volatile-memory Devices Subsystem + * + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __LIBNVDIMM_H__ +#define __LIBNVDIMM_H__ +struct nvdimm; +struct nvdimm_bus_descriptor; +typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, + struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len); + +struct nvdimm_bus_descriptor { + unsigned long dsm_mask; + char *provider_name; + ndctl_fn ndctl; +}; + +struct device; +struct nvdimm_bus; +struct nvdimm_bus *nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nfit_desc); +void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); +#endif /* __LIBNVDIMM_H__ */ -- cgit From 45def22c1fab85764646746ce38d45b2f3281fa5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 26 Apr 2015 19:26:48 -0400 Subject: libnvdimm: control character device and nvdimm_bus sysfs attributes The control device for a nvdimm_bus is registered as an "nd" class device. The expectation is that there will usually only be one "nd" bus registered under /sys/class/nd. However, we allow for the possibility of multiple buses and they will listed in discovery order as ndctl0...ndctlN. This character device hosts the ioctl for passing control messages. The initial command set has a 1:1 correlation with the commands listed in the by the "NFIT DSM Example" document [1], but this scheme is extensible to future command sets. Note, nd_ioctl() and the backing ->ndctl() implementation are defined in a subsequent patch. This is simply the initial registrations and sysfs attributes. [1]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Cc: Neil Brown Cc: Greg KH Cc: Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 2b3c63950c91..d375cdc4abd5 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,8 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +extern struct attribute_group nvdimm_bus_attribute_group; + struct nvdimm; struct nvdimm_bus_descriptor; typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, @@ -21,14 +23,16 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, unsigned int buf_len); struct nvdimm_bus_descriptor { + const struct attribute_group **attr_groups; unsigned long dsm_mask; char *provider_name; ndctl_fn ndctl; }; struct device; -struct nvdimm_bus; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); +struct nvdimm_bus *to_nvdimm_bus(struct device *dev); +struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); #endif /* __LIBNVDIMM_H__ */ -- cgit From e6dfb2de47768efe8cc37c9a1863d2aff81440fb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 25 Apr 2015 03:56:17 -0400 Subject: libnvdimm, nfit: dimm/memory-devices Enable nvdimm devices to be registered on a nvdimm_bus. The kernel assigned device id for nvdimm devicesis dynamic. If userspace needs a more static identifier it should consult a provider-specific attribute. In the case where NFIT is the provider, the 'nmemX/nfit/handle' or 'nmemX/nfit/serial' attributes may be used for this purpose. Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index d375cdc4abd5..07787f0dd7de 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,12 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ + +enum { + /* when a dimm supports both PMEM and BLK access a label is required */ + NDD_ALIASING = 1 << 0, +}; + extern struct attribute_group nvdimm_bus_attribute_group; struct nvdimm; @@ -34,5 +40,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); +struct nvdimm *to_nvdimm(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); +const char *nvdimm_name(struct nvdimm *nvdimm); +void *nvdimm_provider_data(struct nvdimm *nvdimm); +struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, + const struct attribute_group **groups, unsigned long flags); #endif /* __LIBNVDIMM_H__ */ -- cgit From 62232e45f4a265abb43f0acf16e58f5d0b6e1ec9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 8 Jun 2015 14:27:06 -0400 Subject: libnvdimm: control (ioctl) messages for nvdimm_bus and nvdimm devices Most discovery/configuration of the nvdimm-subsystem is done via sysfs attributes. However, some nvdimm_bus instances, particularly the ACPI.NFIT bus, define a small set of messages that can be passed to the platform. For convenience we derive the initial libnvdimm-ioctl command formats directly from the NFIT DSM Interface Example formats. ND_CMD_SMART: media health and diagnostics ND_CMD_GET_CONFIG_SIZE: size of the label space ND_CMD_GET_CONFIG_DATA: read label space ND_CMD_SET_CONFIG_DATA: write label space ND_CMD_VENDOR: vendor-specific command passthrough ND_CMD_ARS_CAP: report address-range-scrubbing capabilities ND_CMD_ARS_START: initiate scrubbing ND_CMD_ARS_STATUS: report on scrubbing state ND_CMD_SMART_THRESHOLD: configure alarm thresholds for smart events If a platform later defines different commands than this set it is straightforward to extend support to those formats. Most of the commands target a specific dimm. However, the address-range-scrubbing commands target the bus. The 'commands' attribute in sysfs of an nvdimm_bus, or nvdimm, enumerate the supported commands for that object. Cc: Cc: Robert Moore Cc: Rafael J. Wysocki Reported-by: Nicholas Moulin Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 27 ++++++- include/uapi/linux/Kbuild | 1 + include/uapi/linux/ndctl.h | 178 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/ndctl.h (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 07787f0dd7de..a39235819af3 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,13 +14,22 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +#include +#include enum { /* when a dimm supports both PMEM and BLK access a label is required */ NDD_ALIASING = 1 << 0, + + /* need to set a limit somewhere, but yes, this is likely overkill */ + ND_IOCTL_MAX_BUFLEN = SZ_4M, + ND_CMD_MAX_ELEM = 4, + ND_CMD_MAX_ENVELOPE = 16, + ND_CMD_ARS_STATUS_MAX = SZ_4K, }; extern struct attribute_group nvdimm_bus_attribute_group; +extern struct attribute_group nvdimm_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -35,6 +44,14 @@ struct nvdimm_bus_descriptor { ndctl_fn ndctl; }; +struct nd_cmd_desc { + int in_num; + int out_num; + u32 in_sizes[ND_CMD_MAX_ELEM]; + int out_sizes[ND_CMD_MAX_ELEM]; +}; + +struct nvdimm_bus; struct device; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); @@ -45,5 +62,13 @@ struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, - const struct attribute_group **groups, unsigned long flags); + const struct attribute_group **groups, unsigned long flags, + unsigned long *dsm_mask); +const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd); +const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd); +u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, void *buf); +u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, const u32 *in_field, + const u32 *out_field); #endif /* __LIBNVDIMM_H__ */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 1a0006a76b00..200cc5ea2998 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -271,6 +271,7 @@ header-y += ncp_fs.h header-y += ncp.h header-y += ncp_mount.h header-y += ncp_no.h +header-y += ndctl.h header-y += neighbour.h header-y += netconf.h header-y += netdevice.h diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h new file mode 100644 index 000000000000..ff13c23b26df --- /dev/null +++ b/include/uapi/linux/ndctl.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2014-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU Lesser General Public License, + * version 2.1, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + */ +#ifndef __NDCTL_H__ +#define __NDCTL_H__ + +#include + +struct nd_cmd_smart { + __u32 status; + __u8 data[128]; +} __packed; + +struct nd_cmd_smart_threshold { + __u32 status; + __u8 data[8]; +} __packed; + +struct nd_cmd_dimm_flags { + __u32 status; + __u32 flags; +} __packed; + +struct nd_cmd_get_config_size { + __u32 status; + __u32 config_size; + __u32 max_xfer; +} __packed; + +struct nd_cmd_get_config_data_hdr { + __u32 in_offset; + __u32 in_length; + __u32 status; + __u8 out_buf[0]; +} __packed; + +struct nd_cmd_set_config_hdr { + __u32 in_offset; + __u32 in_length; + __u8 in_buf[0]; +} __packed; + +struct nd_cmd_vendor_hdr { + __u32 opcode; + __u32 in_length; + __u8 in_buf[0]; +} __packed; + +struct nd_cmd_vendor_tail { + __u32 status; + __u32 out_length; + __u8 out_buf[0]; +} __packed; + +struct nd_cmd_ars_cap { + __u64 address; + __u64 length; + __u32 status; + __u32 max_ars_out; +} __packed; + +struct nd_cmd_ars_start { + __u64 address; + __u64 length; + __u16 type; + __u8 reserved[6]; + __u32 status; +} __packed; + +struct nd_cmd_ars_status { + __u32 status; + __u32 out_length; + __u64 address; + __u64 length; + __u16 type; + __u32 num_records; + struct nd_ars_record { + __u32 handle; + __u32 flags; + __u64 err_address; + __u64 mask; + } __packed records[0]; +} __packed; + +enum { + ND_CMD_IMPLEMENTED = 0, + + /* bus commands */ + ND_CMD_ARS_CAP = 1, + ND_CMD_ARS_START = 2, + ND_CMD_ARS_STATUS = 3, + + /* per-dimm commands */ + ND_CMD_SMART = 1, + ND_CMD_SMART_THRESHOLD = 2, + ND_CMD_DIMM_FLAGS = 3, + ND_CMD_GET_CONFIG_SIZE = 4, + ND_CMD_GET_CONFIG_DATA = 5, + ND_CMD_SET_CONFIG_DATA = 6, + ND_CMD_VENDOR_EFFECT_LOG_SIZE = 7, + ND_CMD_VENDOR_EFFECT_LOG = 8, + ND_CMD_VENDOR = 9, +}; + +static inline const char *nvdimm_bus_cmd_name(unsigned cmd) +{ + static const char * const names[] = { + [ND_CMD_ARS_CAP] = "ars_cap", + [ND_CMD_ARS_START] = "ars_start", + [ND_CMD_ARS_STATUS] = "ars_status", + }; + + if (cmd < ARRAY_SIZE(names) && names[cmd]) + return names[cmd]; + return "unknown"; +} + +static inline const char *nvdimm_cmd_name(unsigned cmd) +{ + static const char * const names[] = { + [ND_CMD_SMART] = "smart", + [ND_CMD_SMART_THRESHOLD] = "smart_thresh", + [ND_CMD_DIMM_FLAGS] = "flags", + [ND_CMD_GET_CONFIG_SIZE] = "get_size", + [ND_CMD_GET_CONFIG_DATA] = "get_data", + [ND_CMD_SET_CONFIG_DATA] = "set_data", + [ND_CMD_VENDOR_EFFECT_LOG_SIZE] = "effect_size", + [ND_CMD_VENDOR_EFFECT_LOG] = "effect_log", + [ND_CMD_VENDOR] = "vendor", + }; + + if (cmd < ARRAY_SIZE(names) && names[cmd]) + return names[cmd]; + return "unknown"; +} + +#define ND_IOCTL 'N' + +#define ND_IOCTL_SMART _IOWR(ND_IOCTL, ND_CMD_SMART,\ + struct nd_cmd_smart) + +#define ND_IOCTL_SMART_THRESHOLD _IOWR(ND_IOCTL, ND_CMD_SMART_THRESHOLD,\ + struct nd_cmd_smart_threshold) + +#define ND_IOCTL_DIMM_FLAGS _IOWR(ND_IOCTL, ND_CMD_DIMM_FLAGS,\ + struct nd_cmd_dimm_flags) + +#define ND_IOCTL_GET_CONFIG_SIZE _IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_SIZE,\ + struct nd_cmd_get_config_size) + +#define ND_IOCTL_GET_CONFIG_DATA _IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_DATA,\ + struct nd_cmd_get_config_data_hdr) + +#define ND_IOCTL_SET_CONFIG_DATA _IOWR(ND_IOCTL, ND_CMD_SET_CONFIG_DATA,\ + struct nd_cmd_set_config_hdr) + +#define ND_IOCTL_VENDOR _IOWR(ND_IOCTL, ND_CMD_VENDOR,\ + struct nd_cmd_vendor_hdr) + +#define ND_IOCTL_ARS_CAP _IOWR(ND_IOCTL, ND_CMD_ARS_CAP,\ + struct nd_cmd_ars_cap) + +#define ND_IOCTL_ARS_START _IOWR(ND_IOCTL, ND_CMD_ARS_START,\ + struct nd_cmd_ars_start) + +#define ND_IOCTL_ARS_STATUS _IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\ + struct nd_cmd_ars_status) + +#endif /* __NDCTL_H__ */ -- cgit From 4d88a97aa9e8cfa6460aab119c5da60ad2267423 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 May 2015 14:41:48 -0400 Subject: libnvdimm, nvdimm: dimm driver and base libnvdimm device-driver infrastructure * Implement the device-model infrastructure for loading modules and attaching drivers to nvdimm devices. This is a simple association of a nd-device-type number with a driver that has a bitmask of supported device types. To facilitate userspace bind/unbind operations 'modalias' and 'devtype', that also appear in the uevent, are added as generic sysfs attributes for all nvdimm devices. The reason for the device-type number is to support sub-types within a given parent devtype, be it a vendor-specific sub-type or otherwise. * The first consumer of this infrastructure is the driver for dimm devices. It simply uses control messages to retrieve and store the configuration-data image (label set) from each dimm. Note: nd_device_register() arranges for asynchronous registration of nvdimm bus devices by default. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 2 ++ include/linux/nd.h | 39 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/ndctl.h | 6 ++++++ 3 files changed, 47 insertions(+) create mode 100644 include/linux/nd.h (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index a39235819af3..d3ebccf4ea8b 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -30,6 +30,7 @@ enum { extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; +extern struct attribute_group nd_device_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -71,4 +72,5 @@ u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, const struct nd_cmd_desc *desc, int idx, const u32 *in_field, const u32 *out_field); +int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count); #endif /* __LIBNVDIMM_H__ */ diff --git a/include/linux/nd.h b/include/linux/nd.h new file mode 100644 index 000000000000..e074f67e53a3 --- /dev/null +++ b/include/linux/nd.h @@ -0,0 +1,39 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __LINUX_ND_H__ +#define __LINUX_ND_H__ +#include +#include + +struct nd_device_driver { + struct device_driver drv; + unsigned long type; + int (*probe)(struct device *dev); + int (*remove)(struct device *dev); +}; + +static inline struct nd_device_driver *to_nd_device_driver( + struct device_driver *drv) +{ + return container_of(drv, struct nd_device_driver, drv); +} + +#define MODULE_ALIAS_ND_DEVICE(type) \ + MODULE_ALIAS("nd:t" __stringify(type) "*") +#define ND_DEVICE_MODALIAS_FMT "nd:t%d" + +int __must_check __nd_driver_register(struct nd_device_driver *nd_drv, + struct module *module, const char *mod_name); +#define nd_driver_register(driver) \ + __nd_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) +#endif /* __LINUX_ND_H__ */ diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index ff13c23b26df..37640916d146 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -175,4 +175,10 @@ static inline const char *nvdimm_cmd_name(unsigned cmd) #define ND_IOCTL_ARS_STATUS _IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\ struct nd_cmd_ars_status) + +#define ND_DEVICE_DIMM 1 /* nd_dimm: container for "config data" */ + +enum nd_driver_flags { + ND_DRIVER_DIMM = 1 << ND_DEVICE_DIMM, +}; #endif /* __NDCTL_H__ */ -- cgit From 1f7df6f88b9245a7f2d0f8ecbc97dc88c8d0d8e1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 9 Jun 2015 20:13:14 -0400 Subject: libnvdimm, nfit: regions (block-data-window, persistent memory, volatile memory) A "region" device represents the maximum capacity of a BLK range (mmio block-data-window(s)), or a PMEM range (DAX-capable persistent memory or volatile memory), without regard for aliasing. Aliasing, in the dimm-local address space (DPA), is resolved by metadata on a dimm to designate which exclusive interface will access the aliased DPA ranges. Support for the per-dimm metadata/label arrvies is in a subsequent patch. The name format of "region" devices is "regionN" where, like dimms, N is a global ida index assigned at discovery time. This id is not reliable across reboots nor in the presence of hotplug. Look to attributes of the region or static id-data of the sub-namespace to generate a persistent name. However, if the platform configuration does not change it is reasonable to expect the same region id to be assigned at the next boot. "region"s have 2 generic attributes "size", and "mapping"s where: - size: the BLK accessible capacity or the span of the system physical address range in the case of PMEM. - mappingN: a tuple describing a dimm's contribution to the region's capacity in the format (,,). For a PMEM-region there will be at least one mapping per dimm in the interleave set. For a BLK-region there is only "mapping0" listing the starting DPA of the BLK-region and the available DPA capacity of that space (matches "size" above). The max number of mappings per "region" is hard coded per the constraints of sysfs attribute groups. That said the number of mappings per region should never exceed the maximum number of possible dimms in the system. If the current number turns out to not be enough then the "mappings" attribute clarifies how many there are supposed to be. "32 should be enough for anybody...". Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index d3ebccf4ea8b..39e7e606092a 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -26,11 +26,14 @@ enum { ND_CMD_MAX_ELEM = 4, ND_CMD_MAX_ENVELOPE = 16, ND_CMD_ARS_STATUS_MAX = SZ_4K, + ND_MAX_MAPPINGS = 32, }; extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; extern struct attribute_group nd_device_attribute_group; +extern struct attribute_group nd_region_attribute_group; +extern struct attribute_group nd_mapping_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -38,6 +41,12 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len); +struct nd_mapping { + struct nvdimm *nvdimm; + u64 start; + u64 size; +}; + struct nvdimm_bus_descriptor { const struct attribute_group **attr_groups; unsigned long dsm_mask; @@ -52,6 +61,14 @@ struct nd_cmd_desc { int out_sizes[ND_CMD_MAX_ELEM]; }; +struct nd_region_desc { + struct resource *res; + struct nd_mapping *nd_mapping; + u16 num_mappings; + const struct attribute_group **attr_groups; + void *provider_data; +}; + struct nvdimm_bus; struct device; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, @@ -59,9 +76,11 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); +struct nd_region *to_nd_region(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); +void *nd_region_provider_data(struct nd_region *nd_region); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, const struct attribute_group **groups, unsigned long flags, unsigned long *dsm_mask); @@ -73,4 +92,10 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, const struct nd_cmd_desc *desc, int idx, const u32 *in_field, const u32 *out_field); int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count); +struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); +struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); +struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); #endif /* __LIBNVDIMM_H__ */ -- cgit From 3d88002e4a7bd40f355550284c6cd140e6fe29dc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 May 2015 15:02:11 -0400 Subject: libnvdimm: support for legacy (non-aliasing) nvdimms The libnvdimm region driver is an intermediary driver that translates non-volatile "region"s into "namespace" sub-devices that are surfaced by persistent memory block-device drivers (PMEM and BLK). ACPI 6 introduces the concept that a given nvdimm may simultaneously offer multiple access modes to its media through direct PMEM load/store access, or windowed BLK mode. Existing nvdimms mostly implement a PMEM interface, some offer a BLK-like mode, but never both as ACPI 6 defines. If an nvdimm is single interfaced, then there is no need for dimm metadata labels. For these devices we can take the region boundaries directly to create a child namespace device (nd_namespace_io). Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 7 +++++-- include/linux/nd.h | 10 ++++++++++ include/uapi/linux/ndctl.h | 10 ++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 39e7e606092a..37f966aff386 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -71,8 +71,11 @@ struct nd_region_desc { struct nvdimm_bus; struct device; -struct nvdimm_bus *nvdimm_bus_register(struct device *parent, - struct nvdimm_bus_descriptor *nfit_desc); +struct module; +struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nfit_desc, struct module *module); +#define nvdimm_bus_register(parent, desc) \ + __nvdimm_bus_register(parent, desc, THIS_MODULE) void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); diff --git a/include/linux/nd.h b/include/linux/nd.h index e074f67e53a3..da70e9962197 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -26,6 +26,16 @@ static inline struct nd_device_driver *to_nd_device_driver( struct device_driver *drv) { return container_of(drv, struct nd_device_driver, drv); +}; + +struct nd_namespace_io { + struct device dev; + struct resource res; +}; + +static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) +{ + return container_of(dev, struct nd_namespace_io, dev); } #define MODULE_ALIAS_ND_DEVICE(type) \ diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 37640916d146..174b6371dcc1 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -177,8 +177,18 @@ static inline const char *nvdimm_cmd_name(unsigned cmd) #define ND_DEVICE_DIMM 1 /* nd_dimm: container for "config data" */ +#define ND_DEVICE_REGION_PMEM 2 /* nd_region: (parent of PMEM namespaces) */ +#define ND_DEVICE_REGION_BLK 3 /* nd_region: (parent of BLK namespaces) */ +#define ND_DEVICE_NAMESPACE_IO 4 /* legacy persistent memory */ +#define ND_DEVICE_NAMESPACE_PMEM 5 /* PMEM namespace (may alias with BLK) */ +#define ND_DEVICE_NAMESPACE_BLK 6 /* BLK namespace (may alias with PMEM) */ enum nd_driver_flags { ND_DRIVER_DIMM = 1 << ND_DEVICE_DIMM, + ND_DRIVER_REGION_PMEM = 1 << ND_DEVICE_REGION_PMEM, + ND_DRIVER_REGION_BLK = 1 << ND_DEVICE_REGION_BLK, + ND_DRIVER_NAMESPACE_IO = 1 << ND_DEVICE_NAMESPACE_IO, + ND_DRIVER_NAMESPACE_PMEM = 1 << ND_DEVICE_NAMESPACE_PMEM, + ND_DRIVER_NAMESPACE_BLK = 1 << ND_DEVICE_NAMESPACE_BLK, }; #endif /* __NDCTL_H__ */ -- cgit From eaf961536e1622ad21247ac8d44acd48ba65566e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 1 May 2015 13:11:27 -0400 Subject: libnvdimm, nfit: add interleave-set state-tracking infrastructure On platforms that have firmware support for reading/writing per-dimm label space, a portion of the dimm may be accessible via an interleave set PMEM mapping in addition to the dimm's BLK (block-data-window aperture(s)) interface. A label, stored in a "configuration data region" on the dimm, disambiguates which dimm addresses are accessed through which exclusive interface. Add infrastructure that allows the kernel to block modifications to a label in the set while any member dimm is active. Note that this is meant only for enforcing "no modifications of active labels" via the coarse ioctl command. Adding/deleting namespaces from an active interleave set is always possible via sysfs. Another aspect of tracking interleave sets is tracking their integrity when DIMMs in a set are physically re-ordered. For this purpose we generate an "interleave-set cookie" that can be recorded in a label and validated against the current configuration. It is the bus provider implementation's responsibility to calculate the interleave set cookie and attach it to a given region. Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 37f966aff386..1b627b109360 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -61,11 +61,16 @@ struct nd_cmd_desc { int out_sizes[ND_CMD_MAX_ELEM]; }; +struct nd_interleave_set { + u64 cookie; +}; + struct nd_region_desc { struct resource *res; struct nd_mapping *nd_mapping; u16 num_mappings; const struct attribute_group **attr_groups; + struct nd_interleave_set *nd_set; void *provider_data; }; @@ -101,4 +106,5 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); +u64 nd_fletcher64(void *addr, size_t len, bool le); #endif /* __LIBNVDIMM_H__ */ -- cgit From 4a826c83db4edc040da3a66dbefd53f0cfcf457d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 9 Jun 2015 16:09:36 -0400 Subject: libnvdimm: namespace indices: read and validate This on media label format [1] consists of two index blocks followed by an array of labels. None of these structures are ever updated in place. A sequence number tracks the current active index and the next one to write, while labels are written to free slots. +------------+ | | | nsindex0 | | | +------------+ | | | nsindex1 | | | +------------+ | label0 | +------------+ | label1 | +------------+ | | ....nslot... | | +------------+ | labelN | +------------+ After reading valid labels, store the dpa ranges they claim into per-dimm resource trees. [1]: http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf Cc: Neil Brown Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/uapi/linux/ndctl.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 174b6371dcc1..1357a87b8714 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -175,7 +175,6 @@ static inline const char *nvdimm_cmd_name(unsigned cmd) #define ND_IOCTL_ARS_STATUS _IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\ struct nd_cmd_ars_status) - #define ND_DEVICE_DIMM 1 /* nd_dimm: container for "config data" */ #define ND_DEVICE_REGION_PMEM 2 /* nd_region: (parent of PMEM namespaces) */ #define ND_DEVICE_REGION_BLK 3 /* nd_region: (parent of BLK namespaces) */ -- cgit From bf9bccc14c05dae8caba29df6187c731710f5380 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 17 Jun 2015 17:14:46 -0400 Subject: libnvdimm: pmem label sets and namespace instantiation. A complete label set is a PMEM-label per-dimm per-interleave-set where all the UUIDs match and the interleave set cookie matches the hosting interleave set. Present sysfs attributes for manipulation of a PMEM-namespace's 'alt_name', 'uuid', and 'size' attributes. A later patch will make these settings persistent by writing back the label. Note that PMEM allocations grow forwards from the start of an interleave set (lowest dimm-physical-address (DPA)). BLK-namespaces that alias with a PMEM interleave set will grow allocations backward from the highest DPA. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 10 ++++++++++ include/linux/nd.h | 24 ++++++++++++++++++++++++ include/uapi/linux/ndctl.h | 4 ++++ 3 files changed, 38 insertions(+) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 1b627b109360..c130972e08c4 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -41,10 +41,20 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len); +struct nd_namespace_label; +struct nvdimm_drvdata; struct nd_mapping { struct nvdimm *nvdimm; + struct nd_namespace_label **labels; u64 start; u64 size; + /* + * @ndd is for private use at region enable / disable time for + * get_ndd() + put_ndd(), all other nd_mapping to ndd + * conversions use to_ndd() which respects enabled state of the + * nvdimm. + */ + struct nvdimm_drvdata *ndd; }; struct nvdimm_bus_descriptor { diff --git a/include/linux/nd.h b/include/linux/nd.h index da70e9962197..255c38a83083 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -28,16 +28,40 @@ static inline struct nd_device_driver *to_nd_device_driver( return container_of(drv, struct nd_device_driver, drv); }; +/** + * struct nd_namespace_io - infrastructure for loading an nd_pmem instance + * @dev: namespace device created by the nd region driver + * @res: struct resource conversion of a NFIT SPA table + */ struct nd_namespace_io { struct device dev; struct resource res; }; +/** + * struct nd_namespace_pmem - namespace device for dimm-backed interleaved memory + * @nsio: device and system physical address range to drive + * @alt_name: namespace name supplied in the dimm label + * @uuid: namespace name supplied in the dimm label + */ +struct nd_namespace_pmem { + struct nd_namespace_io nsio; + char *alt_name; + u8 *uuid; +}; + static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { return container_of(dev, struct nd_namespace_io, dev); } +static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + return container_of(nsio, struct nd_namespace_pmem, nsio); +} + #define MODULE_ALIAS_ND_DEVICE(type) \ MODULE_ALIAS("nd:t" __stringify(type) "*") #define ND_DEVICE_MODALIAS_FMT "nd:t%d" diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 1357a87b8714..2b94ea2287bb 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -190,4 +190,8 @@ enum nd_driver_flags { ND_DRIVER_NAMESPACE_PMEM = 1 << ND_DEVICE_NAMESPACE_PMEM, ND_DRIVER_NAMESPACE_BLK = 1 << ND_DEVICE_NAMESPACE_BLK, }; + +enum { + ND_MIN_NAMESPACE_SIZE = 0x00400000, +}; #endif /* __NDCTL_H__ */ -- cgit From 1b40e09a1232de537b193fa1b6b3ef16d3a1e397 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 1 May 2015 13:34:01 -0400 Subject: libnvdimm: blk labels and namespace instantiation A blk label set describes a namespace comprised of one or more discontiguous dpa ranges on a single dimm. They may alias with one or more pmem interleave sets that include the given dimm. This is the runtime/volatile configuration infrastructure for sysfs manipulation of 'alt_name', 'uuid', 'size', and 'sector_size'. A later patch will make these settings persistent by writing back the label(s). Unlike pmem namespaces, multiple blk namespaces can be created per region. Once a blk namespace has been created a new seed device (unconfigured child of a parent blk region) is instantiated. As long as a region has 'available_size' != 0 new child namespaces may be created. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 3 +++ include/linux/nd.h | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index c130972e08c4..a59dca17b3aa 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -27,6 +27,9 @@ enum { ND_CMD_MAX_ENVELOPE = 16, ND_CMD_ARS_STATUS_MAX = SZ_4K, ND_MAX_MAPPINGS = 32, + + /* mark newly adjusted resources as requiring a label update */ + DPA_RESOURCE_ADJUSTED = 1 << 0, }; extern struct attribute_group nvdimm_bus_attribute_group; diff --git a/include/linux/nd.h b/include/linux/nd.h index 255c38a83083..23276ea91690 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -50,6 +50,26 @@ struct nd_namespace_pmem { u8 *uuid; }; +/** + * struct nd_namespace_blk - namespace for dimm-bounded persistent memory + * @dev: namespace device creation by the nd region driver + * @alt_name: namespace name supplied in the dimm label + * @uuid: namespace name supplied in the dimm label + * @id: ida allocated id + * @lbasize: blk namespaces have a native sector size when btt not present + * @num_resources: number of dpa extents to claim + * @res: discontiguous dpa extents for given dimm + */ +struct nd_namespace_blk { + struct device dev; + char *alt_name; + u8 *uuid; + int id; + unsigned long lbasize; + int num_resources; + struct resource **res; +}; + static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { return container_of(dev, struct nd_namespace_io, dev); @@ -62,6 +82,11 @@ static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) return container_of(nsio, struct nd_namespace_pmem, nsio); } +static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev) +{ + return container_of(dev, struct nd_namespace_blk, dev); +} + #define MODULE_ALIAS_ND_DEVICE(type) \ MODULE_ALIAS("nd:t" __stringify(type) "*") #define ND_DEVICE_MODALIAS_FMT "nd:t%d" -- cgit From 8c2f7e8658df1d3b7cbfa62706941d14c715823a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Jun 2015 04:20:04 -0400 Subject: libnvdimm: infrastructure for btt devices NVDIMM namespaces, in addition to accepting "struct bio" based requests, also have the capability to perform byte-aligned accesses. By default only the bio/block interface is used. However, if another driver can make effective use of the byte-aligned capability it can claim namespace interface and use the byte-aligned ->rw_bytes() interface. The BTT driver is the initial first consumer of this mechanism to allow adding atomic sector update semantics to a pmem or blk namespace. This patch is the sysfs infrastructure to allow configuring a BTT instance for a namespace. Enabling that BTT and performing i/o is in a subsequent patch. Cc: Greg KH Cc: Neil Brown Signed-off-by: Dan Williams --- include/linux/nd.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/nd.h b/include/linux/nd.h index 23276ea91690..507e47c86737 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -12,6 +12,7 @@ */ #ifndef __LINUX_ND_H__ #define __LINUX_ND_H__ +#include #include #include @@ -28,13 +29,33 @@ static inline struct nd_device_driver *to_nd_device_driver( return container_of(drv, struct nd_device_driver, drv); }; +/** + * struct nd_namespace_common - core infrastructure of a namespace + * @force_raw: ignore other personalities for the namespace (e.g. btt) + * @dev: device model node + * @claim: when set a another personality has taken ownership of the namespace + * @rw_bytes: access the raw namespace capacity with byte-aligned transfers + */ +struct nd_namespace_common { + int force_raw; + struct device dev; + struct device *claim; + int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset, + void *buf, size_t size, int rw); +}; + +static inline struct nd_namespace_common *to_ndns(struct device *dev) +{ + return container_of(dev, struct nd_namespace_common, dev); +} + /** * struct nd_namespace_io - infrastructure for loading an nd_pmem instance * @dev: namespace device created by the nd region driver * @res: struct resource conversion of a NFIT SPA table */ struct nd_namespace_io { - struct device dev; + struct nd_namespace_common common; struct resource res; }; @@ -52,7 +73,6 @@ struct nd_namespace_pmem { /** * struct nd_namespace_blk - namespace for dimm-bounded persistent memory - * @dev: namespace device creation by the nd region driver * @alt_name: namespace name supplied in the dimm label * @uuid: namespace name supplied in the dimm label * @id: ida allocated id @@ -61,7 +81,7 @@ struct nd_namespace_pmem { * @res: discontiguous dpa extents for given dimm */ struct nd_namespace_blk { - struct device dev; + struct nd_namespace_common common; char *alt_name; u8 *uuid; int id; @@ -72,7 +92,7 @@ struct nd_namespace_blk { static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { - return container_of(dev, struct nd_namespace_io, dev); + return container_of(dev, struct nd_namespace_io, common.dev); } static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) @@ -84,7 +104,40 @@ static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev) { - return container_of(dev, struct nd_namespace_blk, dev); + return container_of(dev, struct nd_namespace_blk, common.dev); +} + +/** + * nvdimm_read_bytes() - synchronously read bytes from an nvdimm namespace + * @ndns: device to read + * @offset: namespace-relative starting offset + * @buf: buffer to fill + * @size: transfer length + * + * @buf is up-to-date upon return from this routine. + */ +static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size) +{ + return ndns->rw_bytes(ndns, offset, buf, size, READ); +} + +/** + * nvdimm_write_bytes() - synchronously write bytes to an nvdimm namespace + * @ndns: device to read + * @offset: namespace-relative starting offset + * @buf: buffer to drain + * @size: transfer length + * + * NVDIMM Namepaces disks do not implement sectors internally. Depending on + * the @ndns, the contents of @buf may be in cpu cache, platform buffers, + * or on backing memory media upon return from this routine. Flushing + * to media is handled internal to the @ndns driver, if at all. + */ +static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size) +{ + return ndns->rw_bytes(ndns, offset, buf, size, WRITE); } #define MODULE_ALIAS_ND_DEVICE(type) \ -- cgit From 5212e11fde4d40fa627668b4f2222d20db488f71 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 25 Jun 2015 04:20:32 -0400 Subject: nd_btt: atomic sector updates BTT stands for Block Translation Table, and is a way to provide power fail sector atomicity semantics for block devices that have the ability to perform byte granularity IO. It relies on the capability of libnvdimm namespace devices to do byte aligned IO. The BTT works as a stacked blocked device, and reserves a chunk of space from the backing device for its accounting metadata. It is a bio-based driver because all IO is done synchronously, and there is no queuing or asynchronous completions at either the device or the driver level. The BTT uses 'lanes' to index into various 'on-disk' data structures, and lanes also act as a synchronization mechanism in case there are more CPUs than available lanes. We did a comparison between two lane lock strategies - first where we kept an atomic counter around that tracked which was the last lane that was used, and 'our' lane was determined by atomically incrementing that. That way, for the nr_cpus > nr_lanes case, theoretically, no CPU would be blocked waiting for a lane. The other strategy was to use the cpu number we're scheduled on to and hash it to a lane number. Theoretically, this could block an IO that could've otherwise run using a different, free lane. But some fio workloads showed that the direct cpu -> lane hash performed faster than tracking 'last lane' - my reasoning is the cache thrash caused by moving the atomic variable made that approach slower than simply waiting out the in-progress IO. This supports the conclusion that the driver can be a very simple bio-based one that does synchronous IOs instead of queuing. Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Neil Brown Cc: Jeff Moyer Cc: Dave Chinner Cc: Greg KH [jmoyer: fix nmi watchdog timeout in btt_map_init] [jmoyer: move btt initialization to module load path] [jmoyer: fix memory leak in the btt initialization path] [jmoyer: Don't overwrite corrupted arenas] Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index a59dca17b3aa..531d99dfac68 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -85,6 +85,7 @@ struct nd_region_desc { const struct attribute_group **attr_groups; struct nd_interleave_set *nd_set; void *provider_data; + int num_lanes; }; struct nvdimm_bus; -- cgit From 047fc8a1f9a6330eacc80374dff087e20dc2304b Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 25 Jun 2015 04:21:02 -0400 Subject: libnvdimm, nfit, nd_blk: driver for BLK-mode access persistent memory The libnvdimm implementation handles allocating dimm address space (DPA) between PMEM and BLK mode interfaces. After DPA has been allocated from a BLK-region to a BLK-namespace the nd_blk driver attaches to handle I/O as a struct bio based block device. Unlike PMEM, BLK is required to handle platform specific details like mmio register formats and memory controller interleave. For this reason the libnvdimm generic nd_blk driver calls back into the bus provider to carry out the I/O. This initial implementation handles the BLK interface defined by the ACPI 6 NFIT [1] and the NVDIMM DSM Interface Example [2] composed from DCR (dimm control region), BDW (block data window), IDT (interleave descriptor) NFIT structures and the hardware register format. [1]: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf [2]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Signed-off-by: Ross Zwisler Acked-by: Rafael J. Wysocki Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 531d99dfac68..7fc1b25bdb5d 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,7 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +#include #include #include @@ -89,8 +90,24 @@ struct nd_region_desc { }; struct nvdimm_bus; -struct device; struct module; +struct device; +struct nd_blk_region; +struct nd_blk_region_desc { + int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); + struct nd_region_desc ndr_desc; +}; + +static inline struct nd_blk_region_desc *to_blk_region_desc( + struct nd_region_desc *ndr_desc) +{ + return container_of(ndr_desc, struct nd_blk_region_desc, ndr_desc); + +} + struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc, struct module *module); #define nvdimm_bus_register(parent, desc) \ @@ -99,10 +116,10 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); struct nd_region *to_nd_region(struct device *dev); +struct nd_blk_region *to_nd_blk_region(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); -void *nd_region_provider_data(struct nd_region *nd_region); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, const struct attribute_group **groups, unsigned long flags, unsigned long *dsm_mask); @@ -120,5 +137,11 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); +void *nd_region_provider_data(struct nd_region *nd_region); +void *nd_blk_region_provider_data(struct nd_blk_region *ndbr); +void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data); +struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr); +unsigned int nd_region_acquire_lane(struct nd_region *nd_region); +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); u64 nd_fletcher64(void *addr, size_t len, bool le); #endif /* __LIBNVDIMM_H__ */ -- cgit From 581388209405902b56d055f644b4dd124a206112 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 23 Jun 2015 20:08:34 -0400 Subject: libnvdimm, nfit: handle unarmed dimms, mark namespaces read-only Upon detection of an unarmed dimm in a region, arrange for descendant BTT, PMEM, or BLK instances to be read-only. A dimm is primarily marked "unarmed" via flags passed by platform firmware (NFIT). The flags in the NFIT memory device sub-structure indicate the state of the data on the nvdimm relative to its energy source or last "flush to persistence". For the most part there is nothing the driver can do but advertise the state of these flags in sysfs and emit a message if firmware indicates that the contents of the device may be corrupted. However, for the case of ACPI_NFIT_MEM_ARMED, the driver can arrange for the block devices incorporating that nvdimm to be marked read-only. This is a safe default as the data is still available and new writes are held off until the administrator either forces read-write mode, or the energy source becomes armed. A 'read_only' attribute is added to REGION devices to allow for overriding the default read-only policy of all descendant block devices. Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 7fc1b25bdb5d..dc799a29ed1a 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -21,6 +21,8 @@ enum { /* when a dimm supports both PMEM and BLK access a label is required */ NDD_ALIASING = 1 << 0, + /* unarmed memory devices may not persist writes */ + NDD_UNARMED = 1 << 1, /* need to set a limit somewhere, but yes, this is likely overkill */ ND_IOCTL_MAX_BUFLEN = SZ_4M, -- cgit From 99759869faf15471cfce251bc138848d8af7d162 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 17:14:15 -0600 Subject: acpi: Add acpi_map_pxm_to_online_node() The kernel initializes CPU & memory's NUMA topology from ACPI SRAT table. Some other ACPI tables, such as NFIT and DMAR, also contain proximity IDs for their device's NUMA topology. This information can be used to improve performance of these devices. This patch introduces acpi_map_pxm_to_online_node(), which is similar to acpi_map_pxm_to_node(), but always returns an online node. When the mapped node from a given proximity ID is offline, it looks up the node distance table and returns the nearest online node. ACPI device drivers, which are called after the NUMA initialization has completed in the kernel, can call this interface to obtain their device NUMA topology from ACPI tables. Such drivers do not have to deal with offline nodes. A node may be offline when a device proximity ID is unique, SRAT memory entry does not exist, or NUMA is disabled, ex. "numa=off" on x86. This patch also moves the pxm range check from acpi_get_node() to acpi_map_pxm_to_node(). Signed-off-by: Toshi Kani Acked-by: Rafael J. Wysocki > Signed-off-by: Dan Williams --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e4da5e35e29c..1b3bbb11d11c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -289,8 +289,13 @@ extern void acpi_dmi_osi_linux(int enable, const struct dmi_system_id *d); extern void acpi_osi_setup(char *str); #ifdef CONFIG_ACPI_NUMA +int acpi_map_pxm_to_online_node(int pxm); int acpi_get_node(acpi_handle handle); #else +static inline int acpi_map_pxm_to_online_node(int pxm) +{ + return 0; +} static inline int acpi_get_node(acpi_handle handle) { return 0; -- cgit From 41d7a6d637e1440f5410cb43c25a3c41255540c5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 12:18:33 -0600 Subject: libnvdimm: Set numa_node to NVDIMM devices ACPI NFIT table has System Physical Address Range Structure entries that describe a proximity ID of each range when ACPI_NFIT_PROXIMITY_VALID is set in the flags. Change acpi_nfit_register_region() to map a proximity ID to its node ID, and set it to a new numa_node field of nd_region_desc, which is then conveyed to the nd_region device. The device core arranges for btt and namespace devices to inherit their node from their parent region. Signed-off-by: Toshi Kani [djbw: move set_dev_node() from region.c to bus.c] Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index dc799a29ed1a..30b3deaafd51 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -89,6 +89,7 @@ struct nd_region_desc { struct nd_interleave_set *nd_set; void *provider_data; int num_lanes; + int numa_node; }; struct nvdimm_bus; -- cgit From 74ae66c3b14ffa94c8d2dea201cdf8e6203d13d5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 12:18:34 -0600 Subject: libnvdimm: Add sysfs numa_node to NVDIMM devices Add support of sysfs 'numa_node' to I/O-related NVDIMM devices under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.x. An example of numa_node values on a 2-socket system with a single NVDIMM range on each socket is shown below. /sys/bus/nd/devices |-- btt0.0/numa_node:0 |-- btt1.0/numa_node:1 |-- btt1.1/numa_node:1 |-- namespace0.0/numa_node:0 |-- namespace1.0/numa_node:1 |-- region0/numa_node:0 |-- region1/numa_node:1 These numa_node files are then linked under the block class of their device names. /sys/class/block/pmem0/device/numa_node:0 /sys/class/block/pmem1s/device/numa_node:1 This enables numactl(8) to accept 'block:' and 'file:' paths of pmem and btt devices as shown in the examples below. numactl --preferred block:pmem0 --show numactl --preferred file:/dev/pmem1s --show Signed-off-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 30b3deaafd51..75e3af01ee32 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -38,6 +38,7 @@ enum { extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; extern struct attribute_group nd_device_attribute_group; +extern struct attribute_group nd_numa_attribute_group; extern struct attribute_group nd_region_attribute_group; extern struct attribute_group nd_mapping_attribute_group; -- cgit From 61031952f4c89dba1065f7a5b9419badb112554c Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 25 Jun 2015 03:08:39 -0400 Subject: arch, x86: pmem api for ensuring durability of persistent memory updates Based on an original patch by Ross Zwisler [1]. Writes to persistent memory have the potential to be posted to cpu cache, cpu write buffers, and platform write buffers (memory controller) before being committed to persistent media. Provide apis, memcpy_to_pmem(), wmb_pmem(), and memremap_pmem(), to write data to pmem and assert that it is durable in PMEM (a persistent linear address range). A '__pmem' attribute is added so sparse can track proper usage of pointers to pmem. This continues the status quo of pmem being x86 only for 4.2, but reworks to ioremap, and wider implementation of memremap() will enable other archs in 4.3. [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Ross Zwisler [djbw: various reworks] Signed-off-by: Dan Williams --- include/linux/compiler.h | 2 + include/linux/pmem.h | 153 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 include/linux/pmem.h (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 867722591be2..9a528d945498 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -21,6 +21,7 @@ # define __rcu __attribute__((noderef, address_space(4))) #else # define __rcu +# define __pmem __attribute__((noderef, address_space(5))) #endif extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); @@ -42,6 +43,7 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __cond_lock(x,c) (c) # define __percpu # define __rcu +# define __pmem #endif /* Indirect macros required for expanded argument pasting, eg. __LINE__. */ diff --git a/include/linux/pmem.h b/include/linux/pmem.h new file mode 100644 index 000000000000..f6481a0b1d4f --- /dev/null +++ b/include/linux/pmem.h @@ -0,0 +1,153 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __PMEM_H__ +#define __PMEM_H__ + +#include + +#ifdef CONFIG_ARCH_HAS_PMEM_API +#include +#else +static inline void arch_wmb_pmem(void) +{ + BUG(); +} + +static inline bool __arch_has_wmb_pmem(void) +{ + return false; +} + +static inline void __pmem *arch_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + return NULL; +} + +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + BUG(); +} +#endif + +/* + * Architectures that define ARCH_HAS_PMEM_API must provide + * implementations for arch_memremap_pmem(), arch_memcpy_to_pmem(), + * arch_wmb_pmem(), and __arch_has_wmb_pmem(). + */ + +static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) +{ + memcpy(dst, (void __force const *) src, size); +} + +static inline void memunmap_pmem(void __pmem *addr) +{ + iounmap((void __force __iomem *) addr); +} + +/** + * arch_has_wmb_pmem - true if wmb_pmem() ensures durability + * + * For a given cpu implementation within an architecture it is possible + * that wmb_pmem() resolves to a nop. In the case this returns + * false, pmem api users are unable to ensure durability and may want to + * fall back to a different data consistency model, or otherwise notify + * the user. + */ +static inline bool arch_has_wmb_pmem(void) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)) + return __arch_has_wmb_pmem(); + return false; +} + +static inline bool arch_has_pmem_api(void) +{ + return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem(); +} + +/* + * These defaults seek to offer decent performance and minimize the + * window between i/o completion and writes being durable on media. + * However, it is undefined / architecture specific whether + * default_memremap_pmem + default_memcpy_to_pmem is sufficient for + * making data durable relative to i/o completion. + */ +static void default_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t size) +{ + memcpy((void __force *) dst, src, size); +} + +static void __pmem *default_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + /* TODO: convert to ioremap_wt() */ + return (void __pmem __force *)ioremap_nocache(offset, size); +} + +/** + * memremap_pmem - map physical persistent memory for pmem api + * @offset: physical address of persistent memory + * @size: size of the mapping + * + * Establish a mapping of the architecture specific memory type expected + * by memcpy_to_pmem() and wmb_pmem(). For example, it may be + * the case that an uncacheable or writethrough mapping is sufficient, + * or a writeback mapping provided memcpy_to_pmem() and + * wmb_pmem() arrange for the data to be written through the + * cache to persistent media. + */ +static inline void __pmem *memremap_pmem(resource_size_t offset, + unsigned long size) +{ + if (arch_has_pmem_api()) + return arch_memremap_pmem(offset, size); + return default_memremap_pmem(offset, size); +} + +/** + * memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Perform a memory copy that results in the destination of the copy + * being effectively evicted from, or never written to, the processor + * cache hierarchy after the copy completes. After memcpy_to_pmem() + * data may still reside in cpu or platform buffers, so this operation + * must be followed by a wmb_pmem(). + */ +static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n) +{ + if (arch_has_pmem_api()) + arch_memcpy_to_pmem(dst, src, n); + else + default_memcpy_to_pmem(dst, src, n); +} + +/** + * wmb_pmem - synchronize writes to persistent memory + * + * After a series of memcpy_to_pmem() operations this drains data from + * cpu write buffers and any platform (memory controller) buffers to + * ensure that written data is durable on persistent memory media. + */ +static inline void wmb_pmem(void) +{ + if (arch_has_pmem_api()) + arch_wmb_pmem(); +} +#endif /* __PMEM_H__ */ -- cgit