From 6cf9c5babd980ec1959e0dd45e3036474c6a294f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 18 May 2016 09:13:13 -0700
Subject: libnvdimm: stop requiring a driver ->remove() method

The dax_pmem driver was implementing an empty ->remove() method to
satisfy the nvdimm bus driver that unconditionally calls ->remove().
Teach the core bus driver to check if ->remove() is NULL to remove that
requirement.

Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 97589e3cb852..7cbc3d58d176 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -124,9 +124,10 @@ static int nvdimm_bus_remove(struct device *dev)
 	struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
 	struct module *provider = to_bus_provider(dev);
 	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
-	int rc;
+	int rc = 0;
 
-	rc = nd_drv->remove(dev);
+	if (nd_drv->remove)
+		rc = nd_drv->remove(dev);
 	nd_region_disable(nvdimm_bus, dev);
 
 	dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
@@ -296,8 +297,8 @@ int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner,
 		return -EINVAL;
 	}
 
-	if (!nd_drv->probe || !nd_drv->remove) {
-		pr_debug("->probe() and ->remove() must be specified\n");
+	if (!nd_drv->probe) {
+		pr_debug("%s ->probe() must be specified\n", mod_name);
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From ab68f26221366f92611650e8470e6a926801c7d4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 18 May 2016 09:15:08 -0700
Subject: /dev/dax, pmem: direct access to persistent memory

Device DAX is the device-centric analogue of Filesystem DAX
(CONFIG_FS_DAX).  It allows memory ranges to be allocated and mapped
without need of an intervening file system.  Device DAX is strict,
precise and predictable.  Specifically this interface:

1/ Guarantees fault granularity with respect to a given page size (pte,
pmd, or pud) set at configuration time.

2/ Enforces deterministic behavior by being strict about what fault
scenarios are supported.

For example, by forcing MADV_DONTFORK semantics and omitting MAP_PRIVATE
support device-dax guarantees that a mapping always behaves/performs the
same once established.  It is the "what you see is what you get" access
mechanism to differentiated memory vs filesystem DAX which has
filesystem specific implementation semantics.

Persistent memory is the first target, but the mechanism is also
targeted for exclusive allocations of performance differentiated memory
ranges.

This commit is limited to the base device driver infrastructure to
associate a dax device with pmem range.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/Kconfig                     |   2 +
 drivers/Makefile                    |   1 +
 drivers/dax/Kconfig                 |  25 ++++
 drivers/dax/Makefile                |   4 +
 drivers/dax/dax.c                   | 253 ++++++++++++++++++++++++++++++++++++
 drivers/dax/dax.h                   |  24 ++++
 drivers/dax/pmem.c                  | 158 ++++++++++++++++++++++
 tools/testing/nvdimm/Kbuild         |   9 ++
 tools/testing/nvdimm/config_check.c |   2 +
 9 files changed, 478 insertions(+)
 create mode 100644 drivers/dax/Kconfig
 create mode 100644 drivers/dax/Makefile
 create mode 100644 drivers/dax/dax.c
 create mode 100644 drivers/dax/dax.h
 create mode 100644 drivers/dax/pmem.c

diff --git a/drivers/Kconfig b/drivers/Kconfig
index d2ac339de85f..8298eab84a6f 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -190,6 +190,8 @@ source "drivers/android/Kconfig"
 
 source "drivers/nvdimm/Kconfig"
 
+source "drivers/dax/Kconfig"
+
 source "drivers/nvmem/Kconfig"
 
 source "drivers/hwtracing/stm/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 8f5d076baeb0..0b6f3d60193d 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -66,6 +66,7 @@ obj-$(CONFIG_PARPORT)		+= parport/
 obj-$(CONFIG_NVM)		+= lightnvm/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
+obj-$(CONFIG_DEV_DAX)		+= dax/
 obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= macintosh/
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
new file mode 100644
index 000000000000..86ffbaa891ad
--- /dev/null
+++ b/drivers/dax/Kconfig
@@ -0,0 +1,25 @@
+menuconfig DEV_DAX
+	tristate "DAX: direct access to differentiated memory"
+	default m if NVDIMM_DAX
+	help
+	  Support raw access to differentiated (persistence, bandwidth,
+	  latency...) memory via an mmap(2) capable character
+	  device.  Platform firmware or a device driver may identify a
+	  platform memory resource that is differentiated from the
+	  baseline memory pool.  Mappings of a /dev/daxX.Y device impose
+	  restrictions that make the mapping behavior deterministic.
+
+if DEV_DAX
+
+config DEV_DAX_PMEM
+	tristate "PMEM DAX: direct access to persistent memory"
+	depends on NVDIMM_DAX
+	default DEV_DAX
+	help
+	  Support raw access to persistent memory.  Note that this
+	  driver consumes memory ranges allocated and exported by the
+	  libnvdimm sub-system.
+
+	  Say Y if unsure
+
+endif
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
new file mode 100644
index 000000000000..27c54e38478a
--- /dev/null
+++ b/drivers/dax/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_DEV_DAX) += dax.o
+obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
+
+dax_pmem-y := pmem.o
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
new file mode 100644
index 000000000000..4c22a40f2335
--- /dev/null
+++ b/drivers/dax/dax.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/pfn_t.h>
+#include <linux/slab.h>
+#include <linux/dax.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+static int dax_major;
+static struct class *dax_class;
+static DEFINE_IDA(dax_minor_ida);
+
+/**
+ * struct dax_region - mapping infrastructure for dax devices
+ * @id: kernel-wide unique region for a memory range
+ * @base: linear address corresponding to @res
+ * @kref: to pin while other agents have a need to do lookups
+ * @dev: parent device backing this region
+ * @align: allocation and mapping alignment for child dax devices
+ * @res: physical address range of the region
+ * @pfn_flags: identify whether the pfns are paged back or not
+ */
+struct dax_region {
+	int id;
+	struct ida ida;
+	void *base;
+	struct kref kref;
+	struct device *dev;
+	unsigned int align;
+	struct resource res;
+	unsigned long pfn_flags;
+};
+
+/**
+ * struct dax_dev - subdivision of a dax region
+ * @region - parent region
+ * @dev - device backing the character device
+ * @kref - enable this data to be tracked in filp->private_data
+ * @id - child id in the region
+ * @num_resources - number of physical address extents in this device
+ * @res - array of physical address ranges
+ */
+struct dax_dev {
+	struct dax_region *region;
+	struct device *dev;
+	struct kref kref;
+	int id;
+	int num_resources;
+	struct resource res[0];
+};
+
+static void dax_region_free(struct kref *kref)
+{
+	struct dax_region *dax_region;
+
+	dax_region = container_of(kref, struct dax_region, kref);
+	kfree(dax_region);
+}
+
+void dax_region_put(struct dax_region *dax_region)
+{
+	kref_put(&dax_region->kref, dax_region_free);
+}
+EXPORT_SYMBOL_GPL(dax_region_put);
+
+static void dax_dev_free(struct kref *kref)
+{
+	struct dax_dev *dax_dev;
+
+	dax_dev = container_of(kref, struct dax_dev, kref);
+	dax_region_put(dax_dev->region);
+	kfree(dax_dev);
+}
+
+static void dax_dev_put(struct dax_dev *dax_dev)
+{
+	kref_put(&dax_dev->kref, dax_dev_free);
+}
+
+struct dax_region *alloc_dax_region(struct device *parent, int region_id,
+		struct resource *res, unsigned int align, void *addr,
+		unsigned long pfn_flags)
+{
+	struct dax_region *dax_region;
+
+	dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
+
+	if (!dax_region)
+		return NULL;
+
+	memcpy(&dax_region->res, res, sizeof(*res));
+	dax_region->pfn_flags = pfn_flags;
+	kref_init(&dax_region->kref);
+	dax_region->id = region_id;
+	ida_init(&dax_region->ida);
+	dax_region->align = align;
+	dax_region->dev = parent;
+	dax_region->base = addr;
+
+	return dax_region;
+}
+EXPORT_SYMBOL_GPL(alloc_dax_region);
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dax_dev *dax_dev = dev_get_drvdata(dev);
+	unsigned long long size = 0;
+	int i;
+
+	for (i = 0; i < dax_dev->num_resources; i++)
+		size += resource_size(&dax_dev->res[i]);
+
+	return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(size);
+
+static struct attribute *dax_device_attributes[] = {
+	&dev_attr_size.attr,
+	NULL,
+};
+
+static const struct attribute_group dax_device_attribute_group = {
+	.attrs = dax_device_attributes,
+};
+
+static const struct attribute_group *dax_attribute_groups[] = {
+	&dax_device_attribute_group,
+	NULL,
+};
+
+static void unregister_dax_dev(void *_dev)
+{
+	struct device *dev = _dev;
+	struct dax_dev *dax_dev = dev_get_drvdata(dev);
+	struct dax_region *dax_region = dax_dev->region;
+
+	dev_dbg(dev, "%s\n", __func__);
+
+	get_device(dev);
+	device_unregister(dev);
+	ida_simple_remove(&dax_region->ida, dax_dev->id);
+	ida_simple_remove(&dax_minor_ida, MINOR(dev->devt));
+	put_device(dev);
+	dax_dev_put(dax_dev);
+}
+
+int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
+		int count)
+{
+	struct device *parent = dax_region->dev;
+	struct dax_dev *dax_dev;
+	struct device *dev;
+	int rc, minor;
+	dev_t dev_t;
+
+	dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL);
+	if (!dax_dev)
+		return -ENOMEM;
+	memcpy(dax_dev->res, res, sizeof(*res) * count);
+	dax_dev->num_resources = count;
+	kref_init(&dax_dev->kref);
+	dax_dev->region = dax_region;
+	kref_get(&dax_region->kref);
+
+	dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
+	if (dax_dev->id < 0) {
+		rc = dax_dev->id;
+		goto err_id;
+	}
+
+	minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL);
+	if (minor < 0) {
+		rc = minor;
+		goto err_minor;
+	}
+
+	dev_t = MKDEV(dax_major, minor);
+	dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev,
+			dax_attribute_groups, "dax%d.%d", dax_region->id,
+			dax_dev->id);
+	if (IS_ERR(dev)) {
+		rc = PTR_ERR(dev);
+		goto err_create;
+	}
+	dax_dev->dev = dev;
+
+	rc = devm_add_action(dax_region->dev, unregister_dax_dev, dev);
+	if (rc) {
+		unregister_dax_dev(dev);
+		return rc;
+	}
+
+	return 0;
+
+ err_create:
+	ida_simple_remove(&dax_minor_ida, minor);
+ err_minor:
+	ida_simple_remove(&dax_region->ida, dax_dev->id);
+ err_id:
+	dax_dev_put(dax_dev);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(devm_create_dax_dev);
+
+static const struct file_operations dax_fops = {
+	.llseek = noop_llseek,
+	.owner = THIS_MODULE,
+};
+
+static int __init dax_init(void)
+{
+	int rc;
+
+	rc = register_chrdev(0, "dax", &dax_fops);
+	if (rc < 0)
+		return rc;
+	dax_major = rc;
+
+	dax_class = class_create(THIS_MODULE, "dax");
+	if (IS_ERR(dax_class)) {
+		unregister_chrdev(dax_major, "dax");
+		return PTR_ERR(dax_class);
+	}
+
+	return 0;
+}
+
+static void __exit dax_exit(void)
+{
+	class_destroy(dax_class);
+	unregister_chrdev(dax_major, "dax");
+	ida_destroy(&dax_minor_ida);
+}
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL v2");
+subsys_initcall(dax_init);
+module_exit(dax_exit);
diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
new file mode 100644
index 000000000000..d8b8f1f25054
--- /dev/null
+++ b/drivers/dax/dax.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __DAX_H__
+#define __DAX_H__
+struct device;
+struct resource;
+struct dax_region;
+void dax_region_put(struct dax_region *dax_region);
+struct dax_region *alloc_dax_region(struct device *parent,
+		int region_id, struct resource *res, unsigned int align,
+		void *addr, unsigned long flags);
+int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
+		int count);
+#endif /* __DAX_H__ */
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
new file mode 100644
index 000000000000..55d510e36cd1
--- /dev/null
+++ b/drivers/dax/pmem.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/percpu-refcount.h>
+#include <linux/memremap.h>
+#include <linux/module.h>
+#include <linux/pfn_t.h>
+#include "../nvdimm/pfn.h"
+#include "../nvdimm/nd.h"
+#include "dax.h"
+
+struct dax_pmem {
+	struct device *dev;
+	struct percpu_ref ref;
+	struct completion cmp;
+};
+
+struct dax_pmem *to_dax_pmem(struct percpu_ref *ref)
+{
+	return container_of(ref, struct dax_pmem, ref);
+}
+
+static void dax_pmem_percpu_release(struct percpu_ref *ref)
+{
+	struct dax_pmem *dax_pmem = to_dax_pmem(ref);
+
+	dev_dbg(dax_pmem->dev, "%s\n", __func__);
+	complete(&dax_pmem->cmp);
+}
+
+static void dax_pmem_percpu_exit(void *data)
+{
+	struct percpu_ref *ref = data;
+	struct dax_pmem *dax_pmem = to_dax_pmem(ref);
+
+	dev_dbg(dax_pmem->dev, "%s\n", __func__);
+	percpu_ref_exit(ref);
+	wait_for_completion(&dax_pmem->cmp);
+}
+
+static void dax_pmem_percpu_kill(void *data)
+{
+	struct percpu_ref *ref = data;
+	struct dax_pmem *dax_pmem = to_dax_pmem(ref);
+
+	dev_dbg(dax_pmem->dev, "%s\n", __func__);
+	percpu_ref_kill(ref);
+}
+
+static int dax_pmem_probe(struct device *dev)
+{
+	int rc;
+	void *addr;
+	struct resource res;
+	struct nd_pfn_sb *pfn_sb;
+	struct dax_pmem *dax_pmem;
+	struct nd_region *nd_region;
+	struct nd_namespace_io *nsio;
+	struct dax_region *dax_region;
+	struct nd_namespace_common *ndns;
+	struct nd_dax *nd_dax = to_nd_dax(dev);
+	struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
+	struct vmem_altmap __altmap, *altmap = NULL;
+
+	ndns = nvdimm_namespace_common_probe(dev);
+	if (IS_ERR(ndns))
+		return PTR_ERR(ndns);
+	nsio = to_nd_namespace_io(&ndns->dev);
+
+	/* parse the 'pfn' info block via ->rw_bytes */
+	devm_nsio_enable(dev, nsio);
+	altmap = nvdimm_setup_pfn(nd_pfn, &res, &__altmap);
+	if (IS_ERR(altmap))
+		return PTR_ERR(altmap);
+	devm_nsio_disable(dev, nsio);
+
+	pfn_sb = nd_pfn->pfn_sb;
+
+	if (!devm_request_mem_region(dev, nsio->res.start,
+				resource_size(&nsio->res), dev_name(dev))) {
+		dev_warn(dev, "could not reserve region %pR\n", &nsio->res);
+		return -EBUSY;
+	}
+
+	dax_pmem = devm_kzalloc(dev, sizeof(*dax_pmem), GFP_KERNEL);
+	if (!dax_pmem)
+		return -ENOMEM;
+
+	dax_pmem->dev = dev;
+	init_completion(&dax_pmem->cmp);
+	rc = percpu_ref_init(&dax_pmem->ref, dax_pmem_percpu_release, 0,
+			GFP_KERNEL);
+	if (rc)
+		return rc;
+
+	rc = devm_add_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref);
+	if (rc) {
+		dax_pmem_percpu_exit(&dax_pmem->ref);
+		return rc;
+	}
+
+	addr = devm_memremap_pages(dev, &res, &dax_pmem->ref, altmap);
+	if (IS_ERR(addr))
+		return PTR_ERR(addr);
+
+	rc = devm_add_action(dev, dax_pmem_percpu_kill, &dax_pmem->ref);
+	if (rc) {
+		dax_pmem_percpu_kill(&dax_pmem->ref);
+		return rc;
+	}
+
+	nd_region = to_nd_region(dev->parent);
+	dax_region = alloc_dax_region(dev, nd_region->id, &res,
+			le32_to_cpu(pfn_sb->align), addr, PFN_DEV|PFN_MAP);
+	if (!dax_region)
+		return -ENOMEM;
+
+	/* TODO: support for subdividing a dax region... */
+	rc = devm_create_dax_dev(dax_region, &res, 1);
+
+	/* child dax_dev instances now own the lifetime of the dax_region */
+	dax_region_put(dax_region);
+
+	return rc;
+}
+
+static struct nd_device_driver dax_pmem_driver = {
+	.probe = dax_pmem_probe,
+	.drv = {
+		.name = "dax_pmem",
+	},
+	.type = ND_DRIVER_DAX_PMEM,
+};
+
+static int __init dax_pmem_init(void)
+{
+	return nd_driver_register(&dax_pmem_driver);
+}
+module_init(dax_pmem_init);
+
+static void __exit dax_pmem_exit(void)
+{
+	driver_unregister(&dax_pmem_driver.drv);
+}
+module_exit(dax_pmem_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM);
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 5ff6d3c126a9..785985677159 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -16,6 +16,7 @@ ldflags-y += --wrap=phys_to_pfn_t
 DRIVERS := ../../../drivers
 NVDIMM_SRC := $(DRIVERS)/nvdimm
 ACPI_SRC := $(DRIVERS)/acpi
+DAX_SRC := $(DRIVERS)/dax
 
 obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
@@ -23,6 +24,8 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
 obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_ACPI_NFIT) += nfit.o
+obj-$(CONFIG_DEV_DAX) += dax.o
+obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 
 nfit-y := $(ACPI_SRC)/nfit.o
 nfit-y += config_check.o
@@ -39,6 +42,12 @@ nd_blk-y += config_check.o
 nd_e820-y := $(NVDIMM_SRC)/e820.o
 nd_e820-y += config_check.o
 
+dax-y := $(DAX_SRC)/dax.o
+dax-y += config_check.o
+
+dax_pmem-y := $(DAX_SRC)/pmem.o
+dax_pmem-y += config_check.o
+
 libnvdimm-y := $(NVDIMM_SRC)/core.o
 libnvdimm-y += $(NVDIMM_SRC)/bus.o
 libnvdimm-y += $(NVDIMM_SRC)/dimm_devs.o
diff --git a/tools/testing/nvdimm/config_check.c b/tools/testing/nvdimm/config_check.c
index f2c7615554eb..adf18bfeca00 100644
--- a/tools/testing/nvdimm/config_check.c
+++ b/tools/testing/nvdimm/config_check.c
@@ -12,4 +12,6 @@ void check(void)
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BTT));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BLK));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_ACPI_NFIT));
+	BUILD_BUG_ON(!IS_MODULE(CONFIG_DEV_DAX));
+	BUILD_BUG_ON(!IS_MODULE(CONFIG_DEV_DAX_PMEM));
 }
-- 
cgit v1.2.3


From dee410792419aaa8bc3e3b35d2ccb6515835916d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 14 May 2016 12:20:44 -0700
Subject: /dev/dax, core: file operations and dax-mmap

The "Device DAX" core enables dax mappings of performance / feature
differentiated memory.  An open mapping or file handle keeps the backing
struct device live, but new mappings are only possible while the device
is enabled.   Faults are handled under rcu_read_lock to synchronize
with the enabled state of the device.

Similar to the filesystem-dax case the backing memory may optionally
have struct page entries.  However, unlike fs-dax there is no support
for private mappings, or mappings that are not backed by media (see
use of zero-page in fs-dax).

Mappings are always guaranteed to match the alignment of the dax_region.
If the dax_region is configured to have a 2MB alignment, all mappings
are guaranteed to be backed by a pmd entry.  Contrast this determinism
with the fs-dax case where pmd mappings are opportunistic.  If userspace
attempts to force a misaligned mapping, the driver will fail the mmap
attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
like MAP_PRIVATE mappings.

Cc: Hannes Reinecke <hare@suse.de>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/Kconfig |   1 +
 drivers/dax/dax.c   | 322 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/huge_memory.c    |   1 +
 mm/hugetlb.c        |   1 +
 4 files changed, 325 insertions(+)

diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 86ffbaa891ad..cedab7572de3 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,6 +1,7 @@
 menuconfig DEV_DAX
 	tristate "DAX: direct access to differentiated memory"
 	default m if NVDIMM_DAX
+	depends on TRANSPARENT_HUGEPAGE
 	help
 	  Support raw access to differentiated (persistence, bandwidth,
 	  latency...) memory via an mmap(2) capable character
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 4c22a40f2335..b891a129b275 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -49,6 +49,7 @@ struct dax_region {
  * @region - parent region
  * @dev - device backing the character device
  * @kref - enable this data to be tracked in filp->private_data
+ * @alive - !alive + rcu grace period == no new mappings can be established
  * @id - child id in the region
  * @num_resources - number of physical address extents in this device
  * @res - array of physical address ranges
@@ -57,6 +58,7 @@ struct dax_dev {
 	struct dax_region *region;
 	struct device *dev;
 	struct kref kref;
+	bool alive;
 	int id;
 	int num_resources;
 	struct resource res[0];
@@ -150,6 +152,16 @@ static void unregister_dax_dev(void *_dev)
 
 	dev_dbg(dev, "%s\n", __func__);
 
+	/*
+	 * Note, rcu is not protecting the liveness of dax_dev, rcu is
+	 * ensuring that any fault handlers that might have seen
+	 * dax_dev->alive == true, have completed.  Any fault handlers
+	 * that start after synchronize_rcu() has started will abort
+	 * upon seeing dax_dev->alive == false.
+	 */
+	dax_dev->alive = false;
+	synchronize_rcu();
+
 	get_device(dev);
 	device_unregister(dev);
 	ida_simple_remove(&dax_region->ida, dax_dev->id);
@@ -173,6 +185,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
 	memcpy(dax_dev->res, res, sizeof(*res) * count);
 	dax_dev->num_resources = count;
 	kref_init(&dax_dev->kref);
+	dax_dev->alive = true;
 	dax_dev->region = dax_region;
 	kref_get(&dax_region->kref);
 
@@ -217,9 +230,318 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
 }
 EXPORT_SYMBOL_GPL(devm_create_dax_dev);
 
+/* return an unmapped area aligned to the dax region specified alignment */
+static unsigned long dax_dev_get_unmapped_area(struct file *filp,
+		unsigned long addr, unsigned long len, unsigned long pgoff,
+		unsigned long flags)
+{
+	unsigned long off, off_end, off_align, len_align, addr_align, align;
+	struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
+	struct dax_region *dax_region;
+
+	if (!dax_dev || addr)
+		goto out;
+
+	dax_region = dax_dev->region;
+	align = dax_region->align;
+	off = pgoff << PAGE_SHIFT;
+	off_end = off + len;
+	off_align = round_up(off, align);
+
+	if ((off_end <= off_align) || ((off_end - off_align) < align))
+		goto out;
+
+	len_align = len + align;
+	if ((off + len_align) < off)
+		goto out;
+
+	addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
+			pgoff, flags);
+	if (!IS_ERR_VALUE(addr_align)) {
+		addr_align += (off - addr_align) & (align - 1);
+		return addr_align;
+	}
+ out:
+	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+
+static int __match_devt(struct device *dev, const void *data)
+{
+	const dev_t *devt = data;
+
+	return dev->devt == *devt;
+}
+
+static struct device *dax_dev_find(dev_t dev_t)
+{
+	return class_find_device(dax_class, NULL, &dev_t, __match_devt);
+}
+
+static int dax_dev_open(struct inode *inode, struct file *filp)
+{
+	struct dax_dev *dax_dev = NULL;
+	struct device *dev;
+
+	dev = dax_dev_find(inode->i_rdev);
+	if (!dev)
+		return -ENXIO;
+
+	device_lock(dev);
+	dax_dev = dev_get_drvdata(dev);
+	if (dax_dev) {
+		dev_dbg(dev, "%s\n", __func__);
+		filp->private_data = dax_dev;
+		kref_get(&dax_dev->kref);
+		inode->i_flags = S_DAX;
+	}
+	device_unlock(dev);
+
+	if (!dax_dev) {
+		put_device(dev);
+		return -ENXIO;
+	}
+	return 0;
+}
+
+static int dax_dev_release(struct inode *inode, struct file *filp)
+{
+	struct dax_dev *dax_dev = filp->private_data;
+	struct device *dev = dax_dev->dev;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	dax_dev_put(dax_dev);
+	put_device(dev);
+
+	return 0;
+}
+
+static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
+		const char *func)
+{
+	struct dax_region *dax_region = dax_dev->region;
+	struct device *dev = dax_dev->dev;
+	unsigned long mask;
+
+	if (!dax_dev->alive)
+		return -ENXIO;
+
+	/* prevent private / writable mappings from being established */
+	if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {
+		dev_info(dev, "%s: %s: fail, attempted private mapping\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	mask = dax_region->align - 1;
+	if (vma->vm_start & mask || vma->vm_end & mask) {
+		dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
+				current->comm, func, vma->vm_start, vma->vm_end,
+				mask);
+		return -EINVAL;
+	}
+
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
+			&& (vma->vm_flags & VM_DONTCOPY) == 0) {
+		dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	if (!vma_is_dax(vma)) {
+		dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
+		unsigned long size)
+{
+	struct resource *res;
+	phys_addr_t phys;
+	int i;
+
+	for (i = 0; i < dax_dev->num_resources; i++) {
+		res = &dax_dev->res[i];
+		phys = pgoff * PAGE_SIZE + res->start;
+		if (phys >= res->start && phys <= res->end)
+			break;
+		pgoff -= PHYS_PFN(resource_size(res));
+	}
+
+	if (i < dax_dev->num_resources) {
+		res = &dax_dev->res[i];
+		if (phys + size - 1 <= res->end)
+			return phys;
+	}
+
+	return -1;
+}
+
+static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
+		struct vm_fault *vmf)
+{
+	unsigned long vaddr = (unsigned long) vmf->virtual_address;
+	struct device *dev = dax_dev->dev;
+	struct dax_region *dax_region;
+	int rc = VM_FAULT_SIGBUS;
+	phys_addr_t phys;
+	pfn_t pfn;
+
+	if (check_vma(dax_dev, vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dax_dev->region;
+	if (dax_region->align > PAGE_SIZE) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
+				vmf->pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	rc = vm_insert_mixed(vma, vaddr, pfn);
+
+	if (rc == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (rc < 0 && rc != -EBUSY)
+		return VM_FAULT_SIGBUS;
+
+	return VM_FAULT_NOPAGE;
+}
+
+static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int rc;
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
+			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
+			? "write" : "read", vma->vm_start, vma->vm_end);
+	rcu_read_lock();
+	rc = __dax_dev_fault(dax_dev, vma, vmf);
+	rcu_read_unlock();
+
+	return rc;
+}
+
+static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
+		struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
+		unsigned int flags)
+{
+	unsigned long pmd_addr = addr & PMD_MASK;
+	struct device *dev = dax_dev->dev;
+	struct dax_region *dax_region;
+	phys_addr_t phys;
+	pgoff_t pgoff;
+	pfn_t pfn;
+
+	if (check_vma(dax_dev, vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dax_dev->region;
+	if (dax_region->align > PMD_SIZE) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	/* dax pmd mappings require pfn_t_devmap() */
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pgoff = linear_page_index(vma, pmd_addr);
+	phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
+				pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
+			flags & FAULT_FLAG_WRITE);
+}
+
+static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+		pmd_t *pmd, unsigned int flags)
+{
+	int rc;
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
+			current->comm, (flags & FAULT_FLAG_WRITE)
+			? "write" : "read", vma->vm_start, vma->vm_end);
+
+	rcu_read_lock();
+	rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
+	rcu_read_unlock();
+
+	return rc;
+}
+
+static void dax_dev_vm_open(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	kref_get(&dax_dev->kref);
+}
+
+static void dax_dev_vm_close(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	dax_dev_put(dax_dev);
+}
+
+static const struct vm_operations_struct dax_dev_vm_ops = {
+	.fault = dax_dev_fault,
+	.pmd_fault = dax_dev_pmd_fault,
+	.open = dax_dev_vm_open,
+	.close = dax_dev_vm_close,
+};
+
+static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct dax_dev *dax_dev = filp->private_data;
+	int rc;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+
+	rc = check_vma(dax_dev, vma, __func__);
+	if (rc)
+		return rc;
+
+	kref_get(&dax_dev->kref);
+	vma->vm_ops = &dax_dev_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	return 0;
+
+}
+
 static const struct file_operations dax_fops = {
 	.llseek = noop_llseek,
 	.owner = THIS_MODULE,
+	.open = dax_dev_open,
+	.release = dax_dev_release,
+	.get_unmapped_area = dax_dev_get_unmapped_area,
+	.mmap = dax_dev_mmap,
 };
 
 static int __init dax_init(void)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86f9f8b82f8e..52ea012d8a80 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1013,6 +1013,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
 	return VM_FAULT_NOPAGE;
 }
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
 
 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d0d08b396f..b14e98129b07 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -624,6 +624,7 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
 {
 	return vma_hugecache_offset(hstate_vma(vma), vma, address);
 }
+EXPORT_SYMBOL_GPL(linear_hugepage_index);
 
 /*
  * Return the size of the pages allocated when backing a VMA. In the majority
-- 
cgit v1.2.3


From acc93d30d7d43f428272c20a047389c4cbca82ba Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 7 May 2016 11:40:28 -0700
Subject: Revert "block: enable dax for raw block devices"

This reverts commit 5a023cdba50c5f5f2bc351783b3131699deb3937.

The functionality is superseded by the new "Device DAX" facility.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/ioctl.c           | 32 -----------------
 fs/block_dev.c          | 96 +++++++++++++++----------------------------------
 include/linux/fs.h      |  8 -----
 include/uapi/linux/fs.h |  1 -
 4 files changed, 29 insertions(+), 108 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 4ff1f92f89ca..698c7933d582 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -407,35 +407,6 @@ static inline int is_unrecognized_ioctl(int ret)
 		ret == -ENOIOCTLCMD;
 }
 
-#ifdef CONFIG_FS_DAX
-bool blkdev_dax_capable(struct block_device *bdev)
-{
-	struct gendisk *disk = bdev->bd_disk;
-
-	if (!disk->fops->direct_access)
-		return false;
-
-	/*
-	 * If the partition is not aligned on a page boundary, we can't
-	 * do dax I/O to it.
-	 */
-	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
-			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
-		return false;
-
-	/*
-	 * If the device has known bad blocks, force all I/O through the
-	 * driver / page cache.
-	 *
-	 * TODO: support finer grained dax error handling
-	 */
-	if (disk->bb && disk->bb->count)
-		return false;
-
-	return true;
-}
-#endif
-
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
 		unsigned cmd, unsigned long arg)
 {
@@ -598,9 +569,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKTRACESETUP:
 	case BLKTRACETEARDOWN:
 		return blk_trace_ioctl(bdev, cmd, argp);
-	case BLKDAXGET:
-		return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
-		break;
 	case IOC_PR_REGISTER:
 		return blkdev_pr_register(bdev, argp);
 	case IOC_PR_RESERVE:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20a2c02b77c4..36ee10ca503e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,7 @@
 #include <linux/log2.h>
 #include <linux/cleancache.h>
 #include <linux/dax.h>
+#include <linux/badblocks.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -1159,6 +1160,33 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
+static bool blkdev_dax_capable(struct block_device *bdev)
+{
+	struct gendisk *disk = bdev->bd_disk;
+
+	if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))
+		return false;
+
+	/*
+	 * If the partition is not aligned on a page boundary, we can't
+	 * do dax I/O to it.
+	 */
+	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
+			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+		return false;
+
+	/*
+	 * If the device has known bad blocks, force all I/O through the
+	 * driver / page cache.
+	 *
+	 * TODO: support finer grained dax error handling
+	 */
+	if (disk->bb && disk->bb->count)
+		return false;
+
+	return true;
+}
+
 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
@@ -1724,79 +1752,13 @@ static const struct address_space_operations def_blk_aops = {
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
-#ifdef CONFIG_FS_DAX
-/*
- * In the raw block case we do not need to contend with truncation nor
- * unwritten file extents.  Without those concerns there is no need for
- * additional locking beyond the mmap_sem context that these routines
- * are already executing under.
- *
- * Note, there is no protection if the block device is dynamically
- * resized (partition grow/shrink) during a fault. A stable block device
- * size is already not enforced in the blkdev_direct_IO path.
- *
- * For DAX, it is the responsibility of the block device driver to
- * ensure the whole-disk device size is stable while requests are in
- * flight.
- *
- * Finally, unlike the filemap_page_mkwrite() case there is no
- * filesystem superblock to sync against freezing.  We still include a
- * pfn_mkwrite callback for dax drivers to receive write fault
- * notifications.
- */
-static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	return __dax_fault(vma, vmf, blkdev_get_block, NULL);
-}
-
-static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
-		struct vm_fault *vmf)
-{
-	return dax_pfn_mkwrite(vma, vmf);
-}
-
-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, unsigned int flags)
-{
-	return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-}
-
-static const struct vm_operations_struct blkdev_dax_vm_ops = {
-	.fault		= blkdev_dax_fault,
-	.pmd_fault	= blkdev_dax_pmd_fault,
-	.pfn_mkwrite	= blkdev_dax_pfn_mkwrite,
-};
-
-static const struct vm_operations_struct blkdev_default_vm_ops = {
-	.fault		= filemap_fault,
-	.map_pages	= filemap_map_pages,
-};
-
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct inode *bd_inode = bdev_file_inode(file);
-
-	file_accessed(file);
-	if (IS_DAX(bd_inode)) {
-		vma->vm_ops = &blkdev_dax_vm_ops;
-		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
-	} else {
-		vma->vm_ops = &blkdev_default_vm_ops;
-	}
-
-	return 0;
-}
-#else
-#define blkdev_mmap generic_file_mmap
-#endif
-
 const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
 	.read_iter	= blkdev_read_iter,
 	.write_iter	= blkdev_write_iter,
-	.mmap		= blkdev_mmap,
+	.mmap		= generic_file_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70e61b58baaf..8363a10660f6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2320,14 +2320,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-#ifdef CONFIG_FS_DAX
-extern bool blkdev_dax_capable(struct block_device *bdev);
-#else
-static inline bool blkdev_dax_capable(struct block_device *bdev)
-{
-	return false;
-}
-#endif
 
 extern struct super_block *blockdev_superblock;
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index a079d50376e1..fbff8b28aa35 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -222,7 +222,6 @@ struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
-#define BLKDAXGET _IO(0x12,129)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
-- 
cgit v1.2.3


From b354aba0165519a74f540f2ba89d7ec78efca21d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 17 May 2016 20:24:16 -0700
Subject: libnvdimm: release ida resources

ida instances allocate some internal memory for ->free_bitmap in
addition to the base 'struct ida'.  Use ida_destroy() to release that
memory at module_exit().

Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/core.c        | 3 +++
 drivers/nvdimm/dimm_devs.c   | 5 +++++
 drivers/nvdimm/nd-core.h     | 2 ++
 drivers/nvdimm/region_devs.c | 5 +++++
 4 files changed, 15 insertions(+)

diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 182a93fe3712..847532de087a 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -648,6 +648,9 @@ static __exit void libnvdimm_exit(void)
 	nd_region_exit();
 	nvdimm_exit();
 	nvdimm_bus_exit();
+	nd_region_devs_exit();
+	nvdimm_devs_exit();
+	ida_destroy(&nd_ida);
 }
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index c56f88217924..6cca03eaa345 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -546,3 +546,8 @@ int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count);
+
+void __exit nvdimm_devs_exit(void)
+{
+	ida_destroy(&dimm_ida);
+}
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index cb65308c0329..4136c1a82539 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -49,6 +49,8 @@ bool is_nd_blk(struct device *dev);
 struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
+void nvdimm_devs_exit(void);
+void nd_region_devs_exit(void);
 void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
 struct nd_region;
 void nd_region_create_blk_seed(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 9e1b054e0e61..40fcfea26fbb 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -793,3 +793,8 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 			__func__);
 }
 EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
+
+void __exit nd_region_devs_exit(void)
+{
+	ida_destroy(&region_ida);
+}
-- 
cgit v1.2.3


From c5ed9268643c7c4c9f2aaa0fd4c936095e6480ef Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 18 May 2016 14:50:12 -0700
Subject: libnvdimm, dax: autodetect support

For autodetecting a previously established dax configuration we need the
info block to indicate block-device vs device-dax mode, and we need to
have the default namespace probe hand-off the configuration to the
dax_pmem driver.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/dax_devs.c | 35 +++++++++++++++++++++++++++++++++++
 drivers/nvdimm/nd.h       | 11 +++++++++--
 drivers/nvdimm/pfn.h      |  1 +
 drivers/nvdimm/pfn_devs.c | 15 ++++++++++-----
 drivers/nvdimm/pmem.c     |  3 ++-
 5 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c
index f90f7549e7f4..45fa82cae87c 100644
--- a/drivers/nvdimm/dax_devs.c
+++ b/drivers/nvdimm/dax_devs.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include "nd-core.h"
+#include "pfn.h"
 #include "nd.h"
 
 static void nd_dax_release(struct device *dev)
@@ -97,3 +98,37 @@ struct device *nd_dax_create(struct nd_region *nd_region)
 	__nd_device_register(dev);
 	return dev;
 }
+
+int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns)
+{
+	int rc;
+	struct nd_dax *nd_dax;
+	struct device *dax_dev;
+	struct nd_pfn *nd_pfn;
+	struct nd_pfn_sb *pfn_sb;
+	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+	if (ndns->force_raw)
+		return -ENODEV;
+
+	nvdimm_bus_lock(&ndns->dev);
+	nd_dax = nd_dax_alloc(nd_region);
+	nd_pfn = &nd_dax->nd_pfn;
+	dax_dev = nd_pfn_devinit(nd_pfn, ndns);
+	nvdimm_bus_unlock(&ndns->dev);
+	if (!dax_dev)
+		return -ENOMEM;
+	pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
+	nd_pfn->pfn_sb = pfn_sb;
+	rc = nd_pfn_validate(nd_pfn, DAX_SIG);
+	dev_dbg(dev, "%s: dax: %s\n", __func__,
+			rc == 0 ? dev_name(dax_dev) : "<none>");
+	if (rc < 0) {
+		__nd_detach_ndns(dax_dev, &nd_pfn->ndns);
+		put_device(dax_dev);
+	} else
+		__nd_device_register(dax_dev);
+
+	return rc;
+}
+EXPORT_SYMBOL(nd_dax_probe);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 46910b8f32b1..d0ac93c31dda 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -232,7 +232,7 @@ bool is_nd_pfn(struct device *dev);
 struct device *nd_pfn_create(struct nd_region *nd_region);
 struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
 		struct nd_namespace_common *ndns);
-int nd_pfn_validate(struct nd_pfn *nd_pfn);
+int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig);
 extern struct attribute_group nd_pfn_attribute_group;
 #else
 static inline int nd_pfn_probe(struct device *dev,
@@ -251,7 +251,7 @@ static inline struct device *nd_pfn_create(struct nd_region *nd_region)
 	return NULL;
 }
 
-static inline int nd_pfn_validate(struct nd_pfn *nd_pfn)
+static inline int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 {
 	return -ENODEV;
 }
@@ -259,9 +259,16 @@ static inline int nd_pfn_validate(struct nd_pfn *nd_pfn)
 
 struct nd_dax *to_nd_dax(struct device *dev);
 #if IS_ENABLED(CONFIG_NVDIMM_DAX)
+int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_dax(struct device *dev);
 struct device *nd_dax_create(struct nd_region *nd_region);
 #else
+static inline int nd_dax_probe(struct device *dev,
+		struct nd_namespace_common *ndns)
+{
+	return -ENODEV;
+}
+
 static inline bool is_nd_dax(struct device *dev)
 {
 	return false;
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index 9d2704c83fa7..dde9853453d3 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -19,6 +19,7 @@
 
 #define PFN_SIG_LEN 16
 #define PFN_SIG "NVDIMM_PFN_INFO\0"
+#define DAX_SIG "NVDIMM_DAX_INFO\0"
 
 struct nd_pfn_sb {
 	u8 signature[PFN_SIG_LEN];
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 58740d7ce81b..816cd9828ca5 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -360,7 +360,7 @@ struct device *nd_pfn_create(struct nd_region *nd_region)
 	return dev;
 }
 
-int nd_pfn_validate(struct nd_pfn *nd_pfn)
+int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 {
 	u64 checksum, offset;
 	struct nd_namespace_io *nsio;
@@ -377,7 +377,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
 	if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)))
 		return -ENXIO;
 
-	if (memcmp(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN) != 0)
+	if (memcmp(pfn_sb->signature, sig, PFN_SIG_LEN) != 0)
 		return -ENODEV;
 
 	checksum = le64_to_cpu(pfn_sb->checksum);
@@ -467,7 +467,7 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
 	pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
 	nd_pfn = to_nd_pfn(pfn_dev);
 	nd_pfn->pfn_sb = pfn_sb;
-	rc = nd_pfn_validate(nd_pfn);
+	rc = nd_pfn_validate(nd_pfn, PFN_SIG);
 	dev_dbg(dev, "%s: pfn: %s\n", __func__,
 			rc == 0 ? dev_name(pfn_dev) : "<none>");
 	if (rc < 0) {
@@ -552,6 +552,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	struct nd_pfn_sb *pfn_sb;
 	unsigned long npfns;
 	phys_addr_t offset;
+	const char *sig;
 	u64 checksum;
 	int rc;
 
@@ -560,7 +561,11 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 		return -ENOMEM;
 
 	nd_pfn->pfn_sb = pfn_sb;
-	rc = nd_pfn_validate(nd_pfn);
+	if (is_nd_dax(&nd_pfn->dev))
+		sig = DAX_SIG;
+	else
+		sig = PFN_SIG;
+	rc = nd_pfn_validate(nd_pfn, sig);
 	if (rc != -ENODEV)
 		return rc;
 
@@ -628,7 +633,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
 	pfn_sb->dataoff = cpu_to_le64(offset);
 	pfn_sb->npfns = cpu_to_le64(npfns);
-	memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
+	memcpy(pfn_sb->signature, sig, PFN_SIG_LEN);
 	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
 	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
 	pfn_sb->version_major = cpu_to_le16(1);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index d9a0dbc2d023..042baec56931 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -320,7 +320,8 @@ static int nd_pmem_probe(struct device *dev)
 		return pmem_attach_disk(dev, ndns);
 
 	/* if we find a valid info-block we'll come back as that personality */
-	if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0)
+	if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
+			|| nd_dax_probe(dev, ndns) == 0)
 		return -ENXIO;
 
 	/* ...otherwise we're just a raw pmem device */
-- 
cgit v1.2.3


From 5e24c9fd36285535c704e84748d6c890be870fb6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 21 May 2016 11:01:41 -0700
Subject: libnvdimm, dax: fix alignment validation

Testing the dax-device autodetect support revealed a probe failure with
the following result:

    dax0.1: bad offset: 0x8200000 dax disabled

The original pfn-device implementation inferred the alignment from
ilog2(offset), now that the alignment is explicit the is_power_of_2()
needs replacing with a real sanity check against the recorded alignment.
Otherwise the alignment check is useless in the implicit case and only
the minimum size of the offset matters.

This self-consistency check is further validated by the probe path that
will re-check that the offset is large enough to contain all the
metadata required to enable the device.

Cc: <stable@vger.kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pfn_devs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 816cd9828ca5..04f71d6d304d 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -416,6 +416,8 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 			return -ENODEV;
 	}
 
+	if (nd_pfn->align == 0)
+		nd_pfn->align = le32_to_cpu(pfn_sb->align);
 	if (nd_pfn->align > nvdimm_namespace_capacity(ndns)) {
 		dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
 				nd_pfn->align, nvdimm_namespace_capacity(ndns));
@@ -436,8 +438,8 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 		return -EBUSY;
 	}
 
-	nd_pfn->align = le32_to_cpu(pfn_sb->align);
-	if (!is_power_of_2(offset) || offset < PAGE_SIZE) {
+	if ((nd_pfn->align && !IS_ALIGNED(offset, nd_pfn->align))
+			|| !IS_ALIGNED(offset, PAGE_SIZE)) {
 		dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled\n",
 				offset);
 		return -ENXIO;
-- 
cgit v1.2.3


From 03dca343afe080968d90c4d9196404b5bbbc8461 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 21 May 2016 12:22:41 -0700
Subject: libnvdimm, dax: fix deletion

The ndctl unit tests discovered that the dax enabling omitted updates to
nd_detach_and_reset().  This routine clears device the configuration
when the namespace is detached.  Without this clearing userspace may
assume that the device is in the process of being configured by another
agent in the system.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/claim.c    | 23 +++++++++++++++++++++--
 drivers/nvdimm/nd-core.h  |  1 +
 drivers/nvdimm/pfn_devs.c | 19 -------------------
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 5f53db59a058..8b2e3c4fb0ad 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -93,6 +93,25 @@ static bool is_idle(struct device *dev, struct nd_namespace_common *ndns)
 	return true;
 }
 
+struct nd_pfn *to_nd_pfn_safe(struct device *dev)
+{
+	/*
+	 * pfn device attributes are re-used by dax device instances, so we
+	 * need to be careful to correct device-to-nd_pfn conversion.
+	 */
+	if (is_nd_pfn(dev))
+		return to_nd_pfn(dev);
+
+	if (is_nd_dax(dev)) {
+		struct nd_dax *nd_dax = to_nd_dax(dev);
+
+		return &nd_dax->nd_pfn;
+	}
+
+	WARN_ON(1);
+	return NULL;
+}
+
 static void nd_detach_and_reset(struct device *dev,
 		struct nd_namespace_common **_ndns)
 {
@@ -106,8 +125,8 @@ static void nd_detach_and_reset(struct device *dev,
 		nd_btt->lbasize = 0;
 		kfree(nd_btt->uuid);
 		nd_btt->uuid = NULL;
-	} else if (is_nd_pfn(dev)) {
-		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	} else if (is_nd_pfn(dev) || is_nd_dax(dev)) {
+		struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 
 		kfree(nd_pfn->uuid);
 		nd_pfn->uuid = NULL;
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 4136c1a82539..6c42eda025f9 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -94,4 +94,5 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
 ssize_t nd_namespace_store(struct device *dev,
 		struct nd_namespace_common **_ndns, const char *buf,
 		size_t len);
+struct nd_pfn *to_nd_pfn_safe(struct device *dev);
 #endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 04f71d6d304d..436191c47077 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -54,25 +54,6 @@ struct nd_pfn *to_nd_pfn(struct device *dev)
 }
 EXPORT_SYMBOL(to_nd_pfn);
 
-static struct nd_pfn *to_nd_pfn_safe(struct device *dev)
-{
-	/*
-	 * pfn device attributes are re-used by dax device instances, so we
-	 * need to be careful to correct device-to-nd_pfn conversion.
-	 */
-	if (is_nd_pfn(dev))
-		return to_nd_pfn(dev);
-
-	if (is_nd_dax(dev)) {
-		struct nd_dax *nd_dax = to_nd_dax(dev);
-
-		return &nd_dax->nd_pfn;
-	}
-
-	WARN_ON(1);
-	return NULL;
-}
-
 static ssize_t mode_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-- 
cgit v1.2.3