summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2015-02-03 17:44:37 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2015-02-03 17:44:37 +1100
commit0519178514c69bd15a14b8a81002b16db5f71dad (patch)
treec516fd3148e9aa323ffbd5a7545b550e28a45320
parent68a9462de925abe75a9e347d5af9d12a60a314a1 (diff)
parent97054281cad8234429c88eb82ecc22eaaf6d0ef5 (diff)
Merge branch 'akpm/master'
-rw-r--r--Documentation/ABI/testing/sysfs-class-bdi8
-rw-r--r--Documentation/filesystems/00-INDEX5
-rw-r--r--Documentation/filesystems/Locking3
-rw-r--r--Documentation/filesystems/dax.txt91
-rw-r--r--Documentation/filesystems/ext2.txt5
-rw-r--r--Documentation/filesystems/ext4.txt4
-rw-r--r--Documentation/filesystems/vfs.txt7
-rw-r--r--Documentation/filesystems/xip.txt71
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/arm/boot/dts/zynq-parallella.dts2
-rw-r--r--drivers/block/Kconfig13
-rw-r--r--drivers/block/brd.c14
-rw-r--r--drivers/gpio/gpio-zevio.c4
-rw-r--r--drivers/rtc/rtc-isl12022.c3
-rw-r--r--drivers/rtc/rtc-isl12057.c3
-rw-r--r--drivers/staging/iio/light/isl29028.c4
-rw-r--r--drivers/w1/w1_int.c3
-rw-r--r--fs/Kconfig21
-rw-r--r--fs/Makefile1
-rw-r--r--fs/dax.c534
-rw-r--r--fs/exofs/inode.c1
-rw-r--r--fs/ext2/Kconfig11
-rw-r--r--fs/ext2/Makefile1
-rw-r--r--fs/ext2/ext2.h10
-rw-r--r--fs/ext2/file.c44
-rw-r--r--fs/ext2/inode.c38
-rw-r--r--fs/ext2/namei.c13
-rw-r--r--fs/ext2/super.c53
-rw-r--r--fs/ext2/xip.c86
-rw-r--r--fs/ext2/xip.h26
-rw-r--r--fs/ext4/ext4.h6
-rw-r--r--fs/ext4/file.c49
-rw-r--r--fs/ext4/indirect.c18
-rw-r--r--fs/ext4/inode.c89
-rw-r--r--fs/ext4/namei.c10
-rw-r--r--fs/ext4/super.c39
-rw-r--r--fs/ocfs2/aops.c242
-rw-r--r--fs/ocfs2/file.c76
-rw-r--r--fs/ocfs2/file.h9
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/journal.c110
-rw-r--r--fs/ocfs2/journal.h5
-rw-r--r--fs/ocfs2/namei.c284
-rw-r--r--fs/ocfs2/namei.h8
-rw-r--r--fs/ocfs2/ocfs2.h23
-rw-r--r--fs/ocfs2/ocfs2_fs.h14
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/open.c5
-rw-r--r--include/linux/fs.h34
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/linux/rmap.h2
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c35
-rw-r--r--mm/fadvise.c6
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/filemap_xip.c478
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory.c42
-rwxr-xr-xscripts/diffconfig1
60 files changed, 1786 insertions, 921 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi
index d773d5697cf5..3187a18af6da 100644
--- a/Documentation/ABI/testing/sysfs-class-bdi
+++ b/Documentation/ABI/testing/sysfs-class-bdi
@@ -53,3 +53,11 @@ stable_pages_required (read-only)
If set, the backing device requires that all pages comprising a write
request must not be changed until writeout is complete.
+
+strictlimit (read-write)
+
+ Forces per-BDI checks for the share of given device in the write-back
+ cache even before the global background dirty limit is reached. This
+ is useful in situations where the global limit is much higher than
+ affordable for given relatively slow (or untrusted) device. Turning
+ strictlimit on has no visible effect if max_ratio is equal to 100%.
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index ac28149aede4..9922939e7d99 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -34,6 +34,9 @@ configfs/
- directory containing configfs documentation and example code.
cramfs.txt
- info on the cram filesystem for small storage (ROMs etc).
+dax.txt
+ - info on avoiding the page cache for files stored on CPU-addressable
+ storage devices.
debugfs.txt
- info on the debugfs filesystem.
devpts.txt
@@ -154,5 +157,3 @@ xfs-self-describing-metadata.txt
- info on XFS Self Describing Metadata.
xfs.txt
- info and mount options for the XFS filesystem.
-xip.txt
- - info on execute-in-place for file mappings.
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index b30753cbf431..2ca3d17eee56 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -199,8 +199,6 @@ prototypes:
int (*releasepage) (struct page *, int);
void (*freepage)(struct page *);
int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
- int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
- unsigned long *);
int (*migratepage)(struct address_space *, struct page *, struct page *);
int (*launder_page)(struct page *);
int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
@@ -225,7 +223,6 @@ invalidatepage: yes
releasepage: yes
freepage: yes
direct_IO:
-get_xip_mem: maybe
migratepage: yes (both)
launder_page: yes
is_partially_uptodate: yes
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
new file mode 100644
index 000000000000..be376d91d058
--- /dev/null
+++ b/Documentation/filesystems/dax.txt
@@ -0,0 +1,91 @@
+Direct Access for files
+-----------------------
+
+Motivation
+----------
+
+The page cache is usually used to buffer reads and writes to files.
+It is also used to provide the pages which are mapped into userspace
+by a call to mmap.
+
+For block devices that are memory-like, the page cache pages would be
+unnecessary copies of the original storage. The DAX code removes the
+extra copy by performing reads and writes directly to the storage device.
+For file mappings, the storage device is mapped directly into userspace.
+
+
+Usage
+-----
+
+If you have a block device which supports DAX, you can make a filesystem
+on it as usual. When mounting it, use the -o dax option manually
+or add 'dax' to the options in /etc/fstab.
+
+
+Implementation Tips for Block Driver Writers
+--------------------------------------------
+
+To support DAX in your block driver, implement the 'direct_access'
+block device operation. It is used to translate the sector number
+(expressed in units of 512-byte sectors) to a page frame number (pfn)
+that identifies the physical page for the memory. It also returns a
+kernel virtual address that can be used to access the memory.
+
+The direct_access method takes a 'size' parameter that indicates the
+number of bytes being requested. The function should return the number
+of bytes that can be contiguously accessed at that offset. It may also
+return a negative errno if an error occurs.
+
+In order to support this method, the storage must be byte-accessible by
+the CPU at all times. If your device uses paging techniques to expose
+a large amount of memory through a smaller window, then you cannot
+implement direct_access. Equally, if your device can occasionally
+stall the CPU for an extended period, you should also not attempt to
+implement direct_access.
+
+These block devices may be used for inspiration:
+- axonram: Axon DDR2 device driver
+- brd: RAM backed block device driver
+- dcssblk: s390 dcss block device driver
+
+
+Implementation Tips for Filesystem Writers
+------------------------------------------
+
+Filesystem support consists of
+- adding support to mark inodes as being DAX by setting the S_DAX flag in
+ i_flags
+- implementing the direct_IO address space operation, and calling
+ dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
+- implementing an mmap file operation for DAX files which sets the
+ VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers
+ for fault and page_mkwrite (which should probably call dax_fault() and
+ dax_mkwrite(), passing the appropriate get_block() callback)
+- calling dax_truncate_page() instead of block_truncate_page() for DAX files
+- calling dax_zero_page_range() instead of zero_user() for DAX files
+- ensuring that there is sufficient locking between reads, writes,
+ truncates and page faults
+
+The get_block() callback passed to the DAX functions may return
+uninitialised extents. If it does, it must ensure that simultaneous
+calls to get_block() (for example by a page-fault racing with a read()
+or a write()) work correctly.
+
+These filesystems may be used for inspiration:
+- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt
+- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt
+
+
+Shortcomings
+------------
+
+Even if the kernel or its modules are stored on a filesystem that supports
+DAX on a block device that supports DAX, they will still be copied into RAM.
+
+Calling get_user_pages() on a range of user memory that has been mmaped
+from a DAX file will fail as there are no 'struct page' to describe
+those pages. This problem is being worked on. That means that O_DIRECT
+reads/writes to those memory ranges from a non-DAX file will fail (note
+that O_DIRECT reads/writes _of a DAX file_ do work, it is the memory
+that is being accessed that is key here). Other things that will not
+work include RDMA, sendfile() and splice().
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index 67639f905f10..b9714569e472 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -20,6 +20,9 @@ minixdf Makes `df' act like Minix.
check=none, nocheck (*) Don't do extra checking of bitmaps on mount
(check=normal and check=strict options removed)
+dax Use direct access (no page cache). See
+ Documentation/filesystems/dax.txt.
+
debug Extra debugging information is sent to the
kernel syslog. Useful for developers.
@@ -56,8 +59,6 @@ noacl Don't support POSIX ACLs.
nobh Do not attach buffer_heads to file pagecache.
-xip Use execute in place (no caching) if possible
-
grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2.
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 919a3293aaa4..6c0108eb0137 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -386,6 +386,10 @@ max_dir_size_kb=n This limits the size of directories so that any
i_version Enable 64-bit inode version support. This option is
off by default.
+dax Use direct access (no page cache). See
+ Documentation/filesystems/dax.txt. Note that
+ this option is incompatible with data=journal.
+
Data Mode
=========
There are 3 different data modes:
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 43ce0507ee25..966b22829f3b 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -591,8 +591,6 @@ struct address_space_operations {
int (*releasepage) (struct page *, int);
void (*freepage)(struct page *);
ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
- struct page* (*get_xip_page)(struct address_space *, sector_t,
- int);
/* migrate the contents of a page to the specified target */
int (*migratepage) (struct page *, struct page *);
int (*launder_page) (struct page *);
@@ -748,11 +746,6 @@ struct address_space_operations {
and transfer data directly between the storage and the
application's address space.
- get_xip_page: called by the VM to translate a block number to a page.
- The page is valid until the corresponding filesystem is unmounted.
- Filesystems that want to use execute-in-place (XIP) need to implement
- it. An example implementation can be found in fs/ext2/xip.c.
-
migrate_page: This is used to compact the physical memory usage.
If the VM wants to relocate a page (maybe off a memory card
that is signalling imminent failure) it will pass a new page
diff --git a/Documentation/filesystems/xip.txt b/Documentation/filesystems/xip.txt
deleted file mode 100644
index b77472949ede..000000000000
--- a/Documentation/filesystems/xip.txt
+++ /dev/null
@@ -1,71 +0,0 @@
-Execute-in-place for file mappings
-----------------------------------
-
-Motivation
-----------
-File mappings are performed by mapping page cache pages to userspace. In
-addition, read&write type file operations also transfer data from/to the page
-cache.
-
-For memory backed storage devices that use the block device interface, the page
-cache pages are in fact copies of the original storage. Various approaches
-exist to work around the need for an extra copy. The ramdisk driver for example
-does read the data into the page cache, keeps a reference, and discards the
-original data behind later on.
-
-Execute-in-place solves this issue the other way around: instead of keeping
-data in the page cache, the need to have a page cache copy is eliminated
-completely. With execute-in-place, read&write type operations are performed
-directly from/to the memory backed storage device. For file mappings, the
-storage device itself is mapped directly into userspace.
-
-This implementation was initially written for shared memory segments between
-different virtual machines on s390 hardware to allow multiple machines to
-share the same binaries and libraries.
-
-Implementation
---------------
-Execute-in-place is implemented in three steps: block device operation,
-address space operation, and file operations.
-
-A block device operation named direct_access is used to translate the
-block device sector number to a page frame number (pfn) that identifies
-the physical page for the memory. It also returns a kernel virtual
-address that can be used to access the memory.
-
-The direct_access method takes a 'size' parameter that indicates the
-number of bytes being requested. The function should return the number
-of bytes that can be contiguously accessed at that offset. It may also
-return a negative errno if an error occurs.
-
-The block device operation is optional, these block devices support it as of
-today:
-- dcssblk: s390 dcss block device driver
-
-An address space operation named get_xip_mem is used to retrieve references
-to a page frame number and a kernel address. To obtain these values a reference
-to an address_space is provided. This function assigns values to the kmem and
-pfn parameters. The third argument indicates whether the function should allocate
-blocks if needed.
-
-This address space operation is mutually exclusive with readpage&writepage that
-do page cache read/write operations.
-The following filesystems support it as of today:
-- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt
-
-A set of file operations that do utilize get_xip_page can be found in
-mm/filemap_xip.c . The following file operation implementations are provided:
-- aio_read/aio_write
-- readv/writev
-- sendfile
-
-The generic file operations do_sync_read/do_sync_write can be used to implement
-classic synchronous IO calls.
-
-Shortcomings
-------------
-This implementation is limited to storage devices that are cpu addressable at
-all times (no highmem or such). It works well on rom/ram, but enhancements are
-needed to make it work with flash in read+write mode.
-Putting the Linux kernel and/or its modules on a xip filesystem does not mean
-they are not copied.
diff --git a/MAINTAINERS b/MAINTAINERS
index daa05d3ad25b..089039bd4956 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -34,7 +34,7 @@ trivial patch so apply some common sense.
generalized kernel feature ready for next time.
PLEASE check your patch with the automated style checker
- (scripts/checkpatch.pl) to catch trival style violations.
+ (scripts/checkpatch.pl) to catch trivial style violations.
See Documentation/CodingStyle for guidance here.
PLEASE CC: the maintainers and mailing lists that are generated
@@ -3151,6 +3151,12 @@ L: linux-i2c@vger.kernel.org
S: Maintained
F: drivers/i2c/busses/i2c-diolan-u2c.c
+DIRECT ACCESS (DAX)
+M: Matthew Wilcox <willy@linux.intel.com>
+L: linux-fsdevel@vger.kernel.org
+S: Supported
+F: fs/dax.c
+
DIRECTORY NOTIFICATION (DNOTIFY)
M: Eric Paris <eparis@parisplace.org>
S: Maintained
diff --git a/arch/arm/boot/dts/zynq-parallella.dts b/arch/arm/boot/dts/zynq-parallella.dts
index ab1dc0a56cdd..174571232ea5 100644
--- a/arch/arm/boot/dts/zynq-parallella.dts
+++ b/arch/arm/boot/dts/zynq-parallella.dts
@@ -58,7 +58,7 @@
status = "okay";
isl9305: isl9305@68 {
- compatible = "isl,isl9305";
+ compatible = "isil,isl9305";
reg = <0x68>;
regulators {
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 014a1cfc41c5..1b8094d4d7af 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -393,14 +393,15 @@ config BLK_DEV_RAM_SIZE
The default value is 4096 kilobytes. Only change this if you know
what you are doing.
-config BLK_DEV_XIP
- bool "Support XIP filesystems on RAM block device"
- depends on BLK_DEV_RAM
+config BLK_DEV_RAM_DAX
+ bool "Support Direct Access (DAX) to RAM block devices"
+ depends on BLK_DEV_RAM && FS_DAX
default n
help
- Support XIP filesystems (such as ext2 with XIP support on) on
- top of block ram device. This will slightly enlarge the kernel, and
- will prevent RAM block device backing store memory from being
+ Support filesystems using DAX to access RAM block devices. This
+ avoids double-buffering data in the page cache before copying it
+ to the block device. Answering Y will slightly enlarge the kernel,
+ and will prevent RAM block device backing store memory from being
allocated from highmem (only a problem for highmem systems).
config CDROM_PKTCDVD
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index c01b921b1b4a..64ab4951e9d6 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -97,13 +97,13 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
* Must use NOIO because we don't want to recurse back into the
* block or filesystem layers from page reclaim.
*
- * Cannot support XIP and highmem, because our ->direct_access
- * routine for XIP must return memory that is always addressable.
- * If XIP was reworked to use pfns and kmap throughout, this
+ * Cannot support DAX and highmem, because our ->direct_access
+ * routine for DAX must return memory that is always addressable.
+ * If DAX was reworked to use pfns and kmap throughout, this
* restriction might be able to be lifted.
*/
gfp_flags = GFP_NOIO | __GFP_ZERO;
-#ifndef CONFIG_BLK_DEV_XIP
+#ifndef CONFIG_BLK_DEV_RAM_DAX
gfp_flags |= __GFP_HIGHMEM;
#endif
page = alloc_page(gfp_flags);
@@ -369,7 +369,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
return err;
}
-#ifdef CONFIG_BLK_DEV_XIP
+#ifdef CONFIG_BLK_DEV_RAM_DAX
static long brd_direct_access(struct block_device *bdev, sector_t sector,
void **kaddr, unsigned long *pfn, long size)
{
@@ -390,6 +390,8 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector,
*/
return PAGE_SIZE;
}
+#else
+#define brd_direct_access NULL
#endif
static int brd_ioctl(struct block_device *bdev, fmode_t mode,
@@ -430,9 +432,7 @@ static const struct block_device_operations brd_fops = {
.owner = THIS_MODULE,
.rw_page = brd_rw_page,
.ioctl = brd_ioctl,
-#ifdef CONFIG_BLK_DEV_XIP
.direct_access = brd_direct_access,
-#endif
};
/*
diff --git a/drivers/gpio/gpio-zevio.c b/drivers/gpio/gpio-zevio.c
index 6f02d7c4cc57..6e4fb2a470f8 100644
--- a/drivers/gpio/gpio-zevio.c
+++ b/drivers/gpio/gpio-zevio.c
@@ -18,6 +18,10 @@
#include <linux/slab.h>
#include <linux/gpio.h>
+#ifndef IOMEM
+#define IOMEM(x) ((void __force __iomem *)(x))
+#endif
+
/*
* Memory layout:
* This chip has four gpio sections, each controls 8 GPIOs.
diff --git a/drivers/rtc/rtc-isl12022.c b/drivers/rtc/rtc-isl12022.c
index ee3ba7e6b45e..f9b082784b90 100644
--- a/drivers/rtc/rtc-isl12022.c
+++ b/drivers/rtc/rtc-isl12022.c
@@ -275,7 +275,8 @@ static int isl12022_probe(struct i2c_client *client,
#ifdef CONFIG_OF
static const struct of_device_id isl12022_dt_match[] = {
- { .compatible = "isl,isl12022" },
+ { .compatible = "isl,isl12022" }, /* for backward compat., don't use */
+ { .compatible = "isil,isl12022" },
{ },
};
#endif
diff --git a/drivers/rtc/rtc-isl12057.c b/drivers/rtc/rtc-isl12057.c
index b8f862953f7f..da818d3337ce 100644
--- a/drivers/rtc/rtc-isl12057.c
+++ b/drivers/rtc/rtc-isl12057.c
@@ -644,7 +644,8 @@ static SIMPLE_DEV_PM_OPS(isl12057_rtc_pm_ops, isl12057_rtc_suspend,
#ifdef CONFIG_OF
static const struct of_device_id isl12057_dt_match[] = {
- { .compatible = "isl,isl12057" },
+ { .compatible = "isl,isl12057" }, /* for backward compat., don't use */
+ { .compatible = "isil,isl12057" },
{ },
};
#endif
diff --git a/drivers/staging/iio/light/isl29028.c b/drivers/staging/iio/light/isl29028.c
index e969107ddb47..6440e3b293ca 100644
--- a/drivers/staging/iio/light/isl29028.c
+++ b/drivers/staging/iio/light/isl29028.c
@@ -537,8 +537,8 @@ static const struct i2c_device_id isl29028_id[] = {
MODULE_DEVICE_TABLE(i2c, isl29028_id);
static const struct of_device_id isl29028_of_match[] = {
- { .compatible = "isl,isl29028", },
- { .compatible = "isil,isl29028", },/* deprecated, don't use */
+ { .compatible = "isl,isl29028", }, /* for backward compat., don't use */
+ { .compatible = "isil,isl29028", },
{ },
};
MODULE_DEVICE_TABLE(of, isl29028_of_match);
diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c
index 47249a30eae3..20f766afa4c7 100644
--- a/drivers/w1/w1_int.c
+++ b/drivers/w1/w1_int.c
@@ -91,8 +91,7 @@ static struct w1_master *w1_alloc_dev(u32 id, int slave_count, int slave_ttl,
err = device_register(&dev->dev);
if (err) {
pr_err("Failed to register master device. err=%d\n", err);
- memset(dev, 0, sizeof(struct w1_master));
- kfree(dev);
+ put_device(&dev->dev);
dev = NULL;
}
diff --git a/fs/Kconfig b/fs/Kconfig
index 664991afe0c0..560971c20f12 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -13,13 +13,6 @@ if BLOCK
source "fs/ext2/Kconfig"
source "fs/ext3/Kconfig"
source "fs/ext4/Kconfig"
-
-config FS_XIP
-# execute in place
- bool
- depends on EXT2_FS_XIP
- default y
-
source "fs/jbd/Kconfig"
source "fs/jbd2/Kconfig"
@@ -40,6 +33,20 @@ source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
source "fs/nilfs2/Kconfig"
+config FS_DAX
+ bool "Direct Access (DAX) support"
+ depends on MMU
+ help
+ Direct Access (DAX) can be used on memory-backed block devices.
+ If the block device supports DAX and the filesystem supports DAX,
+ then you can avoid using the pagecache to buffer I/Os. Turning
+ on this option will compile in support for DAX; you will need to
+ mount the filesystem using the -o dax option.
+
+ If you do not have a block device that is capable of using this,
+ or if unsure, say N. Saying Y will increase the size of the kernel
+ by about 5kB.
+
endif # BLOCK
# Posix ACL utility routines
diff --git a/fs/Makefile b/fs/Makefile
index bedff48e8fdc..0f4635f7c49c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_AIO) += aio.o
+obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
diff --git a/fs/dax.c b/fs/dax.c
new file mode 100644
index 000000000000..ed1619ec6537
--- /dev/null
+++ b/fs/dax.c
@@ -0,0 +1,534 @@
+/*
+ * fs/dax.c - Direct Access filesystem code
+ * Copyright (c) 2013-2014 Intel Corporation
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/uio.h>
+#include <linux/vmstat.h>
+
+int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+{
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ sector_t sector = block << (inode->i_blkbits - 9);
+
+ might_sleep();
+ do {
+ void *addr;
+ unsigned long pfn;
+ long count;
+
+ count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+ if (count < 0)
+ return count;
+ BUG_ON(size < count);
+ while (count > 0) {
+ unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
+ if (pgsz > count)
+ pgsz = count;
+ if (pgsz < PAGE_SIZE)
+ memset(addr, 0, pgsz);
+ else
+ clear_page(addr);
+ addr += pgsz;
+ size -= pgsz;
+ count -= pgsz;
+ BUG_ON(pgsz & 511);
+ sector += pgsz / 512;
+ cond_resched();
+ }
+ } while (size);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dax_clear_blocks);
+
+static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+{
+ unsigned long pfn;
+ sector_t sector = bh->b_blocknr << (blkbits - 9);
+ return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
+}
+
+static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
+ loff_t end)
+{
+ loff_t final = end - pos + first; /* The final byte of the buffer */
+
+ if (first > 0)
+ memset(addr, 0, first);
+ if (final < size)
+ memset(addr + final, 0, size - final);
+}
+
+static bool buffer_written(struct buffer_head *bh)
+{
+ return buffer_mapped(bh) && !buffer_unwritten(bh);
+}
+
+/*
+ * When ext4 encounters a hole, it returns without modifying the buffer_head
+ * which means that we can't trust b_size. To cope with this, we set b_state
+ * to 0 before calling get_block and, if any bit is set, we know we can trust
+ * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
+ * and would save us time calling get_block repeatedly.
+ */
+static bool buffer_size_valid(struct buffer_head *bh)
+{
+ return bh->b_state != 0;
+}
+
+static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
+ loff_t start, loff_t end, get_block_t get_block,
+ struct buffer_head *bh)
+{
+ ssize_t retval = 0;
+ loff_t pos = start;
+ loff_t max = start;
+ loff_t bh_max = start;
+ void *addr;
+ bool hole = false;
+
+ if (rw != WRITE)
+ end = min(end, i_size_read(inode));
+
+ while (pos < end) {
+ unsigned len;
+ if (pos == max) {
+ unsigned blkbits = inode->i_blkbits;
+ sector_t block = pos >> blkbits;
+ unsigned first = pos - (block << blkbits);
+ long size;
+
+ if (pos == bh_max) {
+ bh->b_size = PAGE_ALIGN(end - pos);
+ bh->b_state = 0;
+ retval = get_block(inode, block, bh,
+ rw == WRITE);
+ if (retval)
+ break;
+ if (!buffer_size_valid(bh))
+ bh->b_size = 1 << blkbits;
+ bh_max = pos - first + bh->b_size;
+ } else {
+ unsigned done = bh->b_size -
+ (bh_max - (pos - first));
+ bh->b_blocknr += done >> blkbits;
+ bh->b_size -= done;
+ }
+
+ hole = (rw != WRITE) && !buffer_written(bh);
+ if (hole) {
+ addr = NULL;
+ size = bh->b_size - first;
+ } else {
+ retval = dax_get_addr(bh, &addr, blkbits);
+ if (retval < 0)
+ break;
+ if (buffer_unwritten(bh) || buffer_new(bh))
+ dax_new_buf(addr, retval, first, pos,
+ end);
+ addr += first;
+ size = retval - first;
+ }
+ max = min(pos + size, end);
+ }
+
+ if (rw == WRITE)
+ len = copy_from_iter(addr, max - pos, iter);
+ else if (!hole)
+ len = copy_to_iter(addr, max - pos, iter);
+ else
+ len = iov_iter_zero(max - pos, iter);
+
+ if (!len)
+ break;
+
+ pos += len;
+ addr += len;
+ }
+
+ return (pos == start) ? retval : pos - start;
+}
+
+/**
+ * dax_do_io - Perform I/O to a DAX file
+ * @rw: READ to read or WRITE to write
+ * @iocb: The control block for this I/O
+ * @inode: The file which the I/O is directed at
+ * @iter: The addresses to do I/O from or to
+ * @pos: The file offset where the I/O starts
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ * @end_io: A filesystem callback for I/O completion
+ * @flags: See below
+ *
+ * This function uses the same locking scheme as do_blockdev_direct_IO:
+ * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
+ * caller for writes. For reads, we take and release the i_mutex ourselves.
+ * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
+ * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
+ * is in progress.
+ */
+ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
+ struct iov_iter *iter, loff_t pos,
+ get_block_t get_block, dio_iodone_t end_io, int flags)
+{
+ struct buffer_head bh;
+ ssize_t retval = -EINVAL;
+ loff_t end = pos + iov_iter_count(iter);
+
+ memset(&bh, 0, sizeof(bh));
+
+ if ((flags & DIO_LOCKING) && (rw == READ)) {
+ struct address_space *mapping = inode->i_mapping;
+ mutex_lock(&inode->i_mutex);
+ retval = filemap_write_and_wait_range(mapping, pos, end - 1);
+ if (retval) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+ }
+
+ /* Protects against truncate */
+ atomic_inc(&inode->i_dio_count);
+
+ retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
+
+ if ((flags & DIO_LOCKING) && (rw == READ))
+ mutex_unlock(&inode->i_mutex);
+
+ if ((retval > 0) && end_io)
+ end_io(iocb, pos, retval, bh.b_private);
+
+ inode_dio_done(inode);
+ out:
+ return retval;
+}
+EXPORT_SYMBOL_GPL(dax_do_io);
+
+/*
+ * The user has performed a load from a hole in the file. Allocating
+ * a new page in the file would cause excessive storage usage for
+ * workloads with sparse files. We allocate a page cache page instead.
+ * We'll kick it out of the page cache if it's ever written to,
+ * otherwise it will simply fall out of the page cache under memory
+ * pressure without ever having been dirtied.
+ */
+static int dax_load_hole(struct address_space *mapping, struct page *page,
+ struct vm_fault *vmf)
+{
+ unsigned long size;
+ struct inode *inode = mapping->host;
+ if (!page)
+ page = find_or_create_page(mapping, vmf->pgoff,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return VM_FAULT_OOM;
+ /* Recheck i_size under page lock to avoid truncate race */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size) {
+ unlock_page(page);
+ page_cache_release(page);
+ return VM_FAULT_SIGBUS;
+ }
+
+ vmf->page = page;
+ return VM_FAULT_LOCKED;
+}
+
+static int copy_user_bh(struct page *to, struct buffer_head *bh,
+ unsigned blkbits, unsigned long vaddr)
+{
+ void *vfrom, *vto;
+ if (dax_get_addr(bh, &vfrom, blkbits) < 0)
+ return -EIO;
+ vto = kmap_atomic(to);
+ copy_user_page(vto, vfrom, vaddr, to);
+ kunmap_atomic(vto);
+ return 0;
+}
+
+static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct address_space *mapping = inode->i_mapping;
+ sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+ unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ void *addr;
+ unsigned long pfn;
+ pgoff_t size;
+ int error;
+
+ i_mmap_lock_read(mapping);
+
+ /*
+ * Check truncate didn't happen while we were allocating a block.
+ * If it did, this block may or may not be still allocated to the
+ * file. We can't tell the filesystem to free it because we can't
+ * take i_mutex here. In the worst case, the file still has blocks
+ * allocated past the end of the file.
+ */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (unlikely(vmf->pgoff >= size)) {
+ error = -EIO;
+ goto out;
+ }
+
+ error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
+ if (error < 0)
+ goto out;
+ if (error < PAGE_SIZE) {
+ error = -EIO;
+ goto out;
+ }
+
+ if (buffer_unwritten(bh) || buffer_new(bh))
+ clear_page(addr);
+
+ error = vm_insert_mixed(vma, vaddr, pfn);
+
+ out:
+ i_mmap_unlock_read(mapping);
+
+ if (bh->b_end_io)
+ bh->b_end_io(bh, 1);
+
+ return error;
+}
+
+static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct page *page;
+ struct buffer_head bh;
+ unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ unsigned blkbits = inode->i_blkbits;
+ sector_t block;
+ pgoff_t size;
+ int error;
+ int major = 0;
+
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ return VM_FAULT_SIGBUS;
+
+ memset(&bh, 0, sizeof(bh));
+ block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+ bh.b_size = PAGE_SIZE;
+
+ repeat:
+ page = find_get_page(mapping, vmf->pgoff);
+ if (page) {
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+ page_cache_release(page);
+ return VM_FAULT_RETRY;
+ }
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (unlikely(vmf->pgoff >= size)) {
+ /*
+ * We have a struct page covering a hole in the file
+ * from a read fault and we've raced with a truncate
+ */
+ error = -EIO;
+ goto unlock_page;
+ }
+ }
+
+ error = get_block(inode, block, &bh, 0);
+ if (!error && (bh.b_size < PAGE_SIZE))
+ error = -EIO; /* fs corruption? */
+ if (error)
+ goto unlock_page;
+
+ if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ error = get_block(inode, block, &bh, 1);
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ major = VM_FAULT_MAJOR;
+ if (!error && (bh.b_size < PAGE_SIZE))
+ error = -EIO;
+ if (error)
+ goto unlock_page;
+ } else {
+ return dax_load_hole(mapping, page, vmf);
+ }
+ }
+
+ if (vmf->cow_page) {
+ struct page *new_page = vmf->cow_page;
+ if (buffer_written(&bh))
+ error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+ else
+ clear_user_highpage(new_page, vaddr);
+ if (error)
+ goto unlock_page;
+ vmf->page = page;
+ if (!page) {
+ i_mmap_lock_read(mapping);
+ /* Check we didn't race with truncate */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
+ if (vmf->pgoff >= size) {
+ i_mmap_unlock_read(mapping);
+ error = -EIO;
+ goto out;
+ }
+ }
+ return VM_FAULT_LOCKED;
+ }
+
+ /* Check we didn't race with a read fault installing a new page */
+ if (!page && major)
+ page = find_lock_page(mapping, vmf->pgoff);
+
+ if (page) {
+ unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+ delete_from_page_cache(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+
+ error = dax_insert_mapping(inode, &bh, vma, vmf);
+
+ out:
+ if (error == -ENOMEM)
+ return VM_FAULT_OOM | major;
+ /* -EBUSY is fine, somebody else faulted on the same PTE */
+ if ((error < 0) && (error != -EBUSY))
+ return VM_FAULT_SIGBUS | major;
+ return VM_FAULT_NOPAGE | major;
+
+ unlock_page:
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ goto out;
+}
+
+/**
+ * dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files.
+ */
+int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block)
+{
+ int result;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ }
+ result = do_dax_fault(vma, vmf, get_block);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(sb);
+
+ return result;
+}
+EXPORT_SYMBOL_GPL(dax_fault);
+
+/**
+ * dax_zero_page_range - zero a range within a page of a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @length: The number of bytes to zero
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * This function can be called by a filesystem when it is zeroing part of a
+ * page in a DAX file. This is intended for hole-punch operations. If
+ * you are truncating a file, the helper function dax_truncate_page() may be
+ * more convenient.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks. Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
+ get_block_t get_block)
+{
+ struct buffer_head bh;
+ pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ int err;
+
+ /* Block boundary? Nothing to do */
+ if (!length)
+ return 0;
+ BUG_ON((offset + length) > PAGE_CACHE_SIZE);
+
+ memset(&bh, 0, sizeof(bh));
+ bh.b_size = PAGE_CACHE_SIZE;
+ err = get_block(inode, index, &bh, 0);
+ if (err < 0)
+ return err;
+ if (buffer_written(&bh)) {
+ void *addr;
+ err = dax_get_addr(&bh, &addr, inode->i_blkbits);
+ if (err < 0)
+ return err;
+ memset(addr + offset, 0, length);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
+/**
+ * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * Similar to block_truncate_page(), this function can be called by a
+ * filesystem when it is truncating a DAX file to handle the partial page.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks. Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+{
+ unsigned length = PAGE_CACHE_ALIGN(from) - from;
+ return dax_zero_page_range(inode, from, length, get_block);
+}
+EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 6fc91df99ff8..a198e94813fe 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = {
.direct_IO = exofs_direct_IO,
/* With these NULL has special meaning or default is not exported */
- .get_xip_mem = NULL,
.migratepage = NULL,
.launder_page = NULL,
.is_partially_uptodate = NULL,
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 14a6780fd034..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -42,14 +42,3 @@ config EXT2_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
-
-config EXT2_FS_XIP
- bool "Ext2 execute in place support"
- depends on EXT2_FS && MMU
- help
- Execute in place can be used on memory-backed block devices. If you
- enable this option, you can select to mount block devices which are
- capable of this feature without using the page cache.
-
- If you do not use a block device that is capable of using this,
- or if unsure, say N.
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index f42af45cfd88..445b0e996a12 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o
-ext2-$(CONFIG_EXT2_FS_XIP) += xip.o
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e4279ead4a05..678f9ab08c48 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -380,10 +380,15 @@ struct ext2_inode {
#define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */
#define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */
#define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */
-#define EXT2_MOUNT_XIP 0x010000 /* Execute in place */
+#define EXT2_MOUNT_XIP 0x010000 /* Obsolete, use DAX */
#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */
#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */
#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */
+#ifdef CONFIG_FS_DAX
+#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */
+#else
+#define EXT2_MOUNT_DAX 0
+#endif
#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt
@@ -788,11 +793,10 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
extern const struct inode_operations ext2_file_inode_operations;
extern const struct file_operations ext2_file_operations;
-extern const struct file_operations ext2_xip_file_operations;
+extern const struct file_operations ext2_dax_file_operations;
/* inode.c */
extern const struct address_space_operations ext2_aops;
-extern const struct address_space_operations ext2_aops_xip;
extern const struct address_space_operations ext2_nobh_aops;
/* namei.c */
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 7c87b22a7228..e31701713516 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -25,6 +25,36 @@
#include "xattr.h"
#include "acl.h"
+#ifdef CONFIG_FS_DAX
+static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return dax_fault(vma, vmf, ext2_get_block);
+}
+
+static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return dax_mkwrite(vma, vmf, ext2_get_block);
+}
+
+static const struct vm_operations_struct ext2_dax_vm_ops = {
+ .fault = ext2_dax_fault,
+ .page_mkwrite = ext2_dax_mkwrite,
+};
+
+static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if (!IS_DAX(file_inode(file)))
+ return generic_file_mmap(file, vma);
+
+ file_accessed(file);
+ vma->vm_ops = &ext2_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP;
+ return 0;
+}
+#else
+#define ext2_file_mmap generic_file_mmap
+#endif
+
/*
* Called when filp is released. This happens when all file descriptors
* for a single struct file are closed. Note that different open() calls
@@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ext2_file_mmap,
.open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
@@ -78,16 +108,18 @@ const struct file_operations ext2_file_operations = {
.splice_write = iter_file_splice_write,
};
-#ifdef CONFIG_EXT2_FS_XIP
-const struct file_operations ext2_xip_file_operations = {
+#ifdef CONFIG_FS_DAX
+const struct file_operations ext2_dax_file_operations = {
.llseek = generic_file_llseek,
- .read = xip_file_read,
- .write = xip_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
- .mmap = xip_file_mmap,
+ .mmap = ext2_file_mmap,
.open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 36d35c36311d..6434bc000125 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -34,7 +34,6 @@
#include <linux/aio.h>
#include "ext2.h"
#include "acl.h"
-#include "xip.h"
#include "xattr.h"
static int __ext2_write_inode(struct inode *inode, int do_sync);
@@ -731,12 +730,14 @@ static int ext2_get_blocks(struct inode *inode,
goto cleanup;
}
- if (ext2_use_xip(inode->i_sb)) {
+ if (IS_DAX(inode)) {
/*
- * we need to clear the block
+ * block must be initialised before we put it in the tree
+ * so that it's not found by another thread before it's
+ * initialised
*/
- err = ext2_clear_xip_target (inode,
- le32_to_cpu(chain[depth-1].key));
+ err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
+ 1 << inode->i_blkbits);
if (err) {
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
@@ -859,7 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
+ if (IS_DAX(inode))
+ ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
+ NULL, DIO_LOCKING);
+ else
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+ ext2_get_block);
if (ret < 0 && (rw & WRITE))
ext2_write_failed(mapping, offset + count);
return ret;
@@ -885,11 +891,6 @@ const struct address_space_operations ext2_aops = {
.error_remove_page = generic_error_remove_page,
};
-const struct address_space_operations ext2_aops_xip = {
- .bmap = ext2_bmap,
- .get_xip_mem = ext2_get_xip_mem,
-};
-
const struct address_space_operations ext2_nobh_aops = {
.readpage = ext2_readpage,
.readpages = ext2_readpages,
@@ -1201,8 +1202,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
inode_dio_wait(inode);
- if (mapping_is_xip(inode->i_mapping))
- error = xip_truncate_page(inode->i_mapping, newsize);
+ if (IS_DAX(inode))
+ error = dax_truncate_page(inode, newsize, ext2_get_block);
else if (test_opt(inode->i_sb, NOBH))
error = nobh_truncate_page(inode->i_mapping,
newsize, ext2_get_block);
@@ -1273,7 +1274,8 @@ void ext2_set_inode_flags(struct inode *inode)
{
unsigned int flags = EXT2_I(inode)->i_flags;
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+ S_DIRSYNC | S_DAX);
if (flags & EXT2_SYNC_FL)
inode->i_flags |= S_SYNC;
if (flags & EXT2_APPEND_FL)
@@ -1284,6 +1286,8 @@ void ext2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_NOATIME;
if (flags & EXT2_DIRSYNC_FL)
inode->i_flags |= S_DIRSYNC;
+ if (test_opt(inode->i_sb, DAX))
+ inode->i_flags |= S_DAX;
}
/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
@@ -1384,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext2_file_inode_operations;
- if (ext2_use_xip(inode->i_sb)) {
- inode->i_mapping->a_ops = &ext2_aops_xip;
- inode->i_fop = &ext2_xip_file_operations;
+ if (test_opt(inode->i_sb, DAX)) {
+ inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_fop = &ext2_dax_file_operations;
} else if (test_opt(inode->i_sb, NOBH)) {
inode->i_mapping->a_ops = &ext2_nobh_aops;
inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index c268d0af1db9..148f6e3789ea 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -35,7 +35,6 @@
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
-#include "xip.h"
static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
{
@@ -105,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
return PTR_ERR(inode);
inode->i_op = &ext2_file_inode_operations;
- if (ext2_use_xip(inode->i_sb)) {
- inode->i_mapping->a_ops = &ext2_aops_xip;
- inode->i_fop = &ext2_xip_file_operations;
+ if (test_opt(inode->i_sb, DAX)) {
+ inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_fop = &ext2_dax_file_operations;
} else if (test_opt(inode->i_sb, NOBH)) {
inode->i_mapping->a_ops = &ext2_nobh_aops;
inode->i_fop = &ext2_file_operations;
@@ -126,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
return PTR_ERR(inode);
inode->i_op = &ext2_file_inode_operations;
- if (ext2_use_xip(inode->i_sb)) {
- inode->i_mapping->a_ops = &ext2_aops_xip;
- inode->i_fop = &ext2_xip_file_operations;
+ if (test_opt(inode->i_sb, DAX)) {
+ inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_fop = &ext2_dax_file_operations;
} else if (test_opt(inode->i_sb, NOBH)) {
inode->i_mapping->a_ops = &ext2_nobh_aops;
inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ae55fddc26a9..d0e746e96511 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -35,7 +35,6 @@
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
-#include "xip.h"
static void ext2_sync_super(struct super_block *sb,
struct ext2_super_block *es, int wait);
@@ -292,9 +291,11 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",grpquota");
#endif
-#if defined(CONFIG_EXT2_FS_XIP)
+#ifdef CONFIG_FS_DAX
if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
seq_puts(seq, ",xip");
+ if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
+ seq_puts(seq, ",dax");
#endif
if (!test_opt(sb, RESERVATION))
@@ -403,7 +404,7 @@ enum {
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug,
Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
- Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota,
+ Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota,
Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
};
@@ -432,6 +433,7 @@ static const match_table_t tokens = {
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_xip, "xip"},
+ {Opt_dax, "dax"},
{Opt_grpquota, "grpquota"},
{Opt_ignore, "noquota"},
{Opt_quota, "quota"},
@@ -559,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb)
break;
#endif
case Opt_xip:
-#ifdef CONFIG_EXT2_FS_XIP
- set_opt (sbi->s_mount_opt, XIP);
+ ext2_msg(sb, KERN_INFO, "use dax instead of xip");
+ set_opt(sbi->s_mount_opt, XIP);
+ /* Fall through */
+ case Opt_dax:
+#ifdef CONFIG_FS_DAX
+ set_opt(sbi->s_mount_opt, DAX);
#else
- ext2_msg(sb, KERN_INFO, "xip option not supported");
+ ext2_msg(sb, KERN_INFO, "dax option not supported");
#endif
break;
@@ -877,9 +883,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
MS_POSIXACL : 0);
- ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
- EXT2_MOUNT_XIP if not */
-
if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
(EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
@@ -909,11 +912,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
- if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) {
- if (!silent)
+ if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
+ if (blocksize != PAGE_SIZE) {
ext2_msg(sb, KERN_ERR,
- "error: unsupported blocksize for xip");
- goto failed_mount;
+ "error: unsupported blocksize for dax");
+ goto failed_mount;
+ }
+ if (!sb->s_bdev->bd_disk->fops->direct_access) {
+ ext2_msg(sb, KERN_ERR,
+ "error: device does not support dax");
+ goto failed_mount;
+ }
}
/* If the blocksize doesn't match, re-read the thing.. */
@@ -1259,7 +1268,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
{
struct ext2_sb_info * sbi = EXT2_SB(sb);
struct ext2_super_block * es;
- unsigned long old_mount_opt = sbi->s_mount_opt;
struct ext2_mount_options old_opts;
unsigned long old_sb_flags;
int err;
@@ -1284,22 +1292,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
- ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
- EXT2_MOUNT_XIP if not */
-
- if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
- ext2_msg(sb, KERN_WARNING,
- "warning: unsupported blocksize for xip");
- err = -EINVAL;
- goto restore_opts;
- }
-
es = sbi->s_es;
- if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
+ if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) {
ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
- "xip flag with busy inodes while remounting");
- sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
- sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
+ "dax flag with busy inodes while remounting");
+ sbi->s_mount_opt ^= EXT2_MOUNT_DAX;
}
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
spin_unlock(&sbi->s_lock);
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
deleted file mode 100644
index bbc5fec6ff7f..000000000000
--- a/fs/ext2/xip.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * linux/fs/ext2/xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte (cotte@de.ibm.com)
- */
-
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/genhd.h>
-#include <linux/buffer_head.h>
-#include <linux/blkdev.h>
-#include "ext2.h"
-#include "xip.h"
-
-static inline long __inode_direct_access(struct inode *inode, sector_t block,
- void **kaddr, unsigned long *pfn, long size)
-{
- struct block_device *bdev = inode->i_sb->s_bdev;
- sector_t sector = block * (PAGE_SIZE / 512);
- return bdev_direct_access(bdev, sector, kaddr, pfn, size);
-}
-
-static inline int
-__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
- sector_t *result)
-{
- struct buffer_head tmp;
- int rc;
-
- memset(&tmp, 0, sizeof(struct buffer_head));
- tmp.b_size = 1 << inode->i_blkbits;
- rc = ext2_get_block(inode, pgoff, &tmp, create);
- *result = tmp.b_blocknr;
-
- /* did we get a sparse block (hole in the file)? */
- if (!tmp.b_blocknr && !rc) {
- BUG_ON(create);
- rc = -ENODATA;
- }
-
- return rc;
-}
-
-int
-ext2_clear_xip_target(struct inode *inode, sector_t block)
-{
- void *kaddr;
- unsigned long pfn;
- long size;
-
- size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE);
- if (size < 0)
- return size;
- clear_page(kaddr);
- return 0;
-}
-
-void ext2_xip_verify_sb(struct super_block *sb)
-{
- struct ext2_sb_info *sbi = EXT2_SB(sb);
-
- if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
- !sb->s_bdev->bd_disk->fops->direct_access) {
- sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
- ext2_msg(sb, KERN_WARNING,
- "warning: ignoring xip option - "
- "not supported by bdev");
- }
-}
-
-int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
- void **kmem, unsigned long *pfn)
-{
- long rc;
- sector_t block;
-
- /* first, retrieve the sector number */
- rc = __ext2_get_block(mapping->host, pgoff, create, &block);
- if (rc)
- return rc;
-
- /* retrieve address of the target data */
- rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE);
- return (rc < 0) ? rc : 0;
-}
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
deleted file mode 100644
index 18b34d2f31b3..000000000000
--- a/fs/ext2/xip.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * linux/fs/ext2/xip.h
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte (cotte@de.ibm.com)
- */
-
-#ifdef CONFIG_EXT2_FS_XIP
-extern void ext2_xip_verify_sb (struct super_block *);
-extern int ext2_clear_xip_target (struct inode *, sector_t);
-
-static inline int ext2_use_xip (struct super_block *sb)
-{
- struct ext2_sb_info *sbi = EXT2_SB(sb);
- return (sbi->s_mount_opt & EXT2_MOUNT_XIP);
-}
-int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
- void **, unsigned long *);
-#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem)
-#else
-#define mapping_is_xip(map) 0
-#define ext2_xip_verify_sb(sb) do { } while (0)
-#define ext2_use_xip(sb) 0
-#define ext2_clear_xip_target(inode, chain) 0
-#define ext2_get_xip_mem NULL
-#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b7f393df2e4c..98ee89cef0ad 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -966,6 +966,11 @@ struct ext4_inode_info {
#define EXT4_MOUNT_ERRORS_MASK 0x00070
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
+#ifdef CONFIG_FS_DAX
+#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
+#else
+#define EXT4_MOUNT_DAX 0
+#endif
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
@@ -2587,6 +2592,7 @@ extern const struct file_operations ext4_dir_operations;
/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
+extern const struct file_operations ext4_dax_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
/* inline.c */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7cb592386121..33a09da16c9c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
struct mutex *aio_mutex = NULL;
struct blk_plug plug;
- int o_direct = file->f_flags & O_DIRECT;
+ int o_direct = io_is_direct(file);
int overwrite = 0;
size_t length = iov_iter_count(from);
ssize_t ret;
@@ -191,6 +191,26 @@ errout:
return ret;
}
+#ifdef CONFIG_FS_DAX
+static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return dax_fault(vma, vmf, ext4_get_block);
+ /* Is this the right get_block? */
+}
+
+static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return dax_mkwrite(vma, vmf, ext4_get_block);
+}
+
+static const struct vm_operations_struct ext4_dax_vm_ops = {
+ .fault = ext4_dax_fault,
+ .page_mkwrite = ext4_dax_mkwrite,
+};
+#else
+#define ext4_dax_vm_ops ext4_file_vm_ops
+#endif
+
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
@@ -200,7 +220,12 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
- vma->vm_ops = &ext4_file_vm_ops;
+ if (IS_DAX(file_inode(file))) {
+ vma->vm_ops = &ext4_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP;
+ } else {
+ vma->vm_ops = &ext4_file_vm_ops;
+ }
return 0;
}
@@ -599,6 +624,26 @@ const struct file_operations ext4_file_operations = {
.fallocate = ext4_fallocate,
};
+#ifdef CONFIG_FS_DAX
+const struct file_operations ext4_dax_file_operations = {
+ .llseek = ext4_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = ext4_file_write_iter,
+ .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext4_compat_ioctl,
+#endif
+ .mmap = ext4_file_mmap,
+ .open = ext4_file_open,
+ .release = ext4_release_file,
+ .fsync = ext4_sync_file,
+ /* Splice not yet supported with DAX */
+ .fallocate = ext4_fallocate,
+};
+#endif
+
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36b369697a13..6b9878a24182 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -689,14 +689,22 @@ retry:
inode_dio_done(inode);
goto locked;
}
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iter, offset,
- ext4_get_block, NULL, NULL, 0);
+ if (IS_DAX(inode))
+ ret = dax_do_io(rw, iocb, inode, iter, offset,
+ ext4_get_block, NULL, 0);
+ else
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter, offset,
+ ext4_get_block, NULL, NULL, 0);
inode_dio_done(inode);
} else {
locked:
- ret = blockdev_direct_IO(rw, iocb, inode, iter,
- offset, ext4_get_block);
+ if (IS_DAX(inode))
+ ret = dax_do_io(rw, iocb, inode, iter, offset,
+ ext4_get_block, NULL, DIO_LOCKING);
+ else
+ ret = blockdev_direct_IO(rw, iocb, inode, iter,
+ offset, ext4_get_block);
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9193ea130dcb..85404f15e53a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,6 +657,18 @@ has_zeroout:
return retval;
}
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+ struct inode *inode = bh->b_assoc_map->host;
+ /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+ loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+ int err;
+ if (!uptodate)
+ return;
+ WARN_ON(!buffer_unwritten(bh));
+ err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
@@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+ bh->b_assoc_map = inode->i_mapping;
+ bh->b_private = (void *)(unsigned long)iblock;
+ bh->b_end_io = ext4_end_io_unwritten;
+ }
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
get_block_func = ext4_get_block_write;
dio_flags = DIO_LOCKING;
}
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iter,
- offset,
- get_block_func,
- ext4_end_io_dio,
- NULL,
- dio_flags);
+ if (IS_DAX(inode))
+ ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
+ ext4_end_io_dio, dio_flags);
+ else
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter, offset,
+ get_block_func,
+ ext4_end_io_dio, NULL, dio_flags);
/*
* Put our reference to io_end. This can free the io_end structure e.g.
@@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_aops;
}
-/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'. The range to be zero'd must
- * be contained with in one block. If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-static int ext4_block_zero_page_range(handle_t *handle,
+static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize, max, pos;
+ unsigned blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
return -ENOMEM;
blocksize = inode->i_sb->s_blocksize;
- max = blocksize - (offset & (blocksize - 1));
-
- /*
- * correct length if it does not fall between
- * 'from' and the end of the block
- */
- if (length > max || length < 0)
- length = max;
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -3278,6 +3281,33 @@ unlock:
}
/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+static int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
+{
+ struct inode *inode = mapping->host;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize = inode->i_sb->s_blocksize;
+ unsigned max = blocksize - (offset & (blocksize - 1));
+
+ /*
+ * correct length if it does not fall between
+ * 'from' and the end of the block
+ */
+ if (length > max || length < 0)
+ length = max;
+
+ if (IS_DAX(inode))
+ return dax_zero_page_range(inode, from, length, ext4_get_block);
+ return __ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+/*
* ext4_block_truncate_page() zeroes out a mapping from file offset `from'
* up to the end of the block which corresponds to `from'.
* This required during truncate. We need to physically zero the tail end
@@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
+ if (test_opt(inode->i_sb, DAX))
+ new_fl |= S_DAX;
inode_set_flags(inode, new_fl,
- S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
}
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
- inode->i_fop = &ext4_file_operations;
+ if (test_opt(inode->i_sb, DAX))
+ inode->i_fop = &ext4_dax_file_operations;
+ else
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &ext4_dir_inode_operations;
@@ -4594,7 +4629,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
*/
- truncate_pagecache(inode, inode->i_size);
+ truncate_pagecache(inode, inode->i_size);
}
/*
* We want to call ext4_truncate() even if attr->ia_size ==
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2291923dae4e..28fe71a2904c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2235,7 +2235,10 @@ retry:
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
- inode->i_fop = &ext4_file_operations;
+ if (test_opt(inode->i_sb, DAX))
+ inode->i_fop = &ext4_dax_file_operations;
+ else
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, inode);
if (!err && IS_DIRSYNC(dir))
@@ -2299,7 +2302,10 @@ retry:
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
- inode->i_fop = &ext4_file_operations;
+ if (test_opt(inode->i_sb, DAX))
+ inode->i_fop = &ext4_dax_file_operations;
+ else
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
d_tmpfile(dentry, inode);
err = ext4_orphan_add(handle, inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9fcd99748f18..3450ce4f3250 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1124,7 +1124,7 @@ enum {
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_i_version,
+ Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
Opt_lazytime, Opt_nolazytime,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
@@ -1188,6 +1188,7 @@ static const match_table_t tokens = {
{Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
+ {Opt_dax, "dax"},
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
{Opt_lazytime, "lazytime"},
@@ -1374,6 +1375,7 @@ static const struct mount_opts {
{Opt_min_batch_time, 0, MOPT_GTE0},
{Opt_inode_readahead_blks, 0, MOPT_GTE0},
{Opt_init_itable, 0, MOPT_GTE0},
+ {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
{Opt_stripe, 0, MOPT_GTE0},
{Opt_resuid, 0, MOPT_GTE0},
{Opt_resgid, 0, MOPT_GTE0},
@@ -1616,6 +1618,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
+#ifndef CONFIG_FS_DAX
+ } else if (token == Opt_dax) {
+ ext4_msg(sb, KERN_INFO, "dax option not supported");
+ return -1;
+#endif
} else {
if (!args->from)
arg = 1;
@@ -3598,6 +3605,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"both data=journal and dioread_nolock");
goto failed_mount;
}
+ if (test_opt(sb, DAX)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ goto failed_mount;
+ }
if (test_opt(sb, DELALLOC))
clear_opt(sb, DELALLOC);
}
@@ -3661,6 +3673,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+ if (blocksize != PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR,
+ "error: unsupported blocksize for dax");
+ goto failed_mount;
+ }
+ if (!sb->s_bdev->bd_disk->fops->direct_access) {
+ ext4_msg(sb, KERN_ERR,
+ "error: device does not support dax");
+ goto failed_mount;
+ }
+ }
+
if (sb->s_blocksize != blocksize) {
/* Validate the filesystem blocksize */
if (!sb_set_blocksize(sb, blocksize)) {
@@ -4877,6 +4902,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
+ if (test_opt(sb, DAX)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ }
+
+ if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
+ ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
+ "dax flag with busy inodes while remounting");
+ sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
}
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 125b749f5a18..174aa9e41bc3 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -28,6 +28,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/mpage.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#include <cluster/masklog.h>
@@ -47,6 +48,9 @@
#include "ocfs2_trace.h"
#include "buffer_head_io.h"
+#include "dir.h"
+#include "namei.h"
+#include "sysfile.h"
static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -506,18 +510,21 @@ bail:
*
* called like this: dio->get_blocks(dio->inode, fs_startblk,
* fs_count, map_bh, dio->rw == WRITE);
- *
- * Note that we never bother to allocate blocks here, and thus ignore the
- * create argument.
*/
static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int ret;
+ u32 cpos = 0;
+ int alloc_locked = 0;
u64 p_blkno, inode_blocks, contig_blocks;
unsigned int ext_flags;
unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ unsigned long len = bh_result->b_size;
+ unsigned int clusters_to_alloc = 0;
+
+ cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
/* This function won't even be called if the request isn't all
* nicely aligned and of the right size, so there's no need
@@ -539,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
/* We should already CoW the refcounted extent in case of create. */
BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
+ /* allocate blocks if no p_blkno is found, and create == 1 */
+ if (!p_blkno && create) {
+ ret = ocfs2_inode_lock(inode, NULL, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ alloc_locked = 1;
+
+ /* fill hole, allocate blocks can't be larger than the size
+ * of the hole */
+ clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
+ if (clusters_to_alloc > contig_blocks)
+ clusters_to_alloc = contig_blocks;
+
+ /* allocate extent and insert them into the extent tree */
+ ret = ocfs2_extend_allocation(inode, cpos,
+ clusters_to_alloc, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
+ &contig_blocks, &ext_flags);
+ if (ret < 0) {
+ mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+ (unsigned long long)iblock);
+ ret = -EIO;
+ goto bail;
+ }
+ }
+
/*
* get_more_blocks() expects us to describe a hole by clearing
* the mapped bit on bh_result().
@@ -556,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
contig_blocks = max_blocks;
bh_result->b_size = contig_blocks << blocksize_bits;
bail:
+ if (alloc_locked)
+ ocfs2_inode_unlock(inode, 1);
return ret;
}
@@ -597,6 +640,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
+static int ocfs2_is_overwrite(struct ocfs2_super *osb,
+ struct inode *inode, loff_t offset)
+{
+ int ret = 0;
+ u32 v_cpos = 0;
+ u32 p_cpos = 0;
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+
+ v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+ ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+ &num_clusters, &ext_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+ return 1;
+
+ return 0;
+}
+
+static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
+ struct iov_iter *iter,
+ loff_t offset)
+{
+ ssize_t ret = 0;
+ ssize_t written = 0;
+ bool orphaned = false;
+ int is_overwrite = 0;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file)->i_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *di_bh = NULL;
+ size_t count = iter->count;
+ journal_t *journal = osb->journal->j_journal;
+ u32 zero_len;
+ int cluster_align;
+ loff_t final_size = offset + count;
+ int append_write = offset >= i_size_read(inode) ? 1 : 0;
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+
+ {
+ u64 o = offset;
+
+ zero_len = do_div(o, 1 << osb->s_clustersize_bits);
+ cluster_align = !zero_len;
+ }
+
+ /*
+ * when final_size > inode->i_size, inode->i_size will be
+ * updated after direct write, so add the inode to orphan
+ * dir first.
+ */
+ if (final_size > i_size_read(inode)) {
+ ret = ocfs2_add_inode_to_orphan(osb, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ orphaned = true;
+ }
+
+ if (append_write) {
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto clean_orphan;
+ }
+
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_extend(inode, di_bh, offset);
+ else
+ ret = ocfs2_extend_no_holes(inode, di_bh, offset,
+ offset);
+ if (ret < 0) {
+ mlog_errno(ret);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ goto clean_orphan;
+ }
+
+ is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
+ if (is_overwrite < 0) {
+ mlog_errno(is_overwrite);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ goto clean_orphan;
+ }
+
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ di_bh = NULL;
+ }
+
+ written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
+ iter, offset,
+ ocfs2_direct_IO_get_blocks,
+ ocfs2_dio_end_io, NULL, 0);
+ if (unlikely(written < 0)) {
+ loff_t i_size = i_size_read(inode);
+
+ if (offset + count > i_size) {
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto clean_orphan;
+ }
+
+ if (i_size == i_size_read(inode)) {
+ ret = ocfs2_truncate_file(inode, di_bh,
+ i_size);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ goto clean_orphan;
+ }
+ }
+
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+
+ ret = jbd2_journal_force_commit(journal);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+ } else if (written < 0 && append_write && !is_overwrite &&
+ !cluster_align) {
+ u32 p_cpos = 0;
+ u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+
+ ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+ &num_clusters, &ext_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto clean_orphan;
+ }
+
+ BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
+
+ ret = blkdev_issue_zeroout(osb->sb->s_bdev,
+ p_cpos << (osb->s_clustersize_bits - 9),
+ zero_len >> 9, GFP_KERNEL, false);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+clean_orphan:
+ if (orphaned) {
+ int tmp_ret;
+ int update_isize = written > 0 ? 1 : 0;
+ loff_t end = update_isize ? offset + written : 0;
+
+ tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
+ update_isize, end);
+ if (tmp_ret < 0) {
+ ret = tmp_ret;
+ goto out;
+ }
+
+ tmp_ret = jbd2_journal_force_commit(journal);
+ if (tmp_ret < 0) {
+ ret = tmp_ret;
+ mlog_errno(tmp_ret);
+ }
+ }
+
+out:
+ if (ret >= 0)
+ ret = written;
+ return ret;
+}
+
static ssize_t ocfs2_direct_IO(int rw,
struct kiocb *iocb,
struct iov_iter *iter,
@@ -604,6 +825,9 @@ static ssize_t ocfs2_direct_IO(int rw,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file)->i_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int full_coherency = !(osb->s_mount_opt &
+ OCFS2_MOUNT_COHERENCY_BUFFERED);
/*
* Fallback to buffered I/O if we see an inode without
@@ -612,14 +836,20 @@ static ssize_t ocfs2_direct_IO(int rw,
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
return 0;
- /* Fallback to buffered I/O if we are appending. */
- if (i_size_read(inode) <= offset)
+ /* Fallback to buffered I/O if we are appending and
+ * concurrent O_DIRECT writes are allowed.
+ */
+ if (i_size_read(inode) <= offset && !full_coherency)
return 0;
- return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+ if (rw == READ)
+ return __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev,
iter, offset,
ocfs2_direct_IO_get_blocks,
ocfs2_dio_end_io, NULL, 0);
+ else
+ return ocfs2_direct_IO_write(iocb, iter, offset);
}
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e0f04d55fd05..46e0d4e857c7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -295,7 +295,7 @@ out:
return ret;
}
-static int ocfs2_set_inode_size(handle_t *handle,
+int ocfs2_set_inode_size(handle_t *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u64 new_i_size)
@@ -441,7 +441,7 @@ out:
return status;
}
-static int ocfs2_truncate_file(struct inode *inode,
+int ocfs2_truncate_file(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size)
{
@@ -709,6 +709,13 @@ leave:
return status;
}
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+ u32 clusters_to_add, int mark_unwritten)
+{
+ return __ocfs2_extend_allocation(inode, logical_start,
+ clusters_to_add, mark_unwritten);
+}
+
/*
* While a write will already be ordering the data, a truncate will not.
* Thus, we need to explicitly order the zeroed pages.
@@ -2109,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
loff_t saved_pos = 0, end;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int full_coherency = !(osb->s_mount_opt &
+ OCFS2_MOUNT_COHERENCY_BUFFERED);
/*
* We start with a read level meta lock and only jump to an ex
@@ -2197,7 +2207,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* one node could wind up truncating another
* nodes writes.
*/
- if (end > i_size_read(inode)) {
+ if (end > i_size_read(inode) && !full_coherency) {
+ *direct_io = 0;
+ break;
+ }
+
+ /*
+ * Fallback to old way if the feature bit is not set.
+ */
+ if (end > i_size_read(inode) &&
+ !ocfs2_supports_append_dio(osb)) {
*direct_io = 0;
break;
}
@@ -2210,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
*/
ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
if (ret == 1) {
- *direct_io = 0;
+ /*
+ * Fallback to old way if the feature bit is not set.
+ * Otherwise try dio first and then complete the rest
+ * request through buffer io.
+ */
+ if (!ocfs2_supports_append_dio(osb))
+ *direct_io = 0;
ret = 0;
} else if (ret < 0)
mlog_errno(ret);
@@ -2243,6 +2268,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
u32 old_clusters;
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
@@ -2357,11 +2383,51 @@ relock:
iov_iter_truncate(from, count);
if (direct_io) {
+ loff_t endbyte;
+ ssize_t written_buffered;
written = generic_file_direct_write(iocb, from, *ppos);
- if (written < 0) {
+ if (written < 0 || written == count) {
ret = written;
goto out_dio;
}
+
+ /*
+ * for completing the rest of the request.
+ */
+ *ppos += written;
+ count -= written;
+ written_buffered = generic_perform_write(file, from, *ppos);
+ /*
+ * If generic_file_buffered_write() returned a synchronous error
+ * then we want to return the number of bytes which were
+ * direct-written, or the error code if that was zero. Note
+ * that this differs from normal direct-io semantics, which
+ * will return -EFOO even if some bytes were written.
+ */
+ if (written_buffered < 0) {
+ ret = written_buffered;
+ goto out_dio;
+ }
+
+ iocb->ki_pos = *ppos + written_buffered;
+ /* We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+ * semantics.
+ */
+ endbyte = *ppos + written_buffered - 1;
+ ret = filemap_write_and_wait_range(file->f_mapping, *ppos,
+ endbyte);
+ if (ret == 0) {
+ written += written_buffered;
+ invalidate_mapping_pages(mapping,
+ *ppos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+ } else {
+ /*
+ * We don't know how much we wrote, so just return
+ * the number of bytes which were direct-written
+ */
+ }
} else {
current->backing_dev_info = inode_to_bdi(inode);
written = generic_perform_write(file, from, *ppos);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7c..e8c62f22215c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_set_inode_size(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 new_i_size);
int ocfs2_simple_size_update(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size);
+int ocfs2_truncate_file(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size);
int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
u64 new_i_size, u64 zero_to);
int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
loff_t zero_to);
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+ u32 clusters_to_add, int mark_unwritten);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c8b25de9efbb..3025c0da6b8a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -648,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode,
if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
- orphan_dir_bh);
+ orphan_dir_bh, false);
if (status < 0) {
mlog_errno(status);
goto bail_commit;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ca3431ee7f24..5e86b247c821 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,6 +81,8 @@ struct ocfs2_inode_info
tid_t i_sync_tid;
tid_t i_datasync_tid;
+ wait_queue_head_t append_dio_wq;
+
struct dquot *i_dquot[MAXQUOTAS];
};
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index d10860fde165..ff531928269e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -50,6 +50,8 @@
#include "sysfile.h"
#include "uptodate.h"
#include "quota.h"
+#include "file.h"
+#include "namei.h"
#include "buffer_head_io.h"
#include "ocfs2_trace.h"
@@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
static int ocfs2_trylock_journal(struct ocfs2_super *osb,
int slot_num);
static int ocfs2_recover_orphans(struct ocfs2_super *osb,
- int slot);
+ int slot,
+ enum ocfs2_orphan_reco_type orphan_reco_type);
static int ocfs2_commit_thread(void *arg);
static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
int slot_num,
struct ocfs2_dinode *la_dinode,
struct ocfs2_dinode *tl_dinode,
- struct ocfs2_quota_recovery *qrec);
+ struct ocfs2_quota_recovery *qrec,
+ enum ocfs2_orphan_reco_type orphan_reco_type);
static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
{
@@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
return 0;
}
-void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
struct ocfs2_replay_map *replay_map = osb->replay_map;
int i;
@@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
for (i = 0; i < replay_map->rm_slots; i++)
if (replay_map->rm_replay_slots[i])
ocfs2_queue_recovery_completion(osb->journal, i, NULL,
- NULL, NULL);
+ NULL, NULL,
+ orphan_reco_type);
replay_map->rm_state = REPLAY_DONE;
}
@@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item {
struct ocfs2_dinode *lri_la_dinode;
struct ocfs2_dinode *lri_tl_dinode;
struct ocfs2_quota_recovery *lri_qrec;
+ enum ocfs2_orphan_reco_type lri_orphan_reco_type;
};
/* Does the second half of the recovery process. By this point, the
@@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
struct ocfs2_dinode *la_dinode, *tl_dinode;
struct ocfs2_la_recovery_item *item, *n;
struct ocfs2_quota_recovery *qrec;
+ enum ocfs2_orphan_reco_type orphan_reco_type;
LIST_HEAD(tmp_la_list);
trace_ocfs2_complete_recovery(
@@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
la_dinode = item->lri_la_dinode;
tl_dinode = item->lri_tl_dinode;
qrec = item->lri_qrec;
+ orphan_reco_type = item->lri_orphan_reco_type;
trace_ocfs2_complete_recovery_slot(item->lri_slot,
la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
@@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
kfree(tl_dinode);
}
- ret = ocfs2_recover_orphans(osb, item->lri_slot);
+ ret = ocfs2_recover_orphans(osb, item->lri_slot,
+ orphan_reco_type);
if (ret < 0)
mlog_errno(ret);
@@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
int slot_num,
struct ocfs2_dinode *la_dinode,
struct ocfs2_dinode *tl_dinode,
- struct ocfs2_quota_recovery *qrec)
+ struct ocfs2_quota_recovery *qrec,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
struct ocfs2_la_recovery_item *item;
@@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
item->lri_slot = slot_num;
item->lri_tl_dinode = tl_dinode;
item->lri_qrec = qrec;
+ item->lri_orphan_reco_type = orphan_reco_type;
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
/* No need to queue up our truncate_log as regular cleanup will catch
* that */
ocfs2_queue_recovery_completion(journal, osb->slot_num,
- osb->local_alloc_copy, NULL, NULL);
+ osb->local_alloc_copy, NULL, NULL,
+ ORPHAN_NEED_TRUNCATE);
ocfs2_schedule_truncate_log_flush(osb, 0);
osb->local_alloc_copy = NULL;
@@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
/* queue to recover orphan slots for all offline slots */
ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
- ocfs2_queue_replay_slots(osb);
+ ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
ocfs2_free_replay_slots(osb);
}
@@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
osb->slot_num,
NULL,
NULL,
- osb->quota_rec);
+ osb->quota_rec,
+ ORPHAN_NEED_TRUNCATE);
osb->quota_rec = NULL;
}
}
@@ -1360,7 +1374,7 @@ restart:
/* queue recovery for our own slot */
ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
- NULL, NULL);
+ NULL, NULL, ORPHAN_NO_NEED_TRUNCATE);
spin_lock(&osb->osb_lock);
while (rm->rm_used) {
@@ -1419,13 +1433,14 @@ skip_recovery:
continue;
}
ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
- NULL, NULL, qrec);
+ NULL, NULL, qrec,
+ ORPHAN_NEED_TRUNCATE);
}
ocfs2_super_unlock(osb, 1);
/* queue recovery for offline slots */
- ocfs2_queue_replay_slots(osb);
+ ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
bail:
mutex_lock(&osb->recovery_lock);
@@ -1711,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
/* This will kfree the memory pointed to by la_copy and tl_copy */
ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
- tl_copy, NULL);
+ tl_copy, NULL, ORPHAN_NEED_TRUNCATE);
status = 0;
done:
@@ -1901,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
for (i = 0; i < osb->max_slots; i++)
ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
- NULL);
+ NULL, ORPHAN_NO_NEED_TRUNCATE);
/*
* We queued a recovery on orphan slots, increment the sequence
* number and update LVB so other node will skip the scan for a while
@@ -2000,6 +2015,13 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
if (IS_ERR(iter))
return 0;
+ /* Skip inodes which are already added to recover list, since dio may
+ * happen concurrently with unlink/rename */
+ if (OCFS2_I(iter)->ip_next_orphan) {
+ iput(iter);
+ return 0;
+ }
+
trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
/* No locking is required for the next_orphan queue as there
* is only ever a single process doing orphan recovery. */
@@ -2108,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
* advertising our state to ocfs2_delete_inode().
*/
static int ocfs2_recover_orphans(struct ocfs2_super *osb,
- int slot)
+ int slot,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
int ret = 0;
struct inode *inode = NULL;
@@ -2132,13 +2155,60 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
(unsigned long long)oi->ip_blkno);
iter = oi->ip_next_orphan;
+ oi->ip_next_orphan = NULL;
+
+ /*
+ * We need to take and drop the inode lock to
+ * force read inode from disk.
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto next;
+ }
+ ocfs2_inode_unlock(inode, 0);
+
+ if (inode->i_nlink == 0) {
+ spin_lock(&oi->ip_lock);
+ /* Set the proper information to get us going into
+ * ocfs2_delete_inode. */
+ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+ spin_unlock(&oi->ip_lock);
+ } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
+ struct buffer_head *di_bh = NULL;
+
+ ret = ocfs2_rw_lock(inode, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto next;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ ocfs2_rw_unlock(inode, 1);
+ mlog_errno(ret);
+ goto next;
+ }
+
+ ret = ocfs2_truncate_file(inode, di_bh,
+ i_size_read(inode));
+ ocfs2_inode_unlock(inode, 1);
+ ocfs2_rw_unlock(inode, 1);
+ brelse(di_bh);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto next;
+ }
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
+ if (ret)
+ mlog_errno(ret);
- spin_lock(&oi->ip_lock);
- /* Set the proper information to get us going into
- * ocfs2_delete_inode. */
- oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
- spin_unlock(&oi->ip_lock);
+ wake_up(&OCFS2_I(inode)->append_dio_wq);
+ } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
+next:
iput(inode);
inode = iter;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 7f8cde94abfe..f4cd3c3e9fb7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb)
* orphan dir index leaf */
#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
+/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry +
+ * orphan dir index root + orphan dir index leaf */
+#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4)
+#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS
+
/* dinode update, old dir dinode update, new dir dinode update, old
* dir dir entry, new dir dir entry, dir entry update for renaming
* directory + target unlink + 3 x dir index leaves */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 914c121ec890..b5c3a5ea3ee6 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
struct inode **ret_orphan_dir,
u64 blkno,
char *name,
- struct ocfs2_dir_lookup_result *lookup);
+ struct ocfs2_dir_lookup_result *lookup,
+ bool dio);
static int ocfs2_orphan_add(struct ocfs2_super *osb,
handle_t *handle,
@@ -87,7 +88,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
char *name,
struct ocfs2_dir_lookup_result *lookup,
- struct inode *orphan_dir_inode);
+ struct inode *orphan_dir_inode,
+ bool dio);
static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
handle_t *handle,
@@ -104,6 +106,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
/* An orphan dir name is an 8 byte value, printed as a hex string */
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
@@ -952,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir,
if (ocfs2_inode_is_unlinkable(inode)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
OCFS2_I(inode)->ip_blkno,
- orphan_name, &orphan_insert);
+ orphan_name, &orphan_insert,
+ false);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -1004,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir,
if (is_unlinkable) {
status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
- orphan_name, &orphan_insert, orphan_dir);
+ orphan_name, &orphan_insert, orphan_dir, false);
if (status < 0)
mlog_errno(status);
}
@@ -1440,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir,
if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
OCFS2_I(new_inode)->ip_blkno,
- orphan_name, &orphan_insert);
+ orphan_name, &orphan_insert,
+ false);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1507,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir,
if (should_add_orphan) {
status = ocfs2_orphan_add(osb, handle, new_inode,
newfe_bh, orphan_name,
- &orphan_insert, orphan_dir);
+ &orphan_insert, orphan_dir, false);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -2088,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
struct buffer_head *orphan_dir_bh,
u64 blkno,
char *name,
- struct ocfs2_dir_lookup_result *lookup)
+ struct ocfs2_dir_lookup_result *lookup,
+ bool dio)
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+ int namelen = dio ?
+ (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+ OCFS2_ORPHAN_NAMELEN;
+
+ if (dio) {
+ ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+ OCFS2_DIO_ORPHAN_PREFIX);
+ if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ return ret;
+ }
- ret = ocfs2_blkno_stringify(blkno, name);
+ ret = ocfs2_blkno_stringify(blkno,
+ name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+ } else
+ ret = ocfs2_blkno_stringify(blkno, name);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -2101,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
orphan_dir_bh, name,
- OCFS2_ORPHAN_NAMELEN, lookup);
+ namelen, lookup);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -2128,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
struct inode **ret_orphan_dir,
u64 blkno,
char *name,
- struct ocfs2_dir_lookup_result *lookup)
+ struct ocfs2_dir_lookup_result *lookup,
+ bool dio)
{
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
@@ -2142,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
}
ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
- blkno, name, lookup);
+ blkno, name, lookup, dio);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -2170,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
char *name,
struct ocfs2_dir_lookup_result *lookup,
- struct inode *orphan_dir_inode)
+ struct inode *orphan_dir_inode,
+ bool dio)
{
struct buffer_head *orphan_dir_bh = NULL;
int status = 0;
struct ocfs2_dinode *orphan_fe;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ int namelen = dio ?
+ (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+ OCFS2_ORPHAN_NAMELEN;
trace_ocfs2_orphan_add_begin(
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -2219,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, orphan_dir_bh);
status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
- OCFS2_ORPHAN_NAMELEN, inode,
+ namelen, inode,
OCFS2_I(inode)->ip_blkno,
orphan_dir_bh, lookup);
if (status < 0) {
@@ -2227,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
goto rollback;
}
- fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
- OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
+ if (dio) {
+ /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan
+ * slot.
+ */
+ fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+ fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num);
+ } else {
+ fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+ OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
- /* Record which orphan dir our inode now resides
- * in. delete_inode will use this to determine which orphan
- * dir to lock. */
- fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+ /* Record which orphan dir our inode now resides
+ * in. delete_inode will use this to determine which orphan
+ * dir to lock. */
+ fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+ }
ocfs2_journal_dirty(handle, fe_bh);
@@ -2258,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
handle_t *handle,
struct inode *orphan_dir_inode,
struct inode *inode,
- struct buffer_head *orphan_dir_bh)
+ struct buffer_head *orphan_dir_bh,
+ bool dio)
{
- char name[OCFS2_ORPHAN_NAMELEN + 1];
+ const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN;
+ char name[namelen + 1];
struct ocfs2_dinode *orphan_fe;
int status = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
- status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+ if (dio) {
+ status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+ OCFS2_DIO_ORPHAN_PREFIX);
+ if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+ status = -EINVAL;
+ mlog_errno(status);
+ return status;
+ }
+
+ status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno,
+ name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+ } else
+ status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -2273,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
trace_ocfs2_orphan_del(
(unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
- name, OCFS2_ORPHAN_NAMELEN);
+ name, namelen);
/* find it's spot in the orphan directory */
- status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
+ status = ocfs2_find_entry(name, namelen, orphan_dir_inode,
&lookup);
if (status) {
mlog_errno(status);
@@ -2376,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
}
ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
- di_blkno, orphan_name, orphan_insert);
+ di_blkno, orphan_name, orphan_insert,
+ false);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -2482,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
di = (struct ocfs2_dinode *)new_di_bh->b_data;
status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
- &orphan_insert, orphan_dir);
+ &orphan_insert, orphan_dir, false);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -2527,6 +2577,186 @@ leave:
return status;
}
+static int ocfs2_dio_orphan_recovered(struct inode *inode)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return 0;
+ }
+
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+
+ return ret;
+}
+
+#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+ struct inode *inode)
+{
+ char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1];
+ struct inode *orphan_dir_inode = NULL;
+ struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+ struct buffer_head *di_bh = NULL;
+ int status = 0;
+ handle_t *handle = NULL;
+ struct ocfs2_dinode *di = NULL;
+
+restart:
+ status = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ /*
+ * Another append dio crashed?
+ * If so, wait for recovery first.
+ */
+ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
+ ocfs2_dio_orphan_recovered(inode),
+ msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
+ goto restart;
+ }
+
+ status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
+ OCFS2_I(inode)->ip_blkno,
+ orphan_name,
+ &orphan_insert,
+ true);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ handle = ocfs2_start_trans(osb,
+ OCFS2_INODE_ADD_TO_ORPHAN_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ goto bail_unlock_orphan;
+ }
+
+ status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name,
+ &orphan_insert, orphan_dir_inode, true);
+ if (status)
+ mlog_errno(status);
+
+ ocfs2_commit_trans(osb, handle);
+
+bail_unlock_orphan:
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+
+ ocfs2_free_dir_lookup_result(&orphan_insert);
+
+bail_unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+
+bail:
+ return status;
+}
+
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+ struct inode *inode, int update_isize,
+ loff_t end)
+{
+ struct inode *orphan_dir_inode = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
+ handle_t *handle = NULL;
+ int status = 0;
+
+ status = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+
+ orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+ ORPHAN_DIR_SYSTEM_INODE,
+ le16_to_cpu(di->i_dio_orphaned_slot));
+ if (!orphan_dir_inode) {
+ status = -ENOENT;
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ mutex_lock(&orphan_dir_inode->i_mutex);
+ status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (status < 0) {
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ handle = ocfs2_start_trans(osb,
+ OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ goto bail_unlock_orphan;
+ }
+
+ BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)));
+
+ status = ocfs2_orphan_del(osb, handle, orphan_dir_inode,
+ inode, orphan_dir_bh, true);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_commit;
+ }
+
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(inode),
+ di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_commit;
+ }
+
+ di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+ di->i_dio_orphaned_slot = 0;
+
+ if (update_isize) {
+ status = ocfs2_set_inode_size(handle, inode, di_bh, end);
+ if (status)
+ mlog_errno(status);
+ } else
+ ocfs2_journal_dirty(handle, di_bh);
+
+bail_commit:
+ ocfs2_commit_trans(osb, handle);
+
+bail_unlock_orphan:
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ brelse(orphan_dir_bh);
+ iput(orphan_dir_inode);
+
+bail_unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+
+bail:
+ return status;
+}
+
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct inode *inode,
struct dentry *dentry)
@@ -2615,7 +2845,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
}
status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
- orphan_dir_bh);
+ orphan_dir_bh, false);
if (status < 0) {
mlog_errno(status);
goto out_commit;
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e5d059d4f115..5ddecce172fa 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
handle_t *handle,
struct inode *orphan_dir_inode,
struct inode *inode,
- struct buffer_head *orphan_dir_bh);
+ struct buffer_head *orphan_dir_bh,
+ bool dio);
int ocfs2_create_inode_in_orphan(struct inode *dir,
int mode,
struct inode **new_inode);
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+ struct inode *inode);
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+ struct inode *inode, int update_isize,
+ loff_t end);
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct inode *new_inode,
struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index fdbcbfed529e..8490c64d34fe 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,11 @@ struct ocfs2_lock_res {
#endif
};
+enum ocfs2_orphan_reco_type {
+ ORPHAN_NO_NEED_TRUNCATE = 0,
+ ORPHAN_NEED_TRUNCATE,
+};
+
enum ocfs2_orphan_scan_state {
ORPHAN_SCAN_ACTIVE,
ORPHAN_SCAN_INACTIVE
@@ -495,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
return 0;
}
+static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+ return 1;
+ return 0;
+}
+
+
static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
@@ -726,6 +739,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
return clusters;
}
+static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb,
+ u64 bytes)
+{
+ int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+ unsigned int clusters;
+
+ clusters = (unsigned int)(bytes >> cl_bits);
+ return clusters;
+}
+
static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
u64 bytes)
{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 938387a10d5d..20e37a3ed26f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -105,7 +105,8 @@
| OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
| OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
- | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
+ | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
/*
* Heartbeat-only devices are missing journals and other files. The
@@ -199,6 +200,11 @@
#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
+/*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008
+
/* The byte offset of the first backup block will be 1G.
* The following will be 4G, 16G, 64G, 256G and 1T.
*/
@@ -229,6 +235,8 @@
#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */
+#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially
+ * for dio */
/*
* Flags on ocfs2_dinode.i_dyn_features
@@ -729,7 +737,9 @@ struct ocfs2_dinode {
inode belongs to. Only valid
if allocated from a
discontiguous block group */
-/*A0*/ __le64 i_reserved2[3];
+/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */
+ __le16 i_reserved1[3];
+ __le64 i_reserved2[2];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 87a1f7679d9b..26675185b886 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1746,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
+ init_waitqueue_head(&oi->append_dio_wq);
+
ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
&ocfs2_inode_caching_ops);
diff --git a/fs/open.c b/fs/open.c
index d36c42ff019d..33f9cbf2610b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f)
{
/* NB: we're sure to have correct a_ops only after f_op->open */
if (f->f_flags & O_DIRECT) {
- if (!f->f_mapping->a_ops ||
- ((!f->f_mapping->a_ops->direct_IO) &&
- (!f->f_mapping->a_ops->get_xip_mem))) {
+ if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
return -EINVAL;
- }
}
return 0;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index aeb9dc3082d5..f5e171cc71c1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -51,6 +51,7 @@ struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
+struct vm_fault;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -361,8 +362,6 @@ struct address_space_operations {
int (*releasepage) (struct page *, gfp_t);
void (*freepage)(struct page *);
ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
- int (*get_xip_mem)(struct address_space *, pgoff_t, int,
- void **, unsigned long *);
/*
* migrate the contents of a page to the specified target. If
* migrate_mode is MIGRATE_ASYNC, it must not block.
@@ -1676,6 +1675,11 @@ struct super_operations {
#define S_IMA 1024 /* Inode has an associated IMA struct */
#define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */
#define S_NOSEC 4096 /* no suid or xattr security attributes */
+#ifdef CONFIG_FS_DAX
+#define S_DAX 8192 /* Direct Access, avoiding the page cache */
+#else
+#define S_DAX 0 /* Make all the DAX code disappear */
+#endif
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -1713,6 +1717,7 @@ struct super_operations {
#define IS_IMA(inode) ((inode)->i_flags & S_IMA)
#define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT)
#define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC)
+#define IS_DAX(inode) ((inode)->i_flags & S_DAX)
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
@@ -2573,19 +2578,13 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
-#ifdef CONFIG_FS_XIP
-extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
- loff_t *ppos);
-extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
-extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
- size_t len, loff_t *ppos);
-extern int xip_truncate_page(struct address_space *mapping, loff_t from);
-#else
-static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
-{
- return 0;
-}
-#endif
+ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
+ loff_t, get_block_t, dio_iodone_t, int flags);
+int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
+int dax_truncate_page(struct inode *, loff_t from, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
@@ -2742,6 +2741,11 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root);
extern void save_mount_options(struct super_block *sb, char *options);
extern void replace_mount_options(struct super_block *sb, char *options);
+static inline bool io_is_direct(struct file *filp)
+{
+ return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp));
+}
+
static inline ino_t parent_ino(struct dentry *dentry)
{
ino_t res;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2a348866719..d782617c11de 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -224,6 +224,7 @@ struct vm_fault {
pgoff_t pgoff; /* Logical page offset based on vma */
void __user *virtual_address; /* Faulting virtual address */
+ struct page *cow_page; /* Handler may choose to COW */
struct page *page; /* ->fault handlers should return a
* page here, unless VM_FAULT_NOPAGE
* is set (which is also implied by
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index dbcd5ec3f291..9c5ff69fa0cd 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -200,7 +200,7 @@ int page_referenced(struct page *, int is_locked,
int try_to_unmap(struct page *, enum ttu_flags flags);
/*
- * Called from mm/filemap_xip.c to unmap empty zero page
+ * Used by uprobes to replace a userspace page safely
*/
pte_t *__page_check_address(struct page *, struct mm_struct *,
unsigned long, spinlock_t **, int);
diff --git a/mm/Makefile b/mm/Makefile
index 3548460ab7b6..ac7987744b47 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,7 +51,6 @@ obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
-obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6dc4580df2af..a00a2e819a78 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -219,11 +219,46 @@ static ssize_t stable_pages_required_show(struct device *dev,
}
static DEVICE_ATTR_RO(stable_pages_required);
+static ssize_t strictlimit_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int val;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ switch (val) {
+ case 0:
+ bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
+ break;
+ case 1:
+ bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return count;
+}
+static ssize_t strictlimit_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n",
+ !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
+}
+static DEVICE_ATTR_RW(strictlimit);
+
static struct attribute *bdi_dev_attrs[] = {
&dev_attr_read_ahead_kb.attr,
&dev_attr_min_ratio.attr,
&dev_attr_max_ratio.attr,
&dev_attr_stable_pages_required.attr,
+ &dev_attr_strictlimit.attr,
NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index fac23ecf8d72..4a3907cf79f8 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -28,6 +28,7 @@
SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
{
struct fd f = fdget(fd);
+ struct inode *inode;
struct address_space *mapping;
struct backing_dev_info *bdi;
loff_t endbyte; /* inclusive */
@@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
if (!f.file)
return -EBADF;
- if (S_ISFIFO(file_inode(f.file)->i_mode)) {
+ inode = file_inode(f.file);
+ if (S_ISFIFO(inode->i_mode)) {
ret = -ESPIPE;
goto out;
}
@@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
goto out;
}
- if (mapping->a_ops->get_xip_mem) {
+ if (IS_DAX(inode)) {
switch (advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
diff --git a/mm/filemap.c b/mm/filemap.c
index d9f5336552d7..ad7242043bdb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;
- /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
- if (file->f_flags & O_DIRECT) {
+ if (io_is_direct(file)) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
@@ -1723,9 +1722,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* we've already read everything we wanted to, or if
* there was a short read because we hit EOF, go ahead
* and return. Otherwise fallthrough to buffered io for
- * the rest of the read.
+ * the rest of the read. Buffered reads will not work for
+ * DAX files, so don't bother trying.
*/
- if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+ if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
+ IS_DAX(inode)) {
file_accessed(file);
goto out;
}
@@ -2582,18 +2583,20 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
goto out;
- /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
- if (unlikely(file->f_flags & O_DIRECT)) {
+ if (io_is_direct(file)) {
loff_t endbyte;
written = generic_file_direct_write(iocb, from, pos);
- if (written < 0 || written == count)
- goto out;
-
/*
- * direct-io write to a hole: fall through to buffered I/O
- * for completing the rest of the request.
+ * If the write stopped short of completing, fall back to
+ * buffered writes. Some filesystems do this for writes to
+ * holes, for example. For DAX files, a buffered write will
+ * not succeed (even if it did, DAX does not handle dirty
+ * page-cache pages correctly).
*/
+ if (written < 0 || written == count || IS_DAX(inode))
+ goto out;
+
pos += written;
count -= written;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
deleted file mode 100644
index c175f9f25210..000000000000
--- a/mm/filemap_xip.c
+++ /dev/null
@@ -1,478 +0,0 @@
-/*
- * linux/mm/filemap_xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte <cotte@de.ibm.com>
- *
- * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
- *
- */
-
-#include <linux/fs.h>
-#include <linux/backing-dev.h>
-#include <linux/pagemap.h>
-#include <linux/export.h>
-#include <linux/uio.h>
-#include <linux/rmap.h>
-#include <linux/mmu_notifier.h>
-#include <linux/sched.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-#include <linux/gfp.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-
-/*
- * We do use our own empty page to avoid interference with other users
- * of ZERO_PAGE(), such as /dev/zero
- */
-static DEFINE_MUTEX(xip_sparse_mutex);
-static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
-static struct page *__xip_sparse_page;
-
-/* called under xip_sparse_mutex */
-static struct page *xip_sparse_page(void)
-{
- if (!__xip_sparse_page) {
- struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
-
- if (page)
- __xip_sparse_page = page;
- }
- return __xip_sparse_page;
-}
-
-/*
- * This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_mem() function for the actual low-level
- * stuff.
- *
- * Note the struct file* is not used at all. It may be NULL.
- */
-static ssize_t
-do_xip_mapping_read(struct address_space *mapping,
- struct file_ra_state *_ra,
- struct file *filp,
- char __user *buf,
- size_t len,
- loff_t *ppos)
-{
- struct inode *inode = mapping->host;
- pgoff_t index, end_index;
- unsigned long offset;
- loff_t isize, pos;
- size_t copied = 0, error = 0;
-
- BUG_ON(!mapping->a_ops->get_xip_mem);
-
- pos = *ppos;
- index = pos >> PAGE_CACHE_SHIFT;
- offset = pos & ~PAGE_CACHE_MASK;
-
- isize = i_size_read(inode);
- if (!isize)
- goto out;
-
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
- do {
- unsigned long nr, left;
- void *xip_mem;
- unsigned long xip_pfn;
- int zero = 0;
-
- /* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_CACHE_SIZE;
- if (index >= end_index) {
- if (index > end_index)
- goto out;
- nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
- if (nr <= offset) {
- goto out;
- }
- }
- nr = nr - offset;
- if (nr > len - copied)
- nr = len - copied;
-
- error = mapping->a_ops->get_xip_mem(mapping, index, 0,
- &xip_mem, &xip_pfn);
- if (unlikely(error)) {
- if (error == -ENODATA) {
- /* sparse */
- zero = 1;
- } else
- goto out;
- }
-
- /* If users can be writing to this page using arbitrary
- * virtual addresses, take care about potential aliasing
- * before reading the page on the kernel side.
- */
- if (mapping_writably_mapped(mapping))
- /* address based flush */ ;
-
- /*
- * Ok, we have the mem, so now we can copy it to user space...
- *
- * The actor routine returns how many bytes were actually used..
- * NOTE! This may not be the same as how much of a user buffer
- * we filled up (we may be padding etc), so we can only update
- * "pos" here (the actor routine has to update the user buffer
- * pointers and the remaining count).
- */
- if (!zero)
- left = __copy_to_user(buf+copied, xip_mem+offset, nr);
- else
- left = __clear_user(buf + copied, nr);
-
- if (left) {
- error = -EFAULT;
- goto out;
- }
-
- copied += (nr - left);
- offset += (nr - left);
- index += offset >> PAGE_CACHE_SHIFT;
- offset &= ~PAGE_CACHE_MASK;
- } while (copied < len);
-
-out:
- *ppos = pos + copied;
- if (filp)
- file_accessed(filp);
-
- return (copied ? copied : error);
-}
-
-ssize_t
-xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
-{
- if (!access_ok(VERIFY_WRITE, buf, len))
- return -EFAULT;
-
- return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
- buf, len, ppos);
-}
-EXPORT_SYMBOL_GPL(xip_file_read);
-
-/*
- * __xip_unmap is invoked from xip_unmap and xip_write
- *
- * This function walks all vmas of the address_space and unmaps the
- * __xip_sparse_page when found at pgoff.
- */
-static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
-{
- struct vm_area_struct *vma;
- struct page *page;
- unsigned count;
- int locked = 0;
-
- count = read_seqcount_begin(&xip_sparse_seq);
-
- page = __xip_sparse_page;
- if (!page)
- return;
-
-retry:
- i_mmap_lock_read(mapping);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- pte_t *pte, pteval;
- spinlock_t *ptl;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long address = vma->vm_start +
- ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-
- BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- pte = page_check_address(page, mm, address, &ptl, 1);
- if (pte) {
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush(vma, address, pte);
- page_remove_rmap(page);
- dec_mm_counter(mm, MM_FILEPAGES);
- BUG_ON(pte_dirty(pteval));
- pte_unmap_unlock(pte, ptl);
- /* must invalidate_page _before_ freeing the page */
- mmu_notifier_invalidate_page(mm, address);
- page_cache_release(page);
- }
- }
- i_mmap_unlock_read(mapping);
-
- if (locked) {
- mutex_unlock(&xip_sparse_mutex);
- } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
- mutex_lock(&xip_sparse_mutex);
- locked = 1;
- goto retry;
- }
-}
-
-/*
- * xip_fault() is invoked via the vma operations vector for a
- * mapped memory region to read in file data during a page fault.
- *
- * This function is derived from filemap_fault, but used for execute in place
- */
-static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct file *file = vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- pgoff_t size;
- void *xip_mem;
- unsigned long xip_pfn;
- struct page *page;
- int error;
-
- /* XXX: are VM_FAULT_ codes OK? */
-again:
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (vmf->pgoff >= size)
- return VM_FAULT_SIGBUS;
-
- error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
- &xip_mem, &xip_pfn);
- if (likely(!error))
- goto found;
- if (error != -ENODATA)
- return VM_FAULT_OOM;
-
- /* sparse block */
- if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
- (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
- (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
- int err;
-
- /* maybe shared writable, allocate new block */
- mutex_lock(&xip_sparse_mutex);
- error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
- &xip_mem, &xip_pfn);
- mutex_unlock(&xip_sparse_mutex);
- if (error)
- return VM_FAULT_SIGBUS;
- /* unmap sparse mappings at pgoff from all other vmas */
- __xip_unmap(mapping, vmf->pgoff);
-
-found:
- err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
- xip_pfn);
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- /*
- * err == -EBUSY is fine, we've raced against another thread
- * that faulted-in the same page
- */
- if (err != -EBUSY)
- BUG_ON(err);
- return VM_FAULT_NOPAGE;
- } else {
- int err, ret = VM_FAULT_OOM;
-
- mutex_lock(&xip_sparse_mutex);
- write_seqcount_begin(&xip_sparse_seq);
- error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
- &xip_mem, &xip_pfn);
- if (unlikely(!error)) {
- write_seqcount_end(&xip_sparse_seq);
- mutex_unlock(&xip_sparse_mutex);
- goto again;
- }
- if (error != -ENODATA)
- goto out;
- /* not shared and writable, use xip_sparse_page() */
- page = xip_sparse_page();
- if (!page)
- goto out;
- err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
- page);
- if (err == -ENOMEM)
- goto out;
-
- ret = VM_FAULT_NOPAGE;
-out:
- write_seqcount_end(&xip_sparse_seq);
- mutex_unlock(&xip_sparse_mutex);
-
- return ret;
- }
-}
-
-static const struct vm_operations_struct xip_file_vm_ops = {
- .fault = xip_file_fault,
- .page_mkwrite = filemap_page_mkwrite,
-};
-
-int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
- BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
-
- file_accessed(file);
- vma->vm_ops = &xip_file_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
- return 0;
-}
-EXPORT_SYMBOL_GPL(xip_file_mmap);
-
-static ssize_t
-__xip_file_write(struct file *filp, const char __user *buf,
- size_t count, loff_t pos, loff_t *ppos)
-{
- struct address_space * mapping = filp->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- size_t bytes;
- ssize_t written = 0;
-
- BUG_ON(!mapping->a_ops->get_xip_mem);
-
- do {
- unsigned long index;
- unsigned long offset;
- size_t copied;
- void *xip_mem;
- unsigned long xip_pfn;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- status = a_ops->get_xip_mem(mapping, index, 0,
- &xip_mem, &xip_pfn);
- if (status == -ENODATA) {
- /* we allocate a new page unmap it */
- mutex_lock(&xip_sparse_mutex);
- status = a_ops->get_xip_mem(mapping, index, 1,
- &xip_mem, &xip_pfn);
- mutex_unlock(&xip_sparse_mutex);
- if (!status)
- /* unmap page at pgoff from all other vmas */
- __xip_unmap(mapping, index);
- }
-
- if (status)
- break;
-
- copied = bytes -
- __copy_from_user_nocache(xip_mem + offset, buf, bytes);
-
- if (likely(copied > 0)) {
- status = copied;
-
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- }
- }
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
- if (status < 0)
- break;
- } while (count);
- *ppos = pos;
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_mutex.
- */
- if (pos > inode->i_size) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
-
- return written ? written : status;
-}
-
-ssize_t
-xip_file_write(struct file *filp, const char __user *buf, size_t len,
- loff_t *ppos)
-{
- struct address_space *mapping = filp->f_mapping;
- struct inode *inode = mapping->host;
- size_t count;
- loff_t pos;
- ssize_t ret;
-
- mutex_lock(&inode->i_mutex);
-
- if (!access_ok(VERIFY_READ, buf, len)) {
- ret=-EFAULT;
- goto out_up;
- }
-
- pos = *ppos;
- count = len;
-
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = inode_to_bdi(inode);
-
- ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
- if (ret)
- goto out_backing;
- if (count == 0)
- goto out_backing;
-
- ret = file_remove_suid(filp);
- if (ret)
- goto out_backing;
-
- ret = file_update_time(filp);
- if (ret)
- goto out_backing;
-
- ret = __xip_file_write (filp, buf, count, pos, ppos);
-
- out_backing:
- current->backing_dev_info = NULL;
- out_up:
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(xip_file_write);
-
-/*
- * truncate a page used for execute in place
- * functionality is analog to block_truncate_page but does use get_xip_mem
- * to get the page instead of page cache
- */
-int
-xip_truncate_page(struct address_space *mapping, loff_t from)
-{
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize;
- unsigned length;
- void *xip_mem;
- unsigned long xip_pfn;
- int err;
-
- BUG_ON(!mapping->a_ops->get_xip_mem);
-
- blocksize = 1 << mapping->host->i_blkbits;
- length = offset & (blocksize - 1);
-
- /* Block boundary? Nothing to do */
- if (!length)
- return 0;
-
- length = blocksize - length;
-
- err = mapping->a_ops->get_xip_mem(mapping, index, 0,
- &xip_mem, &xip_pfn);
- if (unlikely(err)) {
- if (err == -ENODATA)
- /* Hole? No need to truncate */
- return 0;
- else
- return err;
- }
- memset(xip_mem + offset, 0, length);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/madvise.c b/mm/madvise.c
index f599e6bb96b4..6d0fcb8921c2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -248,7 +248,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
return -EBADF;
#endif
- if (file->f_mapping->a_ops->get_xip_mem) {
+ if (IS_DAX(file_inode(file))) {
/* no bad return value, but ignore advice */
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index bb66d55e4b09..8ae52c918415 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1965,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
vmf.pgoff = page->index;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
vmf.page = page;
+ vmf.cow_page = NULL;
ret = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2329,6 +2330,7 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX;
+ /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -2638,7 +2640,8 @@ oom:
* See filemap_fault() and __lock_page_retry().
*/
static int __do_fault(struct vm_area_struct *vma, unsigned long address,
- pgoff_t pgoff, unsigned int flags, struct page **page)
+ pgoff_t pgoff, unsigned int flags,
+ struct page *cow_page, struct page **page)
{
struct vm_fault vmf;
int ret;
@@ -2647,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
+ vmf.cow_page = cow_page;
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
+ if (!vmf.page)
+ goto out;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
@@ -2664,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
else
VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+ out:
*page = vmf.page;
return ret;
}
@@ -2834,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
}
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -2874,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
}
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
- copy_user_highpage(new_page, fault_page, address, vma);
+ if (fault_page)
+ copy_user_highpage(new_page, fault_page, address, vma);
__SetPageUptodate(new_page);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
- unlock_page(fault_page);
- page_cache_release(fault_page);
+ if (fault_page) {
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ } else {
+ /*
+ * The fault handler has no page to lock, so it holds
+ * i_mmap_lock for read to protect against truncate.
+ */
+ i_mmap_unlock_read(vma->vm_file->f_mapping);
+ }
goto uncharge_out;
}
do_set_pte(vma, address, new_page, pte, true, true);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
- unlock_page(fault_page);
- page_cache_release(fault_page);
+ if (fault_page) {
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ } else {
+ /*
+ * The fault handler has no page to lock, so it holds
+ * i_mmap_lock for read to protect against truncate.
+ */
+ i_mmap_unlock_read(vma->vm_file->f_mapping);
+ }
return ret;
uncharge_out:
mem_cgroup_cancel_charge(new_page, memcg);
@@ -2912,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int dirtied = 0;
int ret, tmp;
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
diff --git a/scripts/diffconfig b/scripts/diffconfig
index 6d672836e187..0db267d0adc9 100755
--- a/scripts/diffconfig
+++ b/scripts/diffconfig
@@ -28,7 +28,6 @@ If no config files are specified, .config and .config.old are used.
Example usage:
$ diffconfig .config config-with-some-changes
-EXT2_FS_XATTR n
--EXT2_FS_XIP n
CRAMFS n -> y
EXT2_FS y -> n
LOG_BUF_SHIFT 14 -> 16