summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/devicetree/bindings/dma/dma.txt28
-rw-r--r--Documentation/devicetree/bindings/dma/mv-xor.txt2
-rw-r--r--Documentation/devicetree/bindings/dma/sirfsoc-dma.txt3
-rw-r--r--Documentation/devicetree/bindings/dma/sun6i-dma.txt5
-rw-r--r--Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt52
-rw-r--r--Documentation/dmaengine/provider.txt11
-rw-r--r--Documentation/dmaengine/pxa_dma.txt153
-rw-r--r--Documentation/filesystems/xfs.txt12
-rw-r--r--Documentation/nvdimm/btt.txt283
-rw-r--r--Documentation/nvdimm/nvdimm.txt808
-rw-r--r--MAINTAINERS53
-rw-r--r--arch/arm64/kernel/efi.c1
-rw-r--r--arch/ia64/kernel/efi.c4
-rw-r--r--arch/ia64/kernel/ia64_ksyms.c3
-rw-r--r--arch/ia64/kernel/mca.c6
-rw-r--r--arch/m68k/68000/m68EZ328.c3
-rw-r--r--arch/m68k/68000/m68VZ328.c3
-rw-r--r--arch/m68k/68360/config.c3
-rw-r--r--arch/um/Kconfig.um16
-rw-r--r--arch/um/Makefile7
-rw-r--r--arch/um/drivers/harddog_user.c18
-rw-r--r--arch/um/drivers/mconsole.h2
-rw-r--r--arch/um/drivers/net_user.c6
-rw-r--r--arch/um/drivers/slip_user.c14
-rw-r--r--arch/um/drivers/slirp_user.c16
-rw-r--r--arch/um/include/asm/Kbuild1
-rw-r--r--arch/um/include/asm/ptrace-generic.h3
-rw-r--r--arch/um/include/asm/sections.h9
-rw-r--r--arch/um/include/asm/thread_info.h2
-rw-r--r--arch/um/include/asm/uaccess.h176
-rw-r--r--arch/um/include/shared/init.h24
-rw-r--r--arch/um/include/shared/os.h2
-rw-r--r--arch/um/include/shared/user.h2
-rw-r--r--arch/um/kernel/ksyms.c2
-rw-r--r--arch/um/kernel/physmem.c7
-rw-r--r--arch/um/kernel/ptrace.c7
-rw-r--r--arch/um/kernel/skas/mmu.c7
-rw-r--r--arch/um/kernel/skas/syscall.c6
-rw-r--r--arch/um/kernel/skas/uaccess.c47
-rw-r--r--arch/um/kernel/trap.c5
-rw-r--r--arch/um/kernel/um_arch.c4
-rw-r--r--arch/um/os-Linux/drivers/tuntap_user.c6
-rw-r--r--arch/um/os-Linux/file.c1
-rw-r--r--arch/um/os-Linux/signal.c8
-rw-r--r--arch/um/os-Linux/skas/mem.c6
-rw-r--r--arch/um/os-Linux/skas/process.c8
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/boot/compressed/eboot.c4
-rw-r--r--arch/x86/include/asm/cacheflush.h72
-rw-r--r--arch/x86/include/asm/io.h6
-rw-r--r--arch/x86/include/uapi/asm/e820.h1
-rw-r--r--arch/x86/kernel/e820.c28
-rw-r--r--arch/x86/kernel/pmem.c93
-rw-r--r--arch/x86/platform/efi/efi.c3
-rw-r--r--arch/x86/um/asm/checksum.h1
-rw-r--r--arch/x86/um/asm/elf.h2
-rw-r--r--arch/x86/um/asm/processor.h2
-rw-r--r--arch/x86/um/asm/segment.h8
-rw-r--r--arch/x86/um/ldt.c1
-rw-r--r--arch/x86/um/mem_32.c3
-rw-r--r--arch/x86/um/mem_64.c3
-rw-r--r--arch/x86/um/ptrace_32.c1
-rw-r--r--arch/x86/um/ptrace_64.c1
-rw-r--r--arch/x86/um/shared/sysdep/tls.h6
-rw-r--r--arch/x86/um/signal.c3
-rw-r--r--arch/x86/um/syscalls_64.c1
-rw-r--r--arch/x86/um/tls_32.c1
-rw-r--r--arch/x86/um/tls_64.c1
-rw-r--r--arch/x86/um/vdso/vma.c1
-rw-r--r--block/scsi_ioctl.c4
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/acpi/Kconfig26
-rw-r--r--drivers/acpi/Makefile1
-rw-r--r--drivers/acpi/nfit.c1587
-rw-r--r--drivers/acpi/nfit.h158
-rw-r--r--drivers/acpi/numa.c50
-rw-r--r--drivers/block/Kconfig12
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/nvme-core.c138
-rw-r--r--drivers/block/xen-blkback/blkback.c13
-rw-r--r--drivers/block/xen-blkback/common.h4
-rw-r--r--drivers/block/xen-blkback/xenbus.c167
-rw-r--r--drivers/block/xen-blkfront.c151
-rw-r--r--drivers/dma/Kconfig15
-rw-r--r--drivers/dma/Makefile2
-rw-r--r--drivers/dma/amba-pl08x.c2
-rw-r--r--drivers/dma/at_hdmac.c106
-rw-r--r--drivers/dma/at_hdmac_regs.h5
-rw-r--r--drivers/dma/at_xdmac.c394
-rw-r--r--drivers/dma/dmaengine.c13
-rw-r--r--drivers/dma/ep93xx_dma.c2
-rw-r--r--drivers/dma/fsl-edma.c9
-rw-r--r--drivers/dma/imx-dma.c2
-rw-r--r--drivers/dma/imx-sdma.c2
-rw-r--r--drivers/dma/mv_xor.c352
-rw-r--r--drivers/dma/mv_xor.h27
-rw-r--r--drivers/dma/mxs-dma.c2
-rw-r--r--drivers/dma/nbpfaxi.c2
-rw-r--r--drivers/dma/of-dma.c89
-rw-r--r--drivers/dma/omap-dma.c80
-rw-r--r--drivers/dma/pl330.c8
-rw-r--r--drivers/dma/pxa_dma.c1467
-rw-r--r--drivers/dma/s3c24xx-dma.c2
-rw-r--r--drivers/dma/sh/rcar-dmac.c2
-rw-r--r--drivers/dma/sh/shdma-r8a73a4.c2
-rw-r--r--drivers/dma/sirf-dma.c423
-rw-r--r--drivers/dma/sun6i-dma.c12
-rw-r--r--drivers/dma/ti-dma-crossbar.c188
-rw-r--r--drivers/dma/virt-dma.c19
-rw-r--r--drivers/dma/virt-dma.h13
-rw-r--r--[-rwxr-xr-x]drivers/dma/xgene-dma.c173
-rw-r--r--drivers/md/md.c167
-rw-r--r--drivers/md/raid10.c18
-rw-r--r--drivers/md/raid5.c46
-rw-r--r--drivers/md/raid5.h3
-rw-r--r--drivers/nvdimm/Kconfig68
-rw-r--r--drivers/nvdimm/Makefile20
-rw-r--r--drivers/nvdimm/blk.c384
-rw-r--r--drivers/nvdimm/btt.c1479
-rw-r--r--drivers/nvdimm/btt.h185
-rw-r--r--drivers/nvdimm/btt_devs.c425
-rw-r--r--drivers/nvdimm/bus.c730
-rw-r--r--drivers/nvdimm/core.c465
-rw-r--r--drivers/nvdimm/dimm.c102
-rw-r--r--drivers/nvdimm/dimm_devs.c551
-rw-r--r--drivers/nvdimm/label.c927
-rw-r--r--drivers/nvdimm/label.h141
-rw-r--r--drivers/nvdimm/namespace_devs.c1870
-rw-r--r--drivers/nvdimm/nd-core.h83
-rw-r--r--drivers/nvdimm/nd.h220
-rw-r--r--drivers/nvdimm/pmem.c (renamed from drivers/block/pmem.c)227
-rw-r--r--drivers/nvdimm/region.c114
-rw-r--r--drivers/nvdimm/region_devs.c787
-rw-r--r--drivers/vfio/pci/vfio_pci.c16
-rw-r--r--drivers/vfio/platform/Kconfig4
-rw-r--r--drivers/vfio/platform/Makefile2
-rw-r--r--drivers/vfio/platform/reset/Kconfig7
-rw-r--r--drivers/vfio/platform/reset/Makefile5
-rw-r--r--drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c86
-rw-r--r--drivers/vfio/platform/vfio_platform_common.c60
-rw-r--r--drivers/vfio/platform/vfio_platform_private.h7
-rw-r--r--drivers/vfio/vfio.c27
-rw-r--r--fs/Makefile1
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/backref.c59
-rw-r--r--fs/btrfs/ctree.c16
-rw-r--r--fs/btrfs/ctree.h28
-rw-r--r--fs/btrfs/delayed-ref.c372
-rw-r--r--fs/btrfs/delayed-ref.h29
-rw-r--r--fs/btrfs/dev-replace.c7
-rw-r--r--fs/btrfs/disk-io.c56
-rw-r--r--fs/btrfs/extent-tree.c308
-rw-r--r--fs/btrfs/extent-tree.h0
-rw-r--r--fs/btrfs/extent_io.c9
-rw-r--r--fs/btrfs/file.c9
-rw-r--r--fs/btrfs/free-space-cache.c14
-rw-r--r--fs/btrfs/inode.c26
-rw-r--r--fs/btrfs/ioctl.c50
-rw-r--r--fs/btrfs/ordered-data.c37
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/qgroup.c1052
-rw-r--r--fs/btrfs/qgroup.h61
-rw-r--r--fs/btrfs/relocation.c19
-rw-r--r--fs/btrfs/scrub.c26
-rw-r--r--fs/btrfs/send.c147
-rw-r--r--fs/btrfs/super.c397
-rw-r--r--fs/btrfs/sysfs.c148
-rw-r--r--fs/btrfs/sysfs.h8
-rw-r--r--fs/btrfs/tests/qgroup-tests.c109
-rw-r--r--fs/btrfs/transaction.c79
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c3
-rw-r--r--fs/btrfs/tree-log.c6
-rw-r--r--fs/btrfs/ulist.c47
-rw-r--r--fs/btrfs/ulist.h1
-rw-r--r--fs/btrfs/volumes.c186
-rw-r--r--fs/btrfs/volumes.h9
-rw-r--r--fs/dax.c34
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext4/file.c16
-rw-r--r--fs/ext4/inode.c21
-rw-r--r--fs/hppfs/Makefile6
-rw-r--r--fs/hppfs/hppfs.c765
-rw-r--r--fs/seq_file.c1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c281
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h10
-rw-r--r--fs/xfs/libxfs/xfs_attr.c25
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c29
-rw-r--r--fs/xfs/libxfs/xfs_format.h65
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c542
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h15
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c93
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c8
-rw-r--r--fs/xfs/libxfs/xfs_sb.c34
-rw-r--r--fs/xfs/libxfs/xfs_shared.h6
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h4
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/xfs_aops.c158
-rw-r--r--fs/xfs/xfs_aops.h7
-rw-r--r--fs/xfs/xfs_attr_inactive.c16
-rw-r--r--fs/xfs/xfs_bmap_util.c89
-rw-r--r--fs/xfs/xfs_buf.c6
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_dquot.c8
-rw-r--r--fs/xfs/xfs_error.c4
-rw-r--r--fs/xfs/xfs_error.h4
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_file.c166
-rw-r--r--fs/xfs/xfs_filestream.c3
-rw-r--r--fs/xfs/xfs_fsops.c10
-rw-r--r--fs/xfs/xfs_inode.c204
-rw-r--r--fs/xfs/xfs_ioctl.c14
-rw-r--r--fs/xfs/xfs_iomap.c18
-rw-r--r--fs/xfs/xfs_iops.c48
-rw-r--r--fs/xfs/xfs_itable.c13
-rw-r--r--fs/xfs/xfs_linux.h14
-rw-r--r--fs/xfs/xfs_log.c51
-rw-r--r--fs/xfs/xfs_log.h13
-rw-r--r--fs/xfs/xfs_log_cil.c12
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c97
-rw-r--r--fs/xfs/xfs_mount.c16
-rw-r--r--fs/xfs/xfs_mount.h4
-rw-r--r--fs/xfs/xfs_pnfs.c4
-rw-r--r--fs/xfs/xfs_qm.c7
-rw-r--r--fs/xfs/xfs_qm_syscalls.c20
-rw-r--r--fs/xfs/xfs_quota.h1
-rw-r--r--fs/xfs/xfs_rtalloc.c16
-rw-r--r--fs/xfs/xfs_super.c25
-rw-r--r--fs/xfs/xfs_symlink.c19
-rw-r--r--fs/xfs/xfs_trace.h47
-rw-r--r--fs/xfs/xfs_trans.c91
-rw-r--r--fs/xfs/xfs_trans.h7
-rw-r--r--fs/xfs/xfs_trans_ail.c6
-rw-r--r--fs/xfs/xfs_trans_dquot.c32
-rw-r--r--fs/xfs/xfs_trans_priv.h2
-rw-r--r--include/linux/acpi.h5
-rw-r--r--include/linux/compiler.h2
-rw-r--r--include/linux/dma/pxa-dma.h27
-rw-r--r--include/linux/dmaengine.h76
-rw-r--r--include/linux/efi.h3
-rw-r--r--include/linux/fs.h9
-rw-r--r--include/linux/libnvdimm.h151
-rw-r--r--include/linux/nd.h151
-rw-r--r--include/linux/of_dma.h21
-rw-r--r--include/linux/platform_data/dma-rcar-audmapp.h34
-rw-r--r--include/linux/pmem.h152
-rw-r--r--include/linux/slab.h22
-rw-r--r--include/linux/wait.h13
-rw-r--r--include/trace/events/btrfs.h55
-rw-r--r--include/uapi/linux/Kbuild1
-rw-r--r--include/uapi/linux/ndctl.h197
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/kobject.c1
-rw-r--r--mm/slab_common.c32
-rw-r--r--tools/testing/nvdimm/Kbuild40
-rw-r--r--tools/testing/nvdimm/Makefile7
-rw-r--r--tools/testing/nvdimm/config_check.c15
-rw-r--r--tools/testing/nvdimm/test/Kbuild8
-rw-r--r--tools/testing/nvdimm/test/iomap.c151
-rw-r--r--tools/testing/nvdimm/test/nfit.c1116
-rw-r--r--tools/testing/nvdimm/test/nfit_test.h29
-rw-r--r--tools/testing/selftests/Makefile8
-rw-r--r--tools/testing/selftests/exec/Makefile2
-rw-r--r--tools/testing/selftests/ftrace/Makefile1
-rw-r--r--tools/testing/selftests/futex/Makefile29
-rw-r--r--tools/testing/selftests/futex/README62
-rw-r--r--tools/testing/selftests/futex/functional/.gitignore7
-rw-r--r--tools/testing/selftests/futex/functional/Makefile25
-rw-r--r--tools/testing/selftests/futex/functional/futex_requeue_pi.c409
-rw-r--r--tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c135
-rw-r--r--tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c223
-rw-r--r--tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c125
-rw-r--r--tools/testing/selftests/futex/functional/futex_wait_timeout.c86
-rw-r--r--tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c124
-rw-r--r--tools/testing/selftests/futex/functional/futex_wait_wouldblock.c79
-rwxr-xr-xtools/testing/selftests/futex/functional/run.sh79
-rw-r--r--tools/testing/selftests/futex/include/atomic.h83
-rw-r--r--tools/testing/selftests/futex/include/futextest.h266
-rw-r--r--tools/testing/selftests/futex/include/logging.h153
-rwxr-xr-xtools/testing/selftests/futex/run.sh33
-rw-r--r--tools/testing/selftests/kselftest.h17
-rw-r--r--tools/testing/selftests/lib.mk3
-rw-r--r--tools/testing/selftests/mount/Makefile7
-rw-r--r--tools/testing/selftests/seccomp/.gitignore1
-rw-r--r--tools/testing/selftests/seccomp/Makefile10
-rw-r--r--tools/testing/selftests/seccomp/seccomp_bpf.c2109
-rw-r--r--tools/testing/selftests/seccomp/test_harness.h537
-rw-r--r--tools/testing/selftests/timers/.gitignore18
-rw-r--r--tools/testing/selftests/timers/alarmtimer-suspend.c10
-rw-r--r--tools/testing/selftests/vm/Makefile7
-rw-r--r--tools/testing/selftests/vm/compaction_test.c225
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests12
-rw-r--r--tools/testing/selftests/x86/trivial_64bit_program.c2
299 files changed, 26193 insertions, 4648 deletions
diff --git a/Documentation/devicetree/bindings/dma/dma.txt b/Documentation/devicetree/bindings/dma/dma.txt
index 82104271e754..6312fb00ce8d 100644
--- a/Documentation/devicetree/bindings/dma/dma.txt
+++ b/Documentation/devicetree/bindings/dma/dma.txt
@@ -31,6 +31,34 @@ Example:
dma-requests = <127>;
};
+* DMA router
+
+DMA routers are transparent IP blocks used to route DMA request lines from
+devices to the DMA controller. Some SoCs (like TI DRA7x) have more peripherals
+integrated with DMA requests than what the DMA controller can handle directly.
+
+Required property:
+- dma-masters: phandle of the DMA controller or list of phandles for
+ the DMA controllers the router can direct the signal to.
+- #dma-cells: Must be at least 1. Used to provide DMA router specific
+ information. See DMA client binding below for more
+ details.
+
+Optional properties:
+- dma-requests: Number of incoming request lines the router can handle.
+- In the node pointed by the dma-masters:
+ - dma-requests: The router driver might need to look for this in order
+ to configure the routing.
+
+Example:
+ sdma_xbar: dma-router@4a002b78 {
+ compatible = "ti,dra7-dma-crossbar";
+ reg = <0x4a002b78 0xfc>;
+ #dma-cells = <1>;
+ dma-requests = <205>;
+ ti,dma-safe-map = <0>;
+ dma-masters = <&sdma>;
+ };
* DMA client
diff --git a/Documentation/devicetree/bindings/dma/mv-xor.txt b/Documentation/devicetree/bindings/dma/mv-xor.txt
index 7c6cb7fcecd2..cc29c35266e2 100644
--- a/Documentation/devicetree/bindings/dma/mv-xor.txt
+++ b/Documentation/devicetree/bindings/dma/mv-xor.txt
@@ -1,7 +1,7 @@
* Marvell XOR engines
Required properties:
-- compatible: Should be "marvell,orion-xor"
+- compatible: Should be "marvell,orion-xor" or "marvell,armada-380-xor"
- reg: Should contain registers location and length (two sets)
the first set is the low registers, the second set the high
registers for the XOR engine.
diff --git a/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt b/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt
index ecbc96ad36f8..ccd52d6a231a 100644
--- a/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt
+++ b/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt
@@ -3,7 +3,8 @@
See dma.txt first
Required properties:
-- compatible: Should be "sirf,prima2-dmac" or "sirf,marco-dmac"
+- compatible: Should be "sirf,prima2-dmac", "sirf,atlas7-dmac" or
+ "sirf,atlas7-dmac-v2"
- reg: Should contain DMA registers location and length.
- interrupts: Should contain one interrupt shared by all channel
- #dma-cells: must be <1>. used to represent the number of integer
diff --git a/Documentation/devicetree/bindings/dma/sun6i-dma.txt b/Documentation/devicetree/bindings/dma/sun6i-dma.txt
index 9cdcba24d7c3..d13c136cef8c 100644
--- a/Documentation/devicetree/bindings/dma/sun6i-dma.txt
+++ b/Documentation/devicetree/bindings/dma/sun6i-dma.txt
@@ -4,7 +4,10 @@ This driver follows the generic DMA bindings defined in dma.txt.
Required properties:
-- compatible: Must be "allwinner,sun6i-a31-dma" or "allwinner,sun8i-a23-dma"
+- compatible: Must be one of
+ "allwinner,sun6i-a31-dma"
+ "allwinner,sun8i-a23-dma"
+ "allwinner,sun8i-h3-dma"
- reg: Should contain the registers base address and length
- interrupts: Should contain a reference to the interrupt used by this device
- clocks: Should contain a reference to the parent AHB clock
diff --git a/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt b/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt
new file mode 100644
index 000000000000..63a48928f3a8
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt
@@ -0,0 +1,52 @@
+Texas Instruments DMA Crossbar (DMA request router)
+
+Required properties:
+- compatible: "ti,dra7-dma-crossbar" for DRA7xx DMA crossbar
+- reg: Memory map for accessing module
+- #dma-cells: Should be set to <1>.
+ Clients should use the crossbar request number (input)
+- dma-requests: Number of DMA requests the crossbar can receive
+- dma-masters: phandle pointing to the DMA controller
+
+The DMA controller node need to have the following poroperties:
+- dma-requests: Number of DMA requests the controller can handle
+
+Optional properties:
+- ti,dma-safe-map: Safe routing value for unused request lines
+
+Example:
+
+/* DMA controller */
+sdma: dma-controller@4a056000 {
+ compatible = "ti,omap4430-sdma";
+ reg = <0x4a056000 0x1000>;
+ interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>;
+ #dma-cells = <1>;
+ dma-channels = <32>;
+ dma-requests = <127>;
+};
+
+/* DMA crossbar */
+sdma_xbar: dma-router@4a002b78 {
+ compatible = "ti,dra7-dma-crossbar";
+ reg = <0x4a002b78 0xfc>;
+ #dma-cells = <1>;
+ dma-requests = <205>;
+ ti,dma-safe-map = <0>;
+ dma-masters = <&sdma>;
+};
+
+/* DMA client */
+uart1: serial@4806a000 {
+ compatible = "ti,omap4-uart";
+ reg = <0x4806a000 0x100>;
+ interrupts-extended = <&gic GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>;
+ ti,hwmods = "uart1";
+ clock-frequency = <48000000>;
+ status = "disabled";
+ dmas = <&sdma_xbar 49>, <&sdma_xbar 50>;
+ dma-names = "tx", "rx";
+};
diff --git a/Documentation/dmaengine/provider.txt b/Documentation/dmaengine/provider.txt
index 05d2280190f1..ca67b0f04c6e 100644
--- a/Documentation/dmaengine/provider.txt
+++ b/Documentation/dmaengine/provider.txt
@@ -345,11 +345,12 @@ where to put them)
that abstracts it away.
* DMA_CTRL_ACK
- - Undocumented feature
- - No one really has an idea of what it's about, besides being
- related to reusing the DMA transaction descriptors or having
- additional transactions added to it in the async-tx API
- - Useless in the case of the slave API
+ - If set, the transfer can be reused after being completed.
+ - There is a guarantee the transfer won't be freed until it is acked
+ by async_tx_ack().
+ - As a consequence, if a device driver wants to skip the dma_map_sg() and
+ dma_unmap_sg() in between 2 transfers, because the DMA'd data wasn't used,
+ it can resubmit the transfer right after its completion.
General Design Notes
--------------------
diff --git a/Documentation/dmaengine/pxa_dma.txt b/Documentation/dmaengine/pxa_dma.txt
new file mode 100644
index 000000000000..413ef9cfaa4d
--- /dev/null
+++ b/Documentation/dmaengine/pxa_dma.txt
@@ -0,0 +1,153 @@
+PXA/MMP - DMA Slave controller
+==============================
+
+Constraints
+-----------
+ a) Transfers hot queuing
+ A driver submitting a transfer and issuing it should be granted the transfer
+ is queued even on a running DMA channel.
+ This implies that the queuing doesn't wait for the previous transfer end,
+ and that the descriptor chaining is not only done in the irq/tasklet code
+ triggered by the end of the transfer.
+ A transfer which is submitted and issued on a phy doesn't wait for a phy to
+ stop and restart, but is submitted on a "running channel". The other
+ drivers, especially mmp_pdma waited for the phy to stop before relaunching
+ a new transfer.
+
+ b) All transfers having asked for confirmation should be signaled
+ Any issued transfer with DMA_PREP_INTERRUPT should trigger a callback call.
+ This implies that even if an irq/tasklet is triggered by end of tx1, but
+ at the time of irq/dma tx2 is already finished, tx1->complete() and
+ tx2->complete() should be called.
+
+ c) Channel running state
+ A driver should be able to query if a channel is running or not. For the
+ multimedia case, such as video capture, if a transfer is submitted and then
+ a check of the DMA channel reports a "stopped channel", the transfer should
+ not be issued until the next "start of frame interrupt", hence the need to
+ know if a channel is in running or stopped state.
+
+ d) Bandwidth guarantee
+ The PXA architecture has 4 levels of DMAs priorities : high, normal, low.
+ The high prorities get twice as much bandwidth as the normal, which get twice
+ as much as the low priorities.
+ A driver should be able to request a priority, especially the real-time
+ ones such as pxa_camera with (big) throughputs.
+
+Design
+------
+ a) Virtual channels
+ Same concept as in sa11x0 driver, ie. a driver was assigned a "virtual
+ channel" linked to the requestor line, and the physical DMA channel is
+ assigned on the fly when the transfer is issued.
+
+ b) Transfer anatomy for a scatter-gather transfer
+ +------------+-----+---------------+----------------+-----------------+
+ | desc-sg[0] | ... | desc-sg[last] | status updater | finisher/linker |
+ +------------+-----+---------------+----------------+-----------------+
+
+ This structure is pointed by dma->sg_cpu.
+ The descriptors are used as follows :
+ - desc-sg[i]: i-th descriptor, transferring the i-th sg
+ element to the video buffer scatter gather
+ - status updater
+ Transfers a single u32 to a well known dma coherent memory to leave
+ a trace that this transfer is done. The "well known" is unique per
+ physical channel, meaning that a read of this value will tell which
+ is the last finished transfer at that point in time.
+ - finisher: has ddadr=DADDR_STOP, dcmd=ENDIRQEN
+ - linker: has ddadr= desc-sg[0] of next transfer, dcmd=0
+
+ c) Transfers hot-chaining
+ Suppose the running chain is :
+ Buffer 1 Buffer 2
+ +---------+----+---+ +----+----+----+---+
+ | d0 | .. | dN | l | | d0 | .. | dN | f |
+ +---------+----+-|-+ ^----+----+----+---+
+ | |
+ +----+
+
+ After a call to dmaengine_submit(b3), the chain will look like :
+ Buffer 1 Buffer 2 Buffer 3
+ +---------+----+---+ +----+----+----+---+ +----+----+----+---+
+ | d0 | .. | dN | l | | d0 | .. | dN | l | | d0 | .. | dN | f |
+ +---------+----+-|-+ ^----+----+----+-|-+ ^----+----+----+---+
+ | | | |
+ +----+ +----+
+ new_link
+
+ If while new_link was created the DMA channel stopped, it is _not_
+ restarted. Hot-chaining doesn't break the assumption that
+ dma_async_issue_pending() is to be used to ensure the transfer is actually started.
+
+ One exception to this rule :
+ - if Buffer1 and Buffer2 had all their addresses 8 bytes aligned
+ - and if Buffer3 has at least one address not 4 bytes aligned
+ - then hot-chaining cannot happen, as the channel must be stopped, the
+ "align bit" must be set, and the channel restarted As a consequence,
+ such a transfer tx_submit() will be queued on the submitted queue, and
+ this specific case if the DMA is already running in aligned mode.
+
+ d) Transfers completion updater
+ Each time a transfer is completed on a channel, an interrupt might be
+ generated or not, up to the client's request. But in each case, the last
+ descriptor of a transfer, the "status updater", will write the latest
+ transfer being completed into the physical channel's completion mark.
+
+ This will speed up residue calculation, for large transfers such as video
+ buffers which hold around 6k descriptors or more. This also allows without
+ any lock to find out what is the latest completed transfer in a running
+ DMA chain.
+
+ e) Transfers completion, irq and tasklet
+ When a transfer flagged as "DMA_PREP_INTERRUPT" is finished, the dma irq
+ is raised. Upon this interrupt, a tasklet is scheduled for the physical
+ channel.
+ The tasklet is responsible for :
+ - reading the physical channel last updater mark
+ - calling all the transfer callbacks of finished transfers, based on
+ that mark, and each transfer flags.
+ If a transfer is completed while this handling is done, a dma irq will
+ be raised, and the tasklet will be scheduled once again, having a new
+ updater mark.
+
+ f) Residue
+ Residue granularity will be descriptor based. The issued but not completed
+ transfers will be scanned for all of their descriptors against the
+ currently running descriptor.
+
+ g) Most complicated case of driver's tx queues
+ The most tricky situation is when :
+ - there are not "acked" transfers (tx0)
+ - a driver submitted an aligned tx1, not chained
+ - a driver submitted an aligned tx2 => tx2 is cold chained to tx1
+ - a driver issued tx1+tx2 => channel is running in aligned mode
+ - a driver submitted an aligned tx3 => tx3 is hot-chained
+ - a driver submitted an unaligned tx4 => tx4 is put in submitted queue,
+ not chained
+ - a driver issued tx4 => tx4 is put in issued queue, not chained
+ - a driver submitted an aligned tx5 => tx5 is put in submitted queue, not
+ chained
+ - a driver submitted an aligned tx6 => tx6 is put in submitted queue,
+ cold chained to tx5
+
+ This translates into (after tx4 is issued) :
+ - issued queue
+ +-----+ +-----+ +-----+ +-----+
+ | tx1 | | tx2 | | tx3 | | tx4 |
+ +---|-+ ^---|-+ ^-----+ +-----+
+ | | | |
+ +---+ +---+
+ - submitted queue
+ +-----+ +-----+
+ | tx5 | | tx6 |
+ +---|-+ ^-----+
+ | |
+ +---+
+ - completed queue : empty
+ - allocated queue : tx0
+
+ It should be noted that after tx3 is completed, the channel is stopped, and
+ restarted in "unaligned mode" to handle tx4.
+
+Author: Robert Jarzmik <robert.jarzmik@free.fr>
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 5a5a05582b58..8146e9fd5ffc 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -236,10 +236,10 @@ Removed Mount Options
Name Removed
---- -------
- delaylog/nodelaylog v3.20
- ihashsize v3.20
- irixsgid v3.20
- osyncisdsync/osyncisosync v3.20
+ delaylog/nodelaylog v4.0
+ ihashsize v4.0
+ irixsgid v4.0
+ osyncisdsync/osyncisosync v4.0
sysctls
@@ -346,5 +346,5 @@ Removed Sysctls
Name Removed
---- -------
- fs.xfs.xfsbufd_centisec v3.20
- fs.xfs.age_buffer_centisecs v3.20
+ fs.xfs.xfsbufd_centisec v4.0
+ fs.xfs.age_buffer_centisecs v4.0
diff --git a/Documentation/nvdimm/btt.txt b/Documentation/nvdimm/btt.txt
new file mode 100644
index 000000000000..b91443f577dc
--- /dev/null
+++ b/Documentation/nvdimm/btt.txt
@@ -0,0 +1,283 @@
+BTT - Block Translation Table
+=============================
+
+
+1. Introduction
+---------------
+
+Persistent memory based storage is able to perform IO at byte (or more
+accurately, cache line) granularity. However, we often want to expose such
+storage as traditional block devices. The block drivers for persistent memory
+will do exactly this. However, they do not provide any atomicity guarantees.
+Traditional SSDs typically provide protection against torn sectors in hardware,
+using stored energy in capacitors to complete in-flight block writes, or perhaps
+in firmware. We don't have this luxury with persistent memory - if a write is in
+progress, and we experience a power failure, the block will contain a mix of old
+and new data. Applications may not be prepared to handle such a scenario.
+
+The Block Translation Table (BTT) provides atomic sector update semantics for
+persistent memory devices, so that applications that rely on sector writes not
+being torn can continue to do so. The BTT manifests itself as a stacked block
+device, and reserves a portion of the underlying storage for its metadata. At
+the heart of it, is an indirection table that re-maps all the blocks on the
+volume. It can be thought of as an extremely simple file system that only
+provides atomic sector updates.
+
+
+2. Static Layout
+----------------
+
+The underlying storage on which a BTT can be laid out is not limited in any way.
+The BTT, however, splits the available space into chunks of up to 512 GiB,
+called "Arenas".
+
+Each arena follows the same layout for its metadata, and all references in an
+arena are internal to it (with the exception of one field that points to the
+next arena). The following depicts the "On-disk" metadata layout:
+
+
+ Backing Store +-------> Arena
++---------------+ | +------------------+
+| | | | Arena info block |
+| Arena 0 +---+ | 4K |
+| 512G | +------------------+
+| | | |
++---------------+ | |
+| | | |
+| Arena 1 | | Data Blocks |
+| 512G | | |
+| | | |
++---------------+ | |
+| . | | |
+| . | | |
+| . | | |
+| | | |
+| | | |
++---------------+ +------------------+
+ | |
+ | BTT Map |
+ | |
+ | |
+ +------------------+
+ | |
+ | BTT Flog |
+ | |
+ +------------------+
+ | Info block copy |
+ | 4K |
+ +------------------+
+
+
+3. Theory of Operation
+----------------------
+
+
+a. The BTT Map
+--------------
+
+The map is a simple lookup/indirection table that maps an LBA to an internal
+block. Each map entry is 32 bits. The two most significant bits are special
+flags, and the remaining form the internal block number.
+
+Bit Description
+31 - 30 : Error and Zero flags - Used in the following way:
+ Bit Description
+ 31 30
+ -----------------------------------------------------------------------
+ 00 Initial state. Reads return zeroes; Premap = Postmap
+ 01 Zero state: Reads return zeroes
+ 10 Error state: Reads fail; Writes clear 'E' bit
+ 11 Normal Block – has valid postmap
+
+
+29 - 0 : Mappings to internal 'postmap' blocks
+
+
+Some of the terminology that will be subsequently used:
+
+External LBA : LBA as made visible to upper layers.
+ABA : Arena Block Address - Block offset/number within an arena
+Premap ABA : The block offset into an arena, which was decided upon by range
+ checking the External LBA
+Postmap ABA : The block number in the "Data Blocks" area obtained after
+ indirection from the map
+nfree : The number of free blocks that are maintained at any given time.
+ This is the number of concurrent writes that can happen to the
+ arena.
+
+
+For example, after adding a BTT, we surface a disk of 1024G. We get a read for
+the external LBA at 768G. This falls into the second arena, and of the 512G
+worth of blocks that this arena contributes, this block is at 256G. Thus, the
+premap ABA is 256G. We now refer to the map, and find out the mapping for block
+'X' (256G) points to block 'Y', say '64'. Thus the postmap ABA is 64.
+
+
+b. The BTT Flog
+---------------
+
+The BTT provides sector atomicity by making every write an "allocating write",
+i.e. Every write goes to a "free" block. A running list of free blocks is
+maintained in the form of the BTT flog. 'Flog' is a combination of the words
+"free list" and "log". The flog contains 'nfree' entries, and an entry contains:
+
+lba : The premap ABA that is being written to
+old_map : The old postmap ABA - after 'this' write completes, this will be a
+ free block.
+new_map : The new postmap ABA. The map will up updated to reflect this
+ lba->postmap_aba mapping, but we log it here in case we have to
+ recover.
+seq : Sequence number to mark which of the 2 sections of this flog entry is
+ valid/newest. It cycles between 01->10->11->01 (binary) under normal
+ operation, with 00 indicating an uninitialized state.
+lba' : alternate lba entry
+old_map': alternate old postmap entry
+new_map': alternate new postmap entry
+seq' : alternate sequence number.
+
+Each of the above fields is 32-bit, making one entry 32 bytes. Entries are also
+padded to 64 bytes to avoid cache line sharing or aliasing. Flog updates are
+done such that for any entry being written, it:
+a. overwrites the 'old' section in the entry based on sequence numbers
+b. writes the 'new' section such that the sequence number is written last.
+
+
+c. The concept of lanes
+-----------------------
+
+While 'nfree' describes the number of concurrent IOs an arena can process
+concurrently, 'nlanes' is the number of IOs the BTT device as a whole can
+process.
+ nlanes = min(nfree, num_cpus)
+A lane number is obtained at the start of any IO, and is used for indexing into
+all the on-disk and in-memory data structures for the duration of the IO. If
+there are more CPUs than the max number of available lanes, than lanes are
+protected by spinlocks.
+
+
+d. In-memory data structure: Read Tracking Table (RTT)
+------------------------------------------------------
+
+Consider a case where we have two threads, one doing reads and the other,
+writes. We can hit a condition where the writer thread grabs a free block to do
+a new IO, but the (slow) reader thread is still reading from it. In other words,
+the reader consulted a map entry, and started reading the corresponding block. A
+writer started writing to the same external LBA, and finished the write updating
+the map for that external LBA to point to its new postmap ABA. At this point the
+internal, postmap block that the reader is (still) reading has been inserted
+into the list of free blocks. If another write comes in for the same LBA, it can
+grab this free block, and start writing to it, causing the reader to read
+incorrect data. To prevent this, we introduce the RTT.
+
+The RTT is a simple, per arena table with 'nfree' entries. Every reader inserts
+into rtt[lane_number], the postmap ABA it is reading, and clears it after the
+read is complete. Every writer thread, after grabbing a free block, checks the
+RTT for its presence. If the postmap free block is in the RTT, it waits till the
+reader clears the RTT entry, and only then starts writing to it.
+
+
+e. In-memory data structure: map locks
+--------------------------------------
+
+Consider a case where two writer threads are writing to the same LBA. There can
+be a race in the following sequence of steps:
+
+free[lane] = map[premap_aba]
+map[premap_aba] = postmap_aba
+
+Both threads can update their respective free[lane] with the same old, freed
+postmap_aba. This has made the layout inconsistent by losing a free entry, and
+at the same time, duplicating another free entry for two lanes.
+
+To solve this, we could have a single map lock (per arena) that has to be taken
+before performing the above sequence, but we feel that could be too contentious.
+Instead we use an array of (nfree) map_locks that is indexed by
+(premap_aba modulo nfree).
+
+
+f. Reconstruction from the Flog
+-------------------------------
+
+On startup, we analyze the BTT flog to create our list of free blocks. We walk
+through all the entries, and for each lane, of the set of two possible
+'sections', we always look at the most recent one only (based on the sequence
+number). The reconstruction rules/steps are simple:
+- Read map[log_entry.lba].
+- If log_entry.new matches the map entry, then log_entry.old is free.
+- If log_entry.new does not match the map entry, then log_entry.new is free.
+ (This case can only be caused by power-fails/unsafe shutdowns)
+
+
+g. Summarizing - Read and Write flows
+-------------------------------------
+
+Read:
+
+1. Convert external LBA to arena number + pre-map ABA
+2. Get a lane (and take lane_lock)
+3. Read map to get the entry for this pre-map ABA
+4. Enter post-map ABA into RTT[lane]
+5. If TRIM flag set in map, return zeroes, and end IO (go to step 8)
+6. If ERROR flag set in map, end IO with EIO (go to step 8)
+7. Read data from this block
+8. Remove post-map ABA entry from RTT[lane]
+9. Release lane (and lane_lock)
+
+Write:
+
+1. Convert external LBA to Arena number + pre-map ABA
+2. Get a lane (and take lane_lock)
+3. Use lane to index into in-memory free list and obtain a new block, next flog
+ index, next sequence number
+4. Scan the RTT to check if free block is present, and spin/wait if it is.
+5. Write data to this free block
+6. Read map to get the existing post-map ABA entry for this pre-map ABA
+7. Write flog entry: [premap_aba / old postmap_aba / new postmap_aba / seq_num]
+8. Write new post-map ABA into map.
+9. Write old post-map entry into the free list
+10. Calculate next sequence number and write into the free list entry
+11. Release lane (and lane_lock)
+
+
+4. Error Handling
+=================
+
+An arena would be in an error state if any of the metadata is corrupted
+irrecoverably, either due to a bug or a media error. The following conditions
+indicate an error:
+- Info block checksum does not match (and recovering from the copy also fails)
+- All internal available blocks are not uniquely and entirely addressed by the
+ sum of mapped blocks and free blocks (from the BTT flog).
+- Rebuilding free list from the flog reveals missing/duplicate/impossible
+ entries
+- A map entry is out of bounds
+
+If any of these error conditions are encountered, the arena is put into a read
+only state using a flag in the info block.
+
+
+5. In-kernel usage
+==================
+
+Any block driver that supports byte granularity IO to the storage may register
+with the BTT. It will have to provide the rw_bytes interface in its
+block_device_operations struct:
+
+ int (*rw_bytes)(struct gendisk *, void *, size_t, off_t, int rw);
+
+It may register with the BTT after it adds its own gendisk, using btt_init:
+
+ struct btt *btt_init(struct gendisk *disk, unsigned long long rawsize,
+ u32 lbasize, u8 uuid[], int maxlane);
+
+note that maxlane is the maximum amount of concurrency the driver wishes to
+allow the BTT to use.
+
+The BTT 'disk' appears as a stacked block device that grabs the underlying block
+device in the O_EXCL mode.
+
+When the driver wishes to remove the backing disk, it should similarly call
+btt_fini using the same struct btt* handle that was provided to it by btt_init.
+
+ void btt_fini(struct btt *btt);
+
diff --git a/Documentation/nvdimm/nvdimm.txt b/Documentation/nvdimm/nvdimm.txt
new file mode 100644
index 000000000000..197a0b6b0582
--- /dev/null
+++ b/Documentation/nvdimm/nvdimm.txt
@@ -0,0 +1,808 @@
+ LIBNVDIMM: Non-Volatile Devices
+ libnvdimm - kernel / libndctl - userspace helper library
+ linux-nvdimm@lists.01.org
+ v13
+
+
+ Glossary
+ Overview
+ Supporting Documents
+ Git Trees
+ LIBNVDIMM PMEM and BLK
+ Why BLK?
+ PMEM vs BLK
+ BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
+ Example NVDIMM Platform
+ LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
+ LIBNDCTL: Context
+ libndctl: instantiate a new library context example
+ LIBNVDIMM/LIBNDCTL: Bus
+ libnvdimm: control class device in /sys/class
+ libnvdimm: bus
+ libndctl: bus enumeration example
+ LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
+ libnvdimm: DIMM (NMEM)
+ libndctl: DIMM enumeration example
+ LIBNVDIMM/LIBNDCTL: Region
+ libnvdimm: region
+ libndctl: region enumeration example
+ Why Not Encode the Region Type into the Region Name?
+ How Do I Determine the Major Type of a Region?
+ LIBNVDIMM/LIBNDCTL: Namespace
+ libnvdimm: namespace
+ libndctl: namespace enumeration example
+ libndctl: namespace creation example
+ Why the Term "namespace"?
+ LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
+ libnvdimm: btt layout
+ libndctl: btt creation example
+ Summary LIBNDCTL Diagram
+
+
+Glossary
+--------
+
+PMEM: A system-physical-address range where writes are persistent. A
+block device composed of PMEM is capable of DAX. A PMEM address range
+may span an interleave of several DIMMs.
+
+BLK: A set of one or more programmable memory mapped apertures provided
+by a DIMM to access its media. This indirection precludes the
+performance benefit of interleaving, but enables DIMM-bounded failure
+modes.
+
+DPA: DIMM Physical Address, is a DIMM-relative offset. With one DIMM in
+the system there would be a 1:1 system-physical-address:DPA association.
+Once more DIMMs are added a memory controller interleave must be
+decoded to determine the DPA associated with a given
+system-physical-address. BLK capacity always has a 1:1 relationship
+with a single-DIMM's DPA range.
+
+DAX: File system extensions to bypass the page cache and block layer to
+mmap persistent memory, from a PMEM block device, directly into a
+process address space.
+
+BTT: Block Translation Table: Persistent memory is byte addressable.
+Existing software may have an expectation that the power-fail-atomicity
+of writes is at least one sector, 512 bytes. The BTT is an indirection
+table with atomic update semantics to front a PMEM/BLK block device
+driver and present arbitrary atomic sector sizes.
+
+LABEL: Metadata stored on a DIMM device that partitions and identifies
+(persistently names) storage between PMEM and BLK. It also partitions
+BLK storage to host BTTs with different parameters per BLK-partition.
+Note that traditional partition tables, GPT/MBR, are layered on top of a
+BLK or PMEM device.
+
+
+Overview
+--------
+
+The LIBNVDIMM subsystem provides support for three types of NVDIMMs, namely,
+PMEM, BLK, and NVDIMM devices that can simultaneously support both PMEM
+and BLK mode access. These three modes of operation are described by
+the "NVDIMM Firmware Interface Table" (NFIT) in ACPI 6. While the LIBNVDIMM
+implementation is generic and supports pre-NFIT platforms, it was guided
+by the superset of capabilities need to support this ACPI 6 definition
+for NVDIMM resources. The bulk of the kernel implementation is in place
+to handle the case where DPA accessible via PMEM is aliased with DPA
+accessible via BLK. When that occurs a LABEL is needed to reserve DPA
+for exclusive access via one mode a time.
+
+Supporting Documents
+ACPI 6: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
+NVDIMM Namespace: http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf
+DSM Interface Example: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf
+Driver Writer's Guide: http://pmem.io/documents/NVDIMM_Driver_Writers_Guide.pdf
+
+Git Trees
+LIBNVDIMM: https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git
+LIBNDCTL: https://github.com/pmem/ndctl.git
+PMEM: https://github.com/01org/prd
+
+
+LIBNVDIMM PMEM and BLK
+------------------
+
+Prior to the arrival of the NFIT, non-volatile memory was described to a
+system in various ad-hoc ways. Usually only the bare minimum was
+provided, namely, a single system-physical-address range where writes
+are expected to be durable after a system power loss. Now, the NFIT
+specification standardizes not only the description of PMEM, but also
+BLK and platform message-passing entry points for control and
+configuration.
+
+For each NVDIMM access method (PMEM, BLK), LIBNVDIMM provides a block
+device driver:
+
+ 1. PMEM (nd_pmem.ko): Drives a system-physical-address range. This
+ range is contiguous in system memory and may be interleaved (hardware
+ memory controller striped) across multiple DIMMs. When interleaved the
+ platform may optionally provide details of which DIMMs are participating
+ in the interleave.
+
+ Note that while LIBNVDIMM describes system-physical-address ranges that may
+ alias with BLK access as ND_NAMESPACE_PMEM ranges and those without
+ alias as ND_NAMESPACE_IO ranges, to the nd_pmem driver there is no
+ distinction. The different device-types are an implementation detail
+ that userspace can exploit to implement policies like "only interface
+ with address ranges from certain DIMMs". It is worth noting that when
+ aliasing is present and a DIMM lacks a label, then no block device can
+ be created by default as userspace needs to do at least one allocation
+ of DPA to the PMEM range. In contrast ND_NAMESPACE_IO ranges, once
+ registered, can be immediately attached to nd_pmem.
+
+ 2. BLK (nd_blk.ko): This driver performs I/O using a set of platform
+ defined apertures. A set of apertures will all access just one DIMM.
+ Multiple windows allow multiple concurrent accesses, much like
+ tagged-command-queuing, and would likely be used by different threads or
+ different CPUs.
+
+ The NFIT specification defines a standard format for a BLK-aperture, but
+ the spec also allows for vendor specific layouts, and non-NFIT BLK
+ implementations may other designs for BLK I/O. For this reason "nd_blk"
+ calls back into platform-specific code to perform the I/O. One such
+ implementation is defined in the "Driver Writer's Guide" and "DSM
+ Interface Example".
+
+
+Why BLK?
+--------
+
+While PMEM provides direct byte-addressable CPU-load/store access to
+NVDIMM storage, it does not provide the best system RAS (recovery,
+availability, and serviceability) model. An access to a corrupted
+system-physical-address address causes a cpu exception while an access
+to a corrupted address through an BLK-aperture causes that block window
+to raise an error status in a register. The latter is more aligned with
+the standard error model that host-bus-adapter attached disks present.
+Also, if an administrator ever wants to replace a memory it is easier to
+service a system at DIMM module boundaries. Compare this to PMEM where
+data could be interleaved in an opaque hardware specific manner across
+several DIMMs.
+
+PMEM vs BLK
+BLK-apertures solve this RAS problem, but their presence is also the
+major contributing factor to the complexity of the ND subsystem. They
+complicate the implementation because PMEM and BLK alias in DPA space.
+Any given DIMM's DPA-range may contribute to one or more
+system-physical-address sets of interleaved DIMMs, *and* may also be
+accessed in its entirety through its BLK-aperture. Accessing a DPA
+through a system-physical-address while simultaneously accessing the
+same DPA through a BLK-aperture has undefined results. For this reason,
+DIMMs with this dual interface configuration include a DSM function to
+store/retrieve a LABEL. The LABEL effectively partitions the DPA-space
+into exclusive system-physical-address and BLK-aperture accessible
+regions. For simplicity a DIMM is allowed a PMEM "region" per each
+interleave set in which it is a member. The remaining DPA space can be
+carved into an arbitrary number of BLK devices with discontiguous
+extents.
+
+BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
+--------------------------------------------------
+
+One of the few
+reasons to allow multiple BLK namespaces per REGION is so that each
+BLK-namespace can be configured with a BTT with unique atomic sector
+sizes. While a PMEM device can host a BTT the LABEL specification does
+not provide for a sector size to be specified for a PMEM namespace.
+This is due to the expectation that the primary usage model for PMEM is
+via DAX, and the BTT is incompatible with DAX. However, for the cases
+where an application or filesystem still needs atomic sector update
+guarantees it can register a BTT on a PMEM device or partition. See
+LIBNVDIMM/NDCTL: Block Translation Table "btt"
+
+
+Example NVDIMM Platform
+-----------------------
+
+For the remainder of this document the following diagram will be
+referenced for any example sysfs layouts.
+
+
+ (a) (b) DIMM BLK-REGION
+ +-------------------+--------+--------+--------+
++------+ | pm0.0 | blk2.0 | pm1.0 | blk2.1 | 0 region2
+| imc0 +--+- - - region0- - - +--------+ +--------+
++--+---+ | pm0.0 | blk3.0 | pm1.0 | blk3.1 | 1 region3
+ | +-------------------+--------v v--------+
++--+---+ | |
+| cpu0 | region1
++--+---+ | |
+ | +----------------------------^ ^--------+
++--+---+ | blk4.0 | pm1.0 | blk4.0 | 2 region4
+| imc1 +--+----------------------------| +--------+
++------+ | blk5.0 | pm1.0 | blk5.0 | 3 region5
+ +----------------------------+--------+--------+
+
+In this platform we have four DIMMs and two memory controllers in one
+socket. Each unique interface (BLK or PMEM) to DPA space is identified
+by a region device with a dynamically assigned id (REGION0 - REGION5).
+
+ 1. The first portion of DIMM0 and DIMM1 are interleaved as REGION0. A
+ single PMEM namespace is created in the REGION0-SPA-range that spans
+ DIMM0 and DIMM1 with a user-specified name of "pm0.0". Some of that
+ interleaved system-physical-address range is reclaimed as BLK-aperture
+ accessed space starting at DPA-offset (a) into each DIMM. In that
+ reclaimed space we create two BLK-aperture "namespaces" from REGION2 and
+ REGION3 where "blk2.0" and "blk3.0" are just human readable names that
+ could be set to any user-desired name in the LABEL.
+
+ 2. In the last portion of DIMM0 and DIMM1 we have an interleaved
+ system-physical-address range, REGION1, that spans those two DIMMs as
+ well as DIMM2 and DIMM3. Some of REGION1 allocated to a PMEM namespace
+ named "pm1.0" the rest is reclaimed in 4 BLK-aperture namespaces (for
+ each DIMM in the interleave set), "blk2.1", "blk3.1", "blk4.0", and
+ "blk5.0".
+
+ 3. The portion of DIMM2 and DIMM3 that do not participate in the REGION1
+ interleaved system-physical-address range (i.e. the DPA address below
+ offset (b) are also included in the "blk4.0" and "blk5.0" namespaces.
+ Note, that this example shows that BLK-aperture namespaces don't need to
+ be contiguous in DPA-space.
+
+ This bus is provided by the kernel under the device
+ /sys/devices/platform/nfit_test.0 when CONFIG_NFIT_TEST is enabled and
+ the nfit_test.ko module is loaded. This not only test LIBNVDIMM but the
+ acpi_nfit.ko driver as well.
+
+
+LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
+----------------------------------------------------
+
+What follows is a description of the LIBNVDIMM sysfs layout and a
+corresponding object hierarchy diagram as viewed through the LIBNDCTL
+api. The example sysfs paths and diagrams are relative to the Example
+NVDIMM Platform which is also the LIBNVDIMM bus used in the LIBNDCTL unit
+test.
+
+LIBNDCTL: Context
+Every api call in the LIBNDCTL library requires a context that holds the
+logging parameters and other library instance state. The library is
+based on the libabc template:
+https://git.kernel.org/cgit/linux/kernel/git/kay/libabc.git/
+
+LIBNDCTL: instantiate a new library context example
+
+ struct ndctl_ctx *ctx;
+
+ if (ndctl_new(&ctx) == 0)
+ return ctx;
+ else
+ return NULL;
+
+LIBNVDIMM/LIBNDCTL: Bus
+-------------------
+
+A bus has a 1:1 relationship with an NFIT. The current expectation for
+ACPI based systems is that there is only ever one platform-global NFIT.
+That said, it is trivial to register multiple NFITs, the specification
+does not preclude it. The infrastructure supports multiple busses and
+we we use this capability to test multiple NFIT configurations in the
+unit test.
+
+LIBNVDIMM: control class device in /sys/class
+
+This character device accepts DSM messages to be passed to DIMM
+identified by its NFIT handle.
+
+ /sys/class/nd/ndctl0
+ |-- dev
+ |-- device -> ../../../ndbus0
+ |-- subsystem -> ../../../../../../../class/nd
+
+
+
+LIBNVDIMM: bus
+
+ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
+ struct nvdimm_bus_descriptor *nfit_desc);
+
+ /sys/devices/platform/nfit_test.0/ndbus0
+ |-- commands
+ |-- nd
+ |-- nfit
+ |-- nmem0
+ |-- nmem1
+ |-- nmem2
+ |-- nmem3
+ |-- power
+ |-- provider
+ |-- region0
+ |-- region1
+ |-- region2
+ |-- region3
+ |-- region4
+ |-- region5
+ |-- uevent
+ `-- wait_probe
+
+LIBNDCTL: bus enumeration example
+Find the bus handle that describes the bus from Example NVDIMM Platform
+
+ static struct ndctl_bus *get_bus_by_provider(struct ndctl_ctx *ctx,
+ const char *provider)
+ {
+ struct ndctl_bus *bus;
+
+ ndctl_bus_foreach(ctx, bus)
+ if (strcmp(provider, ndctl_bus_get_provider(bus)) == 0)
+ return bus;
+
+ return NULL;
+ }
+
+ bus = get_bus_by_provider(ctx, "nfit_test.0");
+
+
+LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
+---------------------------
+
+The DIMM device provides a character device for sending commands to
+hardware, and it is a container for LABELs. If the DIMM is defined by
+NFIT then an optional 'nfit' attribute sub-directory is available to add
+NFIT-specifics.
+
+Note that the kernel device name for "DIMMs" is "nmemX". The NFIT
+describes these devices via "Memory Device to System Physical Address
+Range Mapping Structure", and there is no requirement that they actually
+be physical DIMMs, so we use a more generic name.
+
+LIBNVDIMM: DIMM (NMEM)
+
+ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
+ const struct attribute_group **groups, unsigned long flags,
+ unsigned long *dsm_mask);
+
+ /sys/devices/platform/nfit_test.0/ndbus0
+ |-- nmem0
+ | |-- available_slots
+ | |-- commands
+ | |-- dev
+ | |-- devtype
+ | |-- driver -> ../../../../../bus/nd/drivers/nvdimm
+ | |-- modalias
+ | |-- nfit
+ | | |-- device
+ | | |-- format
+ | | |-- handle
+ | | |-- phys_id
+ | | |-- rev_id
+ | | |-- serial
+ | | `-- vendor
+ | |-- state
+ | |-- subsystem -> ../../../../../bus/nd
+ | `-- uevent
+ |-- nmem1
+ [..]
+
+
+LIBNDCTL: DIMM enumeration example
+
+Note, in this example we are assuming NFIT-defined DIMMs which are
+identified by an "nfit_handle" a 32-bit value where:
+Bit 3:0 DIMM number within the memory channel
+Bit 7:4 memory channel number
+Bit 11:8 memory controller ID
+Bit 15:12 socket ID (within scope of a Node controller if node controller is present)
+Bit 27:16 Node Controller ID
+Bit 31:28 Reserved
+
+ static struct ndctl_dimm *get_dimm_by_handle(struct ndctl_bus *bus,
+ unsigned int handle)
+ {
+ struct ndctl_dimm *dimm;
+
+ ndctl_dimm_foreach(bus, dimm)
+ if (ndctl_dimm_get_handle(dimm) == handle)
+ return dimm;
+
+ return NULL;
+ }
+
+ #define DIMM_HANDLE(n, s, i, c, d) \
+ (((n & 0xfff) << 16) | ((s & 0xf) << 12) | ((i & 0xf) << 8) \
+ | ((c & 0xf) << 4) | (d & 0xf))
+
+ dimm = get_dimm_by_handle(bus, DIMM_HANDLE(0, 0, 0, 0, 0));
+
+LIBNVDIMM/LIBNDCTL: Region
+----------------------
+
+A generic REGION device is registered for each PMEM range orBLK-aperture
+set. Per the example there are 6 regions: 2 PMEM and 4 BLK-aperture
+sets on the "nfit_test.0" bus. The primary role of regions are to be a
+container of "mappings". A mapping is a tuple of <DIMM,
+DPA-start-offset, length>.
+
+LIBNVDIMM provides a built-in driver for these REGION devices. This driver
+is responsible for reconciling the aliased DPA mappings across all
+regions, parsing the LABEL, if present, and then emitting NAMESPACE
+devices with the resolved/exclusive DPA-boundaries for the nd_pmem or
+nd_blk device driver to consume.
+
+In addition to the generic attributes of "mapping"s, "interleave_ways"
+and "size" the REGION device also exports some convenience attributes.
+"nstype" indicates the integer type of namespace-device this region
+emits, "devtype" duplicates the DEVTYPE variable stored by udev at the
+'add' event, "modalias" duplicates the MODALIAS variable stored by udev
+at the 'add' event, and finally, the optional "spa_index" is provided in
+the case where the region is defined by a SPA.
+
+LIBNVDIMM: region
+
+ struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc);
+ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc);
+
+ /sys/devices/platform/nfit_test.0/ndbus0
+ |-- region0
+ | |-- available_size
+ | |-- btt0
+ | |-- btt_seed
+ | |-- devtype
+ | |-- driver -> ../../../../../bus/nd/drivers/nd_region
+ | |-- init_namespaces
+ | |-- mapping0
+ | |-- mapping1
+ | |-- mappings
+ | |-- modalias
+ | |-- namespace0.0
+ | |-- namespace_seed
+ | |-- numa_node
+ | |-- nfit
+ | | `-- spa_index
+ | |-- nstype
+ | |-- set_cookie
+ | |-- size
+ | |-- subsystem -> ../../../../../bus/nd
+ | `-- uevent
+ |-- region1
+ [..]
+
+LIBNDCTL: region enumeration example
+
+Sample region retrieval routines based on NFIT-unique data like
+"spa_index" (interleave set id) for PMEM and "nfit_handle" (dimm id) for
+BLK.
+
+ static struct ndctl_region *get_pmem_region_by_spa_index(struct ndctl_bus *bus,
+ unsigned int spa_index)
+ {
+ struct ndctl_region *region;
+
+ ndctl_region_foreach(bus, region) {
+ if (ndctl_region_get_type(region) != ND_DEVICE_REGION_PMEM)
+ continue;
+ if (ndctl_region_get_spa_index(region) == spa_index)
+ return region;
+ }
+ return NULL;
+ }
+
+ static struct ndctl_region *get_blk_region_by_dimm_handle(struct ndctl_bus *bus,
+ unsigned int handle)
+ {
+ struct ndctl_region *region;
+
+ ndctl_region_foreach(bus, region) {
+ struct ndctl_mapping *map;
+
+ if (ndctl_region_get_type(region) != ND_DEVICE_REGION_BLOCK)
+ continue;
+ ndctl_mapping_foreach(region, map) {
+ struct ndctl_dimm *dimm = ndctl_mapping_get_dimm(map);
+
+ if (ndctl_dimm_get_handle(dimm) == handle)
+ return region;
+ }
+ }
+ return NULL;
+ }
+
+
+Why Not Encode the Region Type into the Region Name?
+----------------------------------------------------
+
+At first glance it seems since NFIT defines just PMEM and BLK interface
+types that we should simply name REGION devices with something derived
+from those type names. However, the ND subsystem explicitly keeps the
+REGION name generic and expects userspace to always consider the
+region-attributes for 4 reasons:
+
+ 1. There are already more than two REGION and "namespace" types. For
+ PMEM there are two subtypes. As mentioned previously we have PMEM where
+ the constituent DIMM devices are known and anonymous PMEM. For BLK
+ regions the NFIT specification already anticipates vendor specific
+ implementations. The exact distinction of what a region contains is in
+ the region-attributes not the region-name or the region-devtype.
+
+ 2. A region with zero child-namespaces is a possible configuration. For
+ example, the NFIT allows for a DCR to be published without a
+ corresponding BLK-aperture. This equates to a DIMM that can only accept
+ control/configuration messages, but no i/o through a descendant block
+ device. Again, this "type" is advertised in the attributes ('mappings'
+ == 0) and the name does not tell you much.
+
+ 3. What if a third major interface type arises in the future? Outside
+ of vendor specific implementations, it's not difficult to envision a
+ third class of interface type beyond BLK and PMEM. With a generic name
+ for the REGION level of the device-hierarchy old userspace
+ implementations can still make sense of new kernel advertised
+ region-types. Userspace can always rely on the generic region
+ attributes like "mappings", "size", etc and the expected child devices
+ named "namespace". This generic format of the device-model hierarchy
+ allows the LIBNVDIMM and LIBNDCTL implementations to be more uniform and
+ future-proof.
+
+ 4. There are more robust mechanisms for determining the major type of a
+ region than a device name. See the next section, How Do I Determine the
+ Major Type of a Region?
+
+How Do I Determine the Major Type of a Region?
+----------------------------------------------
+
+Outside of the blanket recommendation of "use libndctl", or simply
+looking at the kernel header (/usr/include/linux/ndctl.h) to decode the
+"nstype" integer attribute, here are some other options.
+
+ 1. module alias lookup:
+
+ The whole point of region/namespace device type differentiation is to
+ decide which block-device driver will attach to a given LIBNVDIMM namespace.
+ One can simply use the modalias to lookup the resulting module. It's
+ important to note that this method is robust in the presence of a
+ vendor-specific driver down the road. If a vendor-specific
+ implementation wants to supplant the standard nd_blk driver it can with
+ minimal impact to the rest of LIBNVDIMM.
+
+ In fact, a vendor may also want to have a vendor-specific region-driver
+ (outside of nd_region). For example, if a vendor defined its own LABEL
+ format it would need its own region driver to parse that LABEL and emit
+ the resulting namespaces. The output from module resolution is more
+ accurate than a region-name or region-devtype.
+
+ 2. udev:
+
+ The kernel "devtype" is registered in the udev database
+ # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region0
+ P: /devices/platform/nfit_test.0/ndbus0/region0
+ E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region0
+ E: DEVTYPE=nd_pmem
+ E: MODALIAS=nd:t2
+ E: SUBSYSTEM=nd
+
+ # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region4
+ P: /devices/platform/nfit_test.0/ndbus0/region4
+ E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region4
+ E: DEVTYPE=nd_blk
+ E: MODALIAS=nd:t3
+ E: SUBSYSTEM=nd
+
+ ...and is available as a region attribute, but keep in mind that the
+ "devtype" does not indicate sub-type variations and scripts should
+ really be understanding the other attributes.
+
+ 3. type specific attributes:
+
+ As it currently stands a BLK-aperture region will never have a
+ "nfit/spa_index" attribute, but neither will a non-NFIT PMEM region. A
+ BLK region with a "mappings" value of 0 is, as mentioned above, a DIMM
+ that does not allow I/O. A PMEM region with a "mappings" value of zero
+ is a simple system-physical-address range.
+
+
+LIBNVDIMM/LIBNDCTL: Namespace
+-------------------------
+
+A REGION, after resolving DPA aliasing and LABEL specified boundaries,
+surfaces one or more "namespace" devices. The arrival of a "namespace"
+device currently triggers either the nd_blk or nd_pmem driver to load
+and register a disk/block device.
+
+LIBNVDIMM: namespace
+Here is a sample layout from the three major types of NAMESPACE where
+namespace0.0 represents DIMM-info-backed PMEM (note that it has a 'uuid'
+attribute), namespace2.0 represents a BLK namespace (note it has a
+'sector_size' attribute) that, and namespace6.0 represents an anonymous
+PMEM namespace (note that has no 'uuid' attribute due to not support a
+LABEL).
+
+ /sys/devices/platform/nfit_test.0/ndbus0/region0/namespace0.0
+ |-- alt_name
+ |-- devtype
+ |-- dpa_extents
+ |-- force_raw
+ |-- modalias
+ |-- numa_node
+ |-- resource
+ |-- size
+ |-- subsystem -> ../../../../../../bus/nd
+ |-- type
+ |-- uevent
+ `-- uuid
+ /sys/devices/platform/nfit_test.0/ndbus0/region2/namespace2.0
+ |-- alt_name
+ |-- devtype
+ |-- dpa_extents
+ |-- force_raw
+ |-- modalias
+ |-- numa_node
+ |-- sector_size
+ |-- size
+ |-- subsystem -> ../../../../../../bus/nd
+ |-- type
+ |-- uevent
+ `-- uuid
+ /sys/devices/platform/nfit_test.1/ndbus1/region6/namespace6.0
+ |-- block
+ | `-- pmem0
+ |-- devtype
+ |-- driver -> ../../../../../../bus/nd/drivers/pmem
+ |-- force_raw
+ |-- modalias
+ |-- numa_node
+ |-- resource
+ |-- size
+ |-- subsystem -> ../../../../../../bus/nd
+ |-- type
+ `-- uevent
+
+LIBNDCTL: namespace enumeration example
+Namespaces are indexed relative to their parent region, example below.
+These indexes are mostly static from boot to boot, but subsystem makes
+no guarantees in this regard. For a static namespace identifier use its
+'uuid' attribute.
+
+static struct ndctl_namespace *get_namespace_by_id(struct ndctl_region *region,
+ unsigned int id)
+{
+ struct ndctl_namespace *ndns;
+
+ ndctl_namespace_foreach(region, ndns)
+ if (ndctl_namespace_get_id(ndns) == id)
+ return ndns;
+
+ return NULL;
+}
+
+LIBNDCTL: namespace creation example
+Idle namespaces are automatically created by the kernel if a given
+region has enough available capacity to create a new namespace.
+Namespace instantiation involves finding an idle namespace and
+configuring it. For the most part the setting of namespace attributes
+can occur in any order, the only constraint is that 'uuid' must be set
+before 'size'. This enables the kernel to track DPA allocations
+internally with a static identifier.
+
+static int configure_namespace(struct ndctl_region *region,
+ struct ndctl_namespace *ndns,
+ struct namespace_parameters *parameters)
+{
+ char devname[50];
+
+ snprintf(devname, sizeof(devname), "namespace%d.%d",
+ ndctl_region_get_id(region), paramaters->id);
+
+ ndctl_namespace_set_alt_name(ndns, devname);
+ /* 'uuid' must be set prior to setting size! */
+ ndctl_namespace_set_uuid(ndns, paramaters->uuid);
+ ndctl_namespace_set_size(ndns, paramaters->size);
+ /* unlike pmem namespaces, blk namespaces have a sector size */
+ if (parameters->lbasize)
+ ndctl_namespace_set_sector_size(ndns, parameters->lbasize);
+ ndctl_namespace_enable(ndns);
+}
+
+
+Why the Term "namespace"?
+
+ 1. Why not "volume" for instance? "volume" ran the risk of confusing ND
+ as a volume manager like device-mapper.
+
+ 2. The term originated to describe the sub-devices that can be created
+ within a NVME controller (see the nvme specification:
+ http://www.nvmexpress.org/specifications/), and NFIT namespaces are
+ meant to parallel the capabilities and configurability of
+ NVME-namespaces.
+
+
+LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
+---------------------------------------------
+
+A BTT (design document: http://pmem.io/2014/09/23/btt.html) is a stacked
+block device driver that fronts either the whole block device or a
+partition of a block device emitted by either a PMEM or BLK NAMESPACE.
+
+LIBNVDIMM: btt layout
+Every region will start out with at least one BTT device which is the
+seed device. To activate it set the "namespace", "uuid", and
+"sector_size" attributes and then bind the device to the nd_pmem or
+nd_blk driver depending on the region type.
+
+ /sys/devices/platform/nfit_test.1/ndbus0/region0/btt0/
+ |-- namespace
+ |-- delete
+ |-- devtype
+ |-- modalias
+ |-- numa_node
+ |-- sector_size
+ |-- subsystem -> ../../../../../bus/nd
+ |-- uevent
+ `-- uuid
+
+LIBNDCTL: btt creation example
+Similar to namespaces an idle BTT device is automatically created per
+region. Each time this "seed" btt device is configured and enabled a new
+seed is created. Creating a BTT configuration involves two steps of
+finding and idle BTT and assigning it to consume a PMEM or BLK namespace.
+
+ static struct ndctl_btt *get_idle_btt(struct ndctl_region *region)
+ {
+ struct ndctl_btt *btt;
+
+ ndctl_btt_foreach(region, btt)
+ if (!ndctl_btt_is_enabled(btt)
+ && !ndctl_btt_is_configured(btt))
+ return btt;
+
+ return NULL;
+ }
+
+ static int configure_btt(struct ndctl_region *region,
+ struct btt_parameters *parameters)
+ {
+ btt = get_idle_btt(region);
+
+ ndctl_btt_set_uuid(btt, parameters->uuid);
+ ndctl_btt_set_sector_size(btt, parameters->sector_size);
+ ndctl_btt_set_namespace(btt, parameters->ndns);
+ /* turn off raw mode device */
+ ndctl_namespace_disable(parameters->ndns);
+ /* turn on btt access */
+ ndctl_btt_enable(btt);
+ }
+
+Once instantiated a new inactive btt seed device will appear underneath
+the region.
+
+Once a "namespace" is removed from a BTT that instance of the BTT device
+will be deleted or otherwise reset to default values. This deletion is
+only at the device model level. In order to destroy a BTT the "info
+block" needs to be destroyed. Note, that to destroy a BTT the media
+needs to be written in raw mode. By default, the kernel will autodetect
+the presence of a BTT and disable raw mode. This autodetect behavior
+can be suppressed by enabling raw mode for the namespace via the
+ndctl_namespace_set_raw_mode() api.
+
+
+Summary LIBNDCTL Diagram
+------------------------
+
+For the given example above, here is the view of the objects as seen by the LIBNDCTL api:
+ +---+
+ |CTX| +---------+ +--------------+ +---------------+
+ +-+-+ +-> REGION0 +---> NAMESPACE0.0 +--> PMEM8 "pm0.0" |
+ | | +---------+ +--------------+ +---------------+
++-------+ | | +---------+ +--------------+ +---------------+
+| DIMM0 <-+ | +-> REGION1 +---> NAMESPACE1.0 +--> PMEM6 "pm1.0" |
++-------+ | | | +---------+ +--------------+ +---------------+
+| DIMM1 <-+ +-v--+ | +---------+ +--------------+ +---------------+
++-------+ +-+BUS0+---> REGION2 +-+-> NAMESPACE2.0 +--> ND6 "blk2.0" |
+| DIMM2 <-+ +----+ | +---------+ | +--------------+ +----------------------+
++-------+ | | +-> NAMESPACE2.1 +--> ND5 "blk2.1" | BTT2 |
+| DIMM3 <-+ | +--------------+ +----------------------+
++-------+ | +---------+ +--------------+ +---------------+
+ +-> REGION3 +-+-> NAMESPACE3.0 +--> ND4 "blk3.0" |
+ | +---------+ | +--------------+ +----------------------+
+ | +-> NAMESPACE3.1 +--> ND3 "blk3.1" | BTT1 |
+ | +--------------+ +----------------------+
+ | +---------+ +--------------+ +---------------+
+ +-> REGION4 +---> NAMESPACE4.0 +--> ND2 "blk4.0" |
+ | +---------+ +--------------+ +---------------+
+ | +---------+ +--------------+ +----------------------+
+ +-> REGION5 +---> NAMESPACE5.0 +--> ND1 "blk5.0" | BTT0 |
+ +---------+ +--------------+ +---------------+------+
+
+
diff --git a/MAINTAINERS b/MAINTAINERS
index 6aedd5072323..0e6b09150aad 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6102,6 +6102,39 @@ M: Sasha Levin <sasha.levin@oracle.com>
S: Maintained
F: tools/lib/lockdep/
+LIBNVDIMM: NON-VOLATILE MEMORY DEVICE SUBSYSTEM
+M: Dan Williams <dan.j.williams@intel.com>
+L: linux-nvdimm@lists.01.org
+Q: https://patchwork.kernel.org/project/linux-nvdimm/list/
+S: Supported
+F: drivers/nvdimm/*
+F: include/linux/nd.h
+F: include/linux/libnvdimm.h
+F: include/uapi/linux/ndctl.h
+
+LIBNVDIMM BLK: MMIO-APERTURE DRIVER
+M: Ross Zwisler <ross.zwisler@linux.intel.com>
+L: linux-nvdimm@lists.01.org
+Q: https://patchwork.kernel.org/project/linux-nvdimm/list/
+S: Supported
+F: drivers/nvdimm/blk.c
+F: drivers/nvdimm/region_devs.c
+F: drivers/acpi/nfit*
+
+LIBNVDIMM BTT: BLOCK TRANSLATION TABLE
+M: Vishal Verma <vishal.l.verma@intel.com>
+L: linux-nvdimm@lists.01.org
+Q: https://patchwork.kernel.org/project/linux-nvdimm/list/
+S: Supported
+F: drivers/nvdimm/btt*
+
+LIBNVDIMM PMEM: PERSISTENT MEMORY DRIVER
+M: Ross Zwisler <ross.zwisler@linux.intel.com>
+L: linux-nvdimm@lists.01.org
+Q: https://patchwork.kernel.org/project/linux-nvdimm/list/
+S: Supported
+F: drivers/nvdimm/pmem.c
+
LINUX FOR IBM pSERIES (RS/6000)
M: Paul Mackerras <paulus@au.ibm.com>
W: http://www.ibm.com/linux/ltc/projects/ppc
@@ -8174,6 +8207,7 @@ T: git git://github.com/hzhuang1/linux.git
T: git git://github.com/rjarzmik/linux.git
S: Maintained
F: arch/arm/mach-pxa/
+F: drivers/dma/pxa*
F: drivers/pcmcia/pxa2xx*
F: drivers/spi/spi-pxa2xx*
F: drivers/usb/gadget/udc/pxa2*
@@ -8362,12 +8396,6 @@ S: Maintained
F: Documentation/blockdev/ramdisk.txt
F: drivers/block/brd.c
-PERSISTENT MEMORY DRIVER
-M: Ross Zwisler <ross.zwisler@linux.intel.com>
-L: linux-nvdimm@lists.01.org
-S: Supported
-F: drivers/block/pmem.c
-
RANDOM NUMBER DRIVER
M: "Theodore Ts'o" <tytso@mit.edu>
S: Maintained
@@ -8986,6 +9014,7 @@ S: Supported
F: kernel/seccomp.c
F: include/uapi/linux/seccomp.h
F: include/linux/seccomp.h
+F: tools/testing/selftests/seccomp/*
K: \bsecure_computing
K: \bTIF_SECCOMP\b
@@ -10389,11 +10418,15 @@ S: Maintained
F: Documentation/filesystems/ubifs.txt
F: fs/ubifs/
-UCLINUX (AND M68KNOMMU)
+UCLINUX (M68KNOMMU AND COLDFIRE)
M: Greg Ungerer <gerg@uclinux.org>
W: http://www.uclinux.org/
+L: linux-m68k@lists.linux-m68k.org
L: uclinux-dev@uclinux.org (subscribers-only)
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/gerg/m68knommu.git
S: Maintained
+F: arch/m68k/coldfire/
+F: arch/m68k/68*/
F: arch/m68k/*/*_no.*
F: arch/m68k/include/asm/*_no.*
@@ -10786,6 +10819,12 @@ F: drivers/vfio/
F: include/linux/vfio.h
F: include/uapi/linux/vfio.h
+VFIO PLATFORM DRIVER
+M: Baptiste Reynal <b.reynal@virtualopensystems.com>
+L: kvm@vger.kernel.org
+S: Maintained
+F: drivers/vfio/platform/
+
VIDEOBUF2 FRAMEWORK
M: Pawel Osciak <pawel@osciak.com>
M: Marek Szyprowski <m.szyprowski@samsung.com>
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index ab21e0d58278..9d4aa18f2a82 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -158,6 +158,7 @@ static __init int is_reserve_region(efi_memory_desc_t *md)
case EFI_BOOT_SERVICES_CODE:
case EFI_BOOT_SERVICES_DATA:
case EFI_CONVENTIONAL_MEMORY:
+ case EFI_PERSISTENT_MEMORY:
return 0;
default:
break;
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 47e962f7ed5a..caae3f4e4341 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1222,6 +1222,10 @@ efi_initialize_iomem_resources(struct resource *code_resource,
flags |= IORESOURCE_DISABLED;
break;
+ case EFI_PERSISTENT_MEMORY:
+ name = "Persistent Memory";
+ break;
+
case EFI_RESERVED_TYPE:
case EFI_RUNTIME_SERVICES_CODE:
case EFI_RUNTIME_SERVICES_DATA:
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 5b7791dd3965..096731049538 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -96,3 +96,6 @@ EXPORT_SYMBOL(ia64_ivt);
/* mcount is defined in assembly */
EXPORT_SYMBOL(_mcount);
#endif
+
+#include <asm/cacheflush.h>
+EXPORT_SYMBOL_GPL(flush_icache_range);
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index dd5801eb4c69..2889412e03eb 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -2117,8 +2117,7 @@ ia64_mca_late_init(void)
register_hotcpu_notifier(&mca_cpu_notifier);
/* Setup the CMCI/P vector and handler */
- init_timer(&cmc_poll_timer);
- cmc_poll_timer.function = ia64_mca_cmc_poll;
+ setup_timer(&cmc_poll_timer, ia64_mca_cmc_poll, 0UL);
/* Unmask/enable the vector */
cmc_polling_enabled = 0;
@@ -2129,8 +2128,7 @@ ia64_mca_late_init(void)
#ifdef CONFIG_ACPI
/* Setup the CPEI/P vector and handler */
cpe_vector = acpi_request_vector(ACPI_INTERRUPT_CPEI);
- init_timer(&cpe_poll_timer);
- cpe_poll_timer.function = ia64_mca_cpe_poll;
+ setup_timer(&cpe_poll_timer, ia64_mca_cpe_poll, 0UL);
{
unsigned int irq;
diff --git a/arch/m68k/68000/m68EZ328.c b/arch/m68k/68000/m68EZ328.c
index 21952906e9e2..e6ab321f93f8 100644
--- a/arch/m68k/68000/m68EZ328.c
+++ b/arch/m68k/68000/m68EZ328.c
@@ -62,8 +62,7 @@ void __init config_BSP(char *command, int len)
#ifdef CONFIG_UCSIMM
printk(KERN_INFO "uCsimm serial string [%s]\n",getserialnum());
p = cs8900a_hwaddr = gethwaddr(0);
- printk(KERN_INFO "uCsimm hwaddr %.2x:%.2x:%.2x:%.2x:%.2x:%.2x\n",
- p[0], p[1], p[2], p[3], p[4], p[5]);
+ printk(KERN_INFO "uCsimm hwaddr %pM\n", p);
p = getbenv("APPEND");
if (p) strcpy(p,command);
diff --git a/arch/m68k/68000/m68VZ328.c b/arch/m68k/68000/m68VZ328.c
index 0e5e5a10a021..1154bdb220a0 100644
--- a/arch/m68k/68000/m68VZ328.c
+++ b/arch/m68k/68000/m68VZ328.c
@@ -152,8 +152,7 @@ static void __init init_hardware(char *command, int size)
printk(KERN_INFO "uCdimm serial string [%s]\n", getserialnum());
p = cs8900a_hwaddr = gethwaddr(0);
- printk(KERN_INFO "uCdimm hwaddr %.2x:%.2x:%.2x:%.2x:%.2x:%.2x\n",
- p[0], p[1], p[2], p[3], p[4], p[5]);
+ printk(KERN_INFO "uCdimm hwaddr %pM\n", p);
p = getbenv("APPEND");
if (p)
strcpy(p, command);
diff --git a/arch/m68k/68360/config.c b/arch/m68k/68360/config.c
index fd1f948c7129..b65fe4eed38e 100644
--- a/arch/m68k/68360/config.c
+++ b/arch/m68k/68360/config.c
@@ -154,8 +154,7 @@ void __init config_BSP(char *command, int len)
#if defined(CONFIG_UCQUICC) && 0
printk(KERN_INFO "uCquicc serial string [%s]\n",getserialnum());
p = scc1_hwaddr = gethwaddr(0);
- printk(KERN_INFO "uCquicc hwaddr %.2x:%.2x:%.2x:%.2x:%.2x:%.2x\n",
- p[0], p[1], p[2], p[3], p[4], p[5]);
+ printk(KERN_INFO "uCquicc hwaddr %pM\n", p);
p = getbenv("APPEND");
if (p)
diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index 6e67847f5272..28a9885e3a37 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -44,23 +44,9 @@ config HOSTFS
If you'd like to be able to work with files stored on the host,
say Y or M here; otherwise say N.
-config HPPFS
- tristate "HoneyPot ProcFS"
- depends on PROC_FS
- help
- hppfs (HoneyPot ProcFS) is a filesystem which allows UML /proc
- entries to be overridden, removed, or fabricated from the host.
- Its purpose is to allow a UML to appear to be a physical machine
- by removing or changing anything in /proc which gives away the
- identity of a UML.
-
- See <http://user-mode-linux.sf.net/old/hppfs.html> for more information.
-
- You only need this if you are setting up a UML honeypot. Otherwise,
- it is safe to say 'N' here.
-
config MCONSOLE
bool "Management console"
+ depends on PROC_FS
default y
help
The user mode linux management console is a low-level interface to
diff --git a/arch/um/Makefile b/arch/um/Makefile
index 17d4460b1af3..098ab3333e7c 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -68,9 +68,10 @@ KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ \
KBUILD_AFLAGS += $(ARCH_INCLUDE)
-USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -D__KERNEL__,,\
- $(patsubst -I%,,$(KBUILD_CFLAGS)))) $(ARCH_INCLUDE) $(MODE_INCLUDE) \
- $(filter -I%,$(CFLAGS)) -D_FILE_OFFSET_BITS=64 -idirafter include
+USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -I%,,$(KBUILD_CFLAGS))) \
+ $(ARCH_INCLUDE) $(MODE_INCLUDE) $(filter -I%,$(CFLAGS)) \
+ -D_FILE_OFFSET_BITS=64 -idirafter include \
+ -D__KERNEL__ -D__UM_HOST__
#This will adjust *FLAGS accordingly to the platform.
include $(ARCH_DIR)/Makefile-os-$(OS)
diff --git a/arch/um/drivers/harddog_user.c b/arch/um/drivers/harddog_user.c
index f99b32a4dbff..3aa8b0d52a48 100644
--- a/arch/um/drivers/harddog_user.c
+++ b/arch/um/drivers/harddog_user.c
@@ -9,8 +9,8 @@
#include <os.h>
struct dog_data {
- int stdin;
- int stdout;
+ int stdin_fd;
+ int stdout_fd;
int close_me[2];
};
@@ -18,11 +18,11 @@ static void pre_exec(void *d)
{
struct dog_data *data = d;
- dup2(data->stdin, 0);
- dup2(data->stdout, 1);
- dup2(data->stdout, 2);
- close(data->stdin);
- close(data->stdout);
+ dup2(data->stdin_fd, 0);
+ dup2(data->stdout_fd, 1);
+ dup2(data->stdout_fd, 2);
+ close(data->stdin_fd);
+ close(data->stdout_fd);
close(data->close_me[0]);
close(data->close_me[1]);
}
@@ -49,8 +49,8 @@ int start_watchdog(int *in_fd_ret, int *out_fd_ret, char *sock)
goto out_close_in;
}
- data.stdin = out_fds[0];
- data.stdout = in_fds[1];
+ data.stdin_fd = out_fds[0];
+ data.stdout_fd = in_fds[1];
data.close_me[0] = out_fds[1];
data.close_me[1] = in_fds[0];
diff --git a/arch/um/drivers/mconsole.h b/arch/um/drivers/mconsole.h
index 8b22535c62ce..44af7379ea19 100644
--- a/arch/um/drivers/mconsole.h
+++ b/arch/um/drivers/mconsole.h
@@ -7,7 +7,7 @@
#ifndef __MCONSOLE_H__
#define __MCONSOLE_H__
-#ifndef __KERNEL__
+#ifdef __UM_HOST__
#include <stdint.h>
#define u32 uint32_t
#endif
diff --git a/arch/um/drivers/net_user.c b/arch/um/drivers/net_user.c
index cd14157b556d..e697a4136707 100644
--- a/arch/um/drivers/net_user.c
+++ b/arch/um/drivers/net_user.c
@@ -166,7 +166,7 @@ int net_sendto(int fd, void *buf, int len, void *to, int sock_len)
struct change_pre_exec_data {
int close_me;
- int stdout;
+ int stdout_fd;
};
static void change_pre_exec(void *arg)
@@ -174,7 +174,7 @@ static void change_pre_exec(void *arg)
struct change_pre_exec_data *data = arg;
close(data->close_me);
- dup2(data->stdout, 1);
+ dup2(data->stdout_fd, 1);
}
static int change_tramp(char **argv, char *output, int output_len)
@@ -189,7 +189,7 @@ static int change_tramp(char **argv, char *output, int output_len)
return err;
}
pe_data.close_me = fds[0];
- pe_data.stdout = fds[1];
+ pe_data.stdout_fd = fds[1];
pid = run_helper(change_pre_exec, &pe_data, argv);
if (pid > 0) /* Avoid hang as we won't get data in failure case. */
diff --git a/arch/um/drivers/slip_user.c b/arch/um/drivers/slip_user.c
index 55c290d925f3..0d6b66c64a81 100644
--- a/arch/um/drivers/slip_user.c
+++ b/arch/um/drivers/slip_user.c
@@ -55,8 +55,8 @@ static int set_up_tty(int fd)
}
struct slip_pre_exec_data {
- int stdin;
- int stdout;
+ int stdin_fd;
+ int stdout_fd;
int close_me;
};
@@ -64,9 +64,9 @@ static void slip_pre_exec(void *arg)
{
struct slip_pre_exec_data *data = arg;
- if (data->stdin >= 0)
- dup2(data->stdin, 0);
- dup2(data->stdout, 1);
+ if (data->stdin_fd >= 0)
+ dup2(data->stdin_fd, 0);
+ dup2(data->stdout_fd, 1);
if (data->close_me >= 0)
close(data->close_me);
}
@@ -85,8 +85,8 @@ static int slip_tramp(char **argv, int fd)
}
err = 0;
- pe_data.stdin = fd;
- pe_data.stdout = fds[1];
+ pe_data.stdin_fd = fd;
+ pe_data.stdout_fd = fds[1];
pe_data.close_me = fds[0];
err = run_helper(slip_pre_exec, &pe_data, argv);
if (err < 0)
diff --git a/arch/um/drivers/slirp_user.c b/arch/um/drivers/slirp_user.c
index c999d187abb9..98b6a41a254e 100644
--- a/arch/um/drivers/slirp_user.c
+++ b/arch/um/drivers/slirp_user.c
@@ -20,18 +20,18 @@ static int slirp_user_init(void *data, void *dev)
}
struct slirp_pre_exec_data {
- int stdin;
- int stdout;
+ int stdin_fd;
+ int stdout_fd;
};
static void slirp_pre_exec(void *arg)
{
struct slirp_pre_exec_data *data = arg;
- if (data->stdin != -1)
- dup2(data->stdin, 0);
- if (data->stdout != -1)
- dup2(data->stdout, 1);
+ if (data->stdin_fd != -1)
+ dup2(data->stdin_fd, 0);
+ if (data->stdout_fd != -1)
+ dup2(data->stdout_fd, 1);
}
static int slirp_tramp(char **argv, int fd)
@@ -39,8 +39,8 @@ static int slirp_tramp(char **argv, int fd)
struct slirp_pre_exec_data pe_data;
int pid;
- pe_data.stdin = fd;
- pe_data.stdout = fd;
+ pe_data.stdin_fd = fd;
+ pe_data.stdout_fd = fd;
pid = run_helper(slirp_pre_exec, &pe_data, argv);
return pid;
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index b7df3ae9be51..3d63ff6f583f 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -21,7 +21,6 @@ generic-y += param.h
generic-y += pci.h
generic-y += percpu.h
generic-y += preempt.h
-generic-y += sections.h
generic-y += switch_to.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/um/include/asm/ptrace-generic.h b/arch/um/include/asm/ptrace-generic.h
index cb9b3c47ca8e..2966adbbdf6c 100644
--- a/arch/um/include/asm/ptrace-generic.h
+++ b/arch/um/include/asm/ptrace-generic.h
@@ -8,7 +8,6 @@
#ifndef __ASSEMBLY__
-#include <asm/ptrace-abi.h>
#include <sysdep/ptrace.h>
struct pt_regs {
@@ -37,7 +36,7 @@ extern int putreg(struct task_struct *child, int regno, unsigned long value);
extern int arch_copy_tls(struct task_struct *new);
extern void clear_flushed_tls(struct task_struct *task);
-extern void syscall_trace_enter(struct pt_regs *regs);
+extern int syscall_trace_enter(struct pt_regs *regs);
extern void syscall_trace_leave(struct pt_regs *regs);
#endif
diff --git a/arch/um/include/asm/sections.h b/arch/um/include/asm/sections.h
new file mode 100644
index 000000000000..cafcf684d947
--- /dev/null
+++ b/arch/um/include/asm/sections.h
@@ -0,0 +1,9 @@
+#ifndef __UM_SECTIONS_H
+#define __UM_SECTIONS_H
+
+#include <asm-generic/sections.h>
+
+extern char __binary_start[];
+extern char __syscall_stub_start[], __syscall_stub_end[];
+
+#endif
diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h
index b30c85b141d9..53968aaf76f9 100644
--- a/arch/um/include/asm/thread_info.h
+++ b/arch/um/include/asm/thread_info.h
@@ -10,7 +10,7 @@
#include <asm/types.h>
#include <asm/page.h>
-#include <asm/uaccess.h>
+#include <asm/segment.h>
struct thread_info {
struct task_struct *task; /* main task structure */
diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h
index 3f22fbf7ca1d..3705620ca298 100644
--- a/arch/um/include/asm/uaccess.h
+++ b/arch/um/include/asm/uaccess.h
@@ -1,178 +1,52 @@
/*
* Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2015 Richard Weinberger (richard@nod.at)
* Licensed under the GPL
*/
#ifndef __UM_UACCESS_H
#define __UM_UACCESS_H
-/* thread_info has a mm_segment_t in it, so put the definition up here */
-typedef struct {
- unsigned long seg;
-} mm_segment_t;
-
-#include <linux/thread_info.h>
-#include <linux/errno.h>
-#include <asm/processor.h>
+#include <asm/thread_info.h>
#include <asm/elf.h>
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
-/*
- * The fs value determines whether argument validity checking should be
- * performed or not. If get_fs() == USER_DS, checking is performed, with
- * get_fs() == KERNEL_DS, checking is bypassed.
- *
- * For historical reasons, these macros are grossly misnamed.
- */
-
-#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
-
-#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF)
-#define USER_DS MAKE_MM_SEG(TASK_SIZE)
-
-#define get_ds() (KERNEL_DS)
-#define get_fs() (current_thread_info()->addr_limit)
-#define set_fs(x) (current_thread_info()->addr_limit = (x))
-
-#define segment_eq(a, b) ((a).seg == (b).seg)
-
#define __under_task_size(addr, size) \
(((unsigned long) (addr) < TASK_SIZE) && \
(((unsigned long) (addr) + (size)) < TASK_SIZE))
-#define __access_ok_vsyscall(type, addr, size) \
- ((type == VERIFY_READ) && \
- ((unsigned long) (addr) >= FIXADDR_USER_START) && \
+#define __access_ok_vsyscall(addr, size) \
+ (((unsigned long) (addr) >= FIXADDR_USER_START) && \
((unsigned long) (addr) + (size) <= FIXADDR_USER_END) && \
((unsigned long) (addr) + (size) >= (unsigned long)(addr)))
#define __addr_range_nowrap(addr, size) \
((unsigned long) (addr) <= ((unsigned long) (addr) + (size)))
-#define access_ok(type, addr, size) \
- (__addr_range_nowrap(addr, size) && \
- (__under_task_size(addr, size) || \
- __access_ok_vsyscall(type, addr, size) || \
- segment_eq(get_fs(), KERNEL_DS)))
-
-extern int copy_from_user(void *to, const void __user *from, int n);
-extern int copy_to_user(void __user *to, const void *from, int n);
-
-/*
- * strncpy_from_user: - Copy a NUL terminated string from userspace.
- * @dst: Destination address, in kernel space. This buffer must be at
- * least @count bytes long.
- * @src: Source address, in user space.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from userspace to kernel space.
- *
- * On success, returns the length of the string (not including the trailing
- * NUL).
- *
- * If access to userspace fails, returns -EFAULT (some data may have been
- * copied).
- *
- * If @count is smaller than the length of the string, copies @count bytes
- * and returns @count.
- */
-
-extern int strncpy_from_user(char *dst, const char __user *src, int count);
-
-/*
- * __clear_user: - Zero a block of memory in user space, with less checking.
- * @to: Destination address, in user space.
- * @n: Number of bytes to zero.
- *
- * Zero a block of memory in user space. Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be cleared.
- * On success, this will be zero.
- */
-extern int __clear_user(void __user *mem, int len);
-
-/*
- * clear_user: - Zero a block of memory in user space.
- * @to: Destination address, in user space.
- * @n: Number of bytes to zero.
- *
- * Zero a block of memory in user space.
- *
- * Returns number of bytes that could not be cleared.
- * On success, this will be zero.
- */
-extern int clear_user(void __user *mem, int len);
-
-/*
- * strlen_user: - Get the size of a string in user space.
- * @str: The string to measure.
- * @n: The maximum valid length
- *
- * Get the size of a NUL-terminated string in user space.
- *
- * Returns the size of the string INCLUDING the terminating NUL.
- * On exception, returns 0.
- * If the string is too long, returns a value greater than @n.
- */
-extern int strnlen_user(const void __user *str, int len);
-
-#define __copy_from_user(to, from, n) copy_from_user(to, from, n)
-
-#define __copy_to_user(to, from, n) copy_to_user(to, from, n)
-
+extern long __copy_from_user(void *to, const void __user *from, unsigned long n);
+extern long __copy_to_user(void __user *to, const void *from, unsigned long n);
+extern long __strncpy_from_user(char *dst, const char __user *src, long count);
+extern long __strnlen_user(const void __user *str, long len);
+extern unsigned long __clear_user(void __user *mem, unsigned long len);
+static inline int __access_ok(unsigned long addr, unsigned long size);
+
+/* Teach asm-generic/uaccess.h that we have C functions for these. */
+#define __access_ok __access_ok
+#define __clear_user __clear_user
+#define __copy_to_user __copy_to_user
+#define __copy_from_user __copy_from_user
+#define __strnlen_user __strnlen_user
+#define __strncpy_from_user __strncpy_from_user
#define __copy_to_user_inatomic __copy_to_user
#define __copy_from_user_inatomic __copy_from_user
-#define __get_user(x, ptr) \
-({ \
- const __typeof__(*(ptr)) __user *__private_ptr = (ptr); \
- __typeof__(x) __private_val; \
- int __private_ret = -EFAULT; \
- (x) = (__typeof__(*(__private_ptr)))0; \
- if (__copy_from_user((__force void *)&__private_val, (__private_ptr),\
- sizeof(*(__private_ptr))) == 0) { \
- (x) = (__typeof__(*(__private_ptr))) __private_val; \
- __private_ret = 0; \
- } \
- __private_ret; \
-})
-
-#define get_user(x, ptr) \
-({ \
- const __typeof__((*(ptr))) __user *private_ptr = (ptr); \
- (access_ok(VERIFY_READ, private_ptr, sizeof(*private_ptr)) ? \
- __get_user(x, private_ptr) : ((x) = (__typeof__(*ptr))0, -EFAULT)); \
-})
-
-#define __put_user(x, ptr) \
-({ \
- __typeof__(*(ptr)) __user *__private_ptr = ptr; \
- __typeof__(*(__private_ptr)) __private_val; \
- int __private_ret = -EFAULT; \
- __private_val = (__typeof__(*(__private_ptr))) (x); \
- if (__copy_to_user((__private_ptr), &__private_val, \
- sizeof(*(__private_ptr))) == 0) { \
- __private_ret = 0; \
- } \
- __private_ret; \
-})
-
-#define put_user(x, ptr) \
-({ \
- __typeof__(*(ptr)) __user *private_ptr = (ptr); \
- (access_ok(VERIFY_WRITE, private_ptr, sizeof(*private_ptr)) ? \
- __put_user(x, private_ptr) : -EFAULT); \
-})
-
-#define strlen_user(str) strnlen_user(str, ~0U >> 1)
+#include <asm-generic/uaccess.h>
-struct exception_table_entry
+static inline int __access_ok(unsigned long addr, unsigned long size)
{
- unsigned long insn;
- unsigned long fixup;
-};
+ return __addr_range_nowrap(addr, size) &&
+ (__under_task_size(addr, size) ||
+ __access_ok_vsyscall(addr, size) ||
+ segment_eq(get_fs(), KERNEL_DS));
+}
#endif
diff --git a/arch/um/include/shared/init.h b/arch/um/include/shared/init.h
index b3906f860a87..233e2593eee0 100644
--- a/arch/um/include/shared/init.h
+++ b/arch/um/include/shared/init.h
@@ -40,28 +40,8 @@
typedef int (*initcall_t)(void);
typedef void (*exitcall_t)(void);
-#ifndef __KERNEL__
-#ifndef __section
-# define __section(S) __attribute__ ((__section__(#S)))
-#endif
-
-#if __GNUC__ == 3
-
-#if __GNUC_MINOR__ >= 3
-# define __used __attribute__((__used__))
-#else
-# define __used __attribute__((__unused__))
-#endif
-
-#else
-#if __GNUC__ == 4
-# define __used __attribute__((__used__))
-#endif
-#endif
-
-#else
#include <linux/compiler.h>
-#endif
+
/* These are for everybody (although not all archs will actually
discard it in modules) */
#define __init __section(.init.text)
@@ -131,7 +111,7 @@ extern struct uml_param __uml_setup_start, __uml_setup_end;
#define __uml_postsetup_call __used __section(.uml.postsetup.init)
#define __uml_exit_call __used __section(.uml.exitcall.exit)
-#ifndef __KERNEL__
+#ifdef __UM_HOST__
#define __define_initcall(level,fn) \
static initcall_t __initcall_##fn __used \
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index d824528f6f62..ad3fa3ae6d34 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -301,4 +301,6 @@ extern int get_pty(void);
/* sys-$ARCH/task_size.c */
extern unsigned long os_get_top_address(void);
+long syscall(long number, ...);
+
#endif
diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h
index cef068563336..4cff19f6207a 100644
--- a/arch/um/include/shared/user.h
+++ b/arch/um/include/shared/user.h
@@ -17,7 +17,7 @@
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
/* This is to get size_t */
-#ifdef __KERNEL__
+#ifndef __UM_HOST__
#include <linux/types.h>
#else
#include <stddef.h>
diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c
index 543c04756939..232b22307fdd 100644
--- a/arch/um/kernel/ksyms.c
+++ b/arch/um/kernel/ksyms.c
@@ -42,3 +42,5 @@ EXPORT_SYMBOL(os_makedev);
EXPORT_SYMBOL(add_sigio_fd);
EXPORT_SYMBOL(ignore_sigio_fd);
EXPORT_SYMBOL(sigio_broken);
+
+EXPORT_SYMBOL(syscall);
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
index 9034fc8056b4..4c9861b421fd 100644
--- a/arch/um/kernel/physmem.c
+++ b/arch/um/kernel/physmem.c
@@ -8,6 +8,7 @@
#include <linux/mm.h>
#include <linux/pfn.h>
#include <asm/page.h>
+#include <asm/sections.h>
#include <as-layout.h>
#include <init.h>
#include <kern.h>
@@ -55,8 +56,6 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len,
}
}
-extern int __syscall_stub_start;
-
/**
* setup_physmem() - Setup physical memory for UML
* @start: Start address of the physical kernel memory,
@@ -110,8 +109,8 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end,
* Special kludge - This page will be mapped in to userspace processes
* from physmem_fd, so it needs to be written out there.
*/
- os_seek_file(physmem_fd, __pa(&__syscall_stub_start));
- os_write_file(physmem_fd, &__syscall_stub_start, PAGE_SIZE);
+ os_seek_file(physmem_fd, __pa(__syscall_stub_start));
+ os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE);
os_fsync_file(physmem_fd);
bootmap_size = init_bootmem(pfn, pfn + delta);
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 174ee5017264..6a826cbb15c4 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/tracehook.h>
#include <asm/uaccess.h>
+#include <asm/ptrace-abi.h>
void user_enable_single_step(struct task_struct *child)
{
@@ -131,7 +132,7 @@ static void send_sigtrap(struct task_struct *tsk, struct uml_pt_regs *regs,
* XXX Check PT_DTRACE vs TIF_SINGLESTEP for singlestepping check and
* PT_PTRACED vs TIF_SYSCALL_TRACE for syscall tracing check
*/
-void syscall_trace_enter(struct pt_regs *regs)
+int syscall_trace_enter(struct pt_regs *regs)
{
audit_syscall_entry(UPT_SYSCALL_NR(&regs->regs),
UPT_SYSCALL_ARG1(&regs->regs),
@@ -140,9 +141,9 @@ void syscall_trace_enter(struct pt_regs *regs)
UPT_SYSCALL_ARG4(&regs->regs));
if (!test_thread_flag(TIF_SYSCALL_TRACE))
- return;
+ return 0;
- tracehook_report_syscall_entry(regs);
+ return tracehook_report_syscall_entry(regs);
}
void syscall_trace_leave(struct pt_regs *regs)
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 94abdcc1d6ad..fda1deba1757 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -8,12 +8,11 @@
#include <linux/slab.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
+#include <asm/sections.h>
#include <as-layout.h>
#include <os.h>
#include <skas.h>
-extern int __syscall_stub_start;
-
static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
unsigned long kernel)
{
@@ -93,7 +92,7 @@ void uml_setup_stubs(struct mm_struct *mm)
int err, ret;
ret = init_stub_pte(mm, STUB_CODE,
- (unsigned long) &__syscall_stub_start);
+ (unsigned long) __syscall_stub_start);
if (ret)
goto out;
@@ -101,7 +100,7 @@ void uml_setup_stubs(struct mm_struct *mm)
if (ret)
goto out;
- mm->context.stub_pages[0] = virt_to_page(&__syscall_stub_start);
+ mm->context.stub_pages[0] = virt_to_page(__syscall_stub_start);
mm->context.stub_pages[1] = virt_to_page(mm->context.id.stack);
/* dup_mmap already holds mmap_sem */
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index c0681e097432..d9ec0068b623 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -18,7 +18,10 @@ void handle_syscall(struct uml_pt_regs *r)
long result;
int syscall;
- syscall_trace_enter(regs);
+ if (syscall_trace_enter(regs)) {
+ result = -ENOSYS;
+ goto out;
+ }
/*
* This should go in the declaration of syscall, but when I do that,
@@ -34,6 +37,7 @@ void handle_syscall(struct uml_pt_regs *r)
result = -ENOSYS;
else result = EXECUTE_SYSCALL(syscall, regs);
+out:
PT_REGS_SET_SYSCALL_RETURN(regs, result);
syscall_trace_leave(regs);
diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c
index 4ffb644d6c07..85ac8adb069b 100644
--- a/arch/um/kernel/skas/uaccess.c
+++ b/arch/um/kernel/skas/uaccess.c
@@ -87,10 +87,10 @@ static int do_op_one_page(unsigned long addr, int len, int is_write,
return n;
}
-static int buffer_op(unsigned long addr, int len, int is_write,
- int (*op)(unsigned long, int, void *), void *arg)
+static long buffer_op(unsigned long addr, int len, int is_write,
+ int (*op)(unsigned long, int, void *), void *arg)
{
- int size, remain, n;
+ long size, remain, n;
size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len);
remain = len;
@@ -139,18 +139,16 @@ static int copy_chunk_from_user(unsigned long from, int len, void *arg)
return 0;
}
-int copy_from_user(void *to, const void __user *from, int n)
+long __copy_from_user(void *to, const void __user *from, unsigned long n)
{
if (segment_eq(get_fs(), KERNEL_DS)) {
memcpy(to, (__force void*)from, n);
return 0;
}
- return access_ok(VERIFY_READ, from, n) ?
- buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to):
- n;
+ return buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to);
}
-EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(__copy_from_user);
static int copy_chunk_to_user(unsigned long to, int len, void *arg)
{
@@ -161,18 +159,16 @@ static int copy_chunk_to_user(unsigned long to, int len, void *arg)
return 0;
}
-int copy_to_user(void __user *to, const void *from, int n)
+long __copy_to_user(void __user *to, const void *from, unsigned long n)
{
if (segment_eq(get_fs(), KERNEL_DS)) {
memcpy((__force void *) to, from, n);
return 0;
}
- return access_ok(VERIFY_WRITE, to, n) ?
- buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from) :
- n;
+ return buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from);
}
-EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(__copy_to_user);
static int strncpy_chunk_from_user(unsigned long from, int len, void *arg)
{
@@ -188,9 +184,9 @@ static int strncpy_chunk_from_user(unsigned long from, int len, void *arg)
return 0;
}
-int strncpy_from_user(char *dst, const char __user *src, int count)
+long __strncpy_from_user(char *dst, const char __user *src, long count)
{
- int n;
+ long n;
char *ptr = dst;
if (segment_eq(get_fs(), KERNEL_DS)) {
@@ -198,16 +194,13 @@ int strncpy_from_user(char *dst, const char __user *src, int count)
return strnlen(dst, count);
}
- if (!access_ok(VERIFY_READ, src, 1))
- return -EFAULT;
-
n = buffer_op((unsigned long) src, count, 0, strncpy_chunk_from_user,
&ptr);
if (n != 0)
return -EFAULT;
return strnlen(dst, count);
}
-EXPORT_SYMBOL(strncpy_from_user);
+EXPORT_SYMBOL(__strncpy_from_user);
static int clear_chunk(unsigned long addr, int len, void *unused)
{
@@ -215,22 +208,16 @@ static int clear_chunk(unsigned long addr, int len, void *unused)
return 0;
}
-int __clear_user(void __user *mem, int len)
-{
- return buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL);
-}
-
-int clear_user(void __user *mem, int len)
+unsigned long __clear_user(void __user *mem, unsigned long len)
{
if (segment_eq(get_fs(), KERNEL_DS)) {
memset((__force void*)mem, 0, len);
return 0;
}
- return access_ok(VERIFY_WRITE, mem, len) ?
- buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL) : len;
+ return buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL);
}
-EXPORT_SYMBOL(clear_user);
+EXPORT_SYMBOL(__clear_user);
static int strnlen_chunk(unsigned long str, int len, void *arg)
{
@@ -244,7 +231,7 @@ static int strnlen_chunk(unsigned long str, int len, void *arg)
return 0;
}
-int strnlen_user(const void __user *str, int len)
+long __strnlen_user(const void __user *str, long len)
{
int count = 0, n;
@@ -256,4 +243,4 @@ int strnlen_user(const void __user *str, int len)
return count + 1;
return 0;
}
-EXPORT_SYMBOL(strnlen_user);
+EXPORT_SYMBOL(__strnlen_user);
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 47ff9b7f3e5d..557232f758b6 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -220,6 +220,11 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
show_regs(container_of(regs, struct pt_regs, regs));
panic("Segfault with no mm");
}
+ else if (!is_user && address < TASK_SIZE) {
+ show_regs(container_of(regs, struct pt_regs, regs));
+ panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx",
+ address, ip);
+ }
if (SEGV_IS_FIXABLE(&fi))
err = handle_page_fault(address, ip, is_write, is_user,
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 07f798f4bcee..16630e75f056 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -248,8 +248,6 @@ EXPORT_SYMBOL(end_iomem);
#define MIN_VMALLOC (32 * 1024 * 1024)
-extern char __binary_start;
-
int __init linux_main(int argc, char **argv)
{
unsigned long avail, diff;
@@ -294,7 +292,7 @@ int __init linux_main(int argc, char **argv)
physmem_size += UML_ROUND_UP(brk_start) - UML_ROUND_UP(&_end);
}
- uml_physmem = (unsigned long) &__binary_start & PAGE_MASK;
+ uml_physmem = (unsigned long) __binary_start & PAGE_MASK;
/* Reserve up to 4M after the current brk */
uml_reserved = ROUND_4M(brk_start) + (1 << 22);
diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c
index 14126d9176aa..c2e6e1dad876 100644
--- a/arch/um/os-Linux/drivers/tuntap_user.c
+++ b/arch/um/os-Linux/drivers/tuntap_user.c
@@ -47,7 +47,7 @@ static void tuntap_del_addr(unsigned char *addr, unsigned char *netmask,
}
struct tuntap_pre_exec_data {
- int stdout;
+ int stdout_fd;
int close_me;
};
@@ -55,7 +55,7 @@ static void tuntap_pre_exec(void *arg)
{
struct tuntap_pre_exec_data *data = arg;
- dup2(data->stdout, 1);
+ dup2(data->stdout_fd, 1);
close(data->close_me);
}
@@ -74,7 +74,7 @@ static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote,
sprintf(version_buf, "%d", UML_NET_VERSION);
- data.stdout = remote;
+ data.stdout_fd = remote;
data.close_me = me;
pid = run_helper(tuntap_pre_exec, &data, argv);
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index 08d90fba952c..26e0164895e4 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -13,6 +13,7 @@
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/un.h>
+#include <sys/types.h>
#include <os.h>
static void copy_stat(struct uml_stat *dst, const struct stat64 *src)
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index 7b605e4dfffa..036d0dbc7b52 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -112,9 +112,11 @@ void timer_init(void)
void set_sigstack(void *sig_stack, int size)
{
- stack_t stack = ((stack_t) { .ss_flags = 0,
- .ss_sp = (__ptr_t) sig_stack,
- .ss_size = size - sizeof(void *) });
+ stack_t stack = {
+ .ss_flags = 0,
+ .ss_sp = sig_stack,
+ .ss_size = size - sizeof(void *)
+ };
if (sigaltstack(&stack, NULL) != 0)
panic("enabling signal stack failed, errno = %d\n", errno);
diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c
index e7f8c945a573..35015e3e1e87 100644
--- a/arch/um/os-Linux/skas/mem.c
+++ b/arch/um/os-Linux/skas/mem.c
@@ -18,7 +18,7 @@
#include <sysdep/ptrace.h>
#include <sysdep/stub.h>
-extern unsigned long batch_syscall_stub, __syscall_stub_start;
+extern char batch_syscall_stub[], __syscall_stub_start[];
extern void wait_stub_done(int pid);
@@ -38,8 +38,8 @@ static int __init init_syscall_regs(void)
{
get_safe_registers(syscall_regs, NULL);
syscall_regs[REGS_IP_INDEX] = STUB_CODE +
- ((unsigned long) &batch_syscall_stub -
- (unsigned long) &__syscall_stub_start);
+ ((unsigned long) batch_syscall_stub -
+ (unsigned long) __syscall_stub_start);
return 0;
}
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 7a9777570a62..3dddedba3a07 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -174,7 +174,7 @@ static void handle_trap(int pid, struct uml_pt_regs *regs,
handle_syscall(regs);
}
-extern int __syscall_stub_start;
+extern char __syscall_stub_start[];
static int userspace_tramp(void *stack)
{
@@ -197,7 +197,7 @@ static int userspace_tramp(void *stack)
* This has a pte, but it can't be mapped in with the usual
* tlb_flush mechanism because this is part of that mechanism
*/
- fd = phys_mapping(to_phys(&__syscall_stub_start), &offset);
+ fd = phys_mapping(to_phys(__syscall_stub_start), &offset);
addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE,
PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset);
if (addr == MAP_FAILED) {
@@ -223,7 +223,7 @@ static int userspace_tramp(void *stack)
unsigned long v = STUB_CODE +
(unsigned long) stub_segv_handler -
- (unsigned long) &__syscall_stub_start;
+ (unsigned long) __syscall_stub_start;
set_sigstack((void *) STUB_DATA, UM_KERN_PAGE_SIZE);
sigemptyset(&sa.sa_mask);
@@ -447,7 +447,7 @@ static int __init init_thread_regs(void)
/* Set parent's instruction pointer to start of clone-stub */
thread_regs[REGS_IP_INDEX] = STUB_CODE +
(unsigned long) stub_clone_handler -
- (unsigned long) &__syscall_stub_start;
+ (unsigned long) __syscall_stub_start;
thread_regs[REGS_SP_INDEX] = STUB_DATA + UM_KERN_PAGE_SIZE -
sizeof(void *);
#ifdef __SIGNAL_FRAMESIZE
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4fcf0ade7e91..d05a42357ef0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_PMEM_API
select ARCH_HAS_SG_CHAIN
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
@@ -1419,6 +1420,9 @@ source "mm/Kconfig"
config X86_PMEM_LEGACY
bool "Support non-standard NVDIMMs and ADR protected memory"
+ depends on PHYS_ADDR_T_64BIT
+ depends on BLK_DEV
+ select LIBNVDIMM
help
Treat memory marked using the non-standard e820 type of 12 as used
by the Intel Sandy Bridge-EP reference BIOS as protected memory.
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 48304b89b601..2c82bd150d43 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1224,6 +1224,10 @@ static efi_status_t setup_e820(struct boot_params *params,
e820_type = E820_NVS;
break;
+ case EFI_PERSISTENT_MEMORY:
+ e820_type = E820_PMEM;
+ break;
+
default:
continue;
}
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b6f7457d12e4..9bf3ea14b9f0 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -4,6 +4,7 @@
/* Caches aren't brain-dead on the intel. */
#include <asm-generic/cacheflush.h>
#include <asm/special_insns.h>
+#include <asm/uaccess.h>
/*
* The set_memory_* API can be used to change various attributes of a virtual
@@ -108,4 +109,75 @@ static inline int rodata_test(void)
}
#endif
+#ifdef ARCH_HAS_NOCACHE_UACCESS
+
+/**
+ * arch_memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Copy data to persistent memory media via non-temporal stores so that
+ * a subsequent arch_wmb_pmem() can flush cpu and memory controller
+ * write buffers to guarantee durability.
+ */
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+ size_t n)
+{
+ int unwritten;
+
+ /*
+ * We are copying between two kernel buffers, if
+ * __copy_from_user_inatomic_nocache() returns an error (page
+ * fault) we would have already reported a general protection fault
+ * before the WARN+BUG.
+ */
+ unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
+ (void __user *) src, n);
+ if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
+ __func__, dst, src, unwritten))
+ BUG();
+}
+
+/**
+ * arch_wmb_pmem - synchronize writes to persistent memory
+ *
+ * After a series of arch_memcpy_to_pmem() operations this drains data
+ * from cpu write buffers and any platform (memory controller) buffers
+ * to ensure that written data is durable on persistent memory media.
+ */
+static inline void arch_wmb_pmem(void)
+{
+ /*
+ * wmb() to 'sfence' all previous writes such that they are
+ * architecturally visible to 'pcommit'. Note, that we've
+ * already arranged for pmem writes to avoid the cache via
+ * arch_memcpy_to_pmem().
+ */
+ wmb();
+ pcommit_sfence();
+}
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * We require that wmb() be an 'sfence', that is only guaranteed on
+ * 64-bit builds
+ */
+ return static_cpu_has(X86_FEATURE_PCOMMIT);
+#else
+ return false;
+#endif
+}
+#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */
+extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
+extern void arch_wmb_pmem(void);
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+ return false;
+}
+#endif
+
#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 83ec9b1d77cc..cc9c61bc1abe 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -248,6 +248,12 @@ static inline void flush_write_buffers(void)
#endif
}
+static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
+ unsigned long size)
+{
+ return (void __force __pmem *) ioremap_cache(offset, size);
+}
+
#endif /* __KERNEL__ */
extern void native_io_delay(void);
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index 960a8a9dc4ab..0f457e6eab18 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -32,6 +32,7 @@
#define E820_ACPI 3
#define E820_NVS 4
#define E820_UNUSABLE 5
+#define E820_PMEM 7
/*
* This is a non-standardized way to represent ADR or NVDIMM regions that
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c8dda42cb6a3..a102564d08eb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,7 @@ static void __init e820_print_type(u32 type)
case E820_UNUSABLE:
printk(KERN_CONT "unusable");
break;
+ case E820_PMEM:
case E820_PRAM:
printk(KERN_CONT "persistent (type %u)", type);
break;
@@ -918,11 +919,32 @@ static inline const char *e820_type_to_string(int e820_type)
case E820_ACPI: return "ACPI Tables";
case E820_NVS: return "ACPI Non-volatile Storage";
case E820_UNUSABLE: return "Unusable memory";
- case E820_PRAM: return "Persistent RAM";
+ case E820_PRAM: return "Persistent Memory (legacy)";
+ case E820_PMEM: return "Persistent Memory";
default: return "reserved";
}
}
+static bool do_mark_busy(u32 type, struct resource *res)
+{
+ /* this is the legacy bios/dos rom-shadow + mmio region */
+ if (res->start < (1ULL<<20))
+ return true;
+
+ /*
+ * Treat persistent memory like device memory, i.e. reserve it
+ * for exclusive use of a driver
+ */
+ switch (type) {
+ case E820_RESERVED:
+ case E820_PRAM:
+ case E820_PMEM:
+ return false;
+ default:
+ return true;
+ }
+}
+
/*
* Mark e820 reserved areas as busy for the resource manager.
*/
@@ -952,9 +974,7 @@ void __init e820_reserve_resources(void)
* pci device BAR resource and insert them later in
* pcibios_resource_survey()
*/
- if (((e820.map[i].type != E820_RESERVED) &&
- (e820.map[i].type != E820_PRAM)) ||
- res->start < (1ULL<<20)) {
+ if (do_mark_busy(e820.map[i].type, res)) {
res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 3420c874ddc5..64f90f53bb85 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -1,53 +1,82 @@
/*
* Copyright (c) 2015, Christoph Hellwig.
+ * Copyright (c) 2015, Intel Corporation.
*/
-#include <linux/memblock.h>
#include <linux/platform_device.h>
-#include <linux/slab.h>
+#include <linux/libnvdimm.h>
+#include <linux/module.h>
#include <asm/e820.h>
-#include <asm/page_types.h>
-#include <asm/setup.h>
-static __init void register_pmem_device(struct resource *res)
+static void e820_pmem_release(struct device *dev)
{
- struct platform_device *pdev;
- int error;
+ struct nvdimm_bus *nvdimm_bus = dev->platform_data;
- pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
- if (!pdev)
- return;
+ if (nvdimm_bus)
+ nvdimm_bus_unregister(nvdimm_bus);
+}
- error = platform_device_add_resources(pdev, res, 1);
- if (error)
- goto out_put_pdev;
+static struct platform_device e820_pmem = {
+ .name = "e820_pmem",
+ .id = -1,
+ .dev = {
+ .release = e820_pmem_release,
+ },
+};
- error = platform_device_add(pdev);
- if (error)
- goto out_put_pdev;
- return;
+static const struct attribute_group *e820_pmem_attribute_groups[] = {
+ &nvdimm_bus_attribute_group,
+ NULL,
+};
-out_put_pdev:
- dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n");
- platform_device_put(pdev);
-}
+static const struct attribute_group *e820_pmem_region_attribute_groups[] = {
+ &nd_region_attribute_group,
+ &nd_device_attribute_group,
+ NULL,
+};
-static __init int register_pmem_devices(void)
+static __init int register_e820_pmem(void)
{
- int i;
+ static struct nvdimm_bus_descriptor nd_desc;
+ struct device *dev = &e820_pmem.dev;
+ struct nvdimm_bus *nvdimm_bus;
+ int rc, i;
+
+ rc = platform_device_register(&e820_pmem);
+ if (rc)
+ return rc;
+
+ nd_desc.attr_groups = e820_pmem_attribute_groups;
+ nd_desc.provider_name = "e820";
+ nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
+ if (!nvdimm_bus)
+ goto err;
+ dev->platform_data = nvdimm_bus;
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
+ struct resource res = {
+ .flags = IORESOURCE_MEM,
+ .start = ei->addr,
+ .end = ei->addr + ei->size - 1,
+ };
+ struct nd_region_desc ndr_desc;
+
+ if (ei->type != E820_PRAM)
+ continue;
- if (ei->type == E820_PRAM) {
- struct resource res = {
- .flags = IORESOURCE_MEM,
- .start = ei->addr,
- .end = ei->addr + ei->size - 1,
- };
- register_pmem_device(&res);
- }
+ memset(&ndr_desc, 0, sizeof(ndr_desc));
+ ndr_desc.res = &res;
+ ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
+ ndr_desc.numa_node = NUMA_NO_NODE;
+ if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
+ goto err;
}
return 0;
+
+ err:
+ dev_err(dev, "failed to register legacy persistent memory ranges\n");
+ platform_device_unregister(&e820_pmem);
+ return -ENXIO;
}
-device_initcall(register_pmem_devices);
+device_initcall(register_e820_pmem);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index c1c382c58c60..cfba30f27392 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -174,6 +174,9 @@ static void __init do_add_efi_memmap(void)
case EFI_UNUSABLE_MEMORY:
e820_type = E820_UNUSABLE;
break;
+ case EFI_PERSISTENT_MEMORY:
+ e820_type = E820_PMEM;
+ break;
default:
/*
* EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h
index 4b181b74454f..ee940185e89f 100644
--- a/arch/x86/um/asm/checksum.h
+++ b/arch/x86/um/asm/checksum.h
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/in6.h>
+#include <linux/uaccess.h>
/*
* computes the checksum of a memory block at buff, length len,
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 0a656b727b1a..548197212a45 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -200,8 +200,6 @@ typedef elf_greg_t elf_gregset_t[ELF_NGREG];
typedef struct user_i387_struct elf_fpregset_t;
-#define task_pt_regs(t) (&(t)->thread.regs)
-
struct task_struct;
extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 2a206d2b14ab..233ee09c1ce8 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -28,6 +28,8 @@ static inline void rep_nop(void)
#define cpu_relax() rep_nop()
#define cpu_relax_lowlatency() cpu_relax()
+#define task_pt_regs(t) (&(t)->thread.regs)
+
#include <asm/processor-generic.h>
#endif
diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h
index 45183fcd10b6..41dd5e1f3cd7 100644
--- a/arch/x86/um/asm/segment.h
+++ b/arch/x86/um/asm/segment.h
@@ -7,4 +7,12 @@ extern int host_gdt_entry_tls_min;
#define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min
#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+typedef struct {
+ unsigned long seg;
+} mm_segment_t;
+
+#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
+#define KERNEL_DS MAKE_MM_SEG(~0UL)
+#define USER_DS MAKE_MM_SEG(TASK_SIZE)
+
#endif
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
index 5c0b711d2433..9701a4fd7bf2 100644
--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -6,6 +6,7 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <os.h>
#include <skas.h>
diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c
index f40281e5d6a2..744afdc18cf3 100644
--- a/arch/x86/um/mem_32.c
+++ b/arch/x86/um/mem_32.c
@@ -7,8 +7,7 @@
*/
#include <linux/mm.h>
-#include <asm/page.h>
-#include <asm/mman.h>
+#include <asm/elf.h>
static struct vm_area_struct gate_vma;
diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c
index f8fecaddcc0d..7642e2e2aa61 100644
--- a/arch/x86/um/mem_64.c
+++ b/arch/x86/um/mem_64.c
@@ -1,6 +1,5 @@
#include <linux/mm.h>
-#include <asm/page.h>
-#include <asm/mman.h>
+#include <asm/elf.h>
const char *arch_vma_name(struct vm_area_struct *vma)
{
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index ce3dd4f36f3f..a29756f2d940 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -6,6 +6,7 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <asm/uaccess.h>
+#include <asm/ptrace-abi.h>
#include <skas.h>
extern int arch_switch_tls(struct task_struct *to);
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index 3b52bf0b418a..a629694ee750 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -11,6 +11,7 @@
#define __FRAME_OFFSETS
#include <asm/ptrace.h>
#include <asm/uaccess.h>
+#include <asm/ptrace-abi.h>
/*
* determines which flags the user has access to.
diff --git a/arch/x86/um/shared/sysdep/tls.h b/arch/x86/um/shared/sysdep/tls.h
index 27cce00c6b30..a682db13df23 100644
--- a/arch/x86/um/shared/sysdep/tls.h
+++ b/arch/x86/um/shared/sysdep/tls.h
@@ -1,7 +1,7 @@
#ifndef _SYSDEP_TLS_H
#define _SYSDEP_TLS_H
-# ifndef __KERNEL__
+#ifdef __UM_HOST__
/* Change name to avoid conflicts with the original one from <asm/ldt.h>, which
* may be named user_desc (but in 2.4 and in header matching its API was named
@@ -22,11 +22,11 @@ typedef struct um_dup_user_desc {
#endif
} user_desc_t;
-# else /* __KERNEL__ */
+#else /* __UM_HOST__ */
typedef struct user_desc user_desc_t;
-# endif /* __KERNEL__ */
+#endif /* __UM_HOST__ */
extern int os_set_thread_area(user_desc_t *info, int pid);
extern int os_get_thread_area(user_desc_t *info, int pid);
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 592491d1d70d..06934a8a4872 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -541,7 +541,8 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
*/
/* x86-64 should always use SA_RESTORER. */
if (ksig->ka.sa.sa_flags & SA_RESTORER)
- err |= __put_user(ksig->ka.sa.sa_restorer, &frame->pretcode);
+ err |= __put_user((void *)ksig->ka.sa.sa_restorer,
+ &frame->pretcode);
else
/* could use a vstub here */
return err;
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index adb08eb5c22a..e6552275320b 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -6,6 +6,7 @@
*/
#include <linux/sched.h>
+#include <linux/uaccess.h>
#include <asm/prctl.h> /* XXX This should get the constants from libc */
#include <os.h>
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index 80ffa5b9982d..48e38584d5c1 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/syscalls.h>
#include <asm/uaccess.h>
+#include <asm/ptrace-abi.h>
#include <os.h>
#include <skas.h>
#include <sysdep/tls.h>
diff --git a/arch/x86/um/tls_64.c b/arch/x86/um/tls_64.c
index d22363cb854e..3ad714373d7f 100644
--- a/arch/x86/um/tls_64.c
+++ b/arch/x86/um/tls_64.c
@@ -1,4 +1,5 @@
#include <linux/sched.h>
+#include <asm/ptrace-abi.h>
void clear_flushed_tls(struct task_struct *task)
{
diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c
index 916cda4cd5b4..237c6831e095 100644
--- a/arch/x86/um/vdso/vma.c
+++ b/arch/x86/um/vdso/vma.c
@@ -10,6 +10,7 @@
#include <linux/sched.h>
#include <linux/mm.h>
#include <asm/page.h>
+#include <asm/elf.h>
#include <linux/init.h>
static unsigned int __read_mostly vdso_enabled = 1;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 55b6f15dac90..dda653ce7b24 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -326,8 +326,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
goto out_put_request;
}
- ret = -EFAULT;
- if (blk_fill_sghdr_rq(q, rq, hdr, mode))
+ ret = blk_fill_sghdr_rq(q, rq, hdr, mode);
+ if (ret < 0)
goto out_free_cdb;
ret = 0;
diff --git a/drivers/Kconfig b/drivers/Kconfig
index c0cc96bab9e7..6e973b8e3a3b 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -182,4 +182,6 @@ source "drivers/thunderbolt/Kconfig"
source "drivers/android/Kconfig"
+source "drivers/nvdimm/Kconfig"
+
endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 9a02fb7c5106..b64b49f6e01b 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/
obj-$(CONFIG_PARPORT) += parport/
obj-y += base/ block/ misc/ mfd/ nfc/
+obj-$(CONFIG_LIBNVDIMM) += nvdimm/
obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
obj-$(CONFIG_NUBUS) += nubus/
obj-y += macintosh/
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 35da507411a0..f15db002be8e 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -386,6 +386,32 @@ config ACPI_REDUCED_HARDWARE_ONLY
If you are unsure what to do, do not enable this option.
+config ACPI_NFIT
+ tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
+ depends on PHYS_ADDR_T_64BIT
+ depends on BLK_DEV
+ select LIBNVDIMM
+ help
+ Infrastructure to probe ACPI 6 compliant platforms for
+ NVDIMMs (NFIT) and register a libnvdimm device tree. In
+ addition to storage devices this also enables libnvdimm to pass
+ ACPI._DSM messages for platform/dimm configuration.
+
+ To compile this driver as a module, choose M here:
+ the module will be called nfit.
+
+config ACPI_NFIT_DEBUG
+ bool "NFIT DSM debug"
+ depends on ACPI_NFIT
+ depends on DYNAMIC_DEBUG
+ default n
+ help
+ Enabling this option causes the nfit driver to dump the
+ input and output buffers of _DSM operations on the ACPI0012
+ device and its children. This can be very verbose, so leave
+ it disabled unless you are debugging a hardware / firmware
+ issue.
+
source "drivers/acpi/apei/Kconfig"
config ACPI_EXTLOG
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 73d840bef455..8321430d7f24 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -68,6 +68,7 @@ obj-$(CONFIG_ACPI_PCI_SLOT) += pci_slot.o
obj-$(CONFIG_ACPI_PROCESSOR) += processor.o
obj-y += container.o
obj-$(CONFIG_ACPI_THERMAL) += thermal.o
+obj-$(CONFIG_ACPI_NFIT) += nfit.o
obj-y += acpi_memhotplug.o
obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o
obj-$(CONFIG_ACPI_BATTERY) += battery.o
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
new file mode 100644
index 000000000000..2161fa178c8d
--- /dev/null
+++ b/drivers/acpi/nfit.c
@@ -0,0 +1,1587 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/list_sort.h>
+#include <linux/libnvdimm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ndctl.h>
+#include <linux/list.h>
+#include <linux/acpi.h>
+#include <linux/sort.h>
+#include <linux/io.h>
+#include "nfit.h"
+
+/*
+ * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is
+ * irrelevant.
+ */
+#include <asm-generic/io-64-nonatomic-hi-lo.h>
+
+static bool force_enable_dimms;
+module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status");
+
+static u8 nfit_uuid[NFIT_UUID_MAX][16];
+
+const u8 *to_nfit_uuid(enum nfit_uuids id)
+{
+ return nfit_uuid[id];
+}
+EXPORT_SYMBOL(to_nfit_uuid);
+
+static struct acpi_nfit_desc *to_acpi_nfit_desc(
+ struct nvdimm_bus_descriptor *nd_desc)
+{
+ return container_of(nd_desc, struct acpi_nfit_desc, nd_desc);
+}
+
+static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc)
+{
+ struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
+
+ /*
+ * If provider == 'ACPI.NFIT' we can assume 'dev' is a struct
+ * acpi_device.
+ */
+ if (!nd_desc->provider_name
+ || strcmp(nd_desc->provider_name, "ACPI.NFIT") != 0)
+ return NULL;
+
+ return to_acpi_device(acpi_desc->dev);
+}
+
+static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
+ struct nvdimm *nvdimm, unsigned int cmd, void *buf,
+ unsigned int buf_len)
+{
+ struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
+ const struct nd_cmd_desc *desc = NULL;
+ union acpi_object in_obj, in_buf, *out_obj;
+ struct device *dev = acpi_desc->dev;
+ const char *cmd_name, *dimm_name;
+ unsigned long dsm_mask;
+ acpi_handle handle;
+ const u8 *uuid;
+ u32 offset;
+ int rc, i;
+
+ if (nvdimm) {
+ struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+ struct acpi_device *adev = nfit_mem->adev;
+
+ if (!adev)
+ return -ENOTTY;
+ dimm_name = nvdimm_name(nvdimm);
+ cmd_name = nvdimm_cmd_name(cmd);
+ dsm_mask = nfit_mem->dsm_mask;
+ desc = nd_cmd_dimm_desc(cmd);
+ uuid = to_nfit_uuid(NFIT_DEV_DIMM);
+ handle = adev->handle;
+ } else {
+ struct acpi_device *adev = to_acpi_dev(acpi_desc);
+
+ cmd_name = nvdimm_bus_cmd_name(cmd);
+ dsm_mask = nd_desc->dsm_mask;
+ desc = nd_cmd_bus_desc(cmd);
+ uuid = to_nfit_uuid(NFIT_DEV_BUS);
+ handle = adev->handle;
+ dimm_name = "bus";
+ }
+
+ if (!desc || (cmd && (desc->out_num + desc->in_num == 0)))
+ return -ENOTTY;
+
+ if (!test_bit(cmd, &dsm_mask))
+ return -ENOTTY;
+
+ in_obj.type = ACPI_TYPE_PACKAGE;
+ in_obj.package.count = 1;
+ in_obj.package.elements = &in_buf;
+ in_buf.type = ACPI_TYPE_BUFFER;
+ in_buf.buffer.pointer = buf;
+ in_buf.buffer.length = 0;
+
+ /* libnvdimm has already validated the input envelope */
+ for (i = 0; i < desc->in_num; i++)
+ in_buf.buffer.length += nd_cmd_in_size(nvdimm, cmd, desc,
+ i, buf);
+
+ if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) {
+ dev_dbg(dev, "%s:%s cmd: %s input length: %d\n", __func__,
+ dimm_name, cmd_name, in_buf.buffer.length);
+ print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4,
+ 4, in_buf.buffer.pointer, min_t(u32, 128,
+ in_buf.buffer.length), true);
+ }
+
+ out_obj = acpi_evaluate_dsm(handle, uuid, 1, cmd, &in_obj);
+ if (!out_obj) {
+ dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name,
+ cmd_name);
+ return -EINVAL;
+ }
+
+ if (out_obj->package.type != ACPI_TYPE_BUFFER) {
+ dev_dbg(dev, "%s:%s unexpected output object type cmd: %s type: %d\n",
+ __func__, dimm_name, cmd_name, out_obj->type);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) {
+ dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__,
+ dimm_name, cmd_name, out_obj->buffer.length);
+ print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4,
+ 4, out_obj->buffer.pointer, min_t(u32, 128,
+ out_obj->buffer.length), true);
+ }
+
+ for (i = 0, offset = 0; i < desc->out_num; i++) {
+ u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf,
+ (u32 *) out_obj->buffer.pointer);
+
+ if (offset + out_size > out_obj->buffer.length) {
+ dev_dbg(dev, "%s:%s output object underflow cmd: %s field: %d\n",
+ __func__, dimm_name, cmd_name, i);
+ break;
+ }
+
+ if (in_buf.buffer.length + offset + out_size > buf_len) {
+ dev_dbg(dev, "%s:%s output overrun cmd: %s field: %d\n",
+ __func__, dimm_name, cmd_name, i);
+ rc = -ENXIO;
+ goto out;
+ }
+ memcpy(buf + in_buf.buffer.length + offset,
+ out_obj->buffer.pointer + offset, out_size);
+ offset += out_size;
+ }
+ if (offset + in_buf.buffer.length < buf_len) {
+ if (i >= 1) {
+ /*
+ * status valid, return the number of bytes left
+ * unfilled in the output buffer
+ */
+ rc = buf_len - offset - in_buf.buffer.length;
+ } else {
+ dev_err(dev, "%s:%s underrun cmd: %s buf_len: %d out_len: %d\n",
+ __func__, dimm_name, cmd_name, buf_len,
+ offset);
+ rc = -ENXIO;
+ }
+ } else
+ rc = 0;
+
+ out:
+ ACPI_FREE(out_obj);
+
+ return rc;
+}
+
+static const char *spa_type_name(u16 type)
+{
+ static const char *to_name[] = {
+ [NFIT_SPA_VOLATILE] = "volatile",
+ [NFIT_SPA_PM] = "pmem",
+ [NFIT_SPA_DCR] = "dimm-control-region",
+ [NFIT_SPA_BDW] = "block-data-window",
+ [NFIT_SPA_VDISK] = "volatile-disk",
+ [NFIT_SPA_VCD] = "volatile-cd",
+ [NFIT_SPA_PDISK] = "persistent-disk",
+ [NFIT_SPA_PCD] = "persistent-cd",
+
+ };
+
+ if (type > NFIT_SPA_PCD)
+ return "unknown";
+
+ return to_name[type];
+}
+
+static int nfit_spa_type(struct acpi_nfit_system_address *spa)
+{
+ int i;
+
+ for (i = 0; i < NFIT_UUID_MAX; i++)
+ if (memcmp(to_nfit_uuid(i), spa->range_guid, 16) == 0)
+ return i;
+ return -1;
+}
+
+static bool add_spa(struct acpi_nfit_desc *acpi_desc,
+ struct acpi_nfit_system_address *spa)
+{
+ struct device *dev = acpi_desc->dev;
+ struct nfit_spa *nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa),
+ GFP_KERNEL);
+
+ if (!nfit_spa)
+ return false;
+ INIT_LIST_HEAD(&nfit_spa->list);
+ nfit_spa->spa = spa;
+ list_add_tail(&nfit_spa->list, &acpi_desc->spas);
+ dev_dbg(dev, "%s: spa index: %d type: %s\n", __func__,
+ spa->range_index,
+ spa_type_name(nfit_spa_type(spa)));
+ return true;
+}
+
+static bool add_memdev(struct acpi_nfit_desc *acpi_desc,
+ struct acpi_nfit_memory_map *memdev)
+{
+ struct device *dev = acpi_desc->dev;
+ struct nfit_memdev *nfit_memdev = devm_kzalloc(dev,
+ sizeof(*nfit_memdev), GFP_KERNEL);
+
+ if (!nfit_memdev)
+ return false;
+ INIT_LIST_HEAD(&nfit_memdev->list);
+ nfit_memdev->memdev = memdev;
+ list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs);
+ dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d\n",
+ __func__, memdev->device_handle, memdev->range_index,
+ memdev->region_index);
+ return true;
+}
+
+static bool add_dcr(struct acpi_nfit_desc *acpi_desc,
+ struct acpi_nfit_control_region *dcr)
+{
+ struct device *dev = acpi_desc->dev;
+ struct nfit_dcr *nfit_dcr = devm_kzalloc(dev, sizeof(*nfit_dcr),
+ GFP_KERNEL);
+
+ if (!nfit_dcr)
+ return false;
+ INIT_LIST_HEAD(&nfit_dcr->list);
+ nfit_dcr->dcr = dcr;
+ list_add_tail(&nfit_dcr->list, &acpi_desc->dcrs);
+ dev_dbg(dev, "%s: dcr index: %d windows: %d\n", __func__,
+ dcr->region_index, dcr->windows);
+ return true;
+}
+
+static bool add_bdw(struct acpi_nfit_desc *acpi_desc,
+ struct acpi_nfit_data_region *bdw)
+{
+ struct device *dev = acpi_desc->dev;
+ struct nfit_bdw *nfit_bdw = devm_kzalloc(dev, sizeof(*nfit_bdw),
+ GFP_KERNEL);
+
+ if (!nfit_bdw)
+ return false;
+ INIT_LIST_HEAD(&nfit_bdw->list);
+ nfit_bdw->bdw = bdw;
+ list_add_tail(&nfit_bdw->list, &acpi_desc->bdws);
+ dev_dbg(dev, "%s: bdw dcr: %d windows: %d\n", __func__,
+ bdw->region_index, bdw->windows);
+ return true;
+}
+
+static bool add_idt(struct acpi_nfit_desc *acpi_desc,
+ struct acpi_nfit_interleave *idt)
+{
+ struct device *dev = acpi_desc->dev;
+ struct nfit_idt *nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt),
+ GFP_KERNEL);
+
+ if (!nfit_idt)
+ return false;
+ INIT_LIST_HEAD(&nfit_idt->list);
+ nfit_idt->idt = idt;
+ list_add_tail(&nfit_idt->list, &acpi_desc->idts);
+ dev_dbg(dev, "%s: idt index: %d num_lines: %d\n", __func__,
+ idt->interleave_index, idt->line_count);
+ return true;
+}
+
+static void *add_table(struct acpi_nfit_desc *acpi_desc, void *table,
+ const void *end)
+{
+ struct device *dev = acpi_desc->dev;
+ struct acpi_nfit_header *hdr;
+ void *err = ERR_PTR(-ENOMEM);
+
+ if (table >= end)
+ return NULL;
+
+ hdr = table;
+ switch (hdr->type) {
+ case ACPI_NFIT_TYPE_SYSTEM_ADDRESS:
+ if (!add_spa(acpi_desc, table))
+ return err;
+ break;
+ case ACPI_NFIT_TYPE_MEMORY_MAP:
+ if (!add_memdev(acpi_desc, table))
+ return err;
+ break;
+ case ACPI_NFIT_TYPE_CONTROL_REGION:
+ if (!add_dcr(acpi_desc, table))
+ return err;
+ break;
+ case ACPI_NFIT_TYPE_DATA_REGION:
+ if (!add_bdw(acpi_desc, table))
+ return err;
+ break;
+ case ACPI_NFIT_TYPE_INTERLEAVE:
+ if (!add_idt(acpi_desc, table))
+ return err;
+ break;
+ case ACPI_NFIT_TYPE_FLUSH_ADDRESS:
+ dev_dbg(dev, "%s: flush\n", __func__);
+ break;
+ case ACPI_NFIT_TYPE_SMBIOS:
+ dev_dbg(dev, "%s: smbios\n", __func__);
+ break;
+ default:
+ dev_err(dev, "unknown table '%d' parsing nfit\n", hdr->type);
+ break;
+ }
+
+ return table + hdr->length;
+}
+
+static void nfit_mem_find_spa_bdw(struct acpi_nfit_desc *acpi_desc,
+ struct nfit_mem *nfit_mem)
+{
+ u32 device_handle = __to_nfit_memdev(nfit_mem)->device_handle;
+ u16 dcr = nfit_mem->dcr->region_index;
+ struct nfit_spa *nfit_spa;
+
+ list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+ u16 range_index = nfit_spa->spa->range_index;
+ int type = nfit_spa_type(nfit_spa->spa);
+ struct nfit_memdev *nfit_memdev;
+
+ if (type != NFIT_SPA_BDW)
+ continue;
+
+ list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+ if (nfit_memdev->memdev->range_index != range_index)
+ continue;
+ if (nfit_memdev->memdev->device_handle != device_handle)
+ continue;
+ if (nfit_memdev->memdev->region_index != dcr)
+ continue;
+
+ nfit_mem->spa_bdw = nfit_spa->spa;
+ return;
+ }
+ }
+
+ dev_dbg(acpi_desc->dev, "SPA-BDW not found for SPA-DCR %d\n",
+ nfit_mem->spa_dcr->range_index);
+ nfit_mem->bdw = NULL;
+}
+
+static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc,
+ struct nfit_mem *nfit_mem, struct acpi_nfit_system_address *spa)
+{
+ u16 dcr = __to_nfit_memdev(nfit_mem)->region_index;
+ struct nfit_memdev *nfit_memdev;
+ struct nfit_dcr *nfit_dcr;
+ struct nfit_bdw *nfit_bdw;
+ struct nfit_idt *nfit_idt;
+ u16 idt_idx, range_index;
+
+ list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) {
+ if (nfit_dcr->dcr->region_index != dcr)
+ continue;
+ nfit_mem->dcr = nfit_dcr->dcr;
+ break;
+ }
+
+ if (!nfit_mem->dcr) {
+ dev_dbg(acpi_desc->dev, "SPA %d missing:%s%s\n",
+ spa->range_index, __to_nfit_memdev(nfit_mem)
+ ? "" : " MEMDEV", nfit_mem->dcr ? "" : " DCR");
+ return -ENODEV;
+ }
+
+ /*
+ * We've found enough to create an nvdimm, optionally
+ * find an associated BDW
+ */
+ list_add(&nfit_mem->list, &acpi_desc->dimms);
+
+ list_for_each_entry(nfit_bdw, &acpi_desc->bdws, list) {
+ if (nfit_bdw->bdw->region_index != dcr)
+ continue;
+ nfit_mem->bdw = nfit_bdw->bdw;
+ break;
+ }
+
+ if (!nfit_mem->bdw)
+ return 0;
+
+ nfit_mem_find_spa_bdw(acpi_desc, nfit_mem);
+
+ if (!nfit_mem->spa_bdw)
+ return 0;
+
+ range_index = nfit_mem->spa_bdw->range_index;
+ list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+ if (nfit_memdev->memdev->range_index != range_index ||
+ nfit_memdev->memdev->region_index != dcr)
+ continue;
+ nfit_mem->memdev_bdw = nfit_memdev->memdev;
+ idt_idx = nfit_memdev->memdev->interleave_index;
+ list_for_each_entry(nfit_idt, &acpi_desc->idts, list) {
+ if (nfit_idt->idt->interleave_index != idt_idx)
+ continue;
+ nfit_mem->idt_bdw = nfit_idt->idt;
+ break;
+ }
+ break;
+ }
+
+ return 0;
+}
+
+static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
+ struct acpi_nfit_system_address *spa)
+{
+ struct nfit_mem *nfit_mem, *found;
+ struct nfit_memdev *nfit_memdev;
+ int type = nfit_spa_type(spa);
+ u16 dcr;
+
+ switch (type) {
+ case NFIT_SPA_DCR:
+ case NFIT_SPA_PM:
+ break;
+ default:
+ return 0;
+ }
+
+ list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+ int rc;
+
+ if (nfit_memdev->memdev->range_index != spa->range_index)
+ continue;
+ found = NULL;
+ dcr = nfit_memdev->memdev->region_index;
+ list_for_each_entry(nfit_mem, &acpi_desc->dimms, list)
+ if (__to_nfit_memdev(nfit_mem)->region_index == dcr) {
+ found = nfit_mem;
+ break;
+ }
+
+ if (found)
+ nfit_mem = found;
+ else {
+ nfit_mem = devm_kzalloc(acpi_desc->dev,
+ sizeof(*nfit_mem), GFP_KERNEL);
+ if (!nfit_mem)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&nfit_mem->list);
+ }
+
+ if (type == NFIT_SPA_DCR) {
+ struct nfit_idt *nfit_idt;
+ u16 idt_idx;
+
+ /* multiple dimms may share a SPA when interleaved */
+ nfit_mem->spa_dcr = spa;
+ nfit_mem->memdev_dcr = nfit_memdev->memdev;
+ idt_idx = nfit_memdev->memdev->interleave_index;
+ list_for_each_entry(nfit_idt, &acpi_desc->idts, list) {
+ if (nfit_idt->idt->interleave_index != idt_idx)
+ continue;
+ nfit_mem->idt_dcr = nfit_idt->idt;
+ break;
+ }
+ } else {
+ /*
+ * A single dimm may belong to multiple SPA-PM
+ * ranges, record at least one in addition to
+ * any SPA-DCR range.
+ */
+ nfit_mem->memdev_pmem = nfit_memdev->memdev;
+ }
+
+ if (found)
+ continue;
+
+ rc = nfit_mem_add(acpi_desc, nfit_mem, spa);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+static int nfit_mem_cmp(void *priv, struct list_head *_a, struct list_head *_b)
+{
+ struct nfit_mem *a = container_of(_a, typeof(*a), list);
+ struct nfit_mem *b = container_of(_b, typeof(*b), list);
+ u32 handleA, handleB;
+
+ handleA = __to_nfit_memdev(a)->device_handle;
+ handleB = __to_nfit_memdev(b)->device_handle;
+ if (handleA < handleB)
+ return -1;
+ else if (handleA > handleB)
+ return 1;
+ return 0;
+}
+
+static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc)
+{
+ struct nfit_spa *nfit_spa;
+
+ /*
+ * For each SPA-DCR or SPA-PMEM address range find its
+ * corresponding MEMDEV(s). From each MEMDEV find the
+ * corresponding DCR. Then, if we're operating on a SPA-DCR,
+ * try to find a SPA-BDW and a corresponding BDW that references
+ * the DCR. Throw it all into an nfit_mem object. Note, that
+ * BDWs are optional.
+ */
+ list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+ int rc;
+
+ rc = nfit_mem_dcr_init(acpi_desc, nfit_spa->spa);
+ if (rc)
+ return rc;
+ }
+
+ list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp);
+
+ return 0;
+}
+
+static ssize_t revision_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+ struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+ struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+ return sprintf(buf, "%d\n", acpi_desc->nfit->header.revision);
+}
+static DEVICE_ATTR_RO(revision);
+
+static struct attribute *acpi_nfit_attributes[] = {
+ &dev_attr_revision.attr,
+ NULL,
+};
+
+static struct attribute_group acpi_nfit_attribute_group = {
+ .name = "nfit",
+ .attrs = acpi_nfit_attributes,
+};
+
+const struct attribute_group *acpi_nfit_attribute_groups[] = {
+ &nvdimm_bus_attribute_group,
+ &acpi_nfit_attribute_group,
+ NULL,
+};
+EXPORT_SYMBOL_GPL(acpi_nfit_attribute_groups);
+
+static struct acpi_nfit_memory_map *to_nfit_memdev(struct device *dev)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+
+ return __to_nfit_memdev(nfit_mem);
+}
+
+static struct acpi_nfit_control_region *to_nfit_dcr(struct device *dev)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+
+ return nfit_mem->dcr;
+}
+
+static ssize_t handle_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev);
+
+ return sprintf(buf, "%#x\n", memdev->device_handle);
+}
+static DEVICE_ATTR_RO(handle);
+
+static ssize_t phys_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev);
+
+ return sprintf(buf, "%#x\n", memdev->physical_id);
+}
+static DEVICE_ATTR_RO(phys_id);
+
+static ssize_t vendor_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+ return sprintf(buf, "%#x\n", dcr->vendor_id);
+}
+static DEVICE_ATTR_RO(vendor);
+
+static ssize_t rev_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+ return sprintf(buf, "%#x\n", dcr->revision_id);
+}
+static DEVICE_ATTR_RO(rev_id);
+
+static ssize_t device_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+ return sprintf(buf, "%#x\n", dcr->device_id);
+}
+static DEVICE_ATTR_RO(device);
+
+static ssize_t format_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+ return sprintf(buf, "%#x\n", dcr->code);
+}
+static DEVICE_ATTR_RO(format);
+
+static ssize_t serial_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+ return sprintf(buf, "%#x\n", dcr->serial_number);
+}
+static DEVICE_ATTR_RO(serial);
+
+static ssize_t flags_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u16 flags = to_nfit_memdev(dev)->flags;
+
+ return sprintf(buf, "%s%s%s%s%s\n",
+ flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save " : "",
+ flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore " : "",
+ flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush " : "",
+ flags & ACPI_NFIT_MEM_ARMED ? "arm " : "",
+ flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart " : "");
+}
+static DEVICE_ATTR_RO(flags);
+
+static struct attribute *acpi_nfit_dimm_attributes[] = {
+ &dev_attr_handle.attr,
+ &dev_attr_phys_id.attr,
+ &dev_attr_vendor.attr,
+ &dev_attr_device.attr,
+ &dev_attr_format.attr,
+ &dev_attr_serial.attr,
+ &dev_attr_rev_id.attr,
+ &dev_attr_flags.attr,
+ NULL,
+};
+
+static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+
+ if (to_nfit_dcr(dev))
+ return a->mode;
+ else
+ return 0;
+}
+
+static struct attribute_group acpi_nfit_dimm_attribute_group = {
+ .name = "nfit",
+ .attrs = acpi_nfit_dimm_attributes,
+ .is_visible = acpi_nfit_dimm_attr_visible,
+};
+
+static const struct attribute_group *acpi_nfit_dimm_attribute_groups[] = {
+ &nvdimm_attribute_group,
+ &nd_device_attribute_group,
+ &acpi_nfit_dimm_attribute_group,
+ NULL,
+};
+
+static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc,
+ u32 device_handle)
+{
+ struct nfit_mem *nfit_mem;
+
+ list_for_each_entry(nfit_mem, &acpi_desc->dimms, list)
+ if (__to_nfit_memdev(nfit_mem)->device_handle == device_handle)
+ return nfit_mem->nvdimm;
+
+ return NULL;
+}
+
+static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
+ struct nfit_mem *nfit_mem, u32 device_handle)
+{
+ struct acpi_device *adev, *adev_dimm;
+ struct device *dev = acpi_desc->dev;
+ const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM);
+ unsigned long long sta;
+ int i, rc = -ENODEV;
+ acpi_status status;
+
+ nfit_mem->dsm_mask = acpi_desc->dimm_dsm_force_en;
+ adev = to_acpi_dev(acpi_desc);
+ if (!adev)
+ return 0;
+
+ adev_dimm = acpi_find_child_device(adev, device_handle, false);
+ nfit_mem->adev = adev_dimm;
+ if (!adev_dimm) {
+ dev_err(dev, "no ACPI.NFIT device with _ADR %#x, disabling...\n",
+ device_handle);
+ return force_enable_dimms ? 0 : -ENODEV;
+ }
+
+ status = acpi_evaluate_integer(adev_dimm->handle, "_STA", NULL, &sta);
+ if (status == AE_NOT_FOUND) {
+ dev_dbg(dev, "%s missing _STA, assuming enabled...\n",
+ dev_name(&adev_dimm->dev));
+ rc = 0;
+ } else if (ACPI_FAILURE(status))
+ dev_err(dev, "%s failed to retrieve_STA, disabling...\n",
+ dev_name(&adev_dimm->dev));
+ else if ((sta & ACPI_STA_DEVICE_ENABLED) == 0)
+ dev_info(dev, "%s disabled by firmware\n",
+ dev_name(&adev_dimm->dev));
+ else
+ rc = 0;
+
+ for (i = ND_CMD_SMART; i <= ND_CMD_VENDOR; i++)
+ if (acpi_check_dsm(adev_dimm->handle, uuid, 1, 1ULL << i))
+ set_bit(i, &nfit_mem->dsm_mask);
+
+ return force_enable_dimms ? 0 : rc;
+}
+
+static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
+{
+ struct nfit_mem *nfit_mem;
+ int dimm_count = 0;
+
+ list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
+ struct nvdimm *nvdimm;
+ unsigned long flags = 0;
+ u32 device_handle;
+ u16 mem_flags;
+ int rc;
+
+ device_handle = __to_nfit_memdev(nfit_mem)->device_handle;
+ nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle);
+ if (nvdimm) {
+ /*
+ * If for some reason we find multiple DCRs the
+ * first one wins
+ */
+ dev_err(acpi_desc->dev, "duplicate DCR detected: %s\n",
+ nvdimm_name(nvdimm));
+ continue;
+ }
+
+ if (nfit_mem->bdw && nfit_mem->memdev_pmem)
+ flags |= NDD_ALIASING;
+
+ mem_flags = __to_nfit_memdev(nfit_mem)->flags;
+ if (mem_flags & ACPI_NFIT_MEM_ARMED)
+ flags |= NDD_UNARMED;
+
+ rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle);
+ if (rc)
+ continue;
+
+ nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
+ acpi_nfit_dimm_attribute_groups,
+ flags, &nfit_mem->dsm_mask);
+ if (!nvdimm)
+ return -ENOMEM;
+
+ nfit_mem->nvdimm = nvdimm;
+ dimm_count++;
+
+ if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0)
+ continue;
+
+ dev_info(acpi_desc->dev, "%s: failed: %s%s%s%s\n",
+ nvdimm_name(nvdimm),
+ mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save " : "",
+ mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore " : "",
+ mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush " : "",
+ mem_flags & ACPI_NFIT_MEM_ARMED ? "arm " : "");
+
+ }
+
+ return nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count);
+}
+
+static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
+{
+ struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
+ const u8 *uuid = to_nfit_uuid(NFIT_DEV_BUS);
+ struct acpi_device *adev;
+ int i;
+
+ adev = to_acpi_dev(acpi_desc);
+ if (!adev)
+ return;
+
+ for (i = ND_CMD_ARS_CAP; i <= ND_CMD_ARS_STATUS; i++)
+ if (acpi_check_dsm(adev->handle, uuid, 1, 1ULL << i))
+ set_bit(i, &nd_desc->dsm_mask);
+}
+
+static ssize_t range_index_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region);
+
+ return sprintf(buf, "%d\n", nfit_spa->spa->range_index);
+}
+static DEVICE_ATTR_RO(range_index);
+
+static struct attribute *acpi_nfit_region_attributes[] = {
+ &dev_attr_range_index.attr,
+ NULL,
+};
+
+static struct attribute_group acpi_nfit_region_attribute_group = {
+ .name = "nfit",
+ .attrs = acpi_nfit_region_attributes,
+};
+
+static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
+ &nd_region_attribute_group,
+ &nd_mapping_attribute_group,
+ &nd_device_attribute_group,
+ &nd_numa_attribute_group,
+ &acpi_nfit_region_attribute_group,
+ NULL,
+};
+
+/* enough info to uniquely specify an interleave set */
+struct nfit_set_info {
+ struct nfit_set_info_map {
+ u64 region_offset;
+ u32 serial_number;
+ u32 pad;
+ } mapping[0];
+};
+
+static size_t sizeof_nfit_set_info(int num_mappings)
+{
+ return sizeof(struct nfit_set_info)
+ + num_mappings * sizeof(struct nfit_set_info_map);
+}
+
+static int cmp_map(const void *m0, const void *m1)
+{
+ const struct nfit_set_info_map *map0 = m0;
+ const struct nfit_set_info_map *map1 = m1;
+
+ return memcmp(&map0->region_offset, &map1->region_offset,
+ sizeof(u64));
+}
+
+/* Retrieve the nth entry referencing this spa */
+static struct acpi_nfit_memory_map *memdev_from_spa(
+ struct acpi_nfit_desc *acpi_desc, u16 range_index, int n)
+{
+ struct nfit_memdev *nfit_memdev;
+
+ list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list)
+ if (nfit_memdev->memdev->range_index == range_index)
+ if (n-- == 0)
+ return nfit_memdev->memdev;
+ return NULL;
+}
+
+static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
+ struct nd_region_desc *ndr_desc,
+ struct acpi_nfit_system_address *spa)
+{
+ int i, spa_type = nfit_spa_type(spa);
+ struct device *dev = acpi_desc->dev;
+ struct nd_interleave_set *nd_set;
+ u16 nr = ndr_desc->num_mappings;
+ struct nfit_set_info *info;
+
+ if (spa_type == NFIT_SPA_PM || spa_type == NFIT_SPA_VOLATILE)
+ /* pass */;
+ else
+ return 0;
+
+ nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
+ if (!nd_set)
+ return -ENOMEM;
+
+ info = devm_kzalloc(dev, sizeof_nfit_set_info(nr), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+ for (i = 0; i < nr; i++) {
+ struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i];
+ struct nfit_set_info_map *map = &info->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+ struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+ struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc,
+ spa->range_index, i);
+
+ if (!memdev || !nfit_mem->dcr) {
+ dev_err(dev, "%s: failed to find DCR\n", __func__);
+ return -ENODEV;
+ }
+
+ map->region_offset = memdev->region_offset;
+ map->serial_number = nfit_mem->dcr->serial_number;
+ }
+
+ sort(&info->mapping[0], nr, sizeof(struct nfit_set_info_map),
+ cmp_map, NULL);
+ nd_set->cookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
+ ndr_desc->nd_set = nd_set;
+ devm_kfree(dev, info);
+
+ return 0;
+}
+
+static u64 to_interleave_offset(u64 offset, struct nfit_blk_mmio *mmio)
+{
+ struct acpi_nfit_interleave *idt = mmio->idt;
+ u32 sub_line_offset, line_index, line_offset;
+ u64 line_no, table_skip_count, table_offset;
+
+ line_no = div_u64_rem(offset, mmio->line_size, &sub_line_offset);
+ table_skip_count = div_u64_rem(line_no, mmio->num_lines, &line_index);
+ line_offset = idt->line_offset[line_index]
+ * mmio->line_size;
+ table_offset = table_skip_count * mmio->table_size;
+
+ return mmio->base_offset + line_offset + table_offset + sub_line_offset;
+}
+
+static u64 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw)
+{
+ struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR];
+ u64 offset = nfit_blk->stat_offset + mmio->size * bw;
+
+ if (mmio->num_lines)
+ offset = to_interleave_offset(offset, mmio);
+
+ return readq(mmio->base + offset);
+}
+
+static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
+ resource_size_t dpa, unsigned int len, unsigned int write)
+{
+ u64 cmd, offset;
+ struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR];
+
+ enum {
+ BCW_OFFSET_MASK = (1ULL << 48)-1,
+ BCW_LEN_SHIFT = 48,
+ BCW_LEN_MASK = (1ULL << 8) - 1,
+ BCW_CMD_SHIFT = 56,
+ };
+
+ cmd = (dpa >> L1_CACHE_SHIFT) & BCW_OFFSET_MASK;
+ len = len >> L1_CACHE_SHIFT;
+ cmd |= ((u64) len & BCW_LEN_MASK) << BCW_LEN_SHIFT;
+ cmd |= ((u64) write) << BCW_CMD_SHIFT;
+
+ offset = nfit_blk->cmd_offset + mmio->size * bw;
+ if (mmio->num_lines)
+ offset = to_interleave_offset(offset, mmio);
+
+ writeq(cmd, mmio->base + offset);
+ /* FIXME: conditionally perform read-back if mandated by firmware */
+}
+
+static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
+ resource_size_t dpa, void *iobuf, size_t len, int rw,
+ unsigned int lane)
+{
+ struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW];
+ unsigned int copied = 0;
+ u64 base_offset;
+ int rc;
+
+ base_offset = nfit_blk->bdw_offset + dpa % L1_CACHE_BYTES
+ + lane * mmio->size;
+ /* TODO: non-temporal access, flush hints, cache management etc... */
+ write_blk_ctl(nfit_blk, lane, dpa, len, rw);
+ while (len) {
+ unsigned int c;
+ u64 offset;
+
+ if (mmio->num_lines) {
+ u32 line_offset;
+
+ offset = to_interleave_offset(base_offset + copied,
+ mmio);
+ div_u64_rem(offset, mmio->line_size, &line_offset);
+ c = min_t(size_t, len, mmio->line_size - line_offset);
+ } else {
+ offset = base_offset + nfit_blk->bdw_offset;
+ c = len;
+ }
+
+ if (rw)
+ memcpy(mmio->aperture + offset, iobuf + copied, c);
+ else
+ memcpy(iobuf + copied, mmio->aperture + offset, c);
+
+ copied += c;
+ len -= c;
+ }
+ rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
+ return rc;
+}
+
+static int acpi_nfit_blk_region_do_io(struct nd_blk_region *ndbr,
+ resource_size_t dpa, void *iobuf, u64 len, int rw)
+{
+ struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr);
+ struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW];
+ struct nd_region *nd_region = nfit_blk->nd_region;
+ unsigned int lane, copied = 0;
+ int rc = 0;
+
+ lane = nd_region_acquire_lane(nd_region);
+ while (len) {
+ u64 c = min(len, mmio->size);
+
+ rc = acpi_nfit_blk_single_io(nfit_blk, dpa + copied,
+ iobuf + copied, c, rw, lane);
+ if (rc)
+ break;
+
+ copied += c;
+ len -= c;
+ }
+ nd_region_release_lane(nd_region, lane);
+
+ return rc;
+}
+
+static void nfit_spa_mapping_release(struct kref *kref)
+{
+ struct nfit_spa_mapping *spa_map = to_spa_map(kref);
+ struct acpi_nfit_system_address *spa = spa_map->spa;
+ struct acpi_nfit_desc *acpi_desc = spa_map->acpi_desc;
+
+ WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
+ dev_dbg(acpi_desc->dev, "%s: SPA%d\n", __func__, spa->range_index);
+ iounmap(spa_map->iomem);
+ release_mem_region(spa->address, spa->length);
+ list_del(&spa_map->list);
+ kfree(spa_map);
+}
+
+static struct nfit_spa_mapping *find_spa_mapping(
+ struct acpi_nfit_desc *acpi_desc,
+ struct acpi_nfit_system_address *spa)
+{
+ struct nfit_spa_mapping *spa_map;
+
+ WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
+ list_for_each_entry(spa_map, &acpi_desc->spa_maps, list)
+ if (spa_map->spa == spa)
+ return spa_map;
+
+ return NULL;
+}
+
+static void nfit_spa_unmap(struct acpi_nfit_desc *acpi_desc,
+ struct acpi_nfit_system_address *spa)
+{
+ struct nfit_spa_mapping *spa_map;
+
+ mutex_lock(&acpi_desc->spa_map_mutex);
+ spa_map = find_spa_mapping(acpi_desc, spa);
+
+ if (spa_map)
+ kref_put(&spa_map->kref, nfit_spa_mapping_release);
+ mutex_unlock(&acpi_desc->spa_map_mutex);
+}
+
+static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
+ struct acpi_nfit_system_address *spa)
+{
+ resource_size_t start = spa->address;
+ resource_size_t n = spa->length;
+ struct nfit_spa_mapping *spa_map;
+ struct resource *res;
+
+ WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
+
+ spa_map = find_spa_mapping(acpi_desc, spa);
+ if (spa_map) {
+ kref_get(&spa_map->kref);
+ return spa_map->iomem;
+ }
+
+ spa_map = kzalloc(sizeof(*spa_map), GFP_KERNEL);
+ if (!spa_map)
+ return NULL;
+
+ INIT_LIST_HEAD(&spa_map->list);
+ spa_map->spa = spa;
+ kref_init(&spa_map->kref);
+ spa_map->acpi_desc = acpi_desc;
+
+ res = request_mem_region(start, n, dev_name(acpi_desc->dev));
+ if (!res)
+ goto err_mem;
+
+ /* TODO: cacheability based on the spa type */
+ spa_map->iomem = ioremap_nocache(start, n);
+ if (!spa_map->iomem)
+ goto err_map;
+
+ list_add_tail(&spa_map->list, &acpi_desc->spa_maps);
+ return spa_map->iomem;
+
+ err_map:
+ release_mem_region(start, n);
+ err_mem:
+ kfree(spa_map);
+ return NULL;
+}
+
+/**
+ * nfit_spa_map - interleave-aware managed-mappings of acpi_nfit_system_address ranges
+ * @nvdimm_bus: NFIT-bus that provided the spa table entry
+ * @nfit_spa: spa table to map
+ *
+ * In the case where block-data-window apertures and
+ * dimm-control-regions are interleaved they will end up sharing a
+ * single request_mem_region() + ioremap() for the address range. In
+ * the style of devm nfit_spa_map() mappings are automatically dropped
+ * when all region devices referencing the same mapping are disabled /
+ * unbound.
+ */
+static void __iomem *nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
+ struct acpi_nfit_system_address *spa)
+{
+ void __iomem *iomem;
+
+ mutex_lock(&acpi_desc->spa_map_mutex);
+ iomem = __nfit_spa_map(acpi_desc, spa);
+ mutex_unlock(&acpi_desc->spa_map_mutex);
+
+ return iomem;
+}
+
+static int nfit_blk_init_interleave(struct nfit_blk_mmio *mmio,
+ struct acpi_nfit_interleave *idt, u16 interleave_ways)
+{
+ if (idt) {
+ mmio->num_lines = idt->line_count;
+ mmio->line_size = idt->line_size;
+ if (interleave_ways == 0)
+ return -ENXIO;
+ mmio->table_size = mmio->num_lines * interleave_ways
+ * mmio->line_size;
+ }
+
+ return 0;
+}
+
+static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus,
+ struct device *dev)
+{
+ struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+ struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+ struct nd_blk_region *ndbr = to_nd_blk_region(dev);
+ struct nfit_blk_mmio *mmio;
+ struct nfit_blk *nfit_blk;
+ struct nfit_mem *nfit_mem;
+ struct nvdimm *nvdimm;
+ int rc;
+
+ nvdimm = nd_blk_region_to_dimm(ndbr);
+ nfit_mem = nvdimm_provider_data(nvdimm);
+ if (!nfit_mem || !nfit_mem->dcr || !nfit_mem->bdw) {
+ dev_dbg(dev, "%s: missing%s%s%s\n", __func__,
+ nfit_mem ? "" : " nfit_mem",
+ nfit_mem->dcr ? "" : " dcr",
+ nfit_mem->bdw ? "" : " bdw");
+ return -ENXIO;
+ }
+
+ nfit_blk = devm_kzalloc(dev, sizeof(*nfit_blk), GFP_KERNEL);
+ if (!nfit_blk)
+ return -ENOMEM;
+ nd_blk_region_set_provider_data(ndbr, nfit_blk);
+ nfit_blk->nd_region = to_nd_region(dev);
+
+ /* map block aperture memory */
+ nfit_blk->bdw_offset = nfit_mem->bdw->offset;
+ mmio = &nfit_blk->mmio[BDW];
+ mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw);
+ if (!mmio->base) {
+ dev_dbg(dev, "%s: %s failed to map bdw\n", __func__,
+ nvdimm_name(nvdimm));
+ return -ENOMEM;
+ }
+ mmio->size = nfit_mem->bdw->size;
+ mmio->base_offset = nfit_mem->memdev_bdw->region_offset;
+ mmio->idt = nfit_mem->idt_bdw;
+ mmio->spa = nfit_mem->spa_bdw;
+ rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_bdw,
+ nfit_mem->memdev_bdw->interleave_ways);
+ if (rc) {
+ dev_dbg(dev, "%s: %s failed to init bdw interleave\n",
+ __func__, nvdimm_name(nvdimm));
+ return rc;
+ }
+
+ /* map block control memory */
+ nfit_blk->cmd_offset = nfit_mem->dcr->command_offset;
+ nfit_blk->stat_offset = nfit_mem->dcr->status_offset;
+ mmio = &nfit_blk->mmio[DCR];
+ mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr);
+ if (!mmio->base) {
+ dev_dbg(dev, "%s: %s failed to map dcr\n", __func__,
+ nvdimm_name(nvdimm));
+ return -ENOMEM;
+ }
+ mmio->size = nfit_mem->dcr->window_size;
+ mmio->base_offset = nfit_mem->memdev_dcr->region_offset;
+ mmio->idt = nfit_mem->idt_dcr;
+ mmio->spa = nfit_mem->spa_dcr;
+ rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_dcr,
+ nfit_mem->memdev_dcr->interleave_ways);
+ if (rc) {
+ dev_dbg(dev, "%s: %s failed to init dcr interleave\n",
+ __func__, nvdimm_name(nvdimm));
+ return rc;
+ }
+
+ if (mmio->line_size == 0)
+ return 0;
+
+ if ((u32) nfit_blk->cmd_offset % mmio->line_size
+ + 8 > mmio->line_size) {
+ dev_dbg(dev, "cmd_offset crosses interleave boundary\n");
+ return -ENXIO;
+ } else if ((u32) nfit_blk->stat_offset % mmio->line_size
+ + 8 > mmio->line_size) {
+ dev_dbg(dev, "stat_offset crosses interleave boundary\n");
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
+ struct device *dev)
+{
+ struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+ struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+ struct nd_blk_region *ndbr = to_nd_blk_region(dev);
+ struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr);
+ int i;
+
+ if (!nfit_blk)
+ return; /* never enabled */
+
+ /* auto-free BLK spa mappings */
+ for (i = 0; i < 2; i++) {
+ struct nfit_blk_mmio *mmio = &nfit_blk->mmio[i];
+
+ if (mmio->base)
+ nfit_spa_unmap(acpi_desc, mmio->spa);
+ }
+ nd_blk_region_set_provider_data(ndbr, NULL);
+ /* devm will free nfit_blk */
+}
+
+static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
+ struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
+ struct acpi_nfit_memory_map *memdev,
+ struct acpi_nfit_system_address *spa)
+{
+ struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc,
+ memdev->device_handle);
+ struct nd_blk_region_desc *ndbr_desc;
+ struct nfit_mem *nfit_mem;
+ int blk_valid = 0;
+
+ if (!nvdimm) {
+ dev_err(acpi_desc->dev, "spa%d dimm: %#x not found\n",
+ spa->range_index, memdev->device_handle);
+ return -ENODEV;
+ }
+
+ nd_mapping->nvdimm = nvdimm;
+ switch (nfit_spa_type(spa)) {
+ case NFIT_SPA_PM:
+ case NFIT_SPA_VOLATILE:
+ nd_mapping->start = memdev->address;
+ nd_mapping->size = memdev->region_size;
+ break;
+ case NFIT_SPA_DCR:
+ nfit_mem = nvdimm_provider_data(nvdimm);
+ if (!nfit_mem || !nfit_mem->bdw) {
+ dev_dbg(acpi_desc->dev, "spa%d %s missing bdw\n",
+ spa->range_index, nvdimm_name(nvdimm));
+ } else {
+ nd_mapping->size = nfit_mem->bdw->capacity;
+ nd_mapping->start = nfit_mem->bdw->start_address;
+ ndr_desc->num_lanes = nfit_mem->bdw->windows;
+ blk_valid = 1;
+ }
+
+ ndr_desc->nd_mapping = nd_mapping;
+ ndr_desc->num_mappings = blk_valid;
+ ndbr_desc = to_blk_region_desc(ndr_desc);
+ ndbr_desc->enable = acpi_nfit_blk_region_enable;
+ ndbr_desc->disable = acpi_nfit_blk_region_disable;
+ ndbr_desc->do_io = acpi_desc->blk_do_io;
+ if (!nvdimm_blk_region_create(acpi_desc->nvdimm_bus, ndr_desc))
+ return -ENOMEM;
+ break;
+ }
+
+ return 0;
+}
+
+static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
+ struct nfit_spa *nfit_spa)
+{
+ static struct nd_mapping nd_mappings[ND_MAX_MAPPINGS];
+ struct acpi_nfit_system_address *spa = nfit_spa->spa;
+ struct nd_blk_region_desc ndbr_desc;
+ struct nd_region_desc *ndr_desc;
+ struct nfit_memdev *nfit_memdev;
+ struct nvdimm_bus *nvdimm_bus;
+ struct resource res;
+ int count = 0, rc;
+
+ if (spa->range_index == 0) {
+ dev_dbg(acpi_desc->dev, "%s: detected invalid spa index\n",
+ __func__);
+ return 0;
+ }
+
+ memset(&res, 0, sizeof(res));
+ memset(&nd_mappings, 0, sizeof(nd_mappings));
+ memset(&ndbr_desc, 0, sizeof(ndbr_desc));
+ res.start = spa->address;
+ res.end = res.start + spa->length - 1;
+ ndr_desc = &ndbr_desc.ndr_desc;
+ ndr_desc->res = &res;
+ ndr_desc->provider_data = nfit_spa;
+ ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
+ if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
+ ndr_desc->numa_node = acpi_map_pxm_to_online_node(
+ spa->proximity_domain);
+ else
+ ndr_desc->numa_node = NUMA_NO_NODE;
+
+ list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+ struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
+ struct nd_mapping *nd_mapping;
+
+ if (memdev->range_index != spa->range_index)
+ continue;
+ if (count >= ND_MAX_MAPPINGS) {
+ dev_err(acpi_desc->dev, "spa%d exceeds max mappings %d\n",
+ spa->range_index, ND_MAX_MAPPINGS);
+ return -ENXIO;
+ }
+ nd_mapping = &nd_mappings[count++];
+ rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, ndr_desc,
+ memdev, spa);
+ if (rc)
+ return rc;
+ }
+
+ ndr_desc->nd_mapping = nd_mappings;
+ ndr_desc->num_mappings = count;
+ rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa);
+ if (rc)
+ return rc;
+
+ nvdimm_bus = acpi_desc->nvdimm_bus;
+ if (nfit_spa_type(spa) == NFIT_SPA_PM) {
+ if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc))
+ return -ENOMEM;
+ } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
+ if (!nvdimm_volatile_region_create(nvdimm_bus, ndr_desc))
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
+{
+ struct nfit_spa *nfit_spa;
+
+ list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+ int rc = acpi_nfit_register_region(acpi_desc, nfit_spa);
+
+ if (rc)
+ return rc;
+ }
+ return 0;
+}
+
+int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
+{
+ struct device *dev = acpi_desc->dev;
+ const void *end;
+ u8 *data;
+ int rc;
+
+ INIT_LIST_HEAD(&acpi_desc->spa_maps);
+ INIT_LIST_HEAD(&acpi_desc->spas);
+ INIT_LIST_HEAD(&acpi_desc->dcrs);
+ INIT_LIST_HEAD(&acpi_desc->bdws);
+ INIT_LIST_HEAD(&acpi_desc->idts);
+ INIT_LIST_HEAD(&acpi_desc->memdevs);
+ INIT_LIST_HEAD(&acpi_desc->dimms);
+ mutex_init(&acpi_desc->spa_map_mutex);
+
+ data = (u8 *) acpi_desc->nfit;
+ end = data + sz;
+ data += sizeof(struct acpi_table_nfit);
+ while (!IS_ERR_OR_NULL(data))
+ data = add_table(acpi_desc, data, end);
+
+ if (IS_ERR(data)) {
+ dev_dbg(dev, "%s: nfit table parsing error: %ld\n", __func__,
+ PTR_ERR(data));
+ return PTR_ERR(data);
+ }
+
+ if (nfit_mem_init(acpi_desc) != 0)
+ return -ENOMEM;
+
+ acpi_nfit_init_dsms(acpi_desc);
+
+ rc = acpi_nfit_register_dimms(acpi_desc);
+ if (rc)
+ return rc;
+
+ return acpi_nfit_register_regions(acpi_desc);
+}
+EXPORT_SYMBOL_GPL(acpi_nfit_init);
+
+static int acpi_nfit_add(struct acpi_device *adev)
+{
+ struct nvdimm_bus_descriptor *nd_desc;
+ struct acpi_nfit_desc *acpi_desc;
+ struct device *dev = &adev->dev;
+ struct acpi_table_header *tbl;
+ acpi_status status = AE_OK;
+ acpi_size sz;
+ int rc;
+
+ status = acpi_get_table_with_size("NFIT", 0, &tbl, &sz);
+ if (ACPI_FAILURE(status)) {
+ dev_err(dev, "failed to find NFIT\n");
+ return -ENXIO;
+ }
+
+ acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL);
+ if (!acpi_desc)
+ return -ENOMEM;
+
+ dev_set_drvdata(dev, acpi_desc);
+ acpi_desc->dev = dev;
+ acpi_desc->nfit = (struct acpi_table_nfit *) tbl;
+ acpi_desc->blk_do_io = acpi_nfit_blk_region_do_io;
+ nd_desc = &acpi_desc->nd_desc;
+ nd_desc->provider_name = "ACPI.NFIT";
+ nd_desc->ndctl = acpi_nfit_ctl;
+ nd_desc->attr_groups = acpi_nfit_attribute_groups;
+
+ acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, nd_desc);
+ if (!acpi_desc->nvdimm_bus)
+ return -ENXIO;
+
+ rc = acpi_nfit_init(acpi_desc, sz);
+ if (rc) {
+ nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+ return rc;
+ }
+ return 0;
+}
+
+static int acpi_nfit_remove(struct acpi_device *adev)
+{
+ struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev);
+
+ nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+ return 0;
+}
+
+static const struct acpi_device_id acpi_nfit_ids[] = {
+ { "ACPI0012", 0 },
+ { "", 0 },
+};
+MODULE_DEVICE_TABLE(acpi, acpi_nfit_ids);
+
+static struct acpi_driver acpi_nfit_driver = {
+ .name = KBUILD_MODNAME,
+ .ids = acpi_nfit_ids,
+ .ops = {
+ .add = acpi_nfit_add,
+ .remove = acpi_nfit_remove,
+ },
+};
+
+static __init int nfit_init(void)
+{
+ BUILD_BUG_ON(sizeof(struct acpi_table_nfit) != 40);
+ BUILD_BUG_ON(sizeof(struct acpi_nfit_system_address) != 56);
+ BUILD_BUG_ON(sizeof(struct acpi_nfit_memory_map) != 48);
+ BUILD_BUG_ON(sizeof(struct acpi_nfit_interleave) != 20);
+ BUILD_BUG_ON(sizeof(struct acpi_nfit_smbios) != 9);
+ BUILD_BUG_ON(sizeof(struct acpi_nfit_control_region) != 80);
+ BUILD_BUG_ON(sizeof(struct acpi_nfit_data_region) != 40);
+
+ acpi_str_to_uuid(UUID_VOLATILE_MEMORY, nfit_uuid[NFIT_SPA_VOLATILE]);
+ acpi_str_to_uuid(UUID_PERSISTENT_MEMORY, nfit_uuid[NFIT_SPA_PM]);
+ acpi_str_to_uuid(UUID_CONTROL_REGION, nfit_uuid[NFIT_SPA_DCR]);
+ acpi_str_to_uuid(UUID_DATA_REGION, nfit_uuid[NFIT_SPA_BDW]);
+ acpi_str_to_uuid(UUID_VOLATILE_VIRTUAL_DISK, nfit_uuid[NFIT_SPA_VDISK]);
+ acpi_str_to_uuid(UUID_VOLATILE_VIRTUAL_CD, nfit_uuid[NFIT_SPA_VCD]);
+ acpi_str_to_uuid(UUID_PERSISTENT_VIRTUAL_DISK, nfit_uuid[NFIT_SPA_PDISK]);
+ acpi_str_to_uuid(UUID_PERSISTENT_VIRTUAL_CD, nfit_uuid[NFIT_SPA_PCD]);
+ acpi_str_to_uuid(UUID_NFIT_BUS, nfit_uuid[NFIT_DEV_BUS]);
+ acpi_str_to_uuid(UUID_NFIT_DIMM, nfit_uuid[NFIT_DEV_DIMM]);
+
+ return acpi_bus_register_driver(&acpi_nfit_driver);
+}
+
+static __exit void nfit_exit(void)
+{
+ acpi_bus_unregister_driver(&acpi_nfit_driver);
+}
+
+module_init(nfit_init);
+module_exit(nfit_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
new file mode 100644
index 000000000000..81f2e8c5a79c
--- /dev/null
+++ b/drivers/acpi/nfit.h
@@ -0,0 +1,158 @@
+/*
+ * NVDIMM Firmware Interface Table - NFIT
+ *
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __NFIT_H__
+#define __NFIT_H__
+#include <linux/libnvdimm.h>
+#include <linux/types.h>
+#include <linux/uuid.h>
+#include <linux/acpi.h>
+#include <acpi/acuuid.h>
+
+#define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba"
+#define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66"
+#define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \
+ | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
+ | ACPI_NFIT_MEM_ARMED)
+
+enum nfit_uuids {
+ NFIT_SPA_VOLATILE,
+ NFIT_SPA_PM,
+ NFIT_SPA_DCR,
+ NFIT_SPA_BDW,
+ NFIT_SPA_VDISK,
+ NFIT_SPA_VCD,
+ NFIT_SPA_PDISK,
+ NFIT_SPA_PCD,
+ NFIT_DEV_BUS,
+ NFIT_DEV_DIMM,
+ NFIT_UUID_MAX,
+};
+
+struct nfit_spa {
+ struct acpi_nfit_system_address *spa;
+ struct list_head list;
+};
+
+struct nfit_dcr {
+ struct acpi_nfit_control_region *dcr;
+ struct list_head list;
+};
+
+struct nfit_bdw {
+ struct acpi_nfit_data_region *bdw;
+ struct list_head list;
+};
+
+struct nfit_idt {
+ struct acpi_nfit_interleave *idt;
+ struct list_head list;
+};
+
+struct nfit_memdev {
+ struct acpi_nfit_memory_map *memdev;
+ struct list_head list;
+};
+
+/* assembled tables for a given dimm/memory-device */
+struct nfit_mem {
+ struct nvdimm *nvdimm;
+ struct acpi_nfit_memory_map *memdev_dcr;
+ struct acpi_nfit_memory_map *memdev_pmem;
+ struct acpi_nfit_memory_map *memdev_bdw;
+ struct acpi_nfit_control_region *dcr;
+ struct acpi_nfit_data_region *bdw;
+ struct acpi_nfit_system_address *spa_dcr;
+ struct acpi_nfit_system_address *spa_bdw;
+ struct acpi_nfit_interleave *idt_dcr;
+ struct acpi_nfit_interleave *idt_bdw;
+ struct list_head list;
+ struct acpi_device *adev;
+ unsigned long dsm_mask;
+};
+
+struct acpi_nfit_desc {
+ struct nvdimm_bus_descriptor nd_desc;
+ struct acpi_table_nfit *nfit;
+ struct mutex spa_map_mutex;
+ struct list_head spa_maps;
+ struct list_head memdevs;
+ struct list_head dimms;
+ struct list_head spas;
+ struct list_head dcrs;
+ struct list_head bdws;
+ struct list_head idts;
+ struct nvdimm_bus *nvdimm_bus;
+ struct device *dev;
+ unsigned long dimm_dsm_force_en;
+ int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
+ void *iobuf, u64 len, int rw);
+};
+
+enum nd_blk_mmio_selector {
+ BDW,
+ DCR,
+};
+
+struct nfit_blk {
+ struct nfit_blk_mmio {
+ union {
+ void __iomem *base;
+ void *aperture;
+ };
+ u64 size;
+ u64 base_offset;
+ u32 line_size;
+ u32 num_lines;
+ u32 table_size;
+ struct acpi_nfit_interleave *idt;
+ struct acpi_nfit_system_address *spa;
+ } mmio[2];
+ struct nd_region *nd_region;
+ u64 bdw_offset; /* post interleave offset */
+ u64 stat_offset;
+ u64 cmd_offset;
+};
+
+struct nfit_spa_mapping {
+ struct acpi_nfit_desc *acpi_desc;
+ struct acpi_nfit_system_address *spa;
+ struct list_head list;
+ struct kref kref;
+ void __iomem *iomem;
+};
+
+static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref)
+{
+ return container_of(kref, struct nfit_spa_mapping, kref);
+}
+
+static inline struct acpi_nfit_memory_map *__to_nfit_memdev(
+ struct nfit_mem *nfit_mem)
+{
+ if (nfit_mem->memdev_dcr)
+ return nfit_mem->memdev_dcr;
+ return nfit_mem->memdev_pmem;
+}
+
+static inline struct acpi_nfit_desc *to_acpi_desc(
+ struct nvdimm_bus_descriptor *nd_desc)
+{
+ return container_of(nd_desc, struct acpi_nfit_desc, nd_desc);
+}
+
+const u8 *to_nfit_uuid(enum nfit_uuids id);
+int acpi_nfit_init(struct acpi_nfit_desc *nfit, acpi_size sz);
+extern const struct attribute_group *acpi_nfit_attribute_groups[];
+#endif /* __NFIT_H__ */
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 1333cbdc3ea2..acaa3b4ea504 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -29,6 +29,8 @@
#include <linux/errno.h>
#include <linux/acpi.h>
#include <linux/numa.h>
+#include <linux/nodemask.h>
+#include <linux/topology.h>
#define PREFIX "ACPI: "
@@ -70,7 +72,12 @@ static void __acpi_map_pxm_to_node(int pxm, int node)
int acpi_map_pxm_to_node(int pxm)
{
- int node = pxm_to_node_map[pxm];
+ int node;
+
+ if (pxm < 0 || pxm >= MAX_PXM_DOMAINS)
+ return NUMA_NO_NODE;
+
+ node = pxm_to_node_map[pxm];
if (node == NUMA_NO_NODE) {
if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
@@ -83,6 +90,45 @@ int acpi_map_pxm_to_node(int pxm)
return node;
}
+/**
+ * acpi_map_pxm_to_online_node - Map proximity ID to online node
+ * @pxm: ACPI proximity ID
+ *
+ * This is similar to acpi_map_pxm_to_node(), but always returns an online
+ * node. When the mapped node from a given proximity ID is offline, it
+ * looks up the node distance table and returns the nearest online node.
+ *
+ * ACPI device drivers, which are called after the NUMA initialization has
+ * completed in the kernel, can call this interface to obtain their device
+ * NUMA topology from ACPI tables. Such drivers do not have to deal with
+ * offline nodes. A node may be offline when a device proximity ID is
+ * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
+ * "numa=off" on x86.
+ */
+int acpi_map_pxm_to_online_node(int pxm)
+{
+ int node, n, dist, min_dist;
+
+ node = acpi_map_pxm_to_node(pxm);
+
+ if (node == NUMA_NO_NODE)
+ node = 0;
+
+ if (!node_online(node)) {
+ min_dist = INT_MAX;
+ for_each_online_node(n) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ node = n;
+ }
+ }
+ }
+
+ return node;
+}
+EXPORT_SYMBOL(acpi_map_pxm_to_online_node);
+
static void __init
acpi_table_print_srat_entry(struct acpi_subtable_header *header)
{
@@ -328,8 +374,6 @@ int acpi_get_node(acpi_handle handle)
int pxm;
pxm = acpi_get_pxm(handle);
- if (pxm < 0 || pxm >= MAX_PXM_DOMAINS)
- return NUMA_NO_NODE;
return acpi_map_pxm_to_node(pxm);
}
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 3ccef9eba6f9..1b8094d4d7af 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -404,18 +404,6 @@ config BLK_DEV_RAM_DAX
and will prevent RAM block device backing store memory from being
allocated from highmem (only a problem for highmem systems).
-config BLK_DEV_PMEM
- tristate "Persistent memory block device support"
- depends on HAS_IOMEM
- help
- Saying Y here will allow you to use a contiguous range of reserved
- memory as one or more persistent block devices.
-
- To compile this driver as a module, choose M here: the module will be
- called 'pmem'.
-
- If unsure, say N.
-
config CDROM_PKTCDVD
tristate "Packet writing on CD/DVD media"
depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 9cc6c18a1c7e..02b688d1438d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,7 +14,6 @@ obj-$(CONFIG_PS3_VRAM) += ps3vram.o
obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
obj-$(CONFIG_BLK_DEV_RAM) += brd.o
-obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o
obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o
obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e5112714188f..34338d7438f5 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -193,6 +193,13 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
return 0;
}
+static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+ struct nvme_queue *nvmeq = hctx->driver_data;
+
+ nvmeq->tags = NULL;
+}
+
static int nvme_admin_init_request(void *data, struct request *req,
unsigned int hctx_idx, unsigned int rq_idx,
unsigned int numa_node)
@@ -606,7 +613,10 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
return;
}
if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
- req->errors = status;
+ if (cmd_rq->ctx == CMD_CTX_CANCELLED)
+ req->errors = -EINTR;
+ else
+ req->errors = status;
} else {
req->errors = nvme_error_status(status);
}
@@ -1161,12 +1171,13 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
{
- struct nvme_command c = {
- .identify.opcode = nvme_admin_identify,
- .identify.cns = cpu_to_le32(1),
- };
+ struct nvme_command c = { };
int error;
+ /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+ c.identify.opcode = nvme_admin_identify;
+ c.identify.cns = cpu_to_le32(1);
+
*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
if (!*id)
return -ENOMEM;
@@ -1181,12 +1192,13 @@ int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
struct nvme_id_ns **id)
{
- struct nvme_command c = {
- .identify.opcode = nvme_admin_identify,
- .identify.nsid = cpu_to_le32(nsid),
- };
+ struct nvme_command c = { };
int error;
+ /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+ c.identify.opcode = nvme_admin_identify,
+ c.identify.nsid = cpu_to_le32(nsid),
+
*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
if (!*id)
return -ENOMEM;
@@ -1230,14 +1242,14 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
{
- struct nvme_command c = {
- .common.opcode = nvme_admin_get_log_page,
- .common.nsid = cpu_to_le32(0xFFFFFFFF),
- .common.cdw10[0] = cpu_to_le32(
+ struct nvme_command c = { };
+ int error;
+
+ c.common.opcode = nvme_admin_get_log_page,
+ c.common.nsid = cpu_to_le32(0xFFFFFFFF),
+ c.common.cdw10[0] = cpu_to_le32(
(((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
NVME_LOG_SMART),
- };
- int error;
*log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
if (!*log)
@@ -1606,6 +1618,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,
.map_queue = blk_mq_map_queue,
.init_hctx = nvme_admin_init_hctx,
+ .exit_hctx = nvme_admin_exit_hctx,
.init_request = nvme_admin_init_request,
.timeout = nvme_timeout,
};
@@ -1648,6 +1661,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
}
if (!blk_get_queue(dev->admin_q)) {
nvme_dev_remove_admin(dev);
+ dev->admin_q = NULL;
return -ENODEV;
}
} else
@@ -2349,19 +2363,20 @@ static int nvme_dev_add(struct nvme_dev *dev)
}
kfree(ctrl);
- dev->tagset.ops = &nvme_mq_ops;
- dev->tagset.nr_hw_queues = dev->online_queues - 1;
- dev->tagset.timeout = NVME_IO_TIMEOUT;
- dev->tagset.numa_node = dev_to_node(dev->dev);
- dev->tagset.queue_depth =
+ if (!dev->tagset.tags) {
+ dev->tagset.ops = &nvme_mq_ops;
+ dev->tagset.nr_hw_queues = dev->online_queues - 1;
+ dev->tagset.timeout = NVME_IO_TIMEOUT;
+ dev->tagset.numa_node = dev_to_node(dev->dev);
+ dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = nvme_cmd_size(dev);
- dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
- dev->tagset.driver_data = dev;
-
- if (blk_mq_alloc_tag_set(&dev->tagset))
- return 0;
+ dev->tagset.cmd_size = nvme_cmd_size(dev);
+ dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+ dev->tagset.driver_data = dev;
+ if (blk_mq_alloc_tag_set(&dev->tagset))
+ return 0;
+ }
schedule_work(&dev->scan_work);
return 0;
}
@@ -2734,8 +2749,10 @@ static void nvme_free_dev(struct kref *kref)
put_device(dev->device);
nvme_free_namespaces(dev);
nvme_release_instance(dev);
- blk_mq_free_tag_set(&dev->tagset);
- blk_put_queue(dev->admin_q);
+ if (dev->tagset.tags)
+ blk_mq_free_tag_set(&dev->tagset);
+ if (dev->admin_q)
+ blk_put_queue(dev->admin_q);
kfree(dev->queues);
kfree(dev->entry);
kfree(dev);
@@ -2866,6 +2883,9 @@ static int nvme_dev_start(struct nvme_dev *dev)
free_tags:
nvme_dev_remove_admin(dev);
+ blk_put_queue(dev->admin_q);
+ dev->admin_q = NULL;
+ dev->queues[0]->tags = NULL;
disable:
nvme_disable_queue(dev, 0);
nvme_dev_list_remove(dev);
@@ -2907,25 +2927,43 @@ static int nvme_dev_resume(struct nvme_dev *dev)
spin_unlock(&dev_list_lock);
} else {
nvme_unfreeze_queues(dev);
- schedule_work(&dev->scan_work);
+ nvme_dev_add(dev);
nvme_set_irq_hints(dev);
}
return 0;
}
+static void nvme_dead_ctrl(struct nvme_dev *dev)
+{
+ dev_warn(dev->dev, "Device failed to resume\n");
+ kref_get(&dev->kref);
+ if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
+ dev->instance))) {
+ dev_err(dev->dev,
+ "Failed to start controller remove task\n");
+ kref_put(&dev->kref, nvme_free_dev);
+ }
+}
+
static void nvme_dev_reset(struct nvme_dev *dev)
{
+ bool in_probe = work_busy(&dev->probe_work);
+
nvme_dev_shutdown(dev);
- if (nvme_dev_resume(dev)) {
- dev_warn(dev->dev, "Device failed to resume\n");
- kref_get(&dev->kref);
- if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
- dev->instance))) {
- dev_err(dev->dev,
- "Failed to start controller remove task\n");
- kref_put(&dev->kref, nvme_free_dev);
- }
+
+ /* Synchronize with device probe so that work will see failure status
+ * and exit gracefully without trying to schedule another reset */
+ flush_work(&dev->probe_work);
+
+ /* Fail this device if reset occured during probe to avoid
+ * infinite initialization loops. */
+ if (in_probe) {
+ nvme_dead_ctrl(dev);
+ return;
}
+ /* Schedule device resume asynchronously so the reset work is available
+ * to cleanup errors that may occur during reinitialization */
+ schedule_work(&dev->probe_work);
}
static void nvme_reset_failed_dev(struct work_struct *ws)
@@ -2957,6 +2995,7 @@ static int nvme_reset(struct nvme_dev *dev)
if (!ret) {
flush_work(&dev->reset_work);
+ flush_work(&dev->probe_work);
return 0;
}
@@ -3053,26 +3092,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
static void nvme_async_probe(struct work_struct *work)
{
struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
- int result;
- result = nvme_dev_start(dev);
- if (result)
- goto reset;
-
- if (dev->online_queues > 1)
- result = nvme_dev_add(dev);
- if (result)
- goto reset;
-
- nvme_set_irq_hints(dev);
- return;
- reset:
- spin_lock(&dev_list_lock);
- if (!work_busy(&dev->reset_work)) {
- dev->reset_workfn = nvme_reset_failed_dev;
- queue_work(nvme_workq, &dev->reset_work);
- }
- spin_unlock(&dev_list_lock);
+ if (nvme_dev_resume(dev) && !work_busy(&dev->reset_work))
+ nvme_dead_ctrl(dev);
}
static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
@@ -3104,8 +3126,8 @@ static void nvme_remove(struct pci_dev *pdev)
flush_work(&dev->reset_work);
flush_work(&dev->scan_work);
device_remove_file(dev->device, &dev_attr_reset_controller);
- nvme_dev_shutdown(dev);
nvme_dev_remove(dev);
+ nvme_dev_shutdown(dev);
nvme_dev_remove_admin(dev);
device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
nvme_free_queues(dev, 0);
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 713fc9ff1149..2126842fb6e8 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -84,6 +84,13 @@ MODULE_PARM_DESC(max_persistent_grants,
"Maximum number of grants to map persistently");
/*
+ * Maximum order of pages to be used for the shared ring between front and
+ * backend, 4KB page granularity is used.
+ */
+unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
+MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
+/*
* The LRU mechanism to clean the lists of persistent grants needs to
* be executed periodically. The time interval between consecutive executions
* of the purge mechanism is set in ms.
@@ -1438,6 +1445,12 @@ static int __init xen_blkif_init(void)
if (!xen_domain())
return -ENODEV;
+ if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) {
+ pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
+ xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER);
+ xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
+ }
+
rc = xen_blkif_interface_init();
if (rc)
goto failed_init;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index f620b5d3f77c..8ccc49d01c8e 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -44,6 +44,7 @@
#include <xen/interface/io/blkif.h>
#include <xen/interface/io/protocols.h>
+extern unsigned int xen_blkif_max_ring_order;
/*
* This is the maximum number of segments that would be allowed in indirect
* requests. This value will also be passed to the frontend.
@@ -248,7 +249,7 @@ struct backend_info;
#define PERSISTENT_GNT_WAS_ACTIVE 1
/* Number of requests that we can fit in a ring */
-#define XEN_BLKIF_REQS 32
+#define XEN_BLKIF_REQS_PER_PAGE 32
struct persistent_gnt {
struct page *page;
@@ -320,6 +321,7 @@ struct xen_blkif {
struct work_struct free_work;
/* Thread shutdown wait queue. */
wait_queue_head_t shutdown_wq;
+ unsigned int nr_ring_pages;
};
struct seg_buf {
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 6ab69ad61ee1..deb3f001791f 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -25,6 +25,7 @@
/* Enlarge the array size in order to fully show blkback name. */
#define BLKBACK_NAME_LEN (20)
+#define RINGREF_NAME_LEN (20)
struct backend_info {
struct xenbus_device *dev;
@@ -124,8 +125,6 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
static struct xen_blkif *xen_blkif_alloc(domid_t domid)
{
struct xen_blkif *blkif;
- struct pending_req *req, *n;
- int i, j;
BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
@@ -151,55 +150,15 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
INIT_LIST_HEAD(&blkif->pending_free);
INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
-
- for (i = 0; i < XEN_BLKIF_REQS; i++) {
- req = kzalloc(sizeof(*req), GFP_KERNEL);
- if (!req)
- goto fail;
- list_add_tail(&req->free_list,
- &blkif->pending_free);
- for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
- req->segments[j] = kzalloc(sizeof(*req->segments[0]),
- GFP_KERNEL);
- if (!req->segments[j])
- goto fail;
- }
- for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
- req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
- GFP_KERNEL);
- if (!req->indirect_pages[j])
- goto fail;
- }
- }
spin_lock_init(&blkif->pending_free_lock);
init_waitqueue_head(&blkif->pending_free_wq);
init_waitqueue_head(&blkif->shutdown_wq);
return blkif;
-
-fail:
- list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
- list_del(&req->free_list);
- for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
- if (!req->segments[j])
- break;
- kfree(req->segments[j]);
- }
- for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
- if (!req->indirect_pages[j])
- break;
- kfree(req->indirect_pages[j]);
- }
- kfree(req);
- }
-
- kmem_cache_free(xen_blkif_cachep, blkif);
-
- return ERR_PTR(-ENOMEM);
}
-static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
- unsigned int evtchn)
+static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
+ unsigned int nr_grefs, unsigned int evtchn)
{
int err;
@@ -207,7 +166,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
if (blkif->irq)
return 0;
- err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1,
+ err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
&blkif->blk_ring);
if (err < 0)
return err;
@@ -217,21 +176,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
{
struct blkif_sring *sring;
sring = (struct blkif_sring *)blkif->blk_ring;
- BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs);
break;
}
case BLKIF_PROTOCOL_X86_32:
{
struct blkif_x86_32_sring *sring_x86_32;
sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
- BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs);
break;
}
case BLKIF_PROTOCOL_X86_64:
{
struct blkif_x86_64_sring *sring_x86_64;
sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
- BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs);
break;
}
default:
@@ -312,7 +271,7 @@ static void xen_blkif_free(struct xen_blkif *blkif)
i++;
}
- WARN_ON(i != XEN_BLKIF_REQS);
+ WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
kmem_cache_free(xen_blkif_cachep, blkif);
}
@@ -597,6 +556,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
if (err)
goto fail;
+ err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-page-order", "%u",
+ xen_blkif_max_ring_order);
+ if (err)
+ pr_warn("%s write out 'max-ring-page-order' failed\n", __func__);
+
err = xenbus_switch_state(dev, XenbusStateInitWait);
if (err)
goto fail;
@@ -860,22 +824,66 @@ again:
static int connect_ring(struct backend_info *be)
{
struct xenbus_device *dev = be->dev;
- unsigned long ring_ref;
- unsigned int evtchn;
+ unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
+ unsigned int evtchn, nr_grefs, ring_page_order;
unsigned int pers_grants;
char protocol[64] = "";
- int err;
+ struct pending_req *req, *n;
+ int err, i, j;
pr_debug("%s %s\n", __func__, dev->otherend);
- err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
- &ring_ref, "event-channel", "%u", &evtchn, NULL);
- if (err) {
- xenbus_dev_fatal(dev, err,
- "reading %s/ring-ref and event-channel",
+ err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
+ &evtchn);
+ if (err != 1) {
+ err = -EINVAL;
+ xenbus_dev_fatal(dev, err, "reading %s/event-channel",
dev->otherend);
return err;
}
+ pr_info("event-channel %u\n", evtchn);
+
+ err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
+ &ring_page_order);
+ if (err != 1) {
+ err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
+ "%u", &ring_ref[0]);
+ if (err != 1) {
+ err = -EINVAL;
+ xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
+ dev->otherend);
+ return err;
+ }
+ nr_grefs = 1;
+ pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
+ ring_ref[0]);
+ } else {
+ unsigned int i;
+
+ if (ring_page_order > xen_blkif_max_ring_order) {
+ err = -EINVAL;
+ xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
+ dev->otherend, ring_page_order,
+ xen_blkif_max_ring_order);
+ return err;
+ }
+
+ nr_grefs = 1 << ring_page_order;
+ for (i = 0; i < nr_grefs; i++) {
+ char ring_ref_name[RINGREF_NAME_LEN];
+
+ snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+ err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name,
+ "%u", &ring_ref[i]);
+ if (err != 1) {
+ err = -EINVAL;
+ xenbus_dev_fatal(dev, err, "reading %s/%s",
+ dev->otherend, ring_ref_name);
+ return err;
+ }
+ pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
+ }
+ }
be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
@@ -900,20 +908,55 @@ static int connect_ring(struct backend_info *be)
be->blkif->vbd.feature_gnt_persistent = pers_grants;
be->blkif->vbd.overflow_max_grants = 0;
+ be->blkif->nr_ring_pages = nr_grefs;
- pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
- ring_ref, evtchn, be->blkif->blk_protocol, protocol,
+ pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
+ nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
pers_grants ? "persistent grants" : "");
+ for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ goto fail;
+ list_add_tail(&req->free_list, &be->blkif->pending_free);
+ for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+ req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
+ if (!req->segments[j])
+ goto fail;
+ }
+ for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+ req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
+ GFP_KERNEL);
+ if (!req->indirect_pages[j])
+ goto fail;
+ }
+ }
+
/* Map the shared frame, irq etc. */
- err = xen_blkif_map(be->blkif, ring_ref, evtchn);
+ err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
if (err) {
- xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
- ring_ref, evtchn);
+ xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
return err;
}
return 0;
+
+fail:
+ list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) {
+ list_del(&req->free_list);
+ for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+ if (!req->segments[j])
+ break;
+ kfree(req->segments[j]);
+ }
+ for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+ if (!req->indirect_pages[j])
+ break;
+ kfree(req->indirect_pages[j]);
+ }
+ kfree(req);
+ }
+ return -ENOMEM;
}
static const struct xenbus_device_id xen_blkbk_ids[] = {
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2c61cf8c6f61..fc770b7d3beb 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -98,7 +98,21 @@ static unsigned int xen_blkif_max_segments = 32;
module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
-#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
+/*
+ * Maximum order of pages to be used for the shared ring between front and
+ * backend, 4KB page granularity is used.
+ */
+static unsigned int xen_blkif_max_ring_order;
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
+MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
+
+#define BLK_RING_SIZE(info) __CONST_RING_SIZE(blkif, PAGE_SIZE * (info)->nr_ring_pages)
+#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES)
+/*
+ * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
+ * characters are enough. Define to 20 to keep consist with backend.
+ */
+#define RINGREF_NAME_LEN (20)
/*
* We have one of these per vbd, whether ide, scsi or 'other'. They
@@ -114,13 +128,14 @@ struct blkfront_info
int vdevice;
blkif_vdev_t handle;
enum blkif_state connected;
- int ring_ref;
+ int ring_ref[XENBUS_MAX_RING_PAGES];
+ unsigned int nr_ring_pages;
struct blkif_front_ring ring;
unsigned int evtchn, irq;
struct request_queue *rq;
struct work_struct work;
struct gnttab_free_callback callback;
- struct blk_shadow shadow[BLK_RING_SIZE];
+ struct blk_shadow shadow[BLK_MAX_RING_SIZE];
struct list_head grants;
struct list_head indirect_pages;
unsigned int persistent_gnts_c;
@@ -139,8 +154,6 @@ static unsigned int nr_minors;
static unsigned long *minors;
static DEFINE_SPINLOCK(minor_lock);
-#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
- (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
#define GRANT_INVALID_REF 0
#define PARTS_PER_DISK 16
@@ -170,7 +183,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info);
static int get_id_from_freelist(struct blkfront_info *info)
{
unsigned long free = info->shadow_free;
- BUG_ON(free >= BLK_RING_SIZE);
+ BUG_ON(free >= BLK_RING_SIZE(info));
info->shadow_free = info->shadow[free].req.u.rw.id;
info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
return free;
@@ -983,7 +996,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
}
}
- for (i = 0; i < BLK_RING_SIZE; i++) {
+ for (i = 0; i < BLK_RING_SIZE(info); i++) {
/*
* Clear persistent grants present in requests already
* on the shared ring
@@ -1033,12 +1046,15 @@ free_shadow:
flush_work(&info->work);
/* Free resources associated with old device channel. */
- if (info->ring_ref != GRANT_INVALID_REF) {
- gnttab_end_foreign_access(info->ring_ref, 0,
- (unsigned long)info->ring.sring);
- info->ring_ref = GRANT_INVALID_REF;
- info->ring.sring = NULL;
+ for (i = 0; i < info->nr_ring_pages; i++) {
+ if (info->ring_ref[i] != GRANT_INVALID_REF) {
+ gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
+ info->ring_ref[i] = GRANT_INVALID_REF;
+ }
}
+ free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
+ info->ring.sring = NULL;
+
if (info->irq)
unbind_from_irqhandler(info->irq, info);
info->evtchn = info->irq = 0;
@@ -1157,7 +1173,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
* never have given to it (we stamp it up to BLK_RING_SIZE -
* look in get_id_from_freelist.
*/
- if (id >= BLK_RING_SIZE) {
+ if (id >= BLK_RING_SIZE(info)) {
WARN(1, "%s: response to %s has incorrect id (%ld)\n",
info->gd->disk_name, op_name(bret->operation), id);
/* We can't safely get the 'struct request' as
@@ -1245,26 +1261,30 @@ static int setup_blkring(struct xenbus_device *dev,
struct blkfront_info *info)
{
struct blkif_sring *sring;
- grant_ref_t gref;
- int err;
+ int err, i;
+ unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
+ grant_ref_t gref[XENBUS_MAX_RING_PAGES];
- info->ring_ref = GRANT_INVALID_REF;
+ for (i = 0; i < info->nr_ring_pages; i++)
+ info->ring_ref[i] = GRANT_INVALID_REF;
- sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
+ sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
+ get_order(ring_size));
if (!sring) {
xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
return -ENOMEM;
}
SHARED_RING_INIT(sring);
- FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+ FRONT_RING_INIT(&info->ring, sring, ring_size);
- err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref);
+ err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
if (err < 0) {
- free_page((unsigned long)sring);
+ free_pages((unsigned long)sring, get_order(ring_size));
info->ring.sring = NULL;
goto fail;
}
- info->ring_ref = gref;
+ for (i = 0; i < info->nr_ring_pages; i++)
+ info->ring_ref[i] = gref[i];
err = xenbus_alloc_evtchn(dev, &info->evtchn);
if (err)
@@ -1292,7 +1312,18 @@ static int talk_to_blkback(struct xenbus_device *dev,
{
const char *message = NULL;
struct xenbus_transaction xbt;
- int err;
+ int err, i;
+ unsigned int max_page_order = 0;
+ unsigned int ring_page_order = 0;
+
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "max-ring-page-order", "%u", &max_page_order);
+ if (err != 1)
+ info->nr_ring_pages = 1;
+ else {
+ ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
+ info->nr_ring_pages = 1 << ring_page_order;
+ }
/* Create shared ring, alloc event channel. */
err = setup_blkring(dev, info);
@@ -1306,11 +1337,32 @@ again:
goto destroy_blkring;
}
- err = xenbus_printf(xbt, dev->nodename,
- "ring-ref", "%u", info->ring_ref);
- if (err) {
- message = "writing ring-ref";
- goto abort_transaction;
+ if (info->nr_ring_pages == 1) {
+ err = xenbus_printf(xbt, dev->nodename,
+ "ring-ref", "%u", info->ring_ref[0]);
+ if (err) {
+ message = "writing ring-ref";
+ goto abort_transaction;
+ }
+ } else {
+ err = xenbus_printf(xbt, dev->nodename,
+ "ring-page-order", "%u", ring_page_order);
+ if (err) {
+ message = "writing ring-page-order";
+ goto abort_transaction;
+ }
+
+ for (i = 0; i < info->nr_ring_pages; i++) {
+ char ring_ref_name[RINGREF_NAME_LEN];
+
+ snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+ err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
+ "%u", info->ring_ref[i]);
+ if (err) {
+ message = "writing ring-ref";
+ goto abort_transaction;
+ }
+ }
}
err = xenbus_printf(xbt, dev->nodename,
"event-channel", "%u", info->evtchn);
@@ -1338,6 +1390,9 @@ again:
goto destroy_blkring;
}
+ for (i = 0; i < BLK_RING_SIZE(info); i++)
+ info->shadow[i].req.u.rw.id = i+1;
+ info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
xenbus_switch_state(dev, XenbusStateInitialised);
return 0;
@@ -1361,7 +1416,7 @@ again:
static int blkfront_probe(struct xenbus_device *dev,
const struct xenbus_device_id *id)
{
- int err, vdevice, i;
+ int err, vdevice;
struct blkfront_info *info;
/* FIXME: Use dynamic device id if this is not set. */
@@ -1422,21 +1477,10 @@ static int blkfront_probe(struct xenbus_device *dev,
info->connected = BLKIF_STATE_DISCONNECTED;
INIT_WORK(&info->work, blkif_restart_queue);
- for (i = 0; i < BLK_RING_SIZE; i++)
- info->shadow[i].req.u.rw.id = i+1;
- info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
-
/* Front end dir is a number, which is used as the id. */
info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
dev_set_drvdata(&dev->dev, info);
- err = talk_to_blkback(dev, info);
- if (err) {
- kfree(info);
- dev_set_drvdata(&dev->dev, NULL);
- return err;
- }
-
return 0;
}
@@ -1476,10 +1520,10 @@ static int blkif_recover(struct blkfront_info *info)
/* Stage 2: Set up free list. */
memset(&info->shadow, 0, sizeof(info->shadow));
- for (i = 0; i < BLK_RING_SIZE; i++)
+ for (i = 0; i < BLK_RING_SIZE(info); i++)
info->shadow[i].req.u.rw.id = i+1;
info->shadow_free = info->ring.req_prod_pvt;
- info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
+ info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
rc = blkfront_setup_indirect(info);
if (rc) {
@@ -1491,7 +1535,7 @@ static int blkif_recover(struct blkfront_info *info)
blk_queue_max_segments(info->rq, segs);
bio_list_init(&bio_list);
INIT_LIST_HEAD(&requests);
- for (i = 0; i < BLK_RING_SIZE; i++) {
+ for (i = 0; i < BLK_RING_SIZE(info); i++) {
/* Not in use? */
if (!copy[i].request)
continue;
@@ -1697,7 +1741,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
segs = info->max_indirect_segments;
}
- err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
+ err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE(info));
if (err)
goto out_of_memory;
@@ -1707,7 +1751,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
* grants, we need to allocate a set of pages that can be
* used for mapping indirect grefs
*/
- int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE;
+ int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE(info);
BUG_ON(!list_empty(&info->indirect_pages));
for (i = 0; i < num; i++) {
@@ -1718,7 +1762,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
}
}
- for (i = 0; i < BLK_RING_SIZE; i++) {
+ for (i = 0; i < BLK_RING_SIZE(info); i++) {
info->shadow[i].grants_used = kzalloc(
sizeof(info->shadow[i].grants_used[0]) * segs,
GFP_NOIO);
@@ -1740,7 +1784,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
return 0;
out_of_memory:
- for (i = 0; i < BLK_RING_SIZE; i++) {
+ for (i = 0; i < BLK_RING_SIZE(info); i++) {
kfree(info->shadow[i].grants_used);
info->shadow[i].grants_used = NULL;
kfree(info->shadow[i].sg);
@@ -1906,8 +1950,15 @@ static void blkback_changed(struct xenbus_device *dev,
dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
switch (backend_state) {
- case XenbusStateInitialising:
case XenbusStateInitWait:
+ if (dev->state != XenbusStateInitialising)
+ break;
+ if (talk_to_blkback(dev, info)) {
+ kfree(info);
+ dev_set_drvdata(&dev->dev, NULL);
+ break;
+ }
+ case XenbusStateInitialising:
case XenbusStateInitialised:
case XenbusStateReconfiguring:
case XenbusStateReconfigured:
@@ -2091,6 +2142,12 @@ static int __init xlblk_init(void)
if (!xen_domain())
return -ENODEV;
+ if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) {
+ pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
+ xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER);
+ xen_blkif_max_ring_order = 0;
+ }
+
if (!xen_has_pv_disk_devices())
return -ENODEV;
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index bda2cb06dc7a..88d474b78076 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -162,6 +162,17 @@ config MX3_IPU_IRQS
To avoid bloating the irq_desc[] array we allocate a sufficient
number of IRQ slots and map them dynamically to specific sources.
+config PXA_DMA
+ bool "PXA DMA support"
+ depends on (ARCH_MMP || ARCH_PXA)
+ select DMA_ENGINE
+ select DMA_VIRTUAL_CHANNELS
+ help
+ Support the DMA engine for PXA. It is also compatible with MMP PDMA
+ platform. The internal DMA IP of all PXA variants is supported, with
+ 16 to 32 channels for peripheral to memory or memory to memory
+ transfers.
+
config TXX9_DMAC
tristate "Toshiba TXx9 SoC DMA support"
depends on MACH_TX49XX || MACH_TX39XX
@@ -245,6 +256,9 @@ config TI_EDMA
Enable support for the TI EDMA controller. This DMA
engine is found on TI DaVinci and AM33xx parts.
+config TI_DMA_CROSSBAR
+ bool
+
config ARCH_HAS_ASYNC_TX_FIND_CHANNEL
bool
@@ -330,6 +344,7 @@ config DMA_OMAP
depends on ARCH_OMAP
select DMA_ENGINE
select DMA_VIRTUAL_CHANNELS
+ select TI_DMA_CROSSBAR if SOC_DRA7XX
config DMA_BCM2835
tristate "BCM2835 DMA engine support"
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 69f77d5ba53b..6a4d6f2827da 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_AMCC_PPC440SPE_ADMA) += ppc4xx/
obj-$(CONFIG_IMX_SDMA) += imx-sdma.o
obj-$(CONFIG_IMX_DMA) += imx-dma.o
obj-$(CONFIG_MXS_DMA) += mxs-dma.o
+obj-$(CONFIG_PXA_DMA) += pxa_dma.o
obj-$(CONFIG_TIMB_DMA) += timb_dma.o
obj-$(CONFIG_SIRF_DMA) += sirf-dma.o
obj-$(CONFIG_TI_EDMA) += edma.o
@@ -38,6 +39,7 @@ obj-$(CONFIG_EP93XX_DMA) += ep93xx_dma.o
obj-$(CONFIG_DMA_SA11X0) += sa11x0-dma.o
obj-$(CONFIG_MMP_TDMA) += mmp_tdma.o
obj-$(CONFIG_DMA_OMAP) += omap-dma.o
+obj-$(CONFIG_TI_DMA_CROSSBAR) += ti-dma-crossbar.o
obj-$(CONFIG_DMA_BCM2835) += bcm2835-dma.o
obj-$(CONFIG_MMP_PDMA) += mmp_pdma.o
obj-$(CONFIG_DMA_JZ4740) += dma-jz4740.o
diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 49d396ec06e5..5de3cf453f35 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -474,7 +474,7 @@ static void pl08x_terminate_phy_chan(struct pl08x_driver_data *pl08x,
u32 val = readl(ch->reg_config);
val &= ~(PL080_CONFIG_ENABLE | PL080_CONFIG_ERR_IRQ_MASK |
- PL080_CONFIG_TC_IRQ_MASK);
+ PL080_CONFIG_TC_IRQ_MASK);
writel(val, ch->reg_config);
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 57b2141ddddc..59892126d175 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -247,6 +247,10 @@ static void atc_dostart(struct at_dma_chan *atchan, struct at_desc *first)
channel_writel(atchan, CTRLA, 0);
channel_writel(atchan, CTRLB, 0);
channel_writel(atchan, DSCR, first->txd.phys);
+ channel_writel(atchan, SPIP, ATC_SPIP_HOLE(first->src_hole) |
+ ATC_SPIP_BOUNDARY(first->boundary));
+ channel_writel(atchan, DPIP, ATC_DPIP_HOLE(first->dst_hole) |
+ ATC_DPIP_BOUNDARY(first->boundary));
dma_writel(atdma, CHER, atchan->mask);
vdbg_dump_regs(atchan);
@@ -635,6 +639,104 @@ static dma_cookie_t atc_tx_submit(struct dma_async_tx_descriptor *tx)
}
/**
+ * atc_prep_dma_interleaved - prepare memory to memory interleaved operation
+ * @chan: the channel to prepare operation on
+ * @xt: Interleaved transfer template
+ * @flags: tx descriptor status flags
+ */
+static struct dma_async_tx_descriptor *
+atc_prep_dma_interleaved(struct dma_chan *chan,
+ struct dma_interleaved_template *xt,
+ unsigned long flags)
+{
+ struct at_dma_chan *atchan = to_at_dma_chan(chan);
+ struct data_chunk *first = xt->sgl;
+ struct at_desc *desc = NULL;
+ size_t xfer_count;
+ unsigned int dwidth;
+ u32 ctrla;
+ u32 ctrlb;
+ size_t len = 0;
+ int i;
+
+ dev_info(chan2dev(chan),
+ "%s: src=0x%08x, dest=0x%08x, numf=%d, frame_size=%d, flags=0x%lx\n",
+ __func__, xt->src_start, xt->dst_start, xt->numf,
+ xt->frame_size, flags);
+
+ if (unlikely(!xt || xt->numf != 1 || !xt->frame_size))
+ return NULL;
+
+ /*
+ * The controller can only "skip" X bytes every Y bytes, so we
+ * need to make sure we are given a template that fit that
+ * description, ie a template with chunks that always have the
+ * same size, with the same ICGs.
+ */
+ for (i = 0; i < xt->frame_size; i++) {
+ struct data_chunk *chunk = xt->sgl + i;
+
+ if ((chunk->size != xt->sgl->size) ||
+ (dmaengine_get_dst_icg(xt, chunk) != dmaengine_get_dst_icg(xt, first)) ||
+ (dmaengine_get_src_icg(xt, chunk) != dmaengine_get_src_icg(xt, first))) {
+ dev_err(chan2dev(chan),
+ "%s: the controller can transfer only identical chunks\n",
+ __func__);
+ return NULL;
+ }
+
+ len += chunk->size;
+ }
+
+ dwidth = atc_get_xfer_width(xt->src_start,
+ xt->dst_start, len);
+
+ xfer_count = len >> dwidth;
+ if (xfer_count > ATC_BTSIZE_MAX) {
+ dev_err(chan2dev(chan), "%s: buffer is too big\n", __func__);
+ return NULL;
+ }
+
+ ctrla = ATC_SRC_WIDTH(dwidth) |
+ ATC_DST_WIDTH(dwidth);
+
+ ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN
+ | ATC_SRC_ADDR_MODE_INCR
+ | ATC_DST_ADDR_MODE_INCR
+ | ATC_SRC_PIP
+ | ATC_DST_PIP
+ | ATC_FC_MEM2MEM;
+
+ /* create the transfer */
+ desc = atc_desc_get(atchan);
+ if (!desc) {
+ dev_err(chan2dev(chan),
+ "%s: couldn't allocate our descriptor\n", __func__);
+ return NULL;
+ }
+
+ desc->lli.saddr = xt->src_start;
+ desc->lli.daddr = xt->dst_start;
+ desc->lli.ctrla = ctrla | xfer_count;
+ desc->lli.ctrlb = ctrlb;
+
+ desc->boundary = first->size >> dwidth;
+ desc->dst_hole = (dmaengine_get_dst_icg(xt, first) >> dwidth) + 1;
+ desc->src_hole = (dmaengine_get_src_icg(xt, first) >> dwidth) + 1;
+
+ desc->txd.cookie = -EBUSY;
+ desc->total_len = desc->len = len;
+ desc->tx_width = dwidth;
+
+ /* set end-of-link to the last link descriptor of list*/
+ set_desc_eol(desc);
+
+ desc->txd.flags = flags; /* client is in control of this ack */
+
+ return &desc->txd;
+}
+
+/**
* atc_prep_dma_memcpy - prepare a memcpy operation
* @chan: the channel to prepare operation on
* @dest: operation virtual destination address
@@ -1609,6 +1711,7 @@ static int __init at_dma_probe(struct platform_device *pdev)
/* setup platform data for each SoC */
dma_cap_set(DMA_MEMCPY, at91sam9rl_config.cap_mask);
dma_cap_set(DMA_SG, at91sam9rl_config.cap_mask);
+ dma_cap_set(DMA_INTERLEAVE, at91sam9g45_config.cap_mask);
dma_cap_set(DMA_MEMCPY, at91sam9g45_config.cap_mask);
dma_cap_set(DMA_SLAVE, at91sam9g45_config.cap_mask);
dma_cap_set(DMA_SG, at91sam9g45_config.cap_mask);
@@ -1713,6 +1816,9 @@ static int __init at_dma_probe(struct platform_device *pdev)
atdma->dma_common.dev = &pdev->dev;
/* set prep routines based on capability */
+ if (dma_has_cap(DMA_INTERLEAVE, atdma->dma_common.cap_mask))
+ atdma->dma_common.device_prep_interleaved_dma = atc_prep_dma_interleaved;
+
if (dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask))
atdma->dma_common.device_prep_dma_memcpy = atc_prep_dma_memcpy;
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h
index 2727ca560572..bc8d5ebedd19 100644
--- a/drivers/dma/at_hdmac_regs.h
+++ b/drivers/dma/at_hdmac_regs.h
@@ -196,6 +196,11 @@ struct at_desc {
size_t len;
u32 tx_width;
size_t total_len;
+
+ /* Interleaved data */
+ size_t boundary;
+ size_t dst_hole;
+ size_t src_hole;
};
static inline struct at_desc *
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 7992164ea9ec..cf1213de7865 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -235,6 +235,10 @@ struct at_xdmac_lld {
dma_addr_t mbr_sa; /* Source Address Member */
dma_addr_t mbr_da; /* Destination Address Member */
u32 mbr_cfg; /* Configuration Register */
+ u32 mbr_bc; /* Block Control Register */
+ u32 mbr_ds; /* Data Stride Register */
+ u32 mbr_sus; /* Source Microblock Stride Register */
+ u32 mbr_dus; /* Destination Microblock Stride Register */
};
@@ -358,6 +362,8 @@ static void at_xdmac_start_xfer(struct at_xdmac_chan *atchan,
if (at_xdmac_chan_is_cyclic(atchan)) {
reg = AT_XDMAC_CNDC_NDVIEW_NDV1;
at_xdmac_chan_write(atchan, AT_XDMAC_CC, first->lld.mbr_cfg);
+ } else if (first->lld.mbr_ubc & AT_XDMAC_MBR_UBC_NDV3) {
+ reg = AT_XDMAC_CNDC_NDVIEW_NDV3;
} else {
/*
* No need to write AT_XDMAC_CC reg, it will be done when the
@@ -465,6 +471,33 @@ static struct at_xdmac_desc *at_xdmac_get_desc(struct at_xdmac_chan *atchan)
return desc;
}
+static void at_xdmac_queue_desc(struct dma_chan *chan,
+ struct at_xdmac_desc *prev,
+ struct at_xdmac_desc *desc)
+{
+ if (!prev || !desc)
+ return;
+
+ prev->lld.mbr_nda = desc->tx_dma_desc.phys;
+ prev->lld.mbr_ubc |= AT_XDMAC_MBR_UBC_NDE;
+
+ dev_dbg(chan2dev(chan), "%s: chain lld: prev=0x%p, mbr_nda=%pad\n",
+ __func__, prev, &prev->lld.mbr_nda);
+}
+
+static inline void at_xdmac_increment_block_count(struct dma_chan *chan,
+ struct at_xdmac_desc *desc)
+{
+ if (!desc)
+ return;
+
+ desc->lld.mbr_bc++;
+
+ dev_dbg(chan2dev(chan),
+ "%s: incrementing the block count of the desc 0x%p\n",
+ __func__, desc);
+}
+
static struct dma_chan *at_xdmac_xlate(struct of_phandle_args *dma_spec,
struct of_dma *of_dma)
{
@@ -656,19 +689,14 @@ at_xdmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV2 /* next descriptor view */
| AT_XDMAC_MBR_UBC_NDEN /* next descriptor dst parameter update */
| AT_XDMAC_MBR_UBC_NSEN /* next descriptor src parameter update */
- | (i == sg_len - 1 ? 0 : AT_XDMAC_MBR_UBC_NDE) /* descriptor fetch */
| (len >> fixed_dwidth); /* microblock length */
dev_dbg(chan2dev(chan),
"%s: lld: mbr_sa=%pad, mbr_da=%pad, mbr_ubc=0x%08x\n",
__func__, &desc->lld.mbr_sa, &desc->lld.mbr_da, desc->lld.mbr_ubc);
/* Chain lld. */
- if (prev) {
- prev->lld.mbr_nda = desc->tx_dma_desc.phys;
- dev_dbg(chan2dev(chan),
- "%s: chain lld: prev=0x%p, mbr_nda=%pad\n",
- __func__, prev, &prev->lld.mbr_nda);
- }
+ if (prev)
+ at_xdmac_queue_desc(chan, prev, desc);
prev = desc;
if (!first)
@@ -748,7 +776,6 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr,
desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV1
| AT_XDMAC_MBR_UBC_NDEN
| AT_XDMAC_MBR_UBC_NSEN
- | AT_XDMAC_MBR_UBC_NDE
| period_len >> at_xdmac_get_dwidth(desc->lld.mbr_cfg);
dev_dbg(chan2dev(chan),
@@ -756,12 +783,8 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr,
__func__, &desc->lld.mbr_sa, &desc->lld.mbr_da, desc->lld.mbr_ubc);
/* Chain lld. */
- if (prev) {
- prev->lld.mbr_nda = desc->tx_dma_desc.phys;
- dev_dbg(chan2dev(chan),
- "%s: chain lld: prev=0x%p, mbr_nda=%pad\n",
- __func__, prev, &prev->lld.mbr_nda);
- }
+ if (prev)
+ at_xdmac_queue_desc(chan, prev, desc);
prev = desc;
if (!first)
@@ -783,6 +806,215 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr,
return &first->tx_dma_desc;
}
+static inline u32 at_xdmac_align_width(struct dma_chan *chan, dma_addr_t addr)
+{
+ u32 width;
+
+ /*
+ * Check address alignment to select the greater data width we
+ * can use.
+ *
+ * Some XDMAC implementations don't provide dword transfer, in
+ * this case selecting dword has the same behavior as
+ * selecting word transfers.
+ */
+ if (!(addr & 7)) {
+ width = AT_XDMAC_CC_DWIDTH_DWORD;
+ dev_dbg(chan2dev(chan), "%s: dwidth: double word\n", __func__);
+ } else if (!(addr & 3)) {
+ width = AT_XDMAC_CC_DWIDTH_WORD;
+ dev_dbg(chan2dev(chan), "%s: dwidth: word\n", __func__);
+ } else if (!(addr & 1)) {
+ width = AT_XDMAC_CC_DWIDTH_HALFWORD;
+ dev_dbg(chan2dev(chan), "%s: dwidth: half word\n", __func__);
+ } else {
+ width = AT_XDMAC_CC_DWIDTH_BYTE;
+ dev_dbg(chan2dev(chan), "%s: dwidth: byte\n", __func__);
+ }
+
+ return width;
+}
+
+static struct at_xdmac_desc *
+at_xdmac_interleaved_queue_desc(struct dma_chan *chan,
+ struct at_xdmac_chan *atchan,
+ struct at_xdmac_desc *prev,
+ dma_addr_t src, dma_addr_t dst,
+ struct dma_interleaved_template *xt,
+ struct data_chunk *chunk)
+{
+ struct at_xdmac_desc *desc;
+ u32 dwidth;
+ unsigned long flags;
+ size_t ublen;
+ /*
+ * WARNING: The channel configuration is set here since there is no
+ * dmaengine_slave_config call in this case. Moreover we don't know the
+ * direction, it involves we can't dynamically set the source and dest
+ * interface so we have to use the same one. Only interface 0 allows EBI
+ * access. Hopefully we can access DDR through both ports (at least on
+ * SAMA5D4x), so we can use the same interface for source and dest,
+ * that solves the fact we don't know the direction.
+ */
+ u32 chan_cc = AT_XDMAC_CC_DIF(0)
+ | AT_XDMAC_CC_SIF(0)
+ | AT_XDMAC_CC_MBSIZE_SIXTEEN
+ | AT_XDMAC_CC_TYPE_MEM_TRAN;
+
+ dwidth = at_xdmac_align_width(chan, src | dst | chunk->size);
+ if (chunk->size >= (AT_XDMAC_MBR_UBC_UBLEN_MAX << dwidth)) {
+ dev_dbg(chan2dev(chan),
+ "%s: chunk too big (%d, max size %lu)...\n",
+ __func__, chunk->size,
+ AT_XDMAC_MBR_UBC_UBLEN_MAX << dwidth);
+ return NULL;
+ }
+
+ if (prev)
+ dev_dbg(chan2dev(chan),
+ "Adding items at the end of desc 0x%p\n", prev);
+
+ if (xt->src_inc) {
+ if (xt->src_sgl)
+ chan_cc |= AT_XDMAC_CC_SAM_UBS_DS_AM;
+ else
+ chan_cc |= AT_XDMAC_CC_SAM_INCREMENTED_AM;
+ }
+
+ if (xt->dst_inc) {
+ if (xt->dst_sgl)
+ chan_cc |= AT_XDMAC_CC_DAM_UBS_DS_AM;
+ else
+ chan_cc |= AT_XDMAC_CC_DAM_INCREMENTED_AM;
+ }
+
+ spin_lock_irqsave(&atchan->lock, flags);
+ desc = at_xdmac_get_desc(atchan);
+ spin_unlock_irqrestore(&atchan->lock, flags);
+ if (!desc) {
+ dev_err(chan2dev(chan), "can't get descriptor\n");
+ return NULL;
+ }
+
+ chan_cc |= AT_XDMAC_CC_DWIDTH(dwidth);
+
+ ublen = chunk->size >> dwidth;
+
+ desc->lld.mbr_sa = src;
+ desc->lld.mbr_da = dst;
+ desc->lld.mbr_sus = dmaengine_get_src_icg(xt, chunk);
+ desc->lld.mbr_dus = dmaengine_get_dst_icg(xt, chunk);
+
+ desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV3
+ | AT_XDMAC_MBR_UBC_NDEN
+ | AT_XDMAC_MBR_UBC_NSEN
+ | ublen;
+ desc->lld.mbr_cfg = chan_cc;
+
+ dev_dbg(chan2dev(chan),
+ "%s: lld: mbr_sa=0x%08x, mbr_da=0x%08x, mbr_ubc=0x%08x, mbr_cfg=0x%08x\n",
+ __func__, desc->lld.mbr_sa, desc->lld.mbr_da,
+ desc->lld.mbr_ubc, desc->lld.mbr_cfg);
+
+ /* Chain lld. */
+ if (prev)
+ at_xdmac_queue_desc(chan, prev, desc);
+
+ return desc;
+}
+
+static struct dma_async_tx_descriptor *
+at_xdmac_prep_interleaved(struct dma_chan *chan,
+ struct dma_interleaved_template *xt,
+ unsigned long flags)
+{
+ struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan);
+ struct at_xdmac_desc *prev = NULL, *first = NULL;
+ struct data_chunk *chunk, *prev_chunk = NULL;
+ dma_addr_t dst_addr, src_addr;
+ size_t dst_skip, src_skip, len = 0;
+ size_t prev_dst_icg = 0, prev_src_icg = 0;
+ int i;
+
+ if (!xt || (xt->numf != 1) || (xt->dir != DMA_MEM_TO_MEM))
+ return NULL;
+
+ dev_dbg(chan2dev(chan), "%s: src=0x%08x, dest=0x%08x, numf=%d, frame_size=%d, flags=0x%lx\n",
+ __func__, xt->src_start, xt->dst_start, xt->numf,
+ xt->frame_size, flags);
+
+ src_addr = xt->src_start;
+ dst_addr = xt->dst_start;
+
+ for (i = 0; i < xt->frame_size; i++) {
+ struct at_xdmac_desc *desc;
+ size_t src_icg, dst_icg;
+
+ chunk = xt->sgl + i;
+
+ dst_icg = dmaengine_get_dst_icg(xt, chunk);
+ src_icg = dmaengine_get_src_icg(xt, chunk);
+
+ src_skip = chunk->size + src_icg;
+ dst_skip = chunk->size + dst_icg;
+
+ dev_dbg(chan2dev(chan),
+ "%s: chunk size=%d, src icg=%d, dst icg=%d\n",
+ __func__, chunk->size, src_icg, dst_icg);
+
+ /*
+ * Handle the case where we just have the same
+ * transfer to setup, we can just increase the
+ * block number and reuse the same descriptor.
+ */
+ if (prev_chunk && prev &&
+ (prev_chunk->size == chunk->size) &&
+ (prev_src_icg == src_icg) &&
+ (prev_dst_icg == dst_icg)) {
+ dev_dbg(chan2dev(chan),
+ "%s: same configuration that the previous chunk, merging the descriptors...\n",
+ __func__);
+ at_xdmac_increment_block_count(chan, prev);
+ continue;
+ }
+
+ desc = at_xdmac_interleaved_queue_desc(chan, atchan,
+ prev,
+ src_addr, dst_addr,
+ xt, chunk);
+ if (!desc) {
+ list_splice_init(&first->descs_list,
+ &atchan->free_descs_list);
+ return NULL;
+ }
+
+ if (!first)
+ first = desc;
+
+ dev_dbg(chan2dev(chan), "%s: add desc 0x%p to descs_list 0x%p\n",
+ __func__, desc, first);
+ list_add_tail(&desc->desc_node, &first->descs_list);
+
+ if (xt->src_sgl)
+ src_addr += src_skip;
+
+ if (xt->dst_sgl)
+ dst_addr += dst_skip;
+
+ len += chunk->size;
+ prev_chunk = chunk;
+ prev_dst_icg = dst_icg;
+ prev_src_icg = src_icg;
+ prev = desc;
+ }
+
+ first->tx_dma_desc.cookie = -EBUSY;
+ first->tx_dma_desc.flags = flags;
+ first->xfer_size = len;
+
+ return &first->tx_dma_desc;
+}
+
static struct dma_async_tx_descriptor *
at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
size_t len, unsigned long flags)
@@ -814,24 +1046,7 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
if (unlikely(!len))
return NULL;
- /*
- * Check address alignment to select the greater data width we can use.
- * Some XDMAC implementations don't provide dword transfer, in this
- * case selecting dword has the same behavior as selecting word transfers.
- */
- if (!((src_addr | dst_addr) & 7)) {
- dwidth = AT_XDMAC_CC_DWIDTH_DWORD;
- dev_dbg(chan2dev(chan), "%s: dwidth: double word\n", __func__);
- } else if (!((src_addr | dst_addr) & 3)) {
- dwidth = AT_XDMAC_CC_DWIDTH_WORD;
- dev_dbg(chan2dev(chan), "%s: dwidth: word\n", __func__);
- } else if (!((src_addr | dst_addr) & 1)) {
- dwidth = AT_XDMAC_CC_DWIDTH_HALFWORD;
- dev_dbg(chan2dev(chan), "%s: dwidth: half word\n", __func__);
- } else {
- dwidth = AT_XDMAC_CC_DWIDTH_BYTE;
- dev_dbg(chan2dev(chan), "%s: dwidth: byte\n", __func__);
- }
+ dwidth = at_xdmac_align_width(chan, src_addr | dst_addr);
/* Prepare descriptors. */
while (remaining_size) {
@@ -861,19 +1076,8 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
dev_dbg(chan2dev(chan), "%s: xfer_size=%zu\n", __func__, xfer_size);
/* Check remaining length and change data width if needed. */
- if (!((src_addr | dst_addr | xfer_size) & 7)) {
- dwidth = AT_XDMAC_CC_DWIDTH_DWORD;
- dev_dbg(chan2dev(chan), "%s: dwidth: double word\n", __func__);
- } else if (!((src_addr | dst_addr | xfer_size) & 3)) {
- dwidth = AT_XDMAC_CC_DWIDTH_WORD;
- dev_dbg(chan2dev(chan), "%s: dwidth: word\n", __func__);
- } else if (!((src_addr | dst_addr | xfer_size) & 1)) {
- dwidth = AT_XDMAC_CC_DWIDTH_HALFWORD;
- dev_dbg(chan2dev(chan), "%s: dwidth: half word\n", __func__);
- } else if ((src_addr | dst_addr | xfer_size) & 1) {
- dwidth = AT_XDMAC_CC_DWIDTH_BYTE;
- dev_dbg(chan2dev(chan), "%s: dwidth: byte\n", __func__);
- }
+ dwidth = at_xdmac_align_width(chan,
+ src_addr | dst_addr | xfer_size);
chan_cc |= AT_XDMAC_CC_DWIDTH(dwidth);
ublen = xfer_size >> dwidth;
@@ -884,7 +1088,6 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV2
| AT_XDMAC_MBR_UBC_NDEN
| AT_XDMAC_MBR_UBC_NSEN
- | (remaining_size ? AT_XDMAC_MBR_UBC_NDE : 0)
| ublen;
desc->lld.mbr_cfg = chan_cc;
@@ -893,12 +1096,8 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
__func__, &desc->lld.mbr_sa, &desc->lld.mbr_da, desc->lld.mbr_ubc, desc->lld.mbr_cfg);
/* Chain lld. */
- if (prev) {
- prev->lld.mbr_nda = desc->tx_dma_desc.phys;
- dev_dbg(chan2dev(chan),
- "%s: chain lld: prev=0x%p, mbr_nda=0x%08x\n",
- __func__, prev, prev->lld.mbr_nda);
- }
+ if (prev)
+ at_xdmac_queue_desc(chan, prev, desc);
prev = desc;
if (!first)
@@ -915,6 +1114,93 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
return &first->tx_dma_desc;
}
+static struct at_xdmac_desc *at_xdmac_memset_create_desc(struct dma_chan *chan,
+ struct at_xdmac_chan *atchan,
+ dma_addr_t dst_addr,
+ size_t len,
+ int value)
+{
+ struct at_xdmac_desc *desc;
+ unsigned long flags;
+ size_t ublen;
+ u32 dwidth;
+ /*
+ * WARNING: The channel configuration is set here since there is no
+ * dmaengine_slave_config call in this case. Moreover we don't know the
+ * direction, it involves we can't dynamically set the source and dest
+ * interface so we have to use the same one. Only interface 0 allows EBI
+ * access. Hopefully we can access DDR through both ports (at least on
+ * SAMA5D4x), so we can use the same interface for source and dest,
+ * that solves the fact we don't know the direction.
+ */
+ u32 chan_cc = AT_XDMAC_CC_DAM_INCREMENTED_AM
+ | AT_XDMAC_CC_SAM_INCREMENTED_AM
+ | AT_XDMAC_CC_DIF(0)
+ | AT_XDMAC_CC_SIF(0)
+ | AT_XDMAC_CC_MBSIZE_SIXTEEN
+ | AT_XDMAC_CC_MEMSET_HW_MODE
+ | AT_XDMAC_CC_TYPE_MEM_TRAN;
+
+ dwidth = at_xdmac_align_width(chan, dst_addr);
+
+ if (len >= (AT_XDMAC_MBR_UBC_UBLEN_MAX << dwidth)) {
+ dev_err(chan2dev(chan),
+ "%s: Transfer too large, aborting...\n",
+ __func__);
+ return NULL;
+ }
+
+ spin_lock_irqsave(&atchan->lock, flags);
+ desc = at_xdmac_get_desc(atchan);
+ spin_unlock_irqrestore(&atchan->lock, flags);
+ if (!desc) {
+ dev_err(chan2dev(chan), "can't get descriptor\n");
+ return NULL;
+ }
+
+ chan_cc |= AT_XDMAC_CC_DWIDTH(dwidth);
+
+ ublen = len >> dwidth;
+
+ desc->lld.mbr_da = dst_addr;
+ desc->lld.mbr_ds = value;
+ desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV3
+ | AT_XDMAC_MBR_UBC_NDEN
+ | AT_XDMAC_MBR_UBC_NSEN
+ | ublen;
+ desc->lld.mbr_cfg = chan_cc;
+
+ dev_dbg(chan2dev(chan),
+ "%s: lld: mbr_da=0x%08x, mbr_ds=0x%08x, mbr_ubc=0x%08x, mbr_cfg=0x%08x\n",
+ __func__, desc->lld.mbr_da, desc->lld.mbr_ds, desc->lld.mbr_ubc,
+ desc->lld.mbr_cfg);
+
+ return desc;
+}
+
+struct dma_async_tx_descriptor *
+at_xdmac_prep_dma_memset(struct dma_chan *chan, dma_addr_t dest, int value,
+ size_t len, unsigned long flags)
+{
+ struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan);
+ struct at_xdmac_desc *desc;
+
+ dev_dbg(chan2dev(chan), "%s: dest=0x%08x, len=%d, pattern=0x%x, flags=0x%lx\n",
+ __func__, dest, len, value, flags);
+
+ if (unlikely(!len))
+ return NULL;
+
+ desc = at_xdmac_memset_create_desc(chan, atchan, dest, len, value);
+ list_add_tail(&desc->desc_node, &desc->descs_list);
+
+ desc->tx_dma_desc.cookie = -EBUSY;
+ desc->tx_dma_desc.flags = flags;
+ desc->xfer_size = len;
+
+ return &desc->tx_dma_desc;
+}
+
static enum dma_status
at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
struct dma_tx_state *txstate)
@@ -1445,7 +1731,9 @@ static int at_xdmac_probe(struct platform_device *pdev)
}
dma_cap_set(DMA_CYCLIC, atxdmac->dma.cap_mask);
+ dma_cap_set(DMA_INTERLEAVE, atxdmac->dma.cap_mask);
dma_cap_set(DMA_MEMCPY, atxdmac->dma.cap_mask);
+ dma_cap_set(DMA_MEMSET, atxdmac->dma.cap_mask);
dma_cap_set(DMA_SLAVE, atxdmac->dma.cap_mask);
/*
* Without DMA_PRIVATE the driver is not able to allocate more than
@@ -1458,7 +1746,9 @@ static int at_xdmac_probe(struct platform_device *pdev)
atxdmac->dma.device_tx_status = at_xdmac_tx_status;
atxdmac->dma.device_issue_pending = at_xdmac_issue_pending;
atxdmac->dma.device_prep_dma_cyclic = at_xdmac_prep_dma_cyclic;
+ atxdmac->dma.device_prep_interleaved_dma = at_xdmac_prep_interleaved;
atxdmac->dma.device_prep_dma_memcpy = at_xdmac_prep_dma_memcpy;
+ atxdmac->dma.device_prep_dma_memset = at_xdmac_prep_dma_memset;
atxdmac->dma.device_prep_slave_sg = at_xdmac_prep_slave_sg;
atxdmac->dma.device_config = at_xdmac_device_config;
atxdmac->dma.device_pause = at_xdmac_device_pause;
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 3ddfd1f6c23c..4a4cce15f25d 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -267,6 +267,13 @@ static void dma_chan_put(struct dma_chan *chan)
/* This channel is not in use anymore, free it */
if (!chan->client_count && chan->device->device_free_chan_resources)
chan->device->device_free_chan_resources(chan);
+
+ /* If the channel is used via a DMA request router, free the mapping */
+ if (chan->router && chan->router->route_free) {
+ chan->router->route_free(chan->router->dev, chan->route_data);
+ chan->router = NULL;
+ chan->route_data = NULL;
+ }
}
enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
@@ -536,7 +543,7 @@ static struct dma_chan *private_candidate(const dma_cap_mask_t *mask,
}
/**
- * dma_request_slave_channel - try to get specific channel exclusively
+ * dma_get_slave_channel - try to get specific channel exclusively
* @chan: target channel
*/
struct dma_chan *dma_get_slave_channel(struct dma_chan *chan)
@@ -648,7 +655,7 @@ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
EXPORT_SYMBOL_GPL(__dma_request_channel);
/**
- * dma_request_slave_channel - try to allocate an exclusive slave channel
+ * dma_request_slave_channel_reason - try to allocate an exclusive slave channel
* @dev: pointer to client device structure
* @name: slave channel name
*
@@ -836,6 +843,8 @@ int dma_async_device_register(struct dma_device *device)
!device->device_prep_dma_pq);
BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) &&
!device->device_prep_dma_pq_val);
+ BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) &&
+ !device->device_prep_dma_memset);
BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) &&
!device->device_prep_dma_interrupt);
BUG_ON(dma_has_cap(DMA_SG, device->cap_mask) &&
diff --git a/drivers/dma/ep93xx_dma.c b/drivers/dma/ep93xx_dma.c
index 24e5290faa32..57ff46284f15 100644
--- a/drivers/dma/ep93xx_dma.c
+++ b/drivers/dma/ep93xx_dma.c
@@ -1364,7 +1364,7 @@ static int __init ep93xx_dma_probe(struct platform_device *pdev)
return ret;
}
-static struct platform_device_id ep93xx_dma_driver_ids[] = {
+static const struct platform_device_id ep93xx_dma_driver_ids[] = {
{ "ep93xx-dma-m2p", 0 },
{ "ep93xx-dma-m2m", 1 },
{ },
diff --git a/drivers/dma/fsl-edma.c b/drivers/dma/fsl-edma.c
index 09e2842d15ec..915eec3cc279 100644
--- a/drivers/dma/fsl-edma.c
+++ b/drivers/dma/fsl-edma.c
@@ -881,10 +881,6 @@ static int fsl_edma_probe(struct platform_device *pdev)
}
- ret = fsl_edma_irq_init(pdev, fsl_edma);
- if (ret)
- return ret;
-
fsl_edma->big_endian = of_property_read_bool(np, "big-endian");
INIT_LIST_HEAD(&fsl_edma->dma_dev.channels);
@@ -900,6 +896,11 @@ static int fsl_edma_probe(struct platform_device *pdev)
fsl_edma_chan_mux(fsl_chan, 0, false);
}
+ edma_writel(fsl_edma, ~0, fsl_edma->membase + EDMA_INTR);
+ ret = fsl_edma_irq_init(pdev, fsl_edma);
+ if (ret)
+ return ret;
+
dma_cap_set(DMA_PRIVATE, fsl_edma->dma_dev.cap_mask);
dma_cap_set(DMA_SLAVE, fsl_edma->dma_dev.cap_mask);
dma_cap_set(DMA_CYCLIC, fsl_edma->dma_dev.cap_mask);
diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
index eed405976ea9..865501fcc67d 100644
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -193,7 +193,7 @@ struct imxdma_filter_data {
int request;
};
-static struct platform_device_id imx_dma_devtype[] = {
+static const struct platform_device_id imx_dma_devtype[] = {
{
.name = "imx1-dma",
.driver_data = IMX1_DMA,
diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 62bbd79338e0..77b6aab04f47 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -420,7 +420,7 @@ static struct sdma_driver_data sdma_imx6q = {
.script_addrs = &sdma_script_imx6q,
};
-static struct platform_device_id sdma_devtypes[] = {
+static const struct platform_device_id sdma_devtypes[] = {
{
.name = "imx25-sdma",
.driver_data = (unsigned long)&sdma_imx25,
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index 1c56001df676..fbaf1ead2597 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -19,6 +19,7 @@
#include <linux/dma-mapping.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
+#include <linux/of_device.h>
#include <linux/platform_device.h>
#include <linux/memory.h>
#include <linux/clk.h>
@@ -30,6 +31,11 @@
#include "dmaengine.h"
#include "mv_xor.h"
+enum mv_xor_mode {
+ XOR_MODE_IN_REG,
+ XOR_MODE_IN_DESC,
+};
+
static void mv_xor_issue_pending(struct dma_chan *chan);
#define to_mv_xor_chan(chan) \
@@ -56,18 +62,30 @@ static void mv_desc_init(struct mv_xor_desc_slot *desc,
hw_desc->byte_count = byte_count;
}
-static void mv_desc_set_next_desc(struct mv_xor_desc_slot *desc,
- u32 next_desc_addr)
+static void mv_desc_set_mode(struct mv_xor_desc_slot *desc)
{
struct mv_xor_desc *hw_desc = desc->hw_desc;
- BUG_ON(hw_desc->phy_next_desc);
- hw_desc->phy_next_desc = next_desc_addr;
+
+ switch (desc->type) {
+ case DMA_XOR:
+ case DMA_INTERRUPT:
+ hw_desc->desc_command |= XOR_DESC_OPERATION_XOR;
+ break;
+ case DMA_MEMCPY:
+ hw_desc->desc_command |= XOR_DESC_OPERATION_MEMCPY;
+ break;
+ default:
+ BUG();
+ return;
+ }
}
-static void mv_desc_clear_next_desc(struct mv_xor_desc_slot *desc)
+static void mv_desc_set_next_desc(struct mv_xor_desc_slot *desc,
+ u32 next_desc_addr)
{
struct mv_xor_desc *hw_desc = desc->hw_desc;
- hw_desc->phy_next_desc = 0;
+ BUG_ON(hw_desc->phy_next_desc);
+ hw_desc->phy_next_desc = next_desc_addr;
}
static void mv_desc_set_src_addr(struct mv_xor_desc_slot *desc,
@@ -104,7 +122,7 @@ static u32 mv_chan_get_intr_cause(struct mv_xor_chan *chan)
return intr_cause;
}
-static void mv_xor_device_clear_eoc_cause(struct mv_xor_chan *chan)
+static void mv_chan_clear_eoc_cause(struct mv_xor_chan *chan)
{
u32 val;
@@ -114,14 +132,14 @@ static void mv_xor_device_clear_eoc_cause(struct mv_xor_chan *chan)
writel_relaxed(val, XOR_INTR_CAUSE(chan));
}
-static void mv_xor_device_clear_err_status(struct mv_xor_chan *chan)
+static void mv_chan_clear_err_status(struct mv_xor_chan *chan)
{
u32 val = 0xFFFF0000 >> (chan->idx * 16);
writel_relaxed(val, XOR_INTR_CAUSE(chan));
}
-static void mv_set_mode(struct mv_xor_chan *chan,
- enum dma_transaction_type type)
+static void mv_chan_set_mode(struct mv_xor_chan *chan,
+ enum dma_transaction_type type)
{
u32 op_mode;
u32 config = readl_relaxed(XOR_CONFIG(chan));
@@ -144,6 +162,25 @@ static void mv_set_mode(struct mv_xor_chan *chan,
config &= ~0x7;
config |= op_mode;
+ if (IS_ENABLED(__BIG_ENDIAN))
+ config |= XOR_DESCRIPTOR_SWAP;
+ else
+ config &= ~XOR_DESCRIPTOR_SWAP;
+
+ writel_relaxed(config, XOR_CONFIG(chan));
+ chan->current_type = type;
+}
+
+static void mv_chan_set_mode_to_desc(struct mv_xor_chan *chan)
+{
+ u32 op_mode;
+ u32 config = readl_relaxed(XOR_CONFIG(chan));
+
+ op_mode = XOR_OPERATION_MODE_IN_DESC;
+
+ config &= ~0x7;
+ config |= op_mode;
+
#if defined(__BIG_ENDIAN)
config |= XOR_DESCRIPTOR_SWAP;
#else
@@ -151,7 +188,6 @@ static void mv_set_mode(struct mv_xor_chan *chan,
#endif
writel_relaxed(config, XOR_CONFIG(chan));
- chan->current_type = type;
}
static void mv_chan_activate(struct mv_xor_chan *chan)
@@ -171,28 +207,13 @@ static char mv_chan_is_busy(struct mv_xor_chan *chan)
return (state == 1) ? 1 : 0;
}
-/**
- * mv_xor_free_slots - flags descriptor slots for reuse
- * @slot: Slot to free
- * Caller must hold &mv_chan->lock while calling this function
- */
-static void mv_xor_free_slots(struct mv_xor_chan *mv_chan,
- struct mv_xor_desc_slot *slot)
-{
- dev_dbg(mv_chan_to_devp(mv_chan), "%s %d slot %p\n",
- __func__, __LINE__, slot);
-
- slot->slot_used = 0;
-
-}
-
/*
- * mv_xor_start_new_chain - program the engine to operate on new chain headed by
- * sw_desc
+ * mv_chan_start_new_chain - program the engine to operate on new
+ * chain headed by sw_desc
* Caller must hold &mv_chan->lock while calling this function
*/
-static void mv_xor_start_new_chain(struct mv_xor_chan *mv_chan,
- struct mv_xor_desc_slot *sw_desc)
+static void mv_chan_start_new_chain(struct mv_xor_chan *mv_chan,
+ struct mv_xor_desc_slot *sw_desc)
{
dev_dbg(mv_chan_to_devp(mv_chan), "%s %d: sw_desc %p\n",
__func__, __LINE__, sw_desc);
@@ -205,8 +226,9 @@ static void mv_xor_start_new_chain(struct mv_xor_chan *mv_chan,
}
static dma_cookie_t
-mv_xor_run_tx_complete_actions(struct mv_xor_desc_slot *desc,
- struct mv_xor_chan *mv_chan, dma_cookie_t cookie)
+mv_desc_run_tx_complete_actions(struct mv_xor_desc_slot *desc,
+ struct mv_xor_chan *mv_chan,
+ dma_cookie_t cookie)
{
BUG_ON(desc->async_tx.cookie < 0);
@@ -230,93 +252,110 @@ mv_xor_run_tx_complete_actions(struct mv_xor_desc_slot *desc,
}
static int
-mv_xor_clean_completed_slots(struct mv_xor_chan *mv_chan)
+mv_chan_clean_completed_slots(struct mv_xor_chan *mv_chan)
{
struct mv_xor_desc_slot *iter, *_iter;
dev_dbg(mv_chan_to_devp(mv_chan), "%s %d\n", __func__, __LINE__);
list_for_each_entry_safe(iter, _iter, &mv_chan->completed_slots,
- completed_node) {
+ node) {
- if (async_tx_test_ack(&iter->async_tx)) {
- list_del(&iter->completed_node);
- mv_xor_free_slots(mv_chan, iter);
- }
+ if (async_tx_test_ack(&iter->async_tx))
+ list_move_tail(&iter->node, &mv_chan->free_slots);
}
return 0;
}
static int
-mv_xor_clean_slot(struct mv_xor_desc_slot *desc,
- struct mv_xor_chan *mv_chan)
+mv_desc_clean_slot(struct mv_xor_desc_slot *desc,
+ struct mv_xor_chan *mv_chan)
{
dev_dbg(mv_chan_to_devp(mv_chan), "%s %d: desc %p flags %d\n",
__func__, __LINE__, desc, desc->async_tx.flags);
- list_del(&desc->chain_node);
+
/* the client is allowed to attach dependent operations
* until 'ack' is set
*/
- if (!async_tx_test_ack(&desc->async_tx)) {
+ if (!async_tx_test_ack(&desc->async_tx))
/* move this slot to the completed_slots */
- list_add_tail(&desc->completed_node, &mv_chan->completed_slots);
- return 0;
- }
+ list_move_tail(&desc->node, &mv_chan->completed_slots);
+ else
+ list_move_tail(&desc->node, &mv_chan->free_slots);
- mv_xor_free_slots(mv_chan, desc);
return 0;
}
/* This function must be called with the mv_xor_chan spinlock held */
-static void mv_xor_slot_cleanup(struct mv_xor_chan *mv_chan)
+static void mv_chan_slot_cleanup(struct mv_xor_chan *mv_chan)
{
struct mv_xor_desc_slot *iter, *_iter;
dma_cookie_t cookie = 0;
int busy = mv_chan_is_busy(mv_chan);
u32 current_desc = mv_chan_get_current_desc(mv_chan);
- int seen_current = 0;
+ int current_cleaned = 0;
+ struct mv_xor_desc *hw_desc;
dev_dbg(mv_chan_to_devp(mv_chan), "%s %d\n", __func__, __LINE__);
dev_dbg(mv_chan_to_devp(mv_chan), "current_desc %x\n", current_desc);
- mv_xor_clean_completed_slots(mv_chan);
+ mv_chan_clean_completed_slots(mv_chan);
/* free completed slots from the chain starting with
* the oldest descriptor
*/
list_for_each_entry_safe(iter, _iter, &mv_chan->chain,
- chain_node) {
- prefetch(_iter);
- prefetch(&_iter->async_tx);
+ node) {
- /* do not advance past the current descriptor loaded into the
- * hardware channel, subsequent descriptors are either in
- * process or have not been submitted
- */
- if (seen_current)
- break;
+ /* clean finished descriptors */
+ hw_desc = iter->hw_desc;
+ if (hw_desc->status & XOR_DESC_SUCCESS) {
+ cookie = mv_desc_run_tx_complete_actions(iter, mv_chan,
+ cookie);
- /* stop the search if we reach the current descriptor and the
- * channel is busy
- */
- if (iter->async_tx.phys == current_desc) {
- seen_current = 1;
- if (busy)
+ /* done processing desc, clean slot */
+ mv_desc_clean_slot(iter, mv_chan);
+
+ /* break if we did cleaned the current */
+ if (iter->async_tx.phys == current_desc) {
+ current_cleaned = 1;
break;
+ }
+ } else {
+ if (iter->async_tx.phys == current_desc) {
+ current_cleaned = 0;
+ break;
+ }
}
-
- cookie = mv_xor_run_tx_complete_actions(iter, mv_chan, cookie);
-
- if (mv_xor_clean_slot(iter, mv_chan))
- break;
}
if ((busy == 0) && !list_empty(&mv_chan->chain)) {
- struct mv_xor_desc_slot *chain_head;
- chain_head = list_entry(mv_chan->chain.next,
- struct mv_xor_desc_slot,
- chain_node);
-
- mv_xor_start_new_chain(mv_chan, chain_head);
+ if (current_cleaned) {
+ /*
+ * current descriptor cleaned and removed, run
+ * from list head
+ */
+ iter = list_entry(mv_chan->chain.next,
+ struct mv_xor_desc_slot,
+ node);
+ mv_chan_start_new_chain(mv_chan, iter);
+ } else {
+ if (!list_is_last(&iter->node, &mv_chan->chain)) {
+ /*
+ * descriptors are still waiting after
+ * current, trigger them
+ */
+ iter = list_entry(iter->node.next,
+ struct mv_xor_desc_slot,
+ node);
+ mv_chan_start_new_chain(mv_chan, iter);
+ } else {
+ /*
+ * some descriptors are still waiting
+ * to be cleaned
+ */
+ tasklet_schedule(&mv_chan->irq_tasklet);
+ }
+ }
}
if (cookie > 0)
@@ -328,56 +367,35 @@ static void mv_xor_tasklet(unsigned long data)
struct mv_xor_chan *chan = (struct mv_xor_chan *) data;
spin_lock_bh(&chan->lock);
- mv_xor_slot_cleanup(chan);
+ mv_chan_slot_cleanup(chan);
spin_unlock_bh(&chan->lock);
}
static struct mv_xor_desc_slot *
-mv_xor_alloc_slot(struct mv_xor_chan *mv_chan)
+mv_chan_alloc_slot(struct mv_xor_chan *mv_chan)
{
- struct mv_xor_desc_slot *iter, *_iter;
- int retry = 0;
+ struct mv_xor_desc_slot *iter;
- /* start search from the last allocated descrtiptor
- * if a contiguous allocation can not be found start searching
- * from the beginning of the list
- */
-retry:
- if (retry == 0)
- iter = mv_chan->last_used;
- else
- iter = list_entry(&mv_chan->all_slots,
- struct mv_xor_desc_slot,
- slot_node);
-
- list_for_each_entry_safe_continue(
- iter, _iter, &mv_chan->all_slots, slot_node) {
-
- prefetch(_iter);
- prefetch(&_iter->async_tx);
- if (iter->slot_used) {
- /* give up after finding the first busy slot
- * on the second pass through the list
- */
- if (retry)
- break;
- continue;
- }
+ spin_lock_bh(&mv_chan->lock);
+
+ if (!list_empty(&mv_chan->free_slots)) {
+ iter = list_first_entry(&mv_chan->free_slots,
+ struct mv_xor_desc_slot,
+ node);
+
+ list_move_tail(&iter->node, &mv_chan->allocated_slots);
+
+ spin_unlock_bh(&mv_chan->lock);
/* pre-ack descriptor */
async_tx_ack(&iter->async_tx);
-
- iter->slot_used = 1;
- INIT_LIST_HEAD(&iter->chain_node);
iter->async_tx.cookie = -EBUSY;
- mv_chan->last_used = iter;
- mv_desc_clear_next_desc(iter);
return iter;
}
- if (!retry++)
- goto retry;
+
+ spin_unlock_bh(&mv_chan->lock);
/* try to free some slots if the allocation fails */
tasklet_schedule(&mv_chan->irq_tasklet);
@@ -403,14 +421,14 @@ mv_xor_tx_submit(struct dma_async_tx_descriptor *tx)
cookie = dma_cookie_assign(tx);
if (list_empty(&mv_chan->chain))
- list_add_tail(&sw_desc->chain_node, &mv_chan->chain);
+ list_move_tail(&sw_desc->node, &mv_chan->chain);
else {
new_hw_chain = 0;
old_chain_tail = list_entry(mv_chan->chain.prev,
struct mv_xor_desc_slot,
- chain_node);
- list_add_tail(&sw_desc->chain_node, &mv_chan->chain);
+ node);
+ list_move_tail(&sw_desc->node, &mv_chan->chain);
dev_dbg(mv_chan_to_devp(mv_chan), "Append to last desc %pa\n",
&old_chain_tail->async_tx.phys);
@@ -431,7 +449,7 @@ mv_xor_tx_submit(struct dma_async_tx_descriptor *tx)
}
if (new_hw_chain)
- mv_xor_start_new_chain(mv_chan, sw_desc);
+ mv_chan_start_new_chain(mv_chan, sw_desc);
spin_unlock_bh(&mv_chan->lock);
@@ -463,26 +481,20 @@ static int mv_xor_alloc_chan_resources(struct dma_chan *chan)
dma_async_tx_descriptor_init(&slot->async_tx, chan);
slot->async_tx.tx_submit = mv_xor_tx_submit;
- INIT_LIST_HEAD(&slot->chain_node);
- INIT_LIST_HEAD(&slot->slot_node);
+ INIT_LIST_HEAD(&slot->node);
dma_desc = mv_chan->dma_desc_pool;
slot->async_tx.phys = dma_desc + idx * MV_XOR_SLOT_SIZE;
slot->idx = idx++;
spin_lock_bh(&mv_chan->lock);
mv_chan->slots_allocated = idx;
- list_add_tail(&slot->slot_node, &mv_chan->all_slots);
+ list_add_tail(&slot->node, &mv_chan->free_slots);
spin_unlock_bh(&mv_chan->lock);
}
- if (mv_chan->slots_allocated && !mv_chan->last_used)
- mv_chan->last_used = list_entry(mv_chan->all_slots.next,
- struct mv_xor_desc_slot,
- slot_node);
-
dev_dbg(mv_chan_to_devp(mv_chan),
- "allocated %d descriptor slots last_used: %p\n",
- mv_chan->slots_allocated, mv_chan->last_used);
+ "allocated %d descriptor slots\n",
+ mv_chan->slots_allocated);
return mv_chan->slots_allocated ? : -ENOMEM;
}
@@ -503,16 +515,17 @@ mv_xor_prep_dma_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
"%s src_cnt: %d len: %u dest %pad flags: %ld\n",
__func__, src_cnt, len, &dest, flags);
- spin_lock_bh(&mv_chan->lock);
- sw_desc = mv_xor_alloc_slot(mv_chan);
+ sw_desc = mv_chan_alloc_slot(mv_chan);
if (sw_desc) {
sw_desc->type = DMA_XOR;
sw_desc->async_tx.flags = flags;
mv_desc_init(sw_desc, dest, len, flags);
+ if (mv_chan->op_in_desc == XOR_MODE_IN_DESC)
+ mv_desc_set_mode(sw_desc);
while (src_cnt--)
mv_desc_set_src_addr(sw_desc, src_cnt, src[src_cnt]);
}
- spin_unlock_bh(&mv_chan->lock);
+
dev_dbg(mv_chan_to_devp(mv_chan),
"%s sw_desc %p async_tx %p \n",
__func__, sw_desc, &sw_desc->async_tx);
@@ -556,25 +569,29 @@ static void mv_xor_free_chan_resources(struct dma_chan *chan)
spin_lock_bh(&mv_chan->lock);
- mv_xor_slot_cleanup(mv_chan);
+ mv_chan_slot_cleanup(mv_chan);
list_for_each_entry_safe(iter, _iter, &mv_chan->chain,
- chain_node) {
+ node) {
in_use_descs++;
- list_del(&iter->chain_node);
+ list_move_tail(&iter->node, &mv_chan->free_slots);
}
list_for_each_entry_safe(iter, _iter, &mv_chan->completed_slots,
- completed_node) {
+ node) {
in_use_descs++;
- list_del(&iter->completed_node);
+ list_move_tail(&iter->node, &mv_chan->free_slots);
+ }
+ list_for_each_entry_safe(iter, _iter, &mv_chan->allocated_slots,
+ node) {
+ in_use_descs++;
+ list_move_tail(&iter->node, &mv_chan->free_slots);
}
list_for_each_entry_safe_reverse(
- iter, _iter, &mv_chan->all_slots, slot_node) {
- list_del(&iter->slot_node);
+ iter, _iter, &mv_chan->free_slots, node) {
+ list_del(&iter->node);
kfree(iter);
mv_chan->slots_allocated--;
}
- mv_chan->last_used = NULL;
dev_dbg(mv_chan_to_devp(mv_chan), "%s slots_allocated %d\n",
__func__, mv_chan->slots_allocated);
@@ -603,13 +620,13 @@ static enum dma_status mv_xor_status(struct dma_chan *chan,
return ret;
spin_lock_bh(&mv_chan->lock);
- mv_xor_slot_cleanup(mv_chan);
+ mv_chan_slot_cleanup(mv_chan);
spin_unlock_bh(&mv_chan->lock);
return dma_cookie_status(chan, cookie, txstate);
}
-static void mv_dump_xor_regs(struct mv_xor_chan *chan)
+static void mv_chan_dump_regs(struct mv_xor_chan *chan)
{
u32 val;
@@ -632,8 +649,8 @@ static void mv_dump_xor_regs(struct mv_xor_chan *chan)
dev_err(mv_chan_to_devp(chan), "error addr 0x%08x\n", val);
}
-static void mv_xor_err_interrupt_handler(struct mv_xor_chan *chan,
- u32 intr_cause)
+static void mv_chan_err_interrupt_handler(struct mv_xor_chan *chan,
+ u32 intr_cause)
{
if (intr_cause & XOR_INT_ERR_DECODE) {
dev_dbg(mv_chan_to_devp(chan), "ignoring address decode error\n");
@@ -643,7 +660,7 @@ static void mv_xor_err_interrupt_handler(struct mv_xor_chan *chan,
dev_err(mv_chan_to_devp(chan), "error on chan %d. intr cause 0x%08x\n",
chan->idx, intr_cause);
- mv_dump_xor_regs(chan);
+ mv_chan_dump_regs(chan);
WARN_ON(1);
}
@@ -655,11 +672,11 @@ static irqreturn_t mv_xor_interrupt_handler(int irq, void *data)
dev_dbg(mv_chan_to_devp(chan), "intr cause %x\n", intr_cause);
if (intr_cause & XOR_INTR_ERRORS)
- mv_xor_err_interrupt_handler(chan, intr_cause);
+ mv_chan_err_interrupt_handler(chan, intr_cause);
tasklet_schedule(&chan->irq_tasklet);
- mv_xor_device_clear_eoc_cause(chan);
+ mv_chan_clear_eoc_cause(chan);
return IRQ_HANDLED;
}
@@ -678,7 +695,7 @@ static void mv_xor_issue_pending(struct dma_chan *chan)
* Perform a transaction to verify the HW works.
*/
-static int mv_xor_memcpy_self_test(struct mv_xor_chan *mv_chan)
+static int mv_chan_memcpy_self_test(struct mv_xor_chan *mv_chan)
{
int i, ret;
void *src, *dest;
@@ -787,7 +804,7 @@ out:
#define MV_XOR_NUM_SRC_TEST 4 /* must be <= 15 */
static int
-mv_xor_xor_self_test(struct mv_xor_chan *mv_chan)
+mv_chan_xor_self_test(struct mv_xor_chan *mv_chan)
{
int i, src_idx, ret;
struct page *dest;
@@ -951,7 +968,7 @@ static int mv_xor_channel_remove(struct mv_xor_chan *mv_chan)
static struct mv_xor_chan *
mv_xor_channel_add(struct mv_xor_device *xordev,
struct platform_device *pdev,
- int idx, dma_cap_mask_t cap_mask, int irq)
+ int idx, dma_cap_mask_t cap_mask, int irq, int op_in_desc)
{
int ret = 0;
struct mv_xor_chan *mv_chan;
@@ -963,6 +980,7 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
mv_chan->idx = idx;
mv_chan->irq = irq;
+ mv_chan->op_in_desc = op_in_desc;
dma_dev = &mv_chan->dmadev;
@@ -1014,7 +1032,7 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
mv_chan);
/* clear errors before enabling interrupts */
- mv_xor_device_clear_err_status(mv_chan);
+ mv_chan_clear_err_status(mv_chan);
ret = request_irq(mv_chan->irq, mv_xor_interrupt_handler,
0, dev_name(&pdev->dev), mv_chan);
@@ -1023,32 +1041,37 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
mv_chan_unmask_interrupts(mv_chan);
- mv_set_mode(mv_chan, DMA_XOR);
+ if (mv_chan->op_in_desc == XOR_MODE_IN_DESC)
+ mv_chan_set_mode_to_desc(mv_chan);
+ else
+ mv_chan_set_mode(mv_chan, DMA_XOR);
spin_lock_init(&mv_chan->lock);
INIT_LIST_HEAD(&mv_chan->chain);
INIT_LIST_HEAD(&mv_chan->completed_slots);
- INIT_LIST_HEAD(&mv_chan->all_slots);
+ INIT_LIST_HEAD(&mv_chan->free_slots);
+ INIT_LIST_HEAD(&mv_chan->allocated_slots);
mv_chan->dmachan.device = dma_dev;
dma_cookie_init(&mv_chan->dmachan);
list_add_tail(&mv_chan->dmachan.device_node, &dma_dev->channels);
if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) {
- ret = mv_xor_memcpy_self_test(mv_chan);
+ ret = mv_chan_memcpy_self_test(mv_chan);
dev_dbg(&pdev->dev, "memcpy self test returned %d\n", ret);
if (ret)
goto err_free_irq;
}
if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
- ret = mv_xor_xor_self_test(mv_chan);
+ ret = mv_chan_xor_self_test(mv_chan);
dev_dbg(&pdev->dev, "xor self test returned %d\n", ret);
if (ret)
goto err_free_irq;
}
- dev_info(&pdev->dev, "Marvell XOR: ( %s%s%s)\n",
+ dev_info(&pdev->dev, "Marvell XOR (%s): ( %s%s%s)\n",
+ mv_chan->op_in_desc ? "Descriptor Mode" : "Registers Mode",
dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "",
dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : "");
@@ -1097,6 +1120,13 @@ mv_xor_conf_mbus_windows(struct mv_xor_device *xordev,
writel(0, base + WINDOW_OVERRIDE_CTRL(1));
}
+static const struct of_device_id mv_xor_dt_ids[] = {
+ { .compatible = "marvell,orion-xor", .data = (void *)XOR_MODE_IN_REG },
+ { .compatible = "marvell,armada-380-xor", .data = (void *)XOR_MODE_IN_DESC },
+ {},
+};
+MODULE_DEVICE_TABLE(of, mv_xor_dt_ids);
+
static int mv_xor_probe(struct platform_device *pdev)
{
const struct mbus_dram_target_info *dram;
@@ -1104,6 +1134,7 @@ static int mv_xor_probe(struct platform_device *pdev)
struct mv_xor_platform_data *pdata = dev_get_platdata(&pdev->dev);
struct resource *res;
int i, ret;
+ int op_in_desc;
dev_notice(&pdev->dev, "Marvell shared XOR driver\n");
@@ -1148,11 +1179,15 @@ static int mv_xor_probe(struct platform_device *pdev)
if (pdev->dev.of_node) {
struct device_node *np;
int i = 0;
+ const struct of_device_id *of_id =
+ of_match_device(mv_xor_dt_ids,
+ &pdev->dev);
for_each_child_of_node(pdev->dev.of_node, np) {
struct mv_xor_chan *chan;
dma_cap_mask_t cap_mask;
int irq;
+ op_in_desc = (int)of_id->data;
dma_cap_zero(cap_mask);
if (of_property_read_bool(np, "dmacap,memcpy"))
@@ -1169,7 +1204,7 @@ static int mv_xor_probe(struct platform_device *pdev)
}
chan = mv_xor_channel_add(xordev, pdev, i,
- cap_mask, irq);
+ cap_mask, irq, op_in_desc);
if (IS_ERR(chan)) {
ret = PTR_ERR(chan);
irq_dispose_mapping(irq);
@@ -1198,7 +1233,8 @@ static int mv_xor_probe(struct platform_device *pdev)
}
chan = mv_xor_channel_add(xordev, pdev, i,
- cd->cap_mask, irq);
+ cd->cap_mask, irq,
+ XOR_MODE_IN_REG);
if (IS_ERR(chan)) {
ret = PTR_ERR(chan);
goto err_channel_add;
@@ -1244,14 +1280,6 @@ static int mv_xor_remove(struct platform_device *pdev)
return 0;
}
-#ifdef CONFIG_OF
-static const struct of_device_id mv_xor_dt_ids[] = {
- { .compatible = "marvell,orion-xor", },
- {},
-};
-MODULE_DEVICE_TABLE(of, mv_xor_dt_ids);
-#endif
-
static struct platform_driver mv_xor_driver = {
.probe = mv_xor_probe,
.remove = mv_xor_remove,
diff --git a/drivers/dma/mv_xor.h b/drivers/dma/mv_xor.h
index 91958dba39a2..b7455b42137b 100644
--- a/drivers/dma/mv_xor.h
+++ b/drivers/dma/mv_xor.h
@@ -19,7 +19,7 @@
#include <linux/dmaengine.h>
#include <linux/interrupt.h>
-#define MV_XOR_POOL_SIZE PAGE_SIZE
+#define MV_XOR_POOL_SIZE (MV_XOR_SLOT_SIZE * 3072)
#define MV_XOR_SLOT_SIZE 64
#define MV_XOR_THRESHOLD 1
#define MV_XOR_MAX_CHANNELS 2
@@ -30,7 +30,13 @@
/* Values for the XOR_CONFIG register */
#define XOR_OPERATION_MODE_XOR 0
#define XOR_OPERATION_MODE_MEMCPY 2
+#define XOR_OPERATION_MODE_IN_DESC 7
#define XOR_DESCRIPTOR_SWAP BIT(14)
+#define XOR_DESC_SUCCESS 0x40000000
+
+#define XOR_DESC_OPERATION_XOR (0 << 24)
+#define XOR_DESC_OPERATION_CRC32C (1 << 24)
+#define XOR_DESC_OPERATION_MEMCPY (2 << 24)
#define XOR_DESC_DMA_OWNED BIT(31)
#define XOR_DESC_EOD_INT_EN BIT(31)
@@ -88,13 +94,14 @@ struct mv_xor_device {
* @mmr_base: memory mapped register base
* @idx: the index of the xor channel
* @chain: device chain view of the descriptors
+ * @free_slots: free slots usable by the channel
+ * @allocated_slots: slots allocated by the driver
* @completed_slots: slots completed by HW but still need to be acked
* @device: parent device
* @common: common dmaengine channel object members
- * @last_used: place holder for allocation to continue from where it left off
- * @all_slots: complete domain of slots usable by the channel
* @slots_allocated: records the actual size of the descriptor slot pool
* @irq_tasklet: bottom half where mv_xor_slot_cleanup runs
+ * @op_in_desc: new mode of driver, each op is writen to descriptor.
*/
struct mv_xor_chan {
int pending;
@@ -105,16 +112,17 @@ struct mv_xor_chan {
int irq;
enum dma_transaction_type current_type;
struct list_head chain;
+ struct list_head free_slots;
+ struct list_head allocated_slots;
struct list_head completed_slots;
dma_addr_t dma_desc_pool;
void *dma_desc_pool_virt;
size_t pool_size;
struct dma_device dmadev;
struct dma_chan dmachan;
- struct mv_xor_desc_slot *last_used;
- struct list_head all_slots;
int slots_allocated;
struct tasklet_struct irq_tasklet;
+ int op_in_desc;
char dummy_src[MV_XOR_MIN_BYTE_COUNT];
char dummy_dst[MV_XOR_MIN_BYTE_COUNT];
dma_addr_t dummy_src_addr, dummy_dst_addr;
@@ -122,9 +130,7 @@ struct mv_xor_chan {
/**
* struct mv_xor_desc_slot - software descriptor
- * @slot_node: node on the mv_xor_chan.all_slots list
- * @chain_node: node on the mv_xor_chan.chain list
- * @completed_node: node on the mv_xor_chan.completed_slots list
+ * @node: node on the mv_xor_chan lists
* @hw_desc: virtual address of the hardware descriptor chain
* @phys: hardware address of the hardware descriptor chain
* @slot_used: slot in use or not
@@ -133,12 +139,9 @@ struct mv_xor_chan {
* @async_tx: support for the async_tx api
*/
struct mv_xor_desc_slot {
- struct list_head slot_node;
- struct list_head chain_node;
- struct list_head completed_node;
+ struct list_head node;
enum dma_transaction_type type;
void *hw_desc;
- u16 slot_used;
u16 idx;
struct dma_async_tx_descriptor async_tx;
};
diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index 829ec686dac3..60de35251da5 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -170,7 +170,7 @@ static struct mxs_dma_type mxs_dma_types[] = {
}
};
-static struct platform_device_id mxs_dma_ids[] = {
+static const struct platform_device_id mxs_dma_ids[] = {
{
.name = "imx23-dma-apbh",
.driver_data = (kernel_ulong_t) &mxs_dma_types[0],
diff --git a/drivers/dma/nbpfaxi.c b/drivers/dma/nbpfaxi.c
index 88b77c98365d..2b5a198ac77e 100644
--- a/drivers/dma/nbpfaxi.c
+++ b/drivers/dma/nbpfaxi.c
@@ -1455,7 +1455,7 @@ static int nbpf_remove(struct platform_device *pdev)
return 0;
}
-static struct platform_device_id nbpf_ids[] = {
+static const struct platform_device_id nbpf_ids[] = {
{"nbpfaxi64dmac1b4", (kernel_ulong_t)&nbpf_cfg[NBPF1B4]},
{"nbpfaxi64dmac1b8", (kernel_ulong_t)&nbpf_cfg[NBPF1B8]},
{"nbpfaxi64dmac1b16", (kernel_ulong_t)&nbpf_cfg[NBPF1B16]},
diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index cbd4a8aff120..1e1f2986eba8 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -45,6 +45,50 @@ static struct of_dma *of_dma_find_controller(struct of_phandle_args *dma_spec)
}
/**
+ * of_dma_router_xlate - translation function for router devices
+ * @dma_spec: pointer to DMA specifier as found in the device tree
+ * @of_dma: pointer to DMA controller data (router information)
+ *
+ * The function creates new dma_spec to be passed to the router driver's
+ * of_dma_route_allocate() function to prepare a dma_spec which will be used
+ * to request channel from the real DMA controller.
+ */
+static struct dma_chan *of_dma_router_xlate(struct of_phandle_args *dma_spec,
+ struct of_dma *ofdma)
+{
+ struct dma_chan *chan;
+ struct of_dma *ofdma_target;
+ struct of_phandle_args dma_spec_target;
+ void *route_data;
+
+ /* translate the request for the real DMA controller */
+ memcpy(&dma_spec_target, dma_spec, sizeof(dma_spec_target));
+ route_data = ofdma->of_dma_route_allocate(&dma_spec_target, ofdma);
+ if (IS_ERR(route_data))
+ return NULL;
+
+ ofdma_target = of_dma_find_controller(&dma_spec_target);
+ if (!ofdma_target)
+ return NULL;
+
+ chan = ofdma_target->of_dma_xlate(&dma_spec_target, ofdma_target);
+ if (chan) {
+ chan->router = ofdma->dma_router;
+ chan->route_data = route_data;
+ } else {
+ ofdma->dma_router->route_free(ofdma->dma_router->dev,
+ route_data);
+ }
+
+ /*
+ * Need to put the node back since the ofdma->of_dma_route_allocate
+ * has taken it for generating the new, translated dma_spec
+ */
+ of_node_put(dma_spec_target.np);
+ return chan;
+}
+
+/**
* of_dma_controller_register - Register a DMA controller to DT DMA helpers
* @np: device node of DMA controller
* @of_dma_xlate: translation function which converts a phandle
@@ -110,6 +154,51 @@ void of_dma_controller_free(struct device_node *np)
EXPORT_SYMBOL_GPL(of_dma_controller_free);
/**
+ * of_dma_router_register - Register a DMA router to DT DMA helpers as a
+ * controller
+ * @np: device node of DMA router
+ * @of_dma_route_allocate: setup function for the router which need to
+ * modify the dma_spec for the DMA controller to
+ * use and to set up the requested route.
+ * @dma_router: pointer to dma_router structure to be used when
+ * the route need to be free up.
+ *
+ * Returns 0 on success or appropriate errno value on error.
+ *
+ * Allocated memory should be freed with appropriate of_dma_controller_free()
+ * call.
+ */
+int of_dma_router_register(struct device_node *np,
+ void *(*of_dma_route_allocate)
+ (struct of_phandle_args *, struct of_dma *),
+ struct dma_router *dma_router)
+{
+ struct of_dma *ofdma;
+
+ if (!np || !of_dma_route_allocate || !dma_router) {
+ pr_err("%s: not enough information provided\n", __func__);
+ return -EINVAL;
+ }
+
+ ofdma = kzalloc(sizeof(*ofdma), GFP_KERNEL);
+ if (!ofdma)
+ return -ENOMEM;
+
+ ofdma->of_node = np;
+ ofdma->of_dma_xlate = of_dma_router_xlate;
+ ofdma->of_dma_route_allocate = of_dma_route_allocate;
+ ofdma->dma_router = dma_router;
+
+ /* Now queue of_dma controller structure in list */
+ mutex_lock(&of_dma_lock);
+ list_add_tail(&ofdma->of_dma_controllers, &of_dma_list);
+ mutex_unlock(&of_dma_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(of_dma_router_register);
+
+/**
* of_dma_match_channel - Check if a DMA specifier matches name
* @np: device node to look for DMA channels
* @name: channel name to be matched
diff --git a/drivers/dma/omap-dma.c b/drivers/dma/omap-dma.c
index 167dbaf65742..249445c8a4c6 100644
--- a/drivers/dma/omap-dma.c
+++ b/drivers/dma/omap-dma.c
@@ -22,6 +22,9 @@
#include "virt-dma.h"
+#define OMAP_SDMA_REQUESTS 127
+#define OMAP_SDMA_CHANNELS 32
+
struct omap_dmadev {
struct dma_device ddev;
spinlock_t lock;
@@ -31,9 +34,10 @@ struct omap_dmadev {
const struct omap_dma_reg *reg_map;
struct omap_system_dma_plat_info *plat;
bool legacy;
+ unsigned dma_requests;
spinlock_t irq_lock;
uint32_t irq_enable_mask;
- struct omap_chan *lch_map[32];
+ struct omap_chan *lch_map[OMAP_SDMA_CHANNELS];
};
struct omap_chan {
@@ -362,7 +366,7 @@ static void omap_dma_start_sg(struct omap_chan *c, struct omap_desc *d,
struct omap_sg *sg = d->sg + idx;
unsigned cxsa, cxei, cxfi;
- if (d->dir == DMA_DEV_TO_MEM) {
+ if (d->dir == DMA_DEV_TO_MEM || d->dir == DMA_MEM_TO_MEM) {
cxsa = CDSA;
cxei = CDEI;
cxfi = CDFI;
@@ -408,7 +412,7 @@ static void omap_dma_start_desc(struct omap_chan *c)
if (dma_omap1())
omap_dma_chan_write(c, CCR2, d->ccr >> 16);
- if (d->dir == DMA_DEV_TO_MEM) {
+ if (d->dir == DMA_DEV_TO_MEM || d->dir == DMA_MEM_TO_MEM) {
cxsa = CSSA;
cxei = CSEI;
cxfi = CSFI;
@@ -589,6 +593,7 @@ static void omap_dma_free_chan_resources(struct dma_chan *chan)
omap_free_dma(c->dma_ch);
dev_dbg(od->ddev.dev, "freeing channel for %u\n", c->dma_sig);
+ c->dma_sig = 0;
}
static size_t omap_dma_sg_size(struct omap_sg *sg)
@@ -948,6 +953,51 @@ static struct dma_async_tx_descriptor *omap_dma_prep_dma_cyclic(
return vchan_tx_prep(&c->vc, &d->vd, flags);
}
+static struct dma_async_tx_descriptor *omap_dma_prep_dma_memcpy(
+ struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
+ size_t len, unsigned long tx_flags)
+{
+ struct omap_chan *c = to_omap_dma_chan(chan);
+ struct omap_desc *d;
+ uint8_t data_type;
+
+ d = kzalloc(sizeof(*d) + sizeof(d->sg[0]), GFP_ATOMIC);
+ if (!d)
+ return NULL;
+
+ data_type = __ffs((src | dest | len));
+ if (data_type > CSDP_DATA_TYPE_32)
+ data_type = CSDP_DATA_TYPE_32;
+
+ d->dir = DMA_MEM_TO_MEM;
+ d->dev_addr = src;
+ d->fi = 0;
+ d->es = data_type;
+ d->sg[0].en = len / BIT(data_type);
+ d->sg[0].fn = 1;
+ d->sg[0].addr = dest;
+ d->sglen = 1;
+ d->ccr = c->ccr;
+ d->ccr |= CCR_DST_AMODE_POSTINC | CCR_SRC_AMODE_POSTINC;
+
+ d->cicr = CICR_DROP_IE;
+ if (tx_flags & DMA_PREP_INTERRUPT)
+ d->cicr |= CICR_FRAME_IE;
+
+ d->csdp = data_type;
+
+ if (dma_omap1()) {
+ d->cicr |= CICR_TOUT_IE;
+ d->csdp |= CSDP_DST_PORT_EMIFF | CSDP_SRC_PORT_EMIFF;
+ } else {
+ d->csdp |= CSDP_DST_PACKED | CSDP_SRC_PACKED;
+ d->cicr |= CICR_MISALIGNED_ERR_IE | CICR_TRANS_ERR_IE;
+ d->csdp |= CSDP_DST_BURST_64 | CSDP_SRC_BURST_64;
+ }
+
+ return vchan_tx_prep(&c->vc, &d->vd, tx_flags);
+}
+
static int omap_dma_slave_config(struct dma_chan *chan, struct dma_slave_config *cfg)
{
struct omap_chan *c = to_omap_dma_chan(chan);
@@ -1037,7 +1087,7 @@ static int omap_dma_resume(struct dma_chan *chan)
return 0;
}
-static int omap_dma_chan_init(struct omap_dmadev *od, int dma_sig)
+static int omap_dma_chan_init(struct omap_dmadev *od)
{
struct omap_chan *c;
@@ -1046,7 +1096,6 @@ static int omap_dma_chan_init(struct omap_dmadev *od, int dma_sig)
return -ENOMEM;
c->reg_map = od->reg_map;
- c->dma_sig = dma_sig;
c->vc.desc_free = omap_dma_desc_free;
vchan_init(&c->vc, &od->ddev);
INIT_LIST_HEAD(&c->node);
@@ -1094,12 +1143,14 @@ static int omap_dma_probe(struct platform_device *pdev)
dma_cap_set(DMA_SLAVE, od->ddev.cap_mask);
dma_cap_set(DMA_CYCLIC, od->ddev.cap_mask);
+ dma_cap_set(DMA_MEMCPY, od->ddev.cap_mask);
od->ddev.device_alloc_chan_resources = omap_dma_alloc_chan_resources;
od->ddev.device_free_chan_resources = omap_dma_free_chan_resources;
od->ddev.device_tx_status = omap_dma_tx_status;
od->ddev.device_issue_pending = omap_dma_issue_pending;
od->ddev.device_prep_slave_sg = omap_dma_prep_slave_sg;
od->ddev.device_prep_dma_cyclic = omap_dma_prep_dma_cyclic;
+ od->ddev.device_prep_dma_memcpy = omap_dma_prep_dma_memcpy;
od->ddev.device_config = omap_dma_slave_config;
od->ddev.device_pause = omap_dma_pause;
od->ddev.device_resume = omap_dma_resume;
@@ -1116,8 +1167,17 @@ static int omap_dma_probe(struct platform_device *pdev)
tasklet_init(&od->task, omap_dma_sched, (unsigned long)od);
- for (i = 0; i < 127; i++) {
- rc = omap_dma_chan_init(od, i);
+ od->dma_requests = OMAP_SDMA_REQUESTS;
+ if (pdev->dev.of_node && of_property_read_u32(pdev->dev.of_node,
+ "dma-requests",
+ &od->dma_requests)) {
+ dev_info(&pdev->dev,
+ "Missing dma-requests property, using %u.\n",
+ OMAP_SDMA_REQUESTS);
+ }
+
+ for (i = 0; i < OMAP_SDMA_CHANNELS; i++) {
+ rc = omap_dma_chan_init(od);
if (rc) {
omap_dma_free(od);
return rc;
@@ -1208,10 +1268,14 @@ static struct platform_driver omap_dma_driver = {
bool omap_dma_filter_fn(struct dma_chan *chan, void *param)
{
if (chan->device->dev->driver == &omap_dma_driver.driver) {
+ struct omap_dmadev *od = to_omap_dma_dev(chan->device);
struct omap_chan *c = to_omap_dma_chan(chan);
unsigned req = *(unsigned *)param;
- return req == c->dma_sig;
+ if (req <= od->dma_requests) {
+ c->dma_sig = req;
+ return true;
+ }
}
return false;
}
diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
index 340f9e607cd8..f513f77b1d85 100644
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -1424,8 +1424,8 @@ static int pl330_submit_req(struct pl330_thread *thrd,
goto xfer_exit;
if (ret > pl330->mcbufsz / 2) {
- dev_info(pl330->ddma.dev, "%s:%d Trying increasing mcbufsz\n",
- __func__, __LINE__);
+ dev_info(pl330->ddma.dev, "%s:%d Try increasing mcbufsz (%i/%i)\n",
+ __func__, __LINE__, ret, pl330->mcbufsz / 2);
ret = -ENOMEM;
goto xfer_exit;
}
@@ -2584,12 +2584,14 @@ pl330_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dst,
{
struct dma_pl330_desc *desc;
struct dma_pl330_chan *pch = to_pchan(chan);
- struct pl330_dmac *pl330 = pch->dmac;
+ struct pl330_dmac *pl330;
int burst;
if (unlikely(!pch || !len))
return NULL;
+ pl330 = pch->dmac;
+
desc = __pl330_prep_dma_memcpy(pch, dst, src, len);
if (!desc)
return NULL;
diff --git a/drivers/dma/pxa_dma.c b/drivers/dma/pxa_dma.c
new file mode 100644
index 000000000000..ddcbbf5cd9e9
--- /dev/null
+++ b/drivers/dma/pxa_dma.c
@@ -0,0 +1,1467 @@
+/*
+ * Copyright 2015 Robert Jarzmik <robert.jarzmik@free.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/dmaengine.h>
+#include <linux/platform_device.h>
+#include <linux/device.h>
+#include <linux/platform_data/mmp_dma.h>
+#include <linux/dmapool.h>
+#include <linux/of_device.h>
+#include <linux/of_dma.h>
+#include <linux/of.h>
+#include <linux/dma/pxa-dma.h>
+
+#include "dmaengine.h"
+#include "virt-dma.h"
+
+#define DCSR(n) (0x0000 + ((n) << 2))
+#define DALGN(n) 0x00a0
+#define DINT 0x00f0
+#define DDADR(n) (0x0200 + ((n) << 4))
+#define DSADR(n) (0x0204 + ((n) << 4))
+#define DTADR(n) (0x0208 + ((n) << 4))
+#define DCMD(n) (0x020c + ((n) << 4))
+
+#define PXA_DCSR_RUN BIT(31) /* Run Bit (read / write) */
+#define PXA_DCSR_NODESC BIT(30) /* No-Descriptor Fetch (read / write) */
+#define PXA_DCSR_STOPIRQEN BIT(29) /* Stop Interrupt Enable (R/W) */
+#define PXA_DCSR_REQPEND BIT(8) /* Request Pending (read-only) */
+#define PXA_DCSR_STOPSTATE BIT(3) /* Stop State (read-only) */
+#define PXA_DCSR_ENDINTR BIT(2) /* End Interrupt (read / write) */
+#define PXA_DCSR_STARTINTR BIT(1) /* Start Interrupt (read / write) */
+#define PXA_DCSR_BUSERR BIT(0) /* Bus Error Interrupt (read / write) */
+
+#define PXA_DCSR_EORIRQEN BIT(28) /* End of Receive IRQ Enable (R/W) */
+#define PXA_DCSR_EORJMPEN BIT(27) /* Jump to next descriptor on EOR */
+#define PXA_DCSR_EORSTOPEN BIT(26) /* STOP on an EOR */
+#define PXA_DCSR_SETCMPST BIT(25) /* Set Descriptor Compare Status */
+#define PXA_DCSR_CLRCMPST BIT(24) /* Clear Descriptor Compare Status */
+#define PXA_DCSR_CMPST BIT(10) /* The Descriptor Compare Status */
+#define PXA_DCSR_EORINTR BIT(9) /* The end of Receive */
+
+#define DRCMR_MAPVLD BIT(7) /* Map Valid (read / write) */
+#define DRCMR_CHLNUM 0x1f /* mask for Channel Number (read / write) */
+
+#define DDADR_DESCADDR 0xfffffff0 /* Address of next descriptor (mask) */
+#define DDADR_STOP BIT(0) /* Stop (read / write) */
+
+#define PXA_DCMD_INCSRCADDR BIT(31) /* Source Address Increment Setting. */
+#define PXA_DCMD_INCTRGADDR BIT(30) /* Target Address Increment Setting. */
+#define PXA_DCMD_FLOWSRC BIT(29) /* Flow Control by the source. */
+#define PXA_DCMD_FLOWTRG BIT(28) /* Flow Control by the target. */
+#define PXA_DCMD_STARTIRQEN BIT(22) /* Start Interrupt Enable */
+#define PXA_DCMD_ENDIRQEN BIT(21) /* End Interrupt Enable */
+#define PXA_DCMD_ENDIAN BIT(18) /* Device Endian-ness. */
+#define PXA_DCMD_BURST8 (1 << 16) /* 8 byte burst */
+#define PXA_DCMD_BURST16 (2 << 16) /* 16 byte burst */
+#define PXA_DCMD_BURST32 (3 << 16) /* 32 byte burst */
+#define PXA_DCMD_WIDTH1 (1 << 14) /* 1 byte width */
+#define PXA_DCMD_WIDTH2 (2 << 14) /* 2 byte width (HalfWord) */
+#define PXA_DCMD_WIDTH4 (3 << 14) /* 4 byte width (Word) */
+#define PXA_DCMD_LENGTH 0x01fff /* length mask (max = 8K - 1) */
+
+#define PDMA_ALIGNMENT 3
+#define PDMA_MAX_DESC_BYTES (PXA_DCMD_LENGTH & ~((1 << PDMA_ALIGNMENT) - 1))
+
+struct pxad_desc_hw {
+ u32 ddadr; /* Points to the next descriptor + flags */
+ u32 dsadr; /* DSADR value for the current transfer */
+ u32 dtadr; /* DTADR value for the current transfer */
+ u32 dcmd; /* DCMD value for the current transfer */
+} __aligned(16);
+
+struct pxad_desc_sw {
+ struct virt_dma_desc vd; /* Virtual descriptor */
+ int nb_desc; /* Number of hw. descriptors */
+ size_t len; /* Number of bytes xfered */
+ dma_addr_t first; /* First descriptor's addr */
+
+ /* At least one descriptor has an src/dst address not multiple of 8 */
+ bool misaligned;
+ bool cyclic;
+ struct dma_pool *desc_pool; /* Channel's used allocator */
+
+ struct pxad_desc_hw *hw_desc[]; /* DMA coherent descriptors */
+};
+
+struct pxad_phy {
+ int idx;
+ void __iomem *base;
+ struct pxad_chan *vchan;
+};
+
+struct pxad_chan {
+ struct virt_dma_chan vc; /* Virtual channel */
+ u32 drcmr; /* Requestor of the channel */
+ enum pxad_chan_prio prio; /* Required priority of phy */
+ /*
+ * At least one desc_sw in submitted or issued transfers on this channel
+ * has one address such as: addr % 8 != 0. This implies the DALGN
+ * setting on the phy.
+ */
+ bool misaligned;
+ struct dma_slave_config cfg; /* Runtime config */
+
+ /* protected by vc->lock */
+ struct pxad_phy *phy;
+ struct dma_pool *desc_pool; /* Descriptors pool */
+};
+
+struct pxad_device {
+ struct dma_device slave;
+ int nr_chans;
+ void __iomem *base;
+ struct pxad_phy *phys;
+ spinlock_t phy_lock; /* Phy association */
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *dbgfs_root;
+ struct dentry *dbgfs_state;
+ struct dentry **dbgfs_chan;
+#endif
+};
+
+#define tx_to_pxad_desc(tx) \
+ container_of(tx, struct pxad_desc_sw, async_tx)
+#define to_pxad_chan(dchan) \
+ container_of(dchan, struct pxad_chan, vc.chan)
+#define to_pxad_dev(dmadev) \
+ container_of(dmadev, struct pxad_device, slave)
+#define to_pxad_sw_desc(_vd) \
+ container_of((_vd), struct pxad_desc_sw, vd)
+
+#define _phy_readl_relaxed(phy, _reg) \
+ readl_relaxed((phy)->base + _reg((phy)->idx))
+#define phy_readl_relaxed(phy, _reg) \
+ ({ \
+ u32 _v; \
+ _v = readl_relaxed((phy)->base + _reg((phy)->idx)); \
+ dev_vdbg(&phy->vchan->vc.chan.dev->device, \
+ "%s(): readl(%s): 0x%08x\n", __func__, #_reg, \
+ _v); \
+ _v; \
+ })
+#define phy_writel(phy, val, _reg) \
+ do { \
+ writel((val), (phy)->base + _reg((phy)->idx)); \
+ dev_vdbg(&phy->vchan->vc.chan.dev->device, \
+ "%s(): writel(0x%08x, %s)\n", \
+ __func__, (u32)(val), #_reg); \
+ } while (0)
+#define phy_writel_relaxed(phy, val, _reg) \
+ do { \
+ writel_relaxed((val), (phy)->base + _reg((phy)->idx)); \
+ dev_vdbg(&phy->vchan->vc.chan.dev->device, \
+ "%s(): writel_relaxed(0x%08x, %s)\n", \
+ __func__, (u32)(val), #_reg); \
+ } while (0)
+
+static unsigned int pxad_drcmr(unsigned int line)
+{
+ if (line < 64)
+ return 0x100 + line * 4;
+ return 0x1000 + line * 4;
+}
+
+/*
+ * Debug fs
+ */
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+
+static int dbg_show_requester_chan(struct seq_file *s, void *p)
+{
+ int pos = 0;
+ struct pxad_phy *phy = s->private;
+ int i;
+ u32 drcmr;
+
+ pos += seq_printf(s, "DMA channel %d requester :\n", phy->idx);
+ for (i = 0; i < 70; i++) {
+ drcmr = readl_relaxed(phy->base + pxad_drcmr(i));
+ if ((drcmr & DRCMR_CHLNUM) == phy->idx)
+ pos += seq_printf(s, "\tRequester %d (MAPVLD=%d)\n", i,
+ !!(drcmr & DRCMR_MAPVLD));
+ }
+ return pos;
+}
+
+static inline int dbg_burst_from_dcmd(u32 dcmd)
+{
+ int burst = (dcmd >> 16) & 0x3;
+
+ return burst ? 4 << burst : 0;
+}
+
+static int is_phys_valid(unsigned long addr)
+{
+ return pfn_valid(__phys_to_pfn(addr));
+}
+
+#define PXA_DCSR_STR(flag) (dcsr & PXA_DCSR_##flag ? #flag" " : "")
+#define PXA_DCMD_STR(flag) (dcmd & PXA_DCMD_##flag ? #flag" " : "")
+
+static int dbg_show_descriptors(struct seq_file *s, void *p)
+{
+ struct pxad_phy *phy = s->private;
+ int i, max_show = 20, burst, width;
+ u32 dcmd;
+ unsigned long phys_desc, ddadr;
+ struct pxad_desc_hw *desc;
+
+ phys_desc = ddadr = _phy_readl_relaxed(phy, DDADR);
+
+ seq_printf(s, "DMA channel %d descriptors :\n", phy->idx);
+ seq_printf(s, "[%03d] First descriptor unknown\n", 0);
+ for (i = 1; i < max_show && is_phys_valid(phys_desc); i++) {
+ desc = phys_to_virt(phys_desc);
+ dcmd = desc->dcmd;
+ burst = dbg_burst_from_dcmd(dcmd);
+ width = (1 << ((dcmd >> 14) & 0x3)) >> 1;
+
+ seq_printf(s, "[%03d] Desc at %08lx(virt %p)\n",
+ i, phys_desc, desc);
+ seq_printf(s, "\tDDADR = %08x\n", desc->ddadr);
+ seq_printf(s, "\tDSADR = %08x\n", desc->dsadr);
+ seq_printf(s, "\tDTADR = %08x\n", desc->dtadr);
+ seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d width=%d len=%d)\n",
+ dcmd,
+ PXA_DCMD_STR(INCSRCADDR), PXA_DCMD_STR(INCTRGADDR),
+ PXA_DCMD_STR(FLOWSRC), PXA_DCMD_STR(FLOWTRG),
+ PXA_DCMD_STR(STARTIRQEN), PXA_DCMD_STR(ENDIRQEN),
+ PXA_DCMD_STR(ENDIAN), burst, width,
+ dcmd & PXA_DCMD_LENGTH);
+ phys_desc = desc->ddadr;
+ }
+ if (i == max_show)
+ seq_printf(s, "[%03d] Desc at %08lx ... max display reached\n",
+ i, phys_desc);
+ else
+ seq_printf(s, "[%03d] Desc at %08lx is %s\n",
+ i, phys_desc, phys_desc == DDADR_STOP ?
+ "DDADR_STOP" : "invalid");
+
+ return 0;
+}
+
+static int dbg_show_chan_state(struct seq_file *s, void *p)
+{
+ struct pxad_phy *phy = s->private;
+ u32 dcsr, dcmd;
+ int burst, width;
+ static const char * const str_prio[] = {
+ "high", "normal", "low", "invalid"
+ };
+
+ dcsr = _phy_readl_relaxed(phy, DCSR);
+ dcmd = _phy_readl_relaxed(phy, DCMD);
+ burst = dbg_burst_from_dcmd(dcmd);
+ width = (1 << ((dcmd >> 14) & 0x3)) >> 1;
+
+ seq_printf(s, "DMA channel %d\n", phy->idx);
+ seq_printf(s, "\tPriority : %s\n",
+ str_prio[(phy->idx & 0xf) / 4]);
+ seq_printf(s, "\tUnaligned transfer bit: %s\n",
+ _phy_readl_relaxed(phy, DALGN) & BIT(phy->idx) ?
+ "yes" : "no");
+ seq_printf(s, "\tDCSR = %08x (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
+ dcsr, PXA_DCSR_STR(RUN), PXA_DCSR_STR(NODESC),
+ PXA_DCSR_STR(STOPIRQEN), PXA_DCSR_STR(EORIRQEN),
+ PXA_DCSR_STR(EORJMPEN), PXA_DCSR_STR(EORSTOPEN),
+ PXA_DCSR_STR(SETCMPST), PXA_DCSR_STR(CLRCMPST),
+ PXA_DCSR_STR(CMPST), PXA_DCSR_STR(EORINTR),
+ PXA_DCSR_STR(REQPEND), PXA_DCSR_STR(STOPSTATE),
+ PXA_DCSR_STR(ENDINTR), PXA_DCSR_STR(STARTINTR),
+ PXA_DCSR_STR(BUSERR));
+
+ seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d width=%d len=%d)\n",
+ dcmd,
+ PXA_DCMD_STR(INCSRCADDR), PXA_DCMD_STR(INCTRGADDR),
+ PXA_DCMD_STR(FLOWSRC), PXA_DCMD_STR(FLOWTRG),
+ PXA_DCMD_STR(STARTIRQEN), PXA_DCMD_STR(ENDIRQEN),
+ PXA_DCMD_STR(ENDIAN), burst, width, dcmd & PXA_DCMD_LENGTH);
+ seq_printf(s, "\tDSADR = %08x\n", _phy_readl_relaxed(phy, DSADR));
+ seq_printf(s, "\tDTADR = %08x\n", _phy_readl_relaxed(phy, DTADR));
+ seq_printf(s, "\tDDADR = %08x\n", _phy_readl_relaxed(phy, DDADR));
+
+ return 0;
+}
+
+static int dbg_show_state(struct seq_file *s, void *p)
+{
+ struct pxad_device *pdev = s->private;
+
+ /* basic device status */
+ seq_puts(s, "DMA engine status\n");
+ seq_printf(s, "\tChannel number: %d\n", pdev->nr_chans);
+
+ return 0;
+}
+
+#define DBGFS_FUNC_DECL(name) \
+static int dbg_open_##name(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, dbg_show_##name, inode->i_private); \
+} \
+static const struct file_operations dbg_fops_##name = { \
+ .owner = THIS_MODULE, \
+ .open = dbg_open_##name, \
+ .llseek = seq_lseek, \
+ .read = seq_read, \
+ .release = single_release, \
+}
+
+DBGFS_FUNC_DECL(state);
+DBGFS_FUNC_DECL(chan_state);
+DBGFS_FUNC_DECL(descriptors);
+DBGFS_FUNC_DECL(requester_chan);
+
+static struct dentry *pxad_dbg_alloc_chan(struct pxad_device *pdev,
+ int ch, struct dentry *chandir)
+{
+ char chan_name[11];
+ struct dentry *chan, *chan_state = NULL, *chan_descr = NULL;
+ struct dentry *chan_reqs = NULL;
+ void *dt;
+
+ scnprintf(chan_name, sizeof(chan_name), "%d", ch);
+ chan = debugfs_create_dir(chan_name, chandir);
+ dt = (void *)&pdev->phys[ch];
+
+ if (chan)
+ chan_state = debugfs_create_file("state", 0400, chan, dt,
+ &dbg_fops_chan_state);
+ if (chan_state)
+ chan_descr = debugfs_create_file("descriptors", 0400, chan, dt,
+ &dbg_fops_descriptors);
+ if (chan_descr)
+ chan_reqs = debugfs_create_file("requesters", 0400, chan, dt,
+ &dbg_fops_requester_chan);
+ if (!chan_reqs)
+ goto err_state;
+
+ return chan;
+
+err_state:
+ debugfs_remove_recursive(chan);
+ return NULL;
+}
+
+static void pxad_init_debugfs(struct pxad_device *pdev)
+{
+ int i;
+ struct dentry *chandir;
+
+ pdev->dbgfs_root = debugfs_create_dir(dev_name(pdev->slave.dev), NULL);
+ if (IS_ERR(pdev->dbgfs_root) || !pdev->dbgfs_root)
+ goto err_root;
+
+ pdev->dbgfs_state = debugfs_create_file("state", 0400, pdev->dbgfs_root,
+ pdev, &dbg_fops_state);
+ if (!pdev->dbgfs_state)
+ goto err_state;
+
+ pdev->dbgfs_chan =
+ kmalloc_array(pdev->nr_chans, sizeof(*pdev->dbgfs_state),
+ GFP_KERNEL);
+ if (!pdev->dbgfs_chan)
+ goto err_alloc;
+
+ chandir = debugfs_create_dir("channels", pdev->dbgfs_root);
+ if (!chandir)
+ goto err_chandir;
+
+ for (i = 0; i < pdev->nr_chans; i++) {
+ pdev->dbgfs_chan[i] = pxad_dbg_alloc_chan(pdev, i, chandir);
+ if (!pdev->dbgfs_chan[i])
+ goto err_chans;
+ }
+
+ return;
+err_chans:
+err_chandir:
+ kfree(pdev->dbgfs_chan);
+err_alloc:
+err_state:
+ debugfs_remove_recursive(pdev->dbgfs_root);
+err_root:
+ pr_err("pxad: debugfs is not available\n");
+}
+
+static void pxad_cleanup_debugfs(struct pxad_device *pdev)
+{
+ debugfs_remove_recursive(pdev->dbgfs_root);
+}
+#else
+static inline void pxad_init_debugfs(struct pxad_device *pdev) {}
+static inline void pxad_cleanup_debugfs(struct pxad_device *pdev) {}
+#endif
+
+/*
+ * In the transition phase where legacy pxa handling is done at the same time as
+ * mmp_dma, the DMA physical channel split between the 2 DMA providers is done
+ * through legacy_reserved. Legacy code reserves DMA channels by settings
+ * corresponding bits in legacy_reserved.
+ */
+static u32 legacy_reserved;
+static u32 legacy_unavailable;
+
+static struct pxad_phy *lookup_phy(struct pxad_chan *pchan)
+{
+ int prio, i;
+ struct pxad_device *pdev = to_pxad_dev(pchan->vc.chan.device);
+ struct pxad_phy *phy, *found = NULL;
+ unsigned long flags;
+
+ /*
+ * dma channel priorities
+ * ch 0 - 3, 16 - 19 <--> (0)
+ * ch 4 - 7, 20 - 23 <--> (1)
+ * ch 8 - 11, 24 - 27 <--> (2)
+ * ch 12 - 15, 28 - 31 <--> (3)
+ */
+
+ spin_lock_irqsave(&pdev->phy_lock, flags);
+ for (prio = pchan->prio; prio >= PXAD_PRIO_HIGHEST; prio--) {
+ for (i = 0; i < pdev->nr_chans; i++) {
+ if (prio != (i & 0xf) >> 2)
+ continue;
+ if ((i < 32) && (legacy_reserved & BIT(i)))
+ continue;
+ phy = &pdev->phys[i];
+ if (!phy->vchan) {
+ phy->vchan = pchan;
+ found = phy;
+ if (i < 32)
+ legacy_unavailable |= BIT(i);
+ goto out_unlock;
+ }
+ }
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&pdev->phy_lock, flags);
+ dev_dbg(&pchan->vc.chan.dev->device,
+ "%s(): phy=%p(%d)\n", __func__, found,
+ found ? found->idx : -1);
+
+ return found;
+}
+
+static void pxad_free_phy(struct pxad_chan *chan)
+{
+ struct pxad_device *pdev = to_pxad_dev(chan->vc.chan.device);
+ unsigned long flags;
+ u32 reg;
+ int i;
+
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): freeing\n", __func__);
+ if (!chan->phy)
+ return;
+
+ /* clear the channel mapping in DRCMR */
+ reg = pxad_drcmr(chan->drcmr);
+ writel_relaxed(0, chan->phy->base + reg);
+
+ spin_lock_irqsave(&pdev->phy_lock, flags);
+ for (i = 0; i < 32; i++)
+ if (chan->phy == &pdev->phys[i])
+ legacy_unavailable &= ~BIT(i);
+ chan->phy->vchan = NULL;
+ chan->phy = NULL;
+ spin_unlock_irqrestore(&pdev->phy_lock, flags);
+}
+
+static bool is_chan_running(struct pxad_chan *chan)
+{
+ u32 dcsr;
+ struct pxad_phy *phy = chan->phy;
+
+ if (!phy)
+ return false;
+ dcsr = phy_readl_relaxed(phy, DCSR);
+ return dcsr & PXA_DCSR_RUN;
+}
+
+static bool is_running_chan_misaligned(struct pxad_chan *chan)
+{
+ u32 dalgn;
+
+ BUG_ON(!chan->phy);
+ dalgn = phy_readl_relaxed(chan->phy, DALGN);
+ return dalgn & (BIT(chan->phy->idx));
+}
+
+static void phy_enable(struct pxad_phy *phy, bool misaligned)
+{
+ u32 reg, dalgn;
+
+ if (!phy->vchan)
+ return;
+
+ dev_dbg(&phy->vchan->vc.chan.dev->device,
+ "%s(); phy=%p(%d) misaligned=%d\n", __func__,
+ phy, phy->idx, misaligned);
+
+ reg = pxad_drcmr(phy->vchan->drcmr);
+ writel_relaxed(DRCMR_MAPVLD | phy->idx, phy->base + reg);
+
+ dalgn = phy_readl_relaxed(phy, DALGN);
+ if (misaligned)
+ dalgn |= BIT(phy->idx);
+ else
+ dalgn &= ~BIT(phy->idx);
+ phy_writel_relaxed(phy, dalgn, DALGN);
+
+ phy_writel(phy, PXA_DCSR_STOPIRQEN | PXA_DCSR_ENDINTR |
+ PXA_DCSR_BUSERR | PXA_DCSR_RUN, DCSR);
+}
+
+static void phy_disable(struct pxad_phy *phy)
+{
+ u32 dcsr;
+
+ if (!phy)
+ return;
+
+ dcsr = phy_readl_relaxed(phy, DCSR);
+ dev_dbg(&phy->vchan->vc.chan.dev->device,
+ "%s(): phy=%p(%d)\n", __func__, phy, phy->idx);
+ phy_writel(phy, dcsr & ~PXA_DCSR_RUN & ~PXA_DCSR_STOPIRQEN, DCSR);
+}
+
+static void pxad_launch_chan(struct pxad_chan *chan,
+ struct pxad_desc_sw *desc)
+{
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): desc=%p\n", __func__, desc);
+ if (!chan->phy) {
+ chan->phy = lookup_phy(chan);
+ if (!chan->phy) {
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): no free dma channel\n", __func__);
+ return;
+ }
+ }
+
+ /*
+ * Program the descriptor's address into the DMA controller,
+ * then start the DMA transaction
+ */
+ phy_writel(chan->phy, desc->first, DDADR);
+ phy_enable(chan->phy, chan->misaligned);
+}
+
+static void set_updater_desc(struct pxad_desc_sw *sw_desc,
+ unsigned long flags)
+{
+ struct pxad_desc_hw *updater =
+ sw_desc->hw_desc[sw_desc->nb_desc - 1];
+ dma_addr_t dma = sw_desc->hw_desc[sw_desc->nb_desc - 2]->ddadr;
+
+ updater->ddadr = DDADR_STOP;
+ updater->dsadr = dma;
+ updater->dtadr = dma + 8;
+ updater->dcmd = PXA_DCMD_WIDTH4 | PXA_DCMD_BURST32 |
+ (PXA_DCMD_LENGTH & sizeof(u32));
+ if (flags & DMA_PREP_INTERRUPT)
+ updater->dcmd |= PXA_DCMD_ENDIRQEN;
+}
+
+static bool is_desc_completed(struct virt_dma_desc *vd)
+{
+ struct pxad_desc_sw *sw_desc = to_pxad_sw_desc(vd);
+ struct pxad_desc_hw *updater =
+ sw_desc->hw_desc[sw_desc->nb_desc - 1];
+
+ return updater->dtadr != (updater->dsadr + 8);
+}
+
+static void pxad_desc_chain(struct virt_dma_desc *vd1,
+ struct virt_dma_desc *vd2)
+{
+ struct pxad_desc_sw *desc1 = to_pxad_sw_desc(vd1);
+ struct pxad_desc_sw *desc2 = to_pxad_sw_desc(vd2);
+ dma_addr_t dma_to_chain;
+
+ dma_to_chain = desc2->first;
+ desc1->hw_desc[desc1->nb_desc - 1]->ddadr = dma_to_chain;
+}
+
+static bool pxad_try_hotchain(struct virt_dma_chan *vc,
+ struct virt_dma_desc *vd)
+{
+ struct virt_dma_desc *vd_last_issued = NULL;
+ struct pxad_chan *chan = to_pxad_chan(&vc->chan);
+
+ /*
+ * Attempt to hot chain the tx if the phy is still running. This is
+ * considered successful only if either the channel is still running
+ * after the chaining, or if the chained transfer is completed after
+ * having been hot chained.
+ * A change of alignment is not allowed, and forbids hotchaining.
+ */
+ if (is_chan_running(chan)) {
+ BUG_ON(list_empty(&vc->desc_issued));
+
+ if (!is_running_chan_misaligned(chan) &&
+ to_pxad_sw_desc(vd)->misaligned)
+ return false;
+
+ vd_last_issued = list_entry(vc->desc_issued.prev,
+ struct virt_dma_desc, node);
+ pxad_desc_chain(vd_last_issued, vd);
+ if (is_chan_running(chan) || is_desc_completed(vd_last_issued))
+ return true;
+ }
+
+ return false;
+}
+
+static unsigned int clear_chan_irq(struct pxad_phy *phy)
+{
+ u32 dcsr;
+ u32 dint = readl(phy->base + DINT);
+
+ if (!(dint & BIT(phy->idx)))
+ return PXA_DCSR_RUN;
+
+ /* clear irq */
+ dcsr = phy_readl_relaxed(phy, DCSR);
+ phy_writel(phy, dcsr, DCSR);
+ if ((dcsr & PXA_DCSR_BUSERR) && (phy->vchan))
+ dev_warn(&phy->vchan->vc.chan.dev->device,
+ "%s(chan=%p): PXA_DCSR_BUSERR\n",
+ __func__, &phy->vchan);
+
+ return dcsr & ~PXA_DCSR_RUN;
+}
+
+static irqreturn_t pxad_chan_handler(int irq, void *dev_id)
+{
+ struct pxad_phy *phy = dev_id;
+ struct pxad_chan *chan = phy->vchan;
+ struct virt_dma_desc *vd, *tmp;
+ unsigned int dcsr;
+ unsigned long flags;
+
+ BUG_ON(!chan);
+
+ dcsr = clear_chan_irq(phy);
+ if (dcsr & PXA_DCSR_RUN)
+ return IRQ_NONE;
+
+ spin_lock_irqsave(&chan->vc.lock, flags);
+ list_for_each_entry_safe(vd, tmp, &chan->vc.desc_issued, node) {
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): checking txd %p[%x]: completed=%d\n",
+ __func__, vd, vd->tx.cookie, is_desc_completed(vd));
+ if (is_desc_completed(vd)) {
+ list_del(&vd->node);
+ vchan_cookie_complete(vd);
+ } else {
+ break;
+ }
+ }
+
+ if (dcsr & PXA_DCSR_STOPSTATE) {
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): channel stopped, submitted_empty=%d issued_empty=%d",
+ __func__,
+ list_empty(&chan->vc.desc_submitted),
+ list_empty(&chan->vc.desc_issued));
+ phy_writel_relaxed(phy, dcsr & ~PXA_DCSR_STOPIRQEN, DCSR);
+
+ if (list_empty(&chan->vc.desc_issued)) {
+ chan->misaligned =
+ !list_empty(&chan->vc.desc_submitted);
+ } else {
+ vd = list_first_entry(&chan->vc.desc_issued,
+ struct virt_dma_desc, node);
+ pxad_launch_chan(chan, to_pxad_sw_desc(vd));
+ }
+ }
+ spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t pxad_int_handler(int irq, void *dev_id)
+{
+ struct pxad_device *pdev = dev_id;
+ struct pxad_phy *phy;
+ u32 dint = readl(pdev->base + DINT);
+ int i, ret = IRQ_NONE;
+
+ while (dint) {
+ i = __ffs(dint);
+ dint &= (dint - 1);
+ phy = &pdev->phys[i];
+ if ((i < 32) && (legacy_reserved & BIT(i)))
+ continue;
+ if (pxad_chan_handler(irq, phy) == IRQ_HANDLED)
+ ret = IRQ_HANDLED;
+ }
+
+ return ret;
+}
+
+static int pxad_alloc_chan_resources(struct dma_chan *dchan)
+{
+ struct pxad_chan *chan = to_pxad_chan(dchan);
+ struct pxad_device *pdev = to_pxad_dev(chan->vc.chan.device);
+
+ if (chan->desc_pool)
+ return 1;
+
+ chan->desc_pool = dma_pool_create(dma_chan_name(dchan),
+ pdev->slave.dev,
+ sizeof(struct pxad_desc_hw),
+ __alignof__(struct pxad_desc_hw),
+ 0);
+ if (!chan->desc_pool) {
+ dev_err(&chan->vc.chan.dev->device,
+ "%s(): unable to allocate descriptor pool\n",
+ __func__);
+ return -ENOMEM;
+ }
+
+ return 1;
+}
+
+static void pxad_free_chan_resources(struct dma_chan *dchan)
+{
+ struct pxad_chan *chan = to_pxad_chan(dchan);
+
+ vchan_free_chan_resources(&chan->vc);
+ dma_pool_destroy(chan->desc_pool);
+ chan->desc_pool = NULL;
+
+}
+
+static void pxad_free_desc(struct virt_dma_desc *vd)
+{
+ int i;
+ dma_addr_t dma;
+ struct pxad_desc_sw *sw_desc = to_pxad_sw_desc(vd);
+
+ BUG_ON(sw_desc->nb_desc == 0);
+ for (i = sw_desc->nb_desc - 1; i >= 0; i--) {
+ if (i > 0)
+ dma = sw_desc->hw_desc[i - 1]->ddadr;
+ else
+ dma = sw_desc->first;
+ dma_pool_free(sw_desc->desc_pool,
+ sw_desc->hw_desc[i], dma);
+ }
+ sw_desc->nb_desc = 0;
+ kfree(sw_desc);
+}
+
+static struct pxad_desc_sw *
+pxad_alloc_desc(struct pxad_chan *chan, unsigned int nb_hw_desc)
+{
+ struct pxad_desc_sw *sw_desc;
+ dma_addr_t dma;
+ int i;
+
+ sw_desc = kzalloc(sizeof(*sw_desc) +
+ nb_hw_desc * sizeof(struct pxad_desc_hw *),
+ GFP_NOWAIT);
+ if (!sw_desc)
+ return NULL;
+ sw_desc->desc_pool = chan->desc_pool;
+
+ for (i = 0; i < nb_hw_desc; i++) {
+ sw_desc->hw_desc[i] = dma_pool_alloc(sw_desc->desc_pool,
+ GFP_NOWAIT, &dma);
+ if (!sw_desc->hw_desc[i]) {
+ dev_err(&chan->vc.chan.dev->device,
+ "%s(): Couldn't allocate the %dth hw_desc from dma_pool %p\n",
+ __func__, i, sw_desc->desc_pool);
+ goto err;
+ }
+
+ if (i == 0)
+ sw_desc->first = dma;
+ else
+ sw_desc->hw_desc[i - 1]->ddadr = dma;
+ sw_desc->nb_desc++;
+ }
+
+ return sw_desc;
+err:
+ pxad_free_desc(&sw_desc->vd);
+ return NULL;
+}
+
+static dma_cookie_t pxad_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+ struct virt_dma_chan *vc = to_virt_chan(tx->chan);
+ struct pxad_chan *chan = to_pxad_chan(&vc->chan);
+ struct virt_dma_desc *vd_chained = NULL,
+ *vd = container_of(tx, struct virt_dma_desc, tx);
+ dma_cookie_t cookie;
+ unsigned long flags;
+
+ set_updater_desc(to_pxad_sw_desc(vd), tx->flags);
+
+ spin_lock_irqsave(&vc->lock, flags);
+ cookie = dma_cookie_assign(tx);
+
+ if (list_empty(&vc->desc_submitted) && pxad_try_hotchain(vc, vd)) {
+ list_move_tail(&vd->node, &vc->desc_issued);
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): txd %p[%x]: submitted (hot linked)\n",
+ __func__, vd, cookie);
+ goto out;
+ }
+
+ /*
+ * Fallback to placing the tx in the submitted queue
+ */
+ if (!list_empty(&vc->desc_submitted)) {
+ vd_chained = list_entry(vc->desc_submitted.prev,
+ struct virt_dma_desc, node);
+ /*
+ * Only chain the descriptors if no new misalignment is
+ * introduced. If a new misalignment is chained, let the channel
+ * stop, and be relaunched in misalign mode from the irq
+ * handler.
+ */
+ if (chan->misaligned || !to_pxad_sw_desc(vd)->misaligned)
+ pxad_desc_chain(vd_chained, vd);
+ else
+ vd_chained = NULL;
+ }
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): txd %p[%x]: submitted (%s linked)\n",
+ __func__, vd, cookie, vd_chained ? "cold" : "not");
+ list_move_tail(&vd->node, &vc->desc_submitted);
+ chan->misaligned |= to_pxad_sw_desc(vd)->misaligned;
+
+out:
+ spin_unlock_irqrestore(&vc->lock, flags);
+ return cookie;
+}
+
+static void pxad_issue_pending(struct dma_chan *dchan)
+{
+ struct pxad_chan *chan = to_pxad_chan(dchan);
+ struct virt_dma_desc *vd_first;
+ unsigned long flags;
+
+ spin_lock_irqsave(&chan->vc.lock, flags);
+ if (list_empty(&chan->vc.desc_submitted))
+ goto out;
+
+ vd_first = list_first_entry(&chan->vc.desc_submitted,
+ struct virt_dma_desc, node);
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): txd %p[%x]", __func__, vd_first, vd_first->tx.cookie);
+
+ vchan_issue_pending(&chan->vc);
+ if (!pxad_try_hotchain(&chan->vc, vd_first))
+ pxad_launch_chan(chan, to_pxad_sw_desc(vd_first));
+out:
+ spin_unlock_irqrestore(&chan->vc.lock, flags);
+}
+
+static inline struct dma_async_tx_descriptor *
+pxad_tx_prep(struct virt_dma_chan *vc, struct virt_dma_desc *vd,
+ unsigned long tx_flags)
+{
+ struct dma_async_tx_descriptor *tx;
+ struct pxad_chan *chan = container_of(vc, struct pxad_chan, vc);
+
+ tx = vchan_tx_prep(vc, vd, tx_flags);
+ tx->tx_submit = pxad_tx_submit;
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): vc=%p txd=%p[%x] flags=0x%lx\n", __func__,
+ vc, vd, vd->tx.cookie,
+ tx_flags);
+
+ return tx;
+}
+
+static void pxad_get_config(struct pxad_chan *chan,
+ enum dma_transfer_direction dir,
+ u32 *dcmd, u32 *dev_src, u32 *dev_dst)
+{
+ u32 maxburst = 0, dev_addr = 0;
+ enum dma_slave_buswidth width = DMA_SLAVE_BUSWIDTH_UNDEFINED;
+
+ *dcmd = 0;
+ if (chan->cfg.direction == DMA_DEV_TO_MEM) {
+ maxburst = chan->cfg.src_maxburst;
+ width = chan->cfg.src_addr_width;
+ dev_addr = chan->cfg.src_addr;
+ *dev_src = dev_addr;
+ *dcmd |= PXA_DCMD_INCTRGADDR | PXA_DCMD_FLOWSRC;
+ }
+ if (chan->cfg.direction == DMA_MEM_TO_DEV) {
+ maxburst = chan->cfg.dst_maxburst;
+ width = chan->cfg.dst_addr_width;
+ dev_addr = chan->cfg.dst_addr;
+ *dev_dst = dev_addr;
+ *dcmd |= PXA_DCMD_INCSRCADDR | PXA_DCMD_FLOWTRG;
+ }
+ if (chan->cfg.direction == DMA_MEM_TO_MEM)
+ *dcmd |= PXA_DCMD_BURST32 | PXA_DCMD_INCTRGADDR |
+ PXA_DCMD_INCSRCADDR;
+
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): dev_addr=0x%x maxburst=%d width=%d dir=%d\n",
+ __func__, dev_addr, maxburst, width, dir);
+
+ if (width == DMA_SLAVE_BUSWIDTH_1_BYTE)
+ *dcmd |= PXA_DCMD_WIDTH1;
+ else if (width == DMA_SLAVE_BUSWIDTH_2_BYTES)
+ *dcmd |= PXA_DCMD_WIDTH2;
+ else if (width == DMA_SLAVE_BUSWIDTH_4_BYTES)
+ *dcmd |= PXA_DCMD_WIDTH4;
+
+ if (maxburst == 8)
+ *dcmd |= PXA_DCMD_BURST8;
+ else if (maxburst == 16)
+ *dcmd |= PXA_DCMD_BURST16;
+ else if (maxburst == 32)
+ *dcmd |= PXA_DCMD_BURST32;
+
+ /* FIXME: drivers should be ported over to use the filter
+ * function. Once that's done, the following two lines can
+ * be removed.
+ */
+ if (chan->cfg.slave_id)
+ chan->drcmr = chan->cfg.slave_id;
+}
+
+static struct dma_async_tx_descriptor *
+pxad_prep_memcpy(struct dma_chan *dchan,
+ dma_addr_t dma_dst, dma_addr_t dma_src,
+ size_t len, unsigned long flags)
+{
+ struct pxad_chan *chan = to_pxad_chan(dchan);
+ struct pxad_desc_sw *sw_desc;
+ struct pxad_desc_hw *hw_desc;
+ u32 dcmd;
+ unsigned int i, nb_desc = 0;
+ size_t copy;
+
+ if (!dchan || !len)
+ return NULL;
+
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): dma_dst=0x%lx dma_src=0x%lx len=%zu flags=%lx\n",
+ __func__, (unsigned long)dma_dst, (unsigned long)dma_src,
+ len, flags);
+ pxad_get_config(chan, DMA_MEM_TO_MEM, &dcmd, NULL, NULL);
+
+ nb_desc = DIV_ROUND_UP(len, PDMA_MAX_DESC_BYTES);
+ sw_desc = pxad_alloc_desc(chan, nb_desc + 1);
+ if (!sw_desc)
+ return NULL;
+ sw_desc->len = len;
+
+ if (!IS_ALIGNED(dma_src, 1 << PDMA_ALIGNMENT) ||
+ !IS_ALIGNED(dma_dst, 1 << PDMA_ALIGNMENT))
+ sw_desc->misaligned = true;
+
+ i = 0;
+ do {
+ hw_desc = sw_desc->hw_desc[i++];
+ copy = min_t(size_t, len, PDMA_MAX_DESC_BYTES);
+ hw_desc->dcmd = dcmd | (PXA_DCMD_LENGTH & copy);
+ hw_desc->dsadr = dma_src;
+ hw_desc->dtadr = dma_dst;
+ len -= copy;
+ dma_src += copy;
+ dma_dst += copy;
+ } while (len);
+ set_updater_desc(sw_desc, flags);
+
+ return pxad_tx_prep(&chan->vc, &sw_desc->vd, flags);
+}
+
+static struct dma_async_tx_descriptor *
+pxad_prep_slave_sg(struct dma_chan *dchan, struct scatterlist *sgl,
+ unsigned int sg_len, enum dma_transfer_direction dir,
+ unsigned long flags, void *context)
+{
+ struct pxad_chan *chan = to_pxad_chan(dchan);
+ struct pxad_desc_sw *sw_desc;
+ size_t len, avail;
+ struct scatterlist *sg;
+ dma_addr_t dma;
+ u32 dcmd, dsadr = 0, dtadr = 0;
+ unsigned int nb_desc = 0, i, j = 0;
+
+ if ((sgl == NULL) || (sg_len == 0))
+ return NULL;
+
+ pxad_get_config(chan, dir, &dcmd, &dsadr, &dtadr);
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): dir=%d flags=%lx\n", __func__, dir, flags);
+
+ for_each_sg(sgl, sg, sg_len, i)
+ nb_desc += DIV_ROUND_UP(sg_dma_len(sg), PDMA_MAX_DESC_BYTES);
+ sw_desc = pxad_alloc_desc(chan, nb_desc + 1);
+ if (!sw_desc)
+ return NULL;
+
+ for_each_sg(sgl, sg, sg_len, i) {
+ dma = sg_dma_address(sg);
+ avail = sg_dma_len(sg);
+ sw_desc->len += avail;
+
+ do {
+ len = min_t(size_t, avail, PDMA_MAX_DESC_BYTES);
+ if (dma & 0x7)
+ sw_desc->misaligned = true;
+
+ sw_desc->hw_desc[j]->dcmd =
+ dcmd | (PXA_DCMD_LENGTH & len);
+ sw_desc->hw_desc[j]->dsadr = dsadr ? dsadr : dma;
+ sw_desc->hw_desc[j++]->dtadr = dtadr ? dtadr : dma;
+
+ dma += len;
+ avail -= len;
+ } while (avail);
+ }
+ set_updater_desc(sw_desc, flags);
+
+ return pxad_tx_prep(&chan->vc, &sw_desc->vd, flags);
+}
+
+static struct dma_async_tx_descriptor *
+pxad_prep_dma_cyclic(struct dma_chan *dchan,
+ dma_addr_t buf_addr, size_t len, size_t period_len,
+ enum dma_transfer_direction dir, unsigned long flags)
+{
+ struct pxad_chan *chan = to_pxad_chan(dchan);
+ struct pxad_desc_sw *sw_desc;
+ struct pxad_desc_hw **phw_desc;
+ dma_addr_t dma;
+ u32 dcmd, dsadr = 0, dtadr = 0;
+ unsigned int nb_desc = 0;
+
+ if (!dchan || !len || !period_len)
+ return NULL;
+ if ((dir != DMA_DEV_TO_MEM) && (dir != DMA_MEM_TO_DEV)) {
+ dev_err(&chan->vc.chan.dev->device,
+ "Unsupported direction for cyclic DMA\n");
+ return NULL;
+ }
+ /* the buffer length must be a multiple of period_len */
+ if (len % period_len != 0 || period_len > PDMA_MAX_DESC_BYTES ||
+ !IS_ALIGNED(period_len, 1 << PDMA_ALIGNMENT))
+ return NULL;
+
+ pxad_get_config(chan, dir, &dcmd, &dsadr, &dtadr);
+ dcmd |= PXA_DCMD_ENDIRQEN | (PXA_DCMD_LENGTH | period_len);
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): buf_addr=0x%lx len=%zu period=%zu dir=%d flags=%lx\n",
+ __func__, (unsigned long)buf_addr, len, period_len, dir, flags);
+
+ nb_desc = DIV_ROUND_UP(period_len, PDMA_MAX_DESC_BYTES);
+ nb_desc *= DIV_ROUND_UP(len, period_len);
+ sw_desc = pxad_alloc_desc(chan, nb_desc + 1);
+ if (!sw_desc)
+ return NULL;
+ sw_desc->cyclic = true;
+ sw_desc->len = len;
+
+ phw_desc = sw_desc->hw_desc;
+ dma = buf_addr;
+ do {
+ phw_desc[0]->dsadr = dsadr ? dsadr : dma;
+ phw_desc[0]->dtadr = dtadr ? dtadr : dma;
+ phw_desc[0]->dcmd = dcmd;
+ phw_desc++;
+ dma += period_len;
+ len -= period_len;
+ } while (len);
+ set_updater_desc(sw_desc, flags);
+
+ return pxad_tx_prep(&chan->vc, &sw_desc->vd, flags);
+}
+
+static int pxad_config(struct dma_chan *dchan,
+ struct dma_slave_config *cfg)
+{
+ struct pxad_chan *chan = to_pxad_chan(dchan);
+
+ if (!dchan)
+ return -EINVAL;
+
+ chan->cfg = *cfg;
+ return 0;
+}
+
+static int pxad_terminate_all(struct dma_chan *dchan)
+{
+ struct pxad_chan *chan = to_pxad_chan(dchan);
+ struct pxad_device *pdev = to_pxad_dev(chan->vc.chan.device);
+ struct virt_dma_desc *vd = NULL;
+ unsigned long flags;
+ struct pxad_phy *phy;
+ LIST_HEAD(head);
+
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): vchan %p: terminate all\n", __func__, &chan->vc);
+
+ spin_lock_irqsave(&chan->vc.lock, flags);
+ vchan_get_all_descriptors(&chan->vc, &head);
+
+ list_for_each_entry(vd, &head, node) {
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): cancelling txd %p[%x] (completed=%d)", __func__,
+ vd, vd->tx.cookie, is_desc_completed(vd));
+ }
+
+ phy = chan->phy;
+ if (phy) {
+ phy_disable(chan->phy);
+ pxad_free_phy(chan);
+ chan->phy = NULL;
+ spin_lock(&pdev->phy_lock);
+ phy->vchan = NULL;
+ spin_unlock(&pdev->phy_lock);
+ }
+ spin_unlock_irqrestore(&chan->vc.lock, flags);
+ vchan_dma_desc_free_list(&chan->vc, &head);
+
+ return 0;
+}
+
+static unsigned int pxad_residue(struct pxad_chan *chan,
+ dma_cookie_t cookie)
+{
+ struct virt_dma_desc *vd = NULL;
+ struct pxad_desc_sw *sw_desc = NULL;
+ struct pxad_desc_hw *hw_desc = NULL;
+ u32 curr, start, len, end, residue = 0;
+ unsigned long flags;
+ bool passed = false;
+ int i;
+
+ /*
+ * If the channel does not have a phy pointer anymore, it has already
+ * been completed. Therefore, its residue is 0.
+ */
+ if (!chan->phy)
+ return 0;
+
+ spin_lock_irqsave(&chan->vc.lock, flags);
+
+ vd = vchan_find_desc(&chan->vc, cookie);
+ if (!vd)
+ goto out;
+
+ sw_desc = to_pxad_sw_desc(vd);
+ if (sw_desc->hw_desc[0]->dcmd & PXA_DCMD_INCSRCADDR)
+ curr = phy_readl_relaxed(chan->phy, DSADR);
+ else
+ curr = phy_readl_relaxed(chan->phy, DTADR);
+
+ for (i = 0; i < sw_desc->nb_desc - 1; i++) {
+ hw_desc = sw_desc->hw_desc[i];
+ if (sw_desc->hw_desc[0]->dcmd & PXA_DCMD_INCSRCADDR)
+ start = hw_desc->dsadr;
+ else
+ start = hw_desc->dtadr;
+ len = hw_desc->dcmd & PXA_DCMD_LENGTH;
+ end = start + len;
+
+ /*
+ * 'passed' will be latched once we found the descriptor
+ * which lies inside the boundaries of the curr
+ * pointer. All descriptors that occur in the list
+ * _after_ we found that partially handled descriptor
+ * are still to be processed and are hence added to the
+ * residual bytes counter.
+ */
+
+ if (passed) {
+ residue += len;
+ } else if (curr >= start && curr <= end) {
+ residue += end - curr;
+ passed = true;
+ }
+ }
+ if (!passed)
+ residue = sw_desc->len;
+
+out:
+ spin_unlock_irqrestore(&chan->vc.lock, flags);
+ dev_dbg(&chan->vc.chan.dev->device,
+ "%s(): txd %p[%x] sw_desc=%p: %d\n",
+ __func__, vd, cookie, sw_desc, residue);
+ return residue;
+}
+
+static enum dma_status pxad_tx_status(struct dma_chan *dchan,
+ dma_cookie_t cookie,
+ struct dma_tx_state *txstate)
+{
+ struct pxad_chan *chan = to_pxad_chan(dchan);
+ enum dma_status ret;
+
+ ret = dma_cookie_status(dchan, cookie, txstate);
+ if (likely(txstate && (ret != DMA_ERROR)))
+ dma_set_residue(txstate, pxad_residue(chan, cookie));
+
+ return ret;
+}
+
+static void pxad_free_channels(struct dma_device *dmadev)
+{
+ struct pxad_chan *c, *cn;
+
+ list_for_each_entry_safe(c, cn, &dmadev->channels,
+ vc.chan.device_node) {
+ list_del(&c->vc.chan.device_node);
+ tasklet_kill(&c->vc.task);
+ }
+}
+
+static int pxad_remove(struct platform_device *op)
+{
+ struct pxad_device *pdev = platform_get_drvdata(op);
+
+ pxad_cleanup_debugfs(pdev);
+ pxad_free_channels(&pdev->slave);
+ dma_async_device_unregister(&pdev->slave);
+ return 0;
+}
+
+static int pxad_init_phys(struct platform_device *op,
+ struct pxad_device *pdev,
+ unsigned int nb_phy_chans)
+{
+ int irq0, irq, nr_irq = 0, i, ret;
+ struct pxad_phy *phy;
+
+ irq0 = platform_get_irq(op, 0);
+ if (irq0 < 0)
+ return irq0;
+
+ pdev->phys = devm_kcalloc(&op->dev, nb_phy_chans,
+ sizeof(pdev->phys[0]), GFP_KERNEL);
+ if (!pdev->phys)
+ return -ENOMEM;
+
+ for (i = 0; i < nb_phy_chans; i++)
+ if (platform_get_irq(op, i) > 0)
+ nr_irq++;
+
+ for (i = 0; i < nb_phy_chans; i++) {
+ phy = &pdev->phys[i];
+ phy->base = pdev->base;
+ phy->idx = i;
+ irq = platform_get_irq(op, i);
+ if ((nr_irq > 1) && (irq > 0))
+ ret = devm_request_irq(&op->dev, irq,
+ pxad_chan_handler,
+ IRQF_SHARED, "pxa-dma", phy);
+ if ((nr_irq == 1) && (i == 0))
+ ret = devm_request_irq(&op->dev, irq0,
+ pxad_int_handler,
+ IRQF_SHARED, "pxa-dma", pdev);
+ if (ret) {
+ dev_err(pdev->slave.dev,
+ "%s(): can't request irq %d:%d\n", __func__,
+ irq, ret);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static const struct of_device_id const pxad_dt_ids[] = {
+ { .compatible = "marvell,pdma-1.0", },
+ {}
+};
+MODULE_DEVICE_TABLE(of, pxad_dt_ids);
+
+static struct dma_chan *pxad_dma_xlate(struct of_phandle_args *dma_spec,
+ struct of_dma *ofdma)
+{
+ struct pxad_device *d = ofdma->of_dma_data;
+ struct dma_chan *chan;
+
+ chan = dma_get_any_slave_channel(&d->slave);
+ if (!chan)
+ return NULL;
+
+ to_pxad_chan(chan)->drcmr = dma_spec->args[0];
+ to_pxad_chan(chan)->prio = dma_spec->args[1];
+
+ return chan;
+}
+
+static int pxad_init_dmadev(struct platform_device *op,
+ struct pxad_device *pdev,
+ unsigned int nr_phy_chans)
+{
+ int ret;
+ unsigned int i;
+ struct pxad_chan *c;
+
+ pdev->nr_chans = nr_phy_chans;
+ INIT_LIST_HEAD(&pdev->slave.channels);
+ pdev->slave.device_alloc_chan_resources = pxad_alloc_chan_resources;
+ pdev->slave.device_free_chan_resources = pxad_free_chan_resources;
+ pdev->slave.device_tx_status = pxad_tx_status;
+ pdev->slave.device_issue_pending = pxad_issue_pending;
+ pdev->slave.device_config = pxad_config;
+ pdev->slave.device_terminate_all = pxad_terminate_all;
+
+ if (op->dev.coherent_dma_mask)
+ dma_set_mask(&op->dev, op->dev.coherent_dma_mask);
+ else
+ dma_set_mask(&op->dev, DMA_BIT_MASK(32));
+
+ ret = pxad_init_phys(op, pdev, nr_phy_chans);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < nr_phy_chans; i++) {
+ c = devm_kzalloc(&op->dev, sizeof(*c), GFP_KERNEL);
+ if (!c)
+ return -ENOMEM;
+ c->vc.desc_free = pxad_free_desc;
+ vchan_init(&c->vc, &pdev->slave);
+ }
+
+ return dma_async_device_register(&pdev->slave);
+}
+
+static int pxad_probe(struct platform_device *op)
+{
+ struct pxad_device *pdev;
+ const struct of_device_id *of_id;
+ struct mmp_dma_platdata *pdata = dev_get_platdata(&op->dev);
+ struct resource *iores;
+ int ret, dma_channels = 0;
+ const enum dma_slave_buswidth widths =
+ DMA_SLAVE_BUSWIDTH_1_BYTE | DMA_SLAVE_BUSWIDTH_2_BYTES |
+ DMA_SLAVE_BUSWIDTH_4_BYTES;
+
+ pdev = devm_kzalloc(&op->dev, sizeof(*pdev), GFP_KERNEL);
+ if (!pdev)
+ return -ENOMEM;
+
+ spin_lock_init(&pdev->phy_lock);
+
+ iores = platform_get_resource(op, IORESOURCE_MEM, 0);
+ pdev->base = devm_ioremap_resource(&op->dev, iores);
+ if (IS_ERR(pdev->base))
+ return PTR_ERR(pdev->base);
+
+ of_id = of_match_device(pxad_dt_ids, &op->dev);
+ if (of_id)
+ of_property_read_u32(op->dev.of_node, "#dma-channels",
+ &dma_channels);
+ else if (pdata && pdata->dma_channels)
+ dma_channels = pdata->dma_channels;
+ else
+ dma_channels = 32; /* default 32 channel */
+
+ dma_cap_set(DMA_SLAVE, pdev->slave.cap_mask);
+ dma_cap_set(DMA_MEMCPY, pdev->slave.cap_mask);
+ dma_cap_set(DMA_CYCLIC, pdev->slave.cap_mask);
+ dma_cap_set(DMA_PRIVATE, pdev->slave.cap_mask);
+ pdev->slave.device_prep_dma_memcpy = pxad_prep_memcpy;
+ pdev->slave.device_prep_slave_sg = pxad_prep_slave_sg;
+ pdev->slave.device_prep_dma_cyclic = pxad_prep_dma_cyclic;
+
+ pdev->slave.copy_align = PDMA_ALIGNMENT;
+ pdev->slave.src_addr_widths = widths;
+ pdev->slave.dst_addr_widths = widths;
+ pdev->slave.directions = BIT(DMA_MEM_TO_DEV) | BIT(DMA_DEV_TO_MEM);
+ pdev->slave.residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
+
+ pdev->slave.dev = &op->dev;
+ ret = pxad_init_dmadev(op, pdev, dma_channels);
+ if (ret) {
+ dev_err(pdev->slave.dev, "unable to register\n");
+ return ret;
+ }
+
+ if (op->dev.of_node) {
+ /* Device-tree DMA controller registration */
+ ret = of_dma_controller_register(op->dev.of_node,
+ pxad_dma_xlate, pdev);
+ if (ret < 0) {
+ dev_err(pdev->slave.dev,
+ "of_dma_controller_register failed\n");
+ return ret;
+ }
+ }
+
+ platform_set_drvdata(op, pdev);
+ pxad_init_debugfs(pdev);
+ dev_info(pdev->slave.dev, "initialized %d channels\n", dma_channels);
+ return 0;
+}
+
+static const struct platform_device_id pxad_id_table[] = {
+ { "pxa-dma", },
+ { },
+};
+
+static struct platform_driver pxad_driver = {
+ .driver = {
+ .name = "pxa-dma",
+ .of_match_table = pxad_dt_ids,
+ },
+ .id_table = pxad_id_table,
+ .probe = pxad_probe,
+ .remove = pxad_remove,
+};
+
+bool pxad_filter_fn(struct dma_chan *chan, void *param)
+{
+ struct pxad_chan *c = to_pxad_chan(chan);
+ struct pxad_param *p = param;
+
+ if (chan->device->dev->driver != &pxad_driver.driver)
+ return false;
+
+ c->drcmr = p->drcmr;
+ c->prio = p->prio;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(pxad_filter_fn);
+
+int pxad_toggle_reserved_channel(int legacy_channel)
+{
+ if (legacy_unavailable & (BIT(legacy_channel)))
+ return -EBUSY;
+ legacy_reserved ^= BIT(legacy_channel);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pxad_toggle_reserved_channel);
+
+module_platform_driver(pxad_driver);
+
+MODULE_DESCRIPTION("Marvell PXA Peripheral DMA Driver");
+MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@free.fr>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/dma/s3c24xx-dma.c b/drivers/dma/s3c24xx-dma.c
index 01dcaf21b988..17ccdfd28f37 100644
--- a/drivers/dma/s3c24xx-dma.c
+++ b/drivers/dma/s3c24xx-dma.c
@@ -1168,7 +1168,7 @@ static struct soc_data soc_s3c2443 = {
.has_clocks = true,
};
-static struct platform_device_id s3c24xx_dma_driver_ids[] = {
+static const struct platform_device_id s3c24xx_dma_driver_ids[] = {
{
.name = "s3c2410-dma",
.driver_data = (kernel_ulong_t)&soc_s3c2410,
diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c
index e0302c784ba4..7820d07e7bee 100644
--- a/drivers/dma/sh/rcar-dmac.c
+++ b/drivers/dma/sh/rcar-dmac.c
@@ -183,7 +183,7 @@ struct rcar_dmac {
unsigned int n_channels;
struct rcar_dmac_chan *channels;
- unsigned long modules[256 / BITS_PER_LONG];
+ DECLARE_BITMAP(modules, 256);
};
#define to_rcar_dmac(d) container_of(d, struct rcar_dmac, engine)
diff --git a/drivers/dma/sh/shdma-r8a73a4.c b/drivers/dma/sh/shdma-r8a73a4.c
index 4fb99970a3ea..96ea3828c3eb 100644
--- a/drivers/dma/sh/shdma-r8a73a4.c
+++ b/drivers/dma/sh/shdma-r8a73a4.c
@@ -11,7 +11,7 @@
#include "shdma-arm.h"
-const unsigned int dma_ts_shift[] = SH_DMAE_TS_SHIFT;
+static const unsigned int dma_ts_shift[] = SH_DMAE_TS_SHIFT;
static const struct sh_dmae_slave_config dma_slaves[] = {
{
diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
index a1afda43b8ef..8c5186cc9f63 100644
--- a/drivers/dma/sirf-dma.c
+++ b/drivers/dma/sirf-dma.c
@@ -23,8 +23,13 @@
#include "dmaengine.h"
+#define SIRFSOC_DMA_VER_A7V1 1
+#define SIRFSOC_DMA_VER_A7V2 2
+#define SIRFSOC_DMA_VER_A6 4
+
#define SIRFSOC_DMA_DESCRIPTORS 16
#define SIRFSOC_DMA_CHANNELS 16
+#define SIRFSOC_DMA_TABLE_NUM 256
#define SIRFSOC_DMA_CH_ADDR 0x00
#define SIRFSOC_DMA_CH_XLEN 0x04
@@ -35,15 +40,44 @@
#define SIRFSOC_DMA_CH_VALID 0x140
#define SIRFSOC_DMA_CH_INT 0x144
#define SIRFSOC_DMA_INT_EN 0x148
-#define SIRFSOC_DMA_INT_EN_CLR 0x14C
+#define SIRFSOC_DMA_INT_EN_CLR 0x14C
#define SIRFSOC_DMA_CH_LOOP_CTRL 0x150
-#define SIRFSOC_DMA_CH_LOOP_CTRL_CLR 0x15C
+#define SIRFSOC_DMA_CH_LOOP_CTRL_CLR 0x154
+#define SIRFSOC_DMA_WIDTH_ATLAS7 0x10
+#define SIRFSOC_DMA_VALID_ATLAS7 0x14
+#define SIRFSOC_DMA_INT_ATLAS7 0x18
+#define SIRFSOC_DMA_INT_EN_ATLAS7 0x1c
+#define SIRFSOC_DMA_LOOP_CTRL_ATLAS7 0x20
+#define SIRFSOC_DMA_CUR_DATA_ADDR 0x34
+#define SIRFSOC_DMA_MUL_ATLAS7 0x38
+#define SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7 0x158
+#define SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7 0x15C
+#define SIRFSOC_DMA_IOBG_SCMD_EN 0x800
+#define SIRFSOC_DMA_EARLY_RESP_SET 0x818
+#define SIRFSOC_DMA_EARLY_RESP_CLR 0x81C
#define SIRFSOC_DMA_MODE_CTRL_BIT 4
#define SIRFSOC_DMA_DIR_CTRL_BIT 5
+#define SIRFSOC_DMA_MODE_CTRL_BIT_ATLAS7 2
+#define SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7 3
+#define SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7 4
+#define SIRFSOC_DMA_TAB_NUM_ATLAS7 7
+#define SIRFSOC_DMA_CHAIN_INT_BIT_ATLAS7 5
+#define SIRFSOC_DMA_CHAIN_FLAG_SHIFT_ATLAS7 25
+#define SIRFSOC_DMA_CHAIN_ADDR_SHIFT 32
+
+#define SIRFSOC_DMA_INT_FINI_INT_ATLAS7 BIT(0)
+#define SIRFSOC_DMA_INT_CNT_INT_ATLAS7 BIT(1)
+#define SIRFSOC_DMA_INT_PAU_INT_ATLAS7 BIT(2)
+#define SIRFSOC_DMA_INT_LOOP_INT_ATLAS7 BIT(3)
+#define SIRFSOC_DMA_INT_INV_INT_ATLAS7 BIT(4)
+#define SIRFSOC_DMA_INT_END_INT_ATLAS7 BIT(5)
+#define SIRFSOC_DMA_INT_ALL_ATLAS7 0x3F
/* xlen and dma_width register is in 4 bytes boundary */
#define SIRFSOC_DMA_WORD_LEN 4
+#define SIRFSOC_DMA_XLEN_MAX_V1 0x800
+#define SIRFSOC_DMA_XLEN_MAX_V2 0x1000
struct sirfsoc_dma_desc {
struct dma_async_tx_descriptor desc;
@@ -56,7 +90,9 @@ struct sirfsoc_dma_desc {
int width; /* DMA width */
int dir;
bool cyclic; /* is loop DMA? */
+ bool chain; /* is chain DMA? */
u32 addr; /* DMA buffer address */
+ u64 chain_table[SIRFSOC_DMA_TABLE_NUM]; /* chain tbl */
};
struct sirfsoc_dma_chan {
@@ -87,10 +123,25 @@ struct sirfsoc_dma {
void __iomem *base;
int irq;
struct clk *clk;
- bool is_marco;
+ int type;
+ void (*exec_desc)(struct sirfsoc_dma_desc *sdesc,
+ int cid, int burst_mode, void __iomem *base);
struct sirfsoc_dma_regs regs_save;
};
+struct sirfsoc_dmadata {
+ void (*exec)(struct sirfsoc_dma_desc *sdesc,
+ int cid, int burst_mode, void __iomem *base);
+ int type;
+};
+
+enum sirfsoc_dma_chain_flag {
+ SIRFSOC_DMA_CHAIN_NORMAL = 0x01,
+ SIRFSOC_DMA_CHAIN_PAUSE = 0x02,
+ SIRFSOC_DMA_CHAIN_LOOP = 0x03,
+ SIRFSOC_DMA_CHAIN_END = 0x04
+};
+
#define DRV_NAME "sirfsoc_dma"
static int sirfsoc_dma_runtime_suspend(struct device *dev);
@@ -109,48 +160,105 @@ static inline struct sirfsoc_dma *dma_chan_to_sirfsoc_dma(struct dma_chan *c)
return container_of(schan, struct sirfsoc_dma, channels[c->chan_id]);
}
+static void sirfsoc_dma_execute_hw_a7v2(struct sirfsoc_dma_desc *sdesc,
+ int cid, int burst_mode, void __iomem *base)
+{
+ if (sdesc->chain) {
+ /* DMA v2 HW chain mode */
+ writel_relaxed((sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7) |
+ (sdesc->chain <<
+ SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7) |
+ (0x8 << SIRFSOC_DMA_TAB_NUM_ATLAS7) | 0x3,
+ base + SIRFSOC_DMA_CH_CTRL);
+ } else {
+ /* DMA v2 legacy mode */
+ writel_relaxed(sdesc->xlen, base + SIRFSOC_DMA_CH_XLEN);
+ writel_relaxed(sdesc->ylen, base + SIRFSOC_DMA_CH_YLEN);
+ writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_ATLAS7);
+ writel_relaxed((sdesc->width*((sdesc->ylen+1)>>1)),
+ base + SIRFSOC_DMA_MUL_ATLAS7);
+ writel_relaxed((sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7) |
+ (sdesc->chain <<
+ SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7) |
+ 0x3, base + SIRFSOC_DMA_CH_CTRL);
+ }
+ writel_relaxed(sdesc->chain ? SIRFSOC_DMA_INT_END_INT_ATLAS7 :
+ (SIRFSOC_DMA_INT_FINI_INT_ATLAS7 |
+ SIRFSOC_DMA_INT_LOOP_INT_ATLAS7),
+ base + SIRFSOC_DMA_INT_EN_ATLAS7);
+ writel(sdesc->addr, base + SIRFSOC_DMA_CH_ADDR);
+ if (sdesc->cyclic)
+ writel(0x10001, base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
+}
+
+static void sirfsoc_dma_execute_hw_a7v1(struct sirfsoc_dma_desc *sdesc,
+ int cid, int burst_mode, void __iomem *base)
+{
+ writel_relaxed(1, base + SIRFSOC_DMA_IOBG_SCMD_EN);
+ writel_relaxed((1 << cid), base + SIRFSOC_DMA_EARLY_RESP_SET);
+ writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_0 + cid * 4);
+ writel_relaxed(cid | (burst_mode << SIRFSOC_DMA_MODE_CTRL_BIT) |
+ (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT),
+ base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL);
+ writel_relaxed(sdesc->xlen, base + cid * 0x10 + SIRFSOC_DMA_CH_XLEN);
+ writel_relaxed(sdesc->ylen, base + cid * 0x10 + SIRFSOC_DMA_CH_YLEN);
+ writel_relaxed(readl_relaxed(base + SIRFSOC_DMA_INT_EN) |
+ (1 << cid), base + SIRFSOC_DMA_INT_EN);
+ writel(sdesc->addr >> 2, base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR);
+ if (sdesc->cyclic) {
+ writel((1 << cid) | 1 << (cid + 16) |
+ readl_relaxed(base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7),
+ base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7);
+ }
+
+}
+
+static void sirfsoc_dma_execute_hw_a6(struct sirfsoc_dma_desc *sdesc,
+ int cid, int burst_mode, void __iomem *base)
+{
+ writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_0 + cid * 4);
+ writel_relaxed(cid | (burst_mode << SIRFSOC_DMA_MODE_CTRL_BIT) |
+ (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT),
+ base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL);
+ writel_relaxed(sdesc->xlen, base + cid * 0x10 + SIRFSOC_DMA_CH_XLEN);
+ writel_relaxed(sdesc->ylen, base + cid * 0x10 + SIRFSOC_DMA_CH_YLEN);
+ writel_relaxed(readl_relaxed(base + SIRFSOC_DMA_INT_EN) |
+ (1 << cid), base + SIRFSOC_DMA_INT_EN);
+ writel(sdesc->addr >> 2, base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR);
+ if (sdesc->cyclic) {
+ writel((1 << cid) | 1 << (cid + 16) |
+ readl_relaxed(base + SIRFSOC_DMA_CH_LOOP_CTRL),
+ base + SIRFSOC_DMA_CH_LOOP_CTRL);
+ }
+
+}
+
/* Execute all queued DMA descriptors */
static void sirfsoc_dma_execute(struct sirfsoc_dma_chan *schan)
{
struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan);
int cid = schan->chan.chan_id;
struct sirfsoc_dma_desc *sdesc = NULL;
+ void __iomem *base;
/*
* lock has been held by functions calling this, so we don't hold
* lock again
*/
-
+ base = sdma->base;
sdesc = list_first_entry(&schan->queued, struct sirfsoc_dma_desc,
- node);
+ node);
/* Move the first queued descriptor to active list */
list_move_tail(&sdesc->node, &schan->active);
- /* Start the DMA transfer */
- writel_relaxed(sdesc->width, sdma->base + SIRFSOC_DMA_WIDTH_0 +
- cid * 4);
- writel_relaxed(cid | (schan->mode << SIRFSOC_DMA_MODE_CTRL_BIT) |
- (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT),
- sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL);
- writel_relaxed(sdesc->xlen, sdma->base + cid * 0x10 +
- SIRFSOC_DMA_CH_XLEN);
- writel_relaxed(sdesc->ylen, sdma->base + cid * 0x10 +
- SIRFSOC_DMA_CH_YLEN);
- writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) |
- (1 << cid), sdma->base + SIRFSOC_DMA_INT_EN);
+ if (sdma->type == SIRFSOC_DMA_VER_A7V2)
+ cid = 0;
- /*
- * writel has an implict memory write barrier to make sure data is
- * flushed into memory before starting DMA
- */
- writel(sdesc->addr >> 2, sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR);
+ /* Start the DMA transfer */
+ sdma->exec_desc(sdesc, cid, schan->mode, base);
- if (sdesc->cyclic) {
- writel((1 << cid) | 1 << (cid + 16) |
- readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL),
- sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
+ if (sdesc->cyclic)
schan->happened_cyclic = schan->completed_cyclic = 0;
- }
}
/* Interrupt handler */
@@ -160,27 +268,65 @@ static irqreturn_t sirfsoc_dma_irq(int irq, void *data)
struct sirfsoc_dma_chan *schan;
struct sirfsoc_dma_desc *sdesc = NULL;
u32 is;
+ bool chain;
int ch;
+ void __iomem *reg;
+
+ switch (sdma->type) {
+ case SIRFSOC_DMA_VER_A6:
+ case SIRFSOC_DMA_VER_A7V1:
+ is = readl(sdma->base + SIRFSOC_DMA_CH_INT);
+ reg = sdma->base + SIRFSOC_DMA_CH_INT;
+ while ((ch = fls(is) - 1) >= 0) {
+ is &= ~(1 << ch);
+ writel_relaxed(1 << ch, reg);
+ schan = &sdma->channels[ch];
+ spin_lock(&schan->lock);
+ sdesc = list_first_entry(&schan->active,
+ struct sirfsoc_dma_desc, node);
+ if (!sdesc->cyclic) {
+ /* Execute queued descriptors */
+ list_splice_tail_init(&schan->active,
+ &schan->completed);
+ dma_cookie_complete(&sdesc->desc);
+ if (!list_empty(&schan->queued))
+ sirfsoc_dma_execute(schan);
+ } else
+ schan->happened_cyclic++;
+ spin_unlock(&schan->lock);
+ }
+ break;
- is = readl(sdma->base + SIRFSOC_DMA_CH_INT);
- while ((ch = fls(is) - 1) >= 0) {
- is &= ~(1 << ch);
- writel_relaxed(1 << ch, sdma->base + SIRFSOC_DMA_CH_INT);
- schan = &sdma->channels[ch];
+ case SIRFSOC_DMA_VER_A7V2:
+ is = readl(sdma->base + SIRFSOC_DMA_INT_ATLAS7);
+ reg = sdma->base + SIRFSOC_DMA_INT_ATLAS7;
+ writel_relaxed(SIRFSOC_DMA_INT_ALL_ATLAS7, reg);
+ schan = &sdma->channels[0];
spin_lock(&schan->lock);
-
- sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc,
- node);
+ sdesc = list_first_entry(&schan->active,
+ struct sirfsoc_dma_desc, node);
if (!sdesc->cyclic) {
- /* Execute queued descriptors */
- list_splice_tail_init(&schan->active, &schan->completed);
- if (!list_empty(&schan->queued))
- sirfsoc_dma_execute(schan);
- } else
+ chain = sdesc->chain;
+ if ((chain && (is & SIRFSOC_DMA_INT_END_INT_ATLAS7)) ||
+ (!chain &&
+ (is & SIRFSOC_DMA_INT_FINI_INT_ATLAS7))) {
+ /* Execute queued descriptors */
+ list_splice_tail_init(&schan->active,
+ &schan->completed);
+ dma_cookie_complete(&sdesc->desc);
+ if (!list_empty(&schan->queued))
+ sirfsoc_dma_execute(schan);
+ }
+ } else if (sdesc->cyclic && (is &
+ SIRFSOC_DMA_INT_LOOP_INT_ATLAS7))
schan->happened_cyclic++;
spin_unlock(&schan->lock);
+ break;
+
+ default:
+ break;
}
/* Schedule tasklet */
@@ -227,16 +373,15 @@ static void sirfsoc_dma_process_completed(struct sirfsoc_dma *sdma)
schan->chan.completed_cookie = last_cookie;
spin_unlock_irqrestore(&schan->lock, flags);
} else {
- /* for cyclic channel, desc is always in active list */
- sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc,
- node);
-
- if (!sdesc || (sdesc && !sdesc->cyclic)) {
- /* without active cyclic DMA */
+ if (list_empty(&schan->active)) {
spin_unlock_irqrestore(&schan->lock, flags);
continue;
}
+ /* for cyclic channel, desc is always in active list */
+ sdesc = list_first_entry(&schan->active,
+ struct sirfsoc_dma_desc, node);
+
/* cyclic DMA */
happened_cyclic = schan->happened_cyclic;
spin_unlock_irqrestore(&schan->lock, flags);
@@ -307,20 +452,32 @@ static int sirfsoc_dma_terminate_all(struct dma_chan *chan)
spin_lock_irqsave(&schan->lock, flags);
- if (!sdma->is_marco) {
- writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) &
- ~(1 << cid), sdma->base + SIRFSOC_DMA_INT_EN);
- writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL)
- & ~((1 << cid) | 1 << (cid + 16)),
- sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
- } else {
+ switch (sdma->type) {
+ case SIRFSOC_DMA_VER_A7V1:
writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_INT_EN_CLR);
writel_relaxed((1 << cid) | 1 << (cid + 16),
- sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL_CLR);
+ sdma->base +
+ SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7);
+ writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID);
+ break;
+ case SIRFSOC_DMA_VER_A7V2:
+ writel_relaxed(0, sdma->base + SIRFSOC_DMA_INT_EN_ATLAS7);
+ writel_relaxed(0, sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
+ writel_relaxed(0, sdma->base + SIRFSOC_DMA_VALID_ATLAS7);
+ break;
+ case SIRFSOC_DMA_VER_A6:
+ writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) &
+ ~(1 << cid), sdma->base + SIRFSOC_DMA_INT_EN);
+ writel_relaxed(readl_relaxed(sdma->base +
+ SIRFSOC_DMA_CH_LOOP_CTRL) &
+ ~((1 << cid) | 1 << (cid + 16)),
+ sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
+ writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID);
+ break;
+ default:
+ break;
}
- writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID);
-
list_splice_tail_init(&schan->active, &schan->free);
list_splice_tail_init(&schan->queued, &schan->free);
@@ -338,13 +495,25 @@ static int sirfsoc_dma_pause_chan(struct dma_chan *chan)
spin_lock_irqsave(&schan->lock, flags);
- if (!sdma->is_marco)
- writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL)
- & ~((1 << cid) | 1 << (cid + 16)),
- sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
- else
+ switch (sdma->type) {
+ case SIRFSOC_DMA_VER_A7V1:
writel_relaxed((1 << cid) | 1 << (cid + 16),
- sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL_CLR);
+ sdma->base +
+ SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7);
+ break;
+ case SIRFSOC_DMA_VER_A7V2:
+ writel_relaxed(0, sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
+ break;
+ case SIRFSOC_DMA_VER_A6:
+ writel_relaxed(readl_relaxed(sdma->base +
+ SIRFSOC_DMA_CH_LOOP_CTRL) &
+ ~((1 << cid) | 1 << (cid + 16)),
+ sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
+ break;
+
+ default:
+ break;
+ }
spin_unlock_irqrestore(&schan->lock, flags);
@@ -359,14 +528,25 @@ static int sirfsoc_dma_resume_chan(struct dma_chan *chan)
unsigned long flags;
spin_lock_irqsave(&schan->lock, flags);
-
- if (!sdma->is_marco)
- writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL)
- | ((1 << cid) | 1 << (cid + 16)),
- sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
- else
+ switch (sdma->type) {
+ case SIRFSOC_DMA_VER_A7V1:
writel_relaxed((1 << cid) | 1 << (cid + 16),
- sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
+ sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7);
+ break;
+ case SIRFSOC_DMA_VER_A7V2:
+ writel_relaxed(0x10001,
+ sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
+ break;
+ case SIRFSOC_DMA_VER_A6:
+ writel_relaxed(readl_relaxed(sdma->base +
+ SIRFSOC_DMA_CH_LOOP_CTRL) |
+ ((1 << cid) | 1 << (cid + 16)),
+ sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
+ break;
+
+ default:
+ break;
+ }
spin_unlock_irqrestore(&schan->lock, flags);
@@ -473,14 +653,31 @@ sirfsoc_dma_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
spin_lock_irqsave(&schan->lock, flags);
- sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc,
- node);
- dma_request_bytes = (sdesc->xlen + 1) * (sdesc->ylen + 1) *
- (sdesc->width * SIRFSOC_DMA_WORD_LEN);
+ if (list_empty(&schan->active)) {
+ ret = dma_cookie_status(chan, cookie, txstate);
+ dma_set_residue(txstate, 0);
+ spin_unlock_irqrestore(&schan->lock, flags);
+ return ret;
+ }
+ sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc, node);
+ if (sdesc->cyclic)
+ dma_request_bytes = (sdesc->xlen + 1) * (sdesc->ylen + 1) *
+ (sdesc->width * SIRFSOC_DMA_WORD_LEN);
+ else
+ dma_request_bytes = sdesc->xlen * SIRFSOC_DMA_WORD_LEN;
ret = dma_cookie_status(chan, cookie, txstate);
- dma_pos = readl_relaxed(sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR)
- << 2;
+
+ if (sdma->type == SIRFSOC_DMA_VER_A7V2)
+ cid = 0;
+
+ if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
+ dma_pos = readl_relaxed(sdma->base + SIRFSOC_DMA_CUR_DATA_ADDR);
+ } else {
+ dma_pos = readl_relaxed(
+ sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR) << 2;
+ }
+
residue = dma_request_bytes - (dma_pos - sdesc->addr);
dma_set_residue(txstate, residue);
@@ -647,6 +844,7 @@ static int sirfsoc_dma_probe(struct platform_device *op)
struct dma_device *dma;
struct sirfsoc_dma *sdma;
struct sirfsoc_dma_chan *schan;
+ struct sirfsoc_dmadata *data;
struct resource res;
ulong regs_start, regs_size;
u32 id;
@@ -657,9 +855,11 @@ static int sirfsoc_dma_probe(struct platform_device *op)
dev_err(dev, "Memory exhausted!\n");
return -ENOMEM;
}
-
- if (of_device_is_compatible(dn, "sirf,marco-dmac"))
- sdma->is_marco = true;
+ data = (struct sirfsoc_dmadata *)
+ (of_match_device(op->dev.driver->of_match_table,
+ &op->dev)->data);
+ sdma->exec_desc = data->exec;
+ sdma->type = data->type;
if (of_property_read_u32(dn, "cell-index", &id)) {
dev_err(dev, "Fail to get DMAC index\n");
@@ -816,6 +1016,8 @@ static int sirfsoc_dma_pm_suspend(struct device *dev)
struct sirfsoc_dma_chan *schan;
int ch;
int ret;
+ int count;
+ u32 int_offset;
/*
* if we were runtime-suspended before, resume to enable clock
@@ -827,11 +1029,19 @@ static int sirfsoc_dma_pm_suspend(struct device *dev)
return ret;
}
+ if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
+ count = 1;
+ int_offset = SIRFSOC_DMA_INT_EN_ATLAS7;
+ } else {
+ count = SIRFSOC_DMA_CHANNELS;
+ int_offset = SIRFSOC_DMA_INT_EN;
+ }
+
/*
* DMA controller will lose all registers while suspending
* so we need to save registers for active channels
*/
- for (ch = 0; ch < SIRFSOC_DMA_CHANNELS; ch++) {
+ for (ch = 0; ch < count; ch++) {
schan = &sdma->channels[ch];
if (list_empty(&schan->active))
continue;
@@ -841,7 +1051,7 @@ static int sirfsoc_dma_pm_suspend(struct device *dev)
save->ctrl[ch] = readl_relaxed(sdma->base +
ch * 0x10 + SIRFSOC_DMA_CH_CTRL);
}
- save->interrupt_en = readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN);
+ save->interrupt_en = readl_relaxed(sdma->base + int_offset);
/* Disable clock */
sirfsoc_dma_runtime_suspend(dev);
@@ -857,14 +1067,27 @@ static int sirfsoc_dma_pm_resume(struct device *dev)
struct sirfsoc_dma_chan *schan;
int ch;
int ret;
+ int count;
+ u32 int_offset;
+ u32 width_offset;
/* Enable clock before accessing register */
ret = sirfsoc_dma_runtime_resume(dev);
if (ret < 0)
return ret;
- writel_relaxed(save->interrupt_en, sdma->base + SIRFSOC_DMA_INT_EN);
- for (ch = 0; ch < SIRFSOC_DMA_CHANNELS; ch++) {
+ if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
+ count = 1;
+ int_offset = SIRFSOC_DMA_INT_EN_ATLAS7;
+ width_offset = SIRFSOC_DMA_WIDTH_ATLAS7;
+ } else {
+ count = SIRFSOC_DMA_CHANNELS;
+ int_offset = SIRFSOC_DMA_INT_EN;
+ width_offset = SIRFSOC_DMA_WIDTH_0;
+ }
+
+ writel_relaxed(save->interrupt_en, sdma->base + int_offset);
+ for (ch = 0; ch < count; ch++) {
schan = &sdma->channels[ch];
if (list_empty(&schan->active))
continue;
@@ -872,15 +1095,21 @@ static int sirfsoc_dma_pm_resume(struct device *dev)
struct sirfsoc_dma_desc,
node);
writel_relaxed(sdesc->width,
- sdma->base + SIRFSOC_DMA_WIDTH_0 + ch * 4);
+ sdma->base + width_offset + ch * 4);
writel_relaxed(sdesc->xlen,
sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_XLEN);
writel_relaxed(sdesc->ylen,
sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_YLEN);
writel_relaxed(save->ctrl[ch],
sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_CTRL);
- writel_relaxed(sdesc->addr >> 2,
- sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_ADDR);
+ if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
+ writel_relaxed(sdesc->addr,
+ sdma->base + SIRFSOC_DMA_CH_ADDR);
+ } else {
+ writel_relaxed(sdesc->addr >> 2,
+ sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_ADDR);
+
+ }
}
/* if we were runtime-suspended before, suspend again */
@@ -896,9 +1125,25 @@ static const struct dev_pm_ops sirfsoc_dma_pm_ops = {
SET_SYSTEM_SLEEP_PM_OPS(sirfsoc_dma_pm_suspend, sirfsoc_dma_pm_resume)
};
+struct sirfsoc_dmadata sirfsoc_dmadata_a6 = {
+ .exec = sirfsoc_dma_execute_hw_a6,
+ .type = SIRFSOC_DMA_VER_A6,
+};
+
+struct sirfsoc_dmadata sirfsoc_dmadata_a7v1 = {
+ .exec = sirfsoc_dma_execute_hw_a7v1,
+ .type = SIRFSOC_DMA_VER_A7V1,
+};
+
+struct sirfsoc_dmadata sirfsoc_dmadata_a7v2 = {
+ .exec = sirfsoc_dma_execute_hw_a7v2,
+ .type = SIRFSOC_DMA_VER_A7V2,
+};
+
static const struct of_device_id sirfsoc_dma_match[] = {
- { .compatible = "sirf,prima2-dmac", },
- { .compatible = "sirf,marco-dmac", },
+ { .compatible = "sirf,prima2-dmac", .data = &sirfsoc_dmadata_a6,},
+ { .compatible = "sirf,atlas7-dmac", .data = &sirfsoc_dmadata_a7v1,},
+ { .compatible = "sirf,atlas7-dmac-v2", .data = &sirfsoc_dmadata_a7v2,},
{},
};
@@ -925,7 +1170,7 @@ static void __exit sirfsoc_dma_exit(void)
subsys_initcall(sirfsoc_dma_init);
module_exit(sirfsoc_dma_exit);
-MODULE_AUTHOR("Rongjun Ying <rongjun.ying@csr.com>, "
- "Barry Song <baohua.song@csr.com>");
+MODULE_AUTHOR("Rongjun Ying <rongjun.ying@csr.com>");
+MODULE_AUTHOR("Barry Song <baohua.song@csr.com>");
MODULE_DESCRIPTION("SIRFSOC DMA control driver");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/dma/sun6i-dma.c b/drivers/dma/sun6i-dma.c
index 11e536586812..842ff97c2cfb 100644
--- a/drivers/dma/sun6i-dma.c
+++ b/drivers/dma/sun6i-dma.c
@@ -891,9 +891,21 @@ static struct sun6i_dma_config sun8i_a23_dma_cfg = {
.nr_max_vchans = 37,
};
+/*
+ * The H3 has 12 physical channels, a maximum DRQ port id of 27,
+ * and a total of 34 usable source and destination endpoints.
+ */
+
+static struct sun6i_dma_config sun8i_h3_dma_cfg = {
+ .nr_max_channels = 12,
+ .nr_max_requests = 27,
+ .nr_max_vchans = 34,
+};
+
static const struct of_device_id sun6i_dma_match[] = {
{ .compatible = "allwinner,sun6i-a31-dma", .data = &sun6i_a31_dma_cfg },
{ .compatible = "allwinner,sun8i-a23-dma", .data = &sun8i_a23_dma_cfg },
+ { .compatible = "allwinner,sun8i-h3-dma", .data = &sun8i_h3_dma_cfg },
{ /* sentinel */ }
};
diff --git a/drivers/dma/ti-dma-crossbar.c b/drivers/dma/ti-dma-crossbar.c
new file mode 100644
index 000000000000..24f5ca2356bf
--- /dev/null
+++ b/drivers/dma/ti-dma-crossbar.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2015 Texas Instruments Incorporated - http://www.ti.com
+ * Author: Peter Ujfalusi <peter.ujfalusi@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/idr.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/of_dma.h>
+
+#define TI_XBAR_OUTPUTS 127
+#define TI_XBAR_INPUTS 256
+
+static DEFINE_IDR(map_idr);
+
+struct ti_dma_xbar_data {
+ void __iomem *iomem;
+
+ struct dma_router dmarouter;
+
+ u16 safe_val; /* Value to rest the crossbar lines */
+ u32 xbar_requests; /* number of DMA requests connected to XBAR */
+ u32 dma_requests; /* number of DMA requests forwarded to DMA */
+};
+
+struct ti_dma_xbar_map {
+ u16 xbar_in;
+ int xbar_out;
+};
+
+static inline void ti_dma_xbar_write(void __iomem *iomem, int xbar, u16 val)
+{
+ writew_relaxed(val, iomem + (xbar * 2));
+}
+
+static void ti_dma_xbar_free(struct device *dev, void *route_data)
+{
+ struct ti_dma_xbar_data *xbar = dev_get_drvdata(dev);
+ struct ti_dma_xbar_map *map = route_data;
+
+ dev_dbg(dev, "Unmapping XBAR%u (was routed to %d)\n",
+ map->xbar_in, map->xbar_out);
+
+ ti_dma_xbar_write(xbar->iomem, map->xbar_out, xbar->safe_val);
+ idr_remove(&map_idr, map->xbar_out);
+ kfree(map);
+}
+
+static void *ti_dma_xbar_route_allocate(struct of_phandle_args *dma_spec,
+ struct of_dma *ofdma)
+{
+ struct platform_device *pdev = of_find_device_by_node(ofdma->of_node);
+ struct ti_dma_xbar_data *xbar = platform_get_drvdata(pdev);
+ struct ti_dma_xbar_map *map;
+
+ if (dma_spec->args[0] >= xbar->xbar_requests) {
+ dev_err(&pdev->dev, "Invalid XBAR request number: %d\n",
+ dma_spec->args[0]);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* The of_node_put() will be done in the core for the node */
+ dma_spec->np = of_parse_phandle(ofdma->of_node, "dma-masters", 0);
+ if (!dma_spec->np) {
+ dev_err(&pdev->dev, "Can't get DMA master\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map) {
+ of_node_put(dma_spec->np);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ map->xbar_out = idr_alloc(&map_idr, NULL, 0, xbar->dma_requests,
+ GFP_KERNEL);
+ map->xbar_in = (u16)dma_spec->args[0];
+
+ /* The DMA request is 1 based in sDMA */
+ dma_spec->args[0] = map->xbar_out + 1;
+
+ dev_dbg(&pdev->dev, "Mapping XBAR%u to DMA%d\n",
+ map->xbar_in, map->xbar_out);
+
+ ti_dma_xbar_write(xbar->iomem, map->xbar_out, map->xbar_in);
+
+ return map;
+}
+
+static int ti_dma_xbar_probe(struct platform_device *pdev)
+{
+ struct device_node *node = pdev->dev.of_node;
+ struct device_node *dma_node;
+ struct ti_dma_xbar_data *xbar;
+ struct resource *res;
+ u32 safe_val;
+ void __iomem *iomem;
+ int i, ret;
+
+ if (!node)
+ return -ENODEV;
+
+ xbar = devm_kzalloc(&pdev->dev, sizeof(*xbar), GFP_KERNEL);
+ if (!xbar)
+ return -ENOMEM;
+
+ dma_node = of_parse_phandle(node, "dma-masters", 0);
+ if (!dma_node) {
+ dev_err(&pdev->dev, "Can't get DMA master node\n");
+ return -ENODEV;
+ }
+
+ if (of_property_read_u32(dma_node, "dma-requests",
+ &xbar->dma_requests)) {
+ dev_info(&pdev->dev,
+ "Missing XBAR output information, using %u.\n",
+ TI_XBAR_OUTPUTS);
+ xbar->dma_requests = TI_XBAR_OUTPUTS;
+ }
+ of_node_put(dma_node);
+
+ if (of_property_read_u32(node, "dma-requests", &xbar->xbar_requests)) {
+ dev_info(&pdev->dev,
+ "Missing XBAR input information, using %u.\n",
+ TI_XBAR_INPUTS);
+ xbar->xbar_requests = TI_XBAR_INPUTS;
+ }
+
+ if (!of_property_read_u32(node, "ti,dma-safe-map", &safe_val))
+ xbar->safe_val = (u16)safe_val;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res)
+ return -ENODEV;
+
+ iomem = devm_ioremap_resource(&pdev->dev, res);
+ if (!iomem)
+ return -ENOMEM;
+
+ xbar->iomem = iomem;
+
+ xbar->dmarouter.dev = &pdev->dev;
+ xbar->dmarouter.route_free = ti_dma_xbar_free;
+
+ platform_set_drvdata(pdev, xbar);
+
+ /* Reset the crossbar */
+ for (i = 0; i < xbar->dma_requests; i++)
+ ti_dma_xbar_write(xbar->iomem, i, xbar->safe_val);
+
+ ret = of_dma_router_register(node, ti_dma_xbar_route_allocate,
+ &xbar->dmarouter);
+ if (ret) {
+ /* Restore the defaults for the crossbar */
+ for (i = 0; i < xbar->dma_requests; i++)
+ ti_dma_xbar_write(xbar->iomem, i, i);
+ }
+
+ return ret;
+}
+
+static const struct of_device_id ti_dma_xbar_match[] = {
+ { .compatible = "ti,dra7-dma-crossbar" },
+ {},
+};
+
+static struct platform_driver ti_dma_xbar_driver = {
+ .driver = {
+ .name = "ti-dma-crossbar",
+ .of_match_table = of_match_ptr(ti_dma_xbar_match),
+ },
+ .probe = ti_dma_xbar_probe,
+};
+
+int omap_dmaxbar_init(void)
+{
+ return platform_driver_register(&ti_dma_xbar_driver);
+}
+arch_initcall(omap_dmaxbar_init);
diff --git a/drivers/dma/virt-dma.c b/drivers/dma/virt-dma.c
index 6f80432a3f0a..7d2c17d8d30f 100644
--- a/drivers/dma/virt-dma.c
+++ b/drivers/dma/virt-dma.c
@@ -29,7 +29,7 @@ dma_cookie_t vchan_tx_submit(struct dma_async_tx_descriptor *tx)
spin_lock_irqsave(&vc->lock, flags);
cookie = dma_cookie_assign(tx);
- list_add_tail(&vd->node, &vc->desc_submitted);
+ list_move_tail(&vd->node, &vc->desc_submitted);
spin_unlock_irqrestore(&vc->lock, flags);
dev_dbg(vc->chan.device->dev, "vchan %p: txd %p[%x]: submitted\n",
@@ -83,8 +83,10 @@ static void vchan_complete(unsigned long arg)
cb_data = vd->tx.callback_param;
list_del(&vd->node);
-
- vc->desc_free(vd);
+ if (async_tx_test_ack(&vd->tx))
+ list_add(&vd->node, &vc->desc_allocated);
+ else
+ vc->desc_free(vd);
if (cb)
cb(cb_data);
@@ -96,9 +98,13 @@ void vchan_dma_desc_free_list(struct virt_dma_chan *vc, struct list_head *head)
while (!list_empty(head)) {
struct virt_dma_desc *vd = list_first_entry(head,
struct virt_dma_desc, node);
- list_del(&vd->node);
- dev_dbg(vc->chan.device->dev, "txd %p: freeing\n", vd);
- vc->desc_free(vd);
+ if (async_tx_test_ack(&vd->tx)) {
+ list_move_tail(&vd->node, &vc->desc_allocated);
+ } else {
+ dev_dbg(vc->chan.device->dev, "txd %p: freeing\n", vd);
+ list_del(&vd->node);
+ vc->desc_free(vd);
+ }
}
}
EXPORT_SYMBOL_GPL(vchan_dma_desc_free_list);
@@ -108,6 +114,7 @@ void vchan_init(struct virt_dma_chan *vc, struct dma_device *dmadev)
dma_cookie_init(&vc->chan);
spin_lock_init(&vc->lock);
+ INIT_LIST_HEAD(&vc->desc_allocated);
INIT_LIST_HEAD(&vc->desc_submitted);
INIT_LIST_HEAD(&vc->desc_issued);
INIT_LIST_HEAD(&vc->desc_completed);
diff --git a/drivers/dma/virt-dma.h b/drivers/dma/virt-dma.h
index 181b95267866..189e75dbcb15 100644
--- a/drivers/dma/virt-dma.h
+++ b/drivers/dma/virt-dma.h
@@ -29,6 +29,7 @@ struct virt_dma_chan {
spinlock_t lock;
/* protected by vc.lock */
+ struct list_head desc_allocated;
struct list_head desc_submitted;
struct list_head desc_issued;
struct list_head desc_completed;
@@ -55,11 +56,16 @@ static inline struct dma_async_tx_descriptor *vchan_tx_prep(struct virt_dma_chan
struct virt_dma_desc *vd, unsigned long tx_flags)
{
extern dma_cookie_t vchan_tx_submit(struct dma_async_tx_descriptor *);
+ unsigned long flags;
dma_async_tx_descriptor_init(&vd->tx, &vc->chan);
vd->tx.flags = tx_flags;
vd->tx.tx_submit = vchan_tx_submit;
+ spin_lock_irqsave(&vc->lock, flags);
+ list_add_tail(&vd->node, &vc->desc_allocated);
+ spin_unlock_irqrestore(&vc->lock, flags);
+
return &vd->tx;
}
@@ -122,7 +128,8 @@ static inline struct virt_dma_desc *vchan_next_desc(struct virt_dma_chan *vc)
}
/**
- * vchan_get_all_descriptors - obtain all submitted and issued descriptors
+ * vchan_get_all_descriptors - obtain all allocated, submitted and issued
+ * descriptors
* vc: virtual channel to get descriptors from
* head: list of descriptors found
*
@@ -134,6 +141,7 @@ static inline struct virt_dma_desc *vchan_next_desc(struct virt_dma_chan *vc)
static inline void vchan_get_all_descriptors(struct virt_dma_chan *vc,
struct list_head *head)
{
+ list_splice_tail_init(&vc->desc_allocated, head);
list_splice_tail_init(&vc->desc_submitted, head);
list_splice_tail_init(&vc->desc_issued, head);
list_splice_tail_init(&vc->desc_completed, head);
@@ -141,11 +149,14 @@ static inline void vchan_get_all_descriptors(struct virt_dma_chan *vc,
static inline void vchan_free_chan_resources(struct virt_dma_chan *vc)
{
+ struct virt_dma_desc *vd;
unsigned long flags;
LIST_HEAD(head);
spin_lock_irqsave(&vc->lock, flags);
vchan_get_all_descriptors(vc, &head);
+ list_for_each_entry(vd, &head, node)
+ async_tx_clear_ack(&vd->tx);
spin_unlock_irqrestore(&vc->lock, flags);
vchan_dma_desc_free_list(vc, &head);
diff --git a/drivers/dma/xgene-dma.c b/drivers/dma/xgene-dma.c
index f52e37502254..620fd55ec766 100755..100644
--- a/drivers/dma/xgene-dma.c
+++ b/drivers/dma/xgene-dma.c
@@ -124,32 +124,8 @@
#define XGENE_DMA_DESC_ELERR_POS 46
#define XGENE_DMA_DESC_RTYPE_POS 56
#define XGENE_DMA_DESC_LERR_POS 60
-#define XGENE_DMA_DESC_FLYBY_POS 4
#define XGENE_DMA_DESC_BUFLEN_POS 48
#define XGENE_DMA_DESC_HOENQ_NUM_POS 48
-
-#define XGENE_DMA_DESC_NV_SET(m) \
- (((u64 *)(m))[0] |= XGENE_DMA_DESC_NV_BIT)
-#define XGENE_DMA_DESC_IN_SET(m) \
- (((u64 *)(m))[0] |= XGENE_DMA_DESC_IN_BIT)
-#define XGENE_DMA_DESC_RTYPE_SET(m, v) \
- (((u64 *)(m))[0] |= ((u64)(v) << XGENE_DMA_DESC_RTYPE_POS))
-#define XGENE_DMA_DESC_BUFADDR_SET(m, v) \
- (((u64 *)(m))[0] |= (v))
-#define XGENE_DMA_DESC_BUFLEN_SET(m, v) \
- (((u64 *)(m))[0] |= ((u64)(v) << XGENE_DMA_DESC_BUFLEN_POS))
-#define XGENE_DMA_DESC_C_SET(m) \
- (((u64 *)(m))[1] |= XGENE_DMA_DESC_C_BIT)
-#define XGENE_DMA_DESC_FLYBY_SET(m, v) \
- (((u64 *)(m))[2] |= ((v) << XGENE_DMA_DESC_FLYBY_POS))
-#define XGENE_DMA_DESC_MULTI_SET(m, v, i) \
- (((u64 *)(m))[2] |= ((u64)(v) << (((i) + 1) * 8)))
-#define XGENE_DMA_DESC_DR_SET(m) \
- (((u64 *)(m))[2] |= XGENE_DMA_DESC_DR_BIT)
-#define XGENE_DMA_DESC_DST_ADDR_SET(m, v) \
- (((u64 *)(m))[3] |= (v))
-#define XGENE_DMA_DESC_H0ENQ_NUM_SET(m, v) \
- (((u64 *)(m))[3] |= ((u64)(v) << XGENE_DMA_DESC_HOENQ_NUM_POS))
#define XGENE_DMA_DESC_ELERR_RD(m) \
(((m) >> XGENE_DMA_DESC_ELERR_POS) & 0x3)
#define XGENE_DMA_DESC_LERR_RD(m) \
@@ -158,14 +134,7 @@
(((elerr) << 4) | (lerr))
/* X-Gene DMA descriptor empty s/w signature */
-#define XGENE_DMA_DESC_EMPTY_INDEX 0
#define XGENE_DMA_DESC_EMPTY_SIGNATURE ~0ULL
-#define XGENE_DMA_DESC_SET_EMPTY(m) \
- (((u64 *)(m))[XGENE_DMA_DESC_EMPTY_INDEX] = \
- XGENE_DMA_DESC_EMPTY_SIGNATURE)
-#define XGENE_DMA_DESC_IS_EMPTY(m) \
- (((u64 *)(m))[XGENE_DMA_DESC_EMPTY_INDEX] == \
- XGENE_DMA_DESC_EMPTY_SIGNATURE)
/* X-Gene DMA configurable parameters defines */
#define XGENE_DMA_RING_NUM 512
@@ -184,7 +153,7 @@
#define XGENE_DMA_XOR_ALIGNMENT 6 /* 64 Bytes */
#define XGENE_DMA_MAX_XOR_SRC 5
#define XGENE_DMA_16K_BUFFER_LEN_CODE 0x0
-#define XGENE_DMA_INVALID_LEN_CODE 0x7800
+#define XGENE_DMA_INVALID_LEN_CODE 0x7800000000000000ULL
/* X-Gene DMA descriptor error codes */
#define ERR_DESC_AXI 0x01
@@ -214,10 +183,10 @@
#define ERR_DESC_SRC_INT 0xB
/* X-Gene DMA flyby operation code */
-#define FLYBY_2SRC_XOR 0x8
-#define FLYBY_3SRC_XOR 0x9
-#define FLYBY_4SRC_XOR 0xA
-#define FLYBY_5SRC_XOR 0xB
+#define FLYBY_2SRC_XOR 0x80
+#define FLYBY_3SRC_XOR 0x90
+#define FLYBY_4SRC_XOR 0xA0
+#define FLYBY_5SRC_XOR 0xB0
/* X-Gene DMA SW descriptor flags */
#define XGENE_DMA_FLAG_64B_DESC BIT(0)
@@ -238,10 +207,10 @@
dev_err(chan->dev, "%s: " fmt, chan->name, ##arg)
struct xgene_dma_desc_hw {
- u64 m0;
- u64 m1;
- u64 m2;
- u64 m3;
+ __le64 m0;
+ __le64 m1;
+ __le64 m2;
+ __le64 m3;
};
enum xgene_dma_ring_cfgsize {
@@ -388,18 +357,11 @@ static bool is_pq_enabled(struct xgene_dma *pdma)
return !(val & XGENE_DMA_PQ_DISABLE_MASK);
}
-static void xgene_dma_cpu_to_le64(u64 *desc, int count)
-{
- int i;
-
- for (i = 0; i < count; i++)
- desc[i] = cpu_to_le64(desc[i]);
-}
-
-static u16 xgene_dma_encode_len(u32 len)
+static u64 xgene_dma_encode_len(size_t len)
{
return (len < XGENE_DMA_MAX_BYTE_CNT) ?
- len : XGENE_DMA_16K_BUFFER_LEN_CODE;
+ ((u64)len << XGENE_DMA_DESC_BUFLEN_POS) :
+ XGENE_DMA_16K_BUFFER_LEN_CODE;
}
static u8 xgene_dma_encode_xor_flyby(u32 src_cnt)
@@ -424,34 +386,50 @@ static u32 xgene_dma_ring_desc_cnt(struct xgene_dma_ring *ring)
return XGENE_DMA_RING_DESC_CNT(ring_state);
}
-static void xgene_dma_set_src_buffer(void *ext8, size_t *len,
+static void xgene_dma_set_src_buffer(__le64 *ext8, size_t *len,
dma_addr_t *paddr)
{
size_t nbytes = (*len < XGENE_DMA_MAX_BYTE_CNT) ?
*len : XGENE_DMA_MAX_BYTE_CNT;
- XGENE_DMA_DESC_BUFADDR_SET(ext8, *paddr);
- XGENE_DMA_DESC_BUFLEN_SET(ext8, xgene_dma_encode_len(nbytes));
+ *ext8 |= cpu_to_le64(*paddr);
+ *ext8 |= cpu_to_le64(xgene_dma_encode_len(nbytes));
*len -= nbytes;
*paddr += nbytes;
}
-static void xgene_dma_invalidate_buffer(void *ext8)
+static void xgene_dma_invalidate_buffer(__le64 *ext8)
{
- XGENE_DMA_DESC_BUFLEN_SET(ext8, XGENE_DMA_INVALID_LEN_CODE);
+ *ext8 |= cpu_to_le64(XGENE_DMA_INVALID_LEN_CODE);
}
-static void *xgene_dma_lookup_ext8(u64 *desc, int idx)
+static __le64 *xgene_dma_lookup_ext8(struct xgene_dma_desc_hw *desc, int idx)
{
- return (idx % 2) ? (desc + idx - 1) : (desc + idx + 1);
+ switch (idx) {
+ case 0:
+ return &desc->m1;
+ case 1:
+ return &desc->m0;
+ case 2:
+ return &desc->m3;
+ case 3:
+ return &desc->m2;
+ default:
+ pr_err("Invalid dma descriptor index\n");
+ }
+
+ return NULL;
}
-static void xgene_dma_init_desc(void *desc, u16 dst_ring_num)
+static void xgene_dma_init_desc(struct xgene_dma_desc_hw *desc,
+ u16 dst_ring_num)
{
- XGENE_DMA_DESC_C_SET(desc); /* Coherent IO */
- XGENE_DMA_DESC_IN_SET(desc);
- XGENE_DMA_DESC_H0ENQ_NUM_SET(desc, dst_ring_num);
- XGENE_DMA_DESC_RTYPE_SET(desc, XGENE_DMA_RING_OWNER_DMA);
+ desc->m0 |= cpu_to_le64(XGENE_DMA_DESC_IN_BIT);
+ desc->m0 |= cpu_to_le64((u64)XGENE_DMA_RING_OWNER_DMA <<
+ XGENE_DMA_DESC_RTYPE_POS);
+ desc->m1 |= cpu_to_le64(XGENE_DMA_DESC_C_BIT);
+ desc->m3 |= cpu_to_le64((u64)dst_ring_num <<
+ XGENE_DMA_DESC_HOENQ_NUM_POS);
}
static void xgene_dma_prep_cpy_desc(struct xgene_dma_chan *chan,
@@ -459,7 +437,7 @@ static void xgene_dma_prep_cpy_desc(struct xgene_dma_chan *chan,
dma_addr_t dst, dma_addr_t src,
size_t len)
{
- void *desc1, *desc2;
+ struct xgene_dma_desc_hw *desc1, *desc2;
int i;
/* Get 1st descriptor */
@@ -467,23 +445,21 @@ static void xgene_dma_prep_cpy_desc(struct xgene_dma_chan *chan,
xgene_dma_init_desc(desc1, chan->tx_ring.dst_ring_num);
/* Set destination address */
- XGENE_DMA_DESC_DR_SET(desc1);
- XGENE_DMA_DESC_DST_ADDR_SET(desc1, dst);
+ desc1->m2 |= cpu_to_le64(XGENE_DMA_DESC_DR_BIT);
+ desc1->m3 |= cpu_to_le64(dst);
/* Set 1st source address */
- xgene_dma_set_src_buffer(desc1 + 8, &len, &src);
+ xgene_dma_set_src_buffer(&desc1->m1, &len, &src);
- if (len <= 0) {
- desc2 = NULL;
- goto skip_additional_src;
- }
+ if (!len)
+ return;
/*
* We need to split this source buffer,
* and need to use 2nd descriptor
*/
desc2 = &desc_sw->desc2;
- XGENE_DMA_DESC_NV_SET(desc1);
+ desc1->m0 |= cpu_to_le64(XGENE_DMA_DESC_NV_BIT);
/* Set 2nd to 5th source address */
for (i = 0; i < 4 && len; i++)
@@ -496,12 +472,6 @@ static void xgene_dma_prep_cpy_desc(struct xgene_dma_chan *chan,
/* Updated flag that we have prepared 64B descriptor */
desc_sw->flags |= XGENE_DMA_FLAG_64B_DESC;
-
-skip_additional_src:
- /* Hardware stores descriptor in little endian format */
- xgene_dma_cpu_to_le64(desc1, 4);
- if (desc2)
- xgene_dma_cpu_to_le64(desc2, 4);
}
static void xgene_dma_prep_xor_desc(struct xgene_dma_chan *chan,
@@ -510,7 +480,7 @@ static void xgene_dma_prep_xor_desc(struct xgene_dma_chan *chan,
u32 src_cnt, size_t *nbytes,
const u8 *scf)
{
- void *desc1, *desc2;
+ struct xgene_dma_desc_hw *desc1, *desc2;
size_t len = *nbytes;
int i;
@@ -521,28 +491,24 @@ static void xgene_dma_prep_xor_desc(struct xgene_dma_chan *chan,
xgene_dma_init_desc(desc1, chan->tx_ring.dst_ring_num);
/* Set destination address */
- XGENE_DMA_DESC_DR_SET(desc1);
- XGENE_DMA_DESC_DST_ADDR_SET(desc1, *dst);
+ desc1->m2 |= cpu_to_le64(XGENE_DMA_DESC_DR_BIT);
+ desc1->m3 |= cpu_to_le64(*dst);
/* We have multiple source addresses, so need to set NV bit*/
- XGENE_DMA_DESC_NV_SET(desc1);
+ desc1->m0 |= cpu_to_le64(XGENE_DMA_DESC_NV_BIT);
/* Set flyby opcode */
- XGENE_DMA_DESC_FLYBY_SET(desc1, xgene_dma_encode_xor_flyby(src_cnt));
+ desc1->m2 |= cpu_to_le64(xgene_dma_encode_xor_flyby(src_cnt));
/* Set 1st to 5th source addresses */
for (i = 0; i < src_cnt; i++) {
len = *nbytes;
- xgene_dma_set_src_buffer((i == 0) ? (desc1 + 8) :
+ xgene_dma_set_src_buffer((i == 0) ? &desc1->m1 :
xgene_dma_lookup_ext8(desc2, i - 1),
&len, &src[i]);
- XGENE_DMA_DESC_MULTI_SET(desc1, scf[i], i);
+ desc1->m2 |= cpu_to_le64((scf[i] << ((i + 1) * 8)));
}
- /* Hardware stores descriptor in little endian format */
- xgene_dma_cpu_to_le64(desc1, 4);
- xgene_dma_cpu_to_le64(desc2, 4);
-
/* Update meta data */
*nbytes = len;
*dst += XGENE_DMA_MAX_BYTE_CNT;
@@ -738,7 +704,7 @@ static int xgene_chan_xfer_request(struct xgene_dma_ring *ring,
* xgene_chan_xfer_ld_pending - push any pending transactions to hw
* @chan : X-Gene DMA channel
*
- * LOCKING: must hold chan->desc_lock
+ * LOCKING: must hold chan->lock
*/
static void xgene_chan_xfer_ld_pending(struct xgene_dma_chan *chan)
{
@@ -808,7 +774,8 @@ static void xgene_dma_cleanup_descriptors(struct xgene_dma_chan *chan)
desc_hw = &ring->desc_hw[ring->head];
/* Check if this descriptor has been completed */
- if (unlikely(XGENE_DMA_DESC_IS_EMPTY(desc_hw)))
+ if (unlikely(le64_to_cpu(desc_hw->m0) ==
+ XGENE_DMA_DESC_EMPTY_SIGNATURE))
break;
if (++ring->head == ring->slots)
@@ -842,7 +809,7 @@ static void xgene_dma_cleanup_descriptors(struct xgene_dma_chan *chan)
iowrite32(-1, ring->cmd);
/* Mark this hw descriptor as processed */
- XGENE_DMA_DESC_SET_EMPTY(desc_hw);
+ desc_hw->m0 = cpu_to_le64(XGENE_DMA_DESC_EMPTY_SIGNATURE);
xgene_dma_run_tx_complete_actions(chan, desc_sw);
@@ -889,7 +856,7 @@ static int xgene_dma_alloc_chan_resources(struct dma_chan *dchan)
* @chan: X-Gene DMA channel
* @list: the list to free
*
- * LOCKING: must hold chan->desc_lock
+ * LOCKING: must hold chan->lock
*/
static void xgene_dma_free_desc_list(struct xgene_dma_chan *chan,
struct list_head *list)
@@ -900,15 +867,6 @@ static void xgene_dma_free_desc_list(struct xgene_dma_chan *chan,
xgene_dma_clean_descriptor(chan, desc);
}
-static void xgene_dma_free_tx_desc_list(struct xgene_dma_chan *chan,
- struct list_head *list)
-{
- struct xgene_dma_desc_sw *desc, *_desc;
-
- list_for_each_entry_safe(desc, _desc, list, node)
- xgene_dma_clean_descriptor(chan, desc);
-}
-
static void xgene_dma_free_chan_resources(struct dma_chan *dchan)
{
struct xgene_dma_chan *chan = to_dma_chan(dchan);
@@ -985,7 +943,7 @@ fail:
if (!first)
return NULL;
- xgene_dma_free_tx_desc_list(chan, &first->tx_list);
+ xgene_dma_free_desc_list(chan, &first->tx_list);
return NULL;
}
@@ -1093,7 +1051,7 @@ fail:
if (!first)
return NULL;
- xgene_dma_free_tx_desc_list(chan, &first->tx_list);
+ xgene_dma_free_desc_list(chan, &first->tx_list);
return NULL;
}
@@ -1141,7 +1099,7 @@ fail:
if (!first)
return NULL;
- xgene_dma_free_tx_desc_list(chan, &first->tx_list);
+ xgene_dma_free_desc_list(chan, &first->tx_list);
return NULL;
}
@@ -1218,7 +1176,7 @@ fail:
if (!first)
return NULL;
- xgene_dma_free_tx_desc_list(chan, &first->tx_list);
+ xgene_dma_free_desc_list(chan, &first->tx_list);
return NULL;
}
@@ -1316,7 +1274,6 @@ static void xgene_dma_setup_ring(struct xgene_dma_ring *ring)
{
void *ring_cfg = ring->state;
u64 addr = ring->desc_paddr;
- void *desc;
u32 i, val;
ring->slots = ring->size / XGENE_DMA_RING_WQ_DESC_SIZE;
@@ -1358,8 +1315,10 @@ static void xgene_dma_setup_ring(struct xgene_dma_ring *ring)
/* Set empty signature to DMA Rx ring descriptors */
for (i = 0; i < ring->slots; i++) {
+ struct xgene_dma_desc_hw *desc;
+
desc = &ring->desc_hw[i];
- XGENE_DMA_DESC_SET_EMPTY(desc);
+ desc->m0 = cpu_to_le64(XGENE_DMA_DESC_EMPTY_SIGNATURE);
}
/* Enable DMA Rx ring interrupt */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8d9f89b4519d..df92d30ca054 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2628,13 +2628,14 @@ errors_show(struct md_rdev *rdev, char *page)
static ssize_t
errors_store(struct md_rdev *rdev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (*buf && (*e == 0 || *e == '\n')) {
- atomic_set(&rdev->corrected_errors, n);
- return len;
- }
- return -EINVAL;
+ unsigned int n;
+ int rv;
+
+ rv = kstrtouint(buf, 10, &n);
+ if (rv < 0)
+ return rv;
+ atomic_set(&rdev->corrected_errors, n);
+ return len;
}
static struct rdev_sysfs_entry rdev_errors =
__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
@@ -2651,13 +2652,16 @@ slot_show(struct md_rdev *rdev, char *page)
static ssize_t
slot_store(struct md_rdev *rdev, const char *buf, size_t len)
{
- char *e;
+ int slot;
int err;
- int slot = simple_strtoul(buf, &e, 10);
+
if (strncmp(buf, "none", 4)==0)
slot = -1;
- else if (e==buf || (*e && *e!= '\n'))
- return -EINVAL;
+ else {
+ err = kstrtouint(buf, 10, (unsigned int *)&slot);
+ if (err < 0)
+ return err;
+ }
if (rdev->mddev->pers && slot == -1) {
/* Setting 'slot' on an active array requires also
* updating the 'rd%d' link, and communicating
@@ -3542,12 +3546,12 @@ layout_show(struct mddev *mddev, char *page)
static ssize_t
layout_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
+ unsigned int n;
int err;
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
return err;
@@ -3591,12 +3595,12 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks);
static ssize_t
raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
+ unsigned int n;
int err;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
@@ -3643,12 +3647,12 @@ chunk_size_show(struct mddev *mddev, char *page)
static ssize_t
chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
{
+ unsigned long n;
int err;
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtoul(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
@@ -3686,19 +3690,24 @@ resync_start_show(struct mddev *mddev, char *page)
static ssize_t
resync_start_store(struct mddev *mddev, const char *buf, size_t len)
{
+ unsigned long long n;
int err;
- char *e;
- unsigned long long n = simple_strtoull(buf, &e, 10);
+
+ if (cmd_match(buf, "none"))
+ n = MaxSector;
+ else {
+ err = kstrtoull(buf, 10, &n);
+ if (err < 0)
+ return err;
+ if (n != (sector_t)n)
+ return -EINVAL;
+ }
err = mddev_lock(mddev);
if (err)
return err;
if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
err = -EBUSY;
- else if (cmd_match(buf, "none"))
- n = MaxSector;
- else if (!*buf || (*e && *e != '\n'))
- err = -EINVAL;
if (!err) {
mddev->recovery_cp = n;
@@ -3934,14 +3943,14 @@ max_corrected_read_errors_show(struct mddev *mddev, char *page) {
static ssize_t
max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
+ unsigned int n;
+ int rv;
- if (*buf && (*e == 0 || *e == '\n')) {
- atomic_set(&mddev->max_corr_read_errors, n);
- return len;
- }
- return -EINVAL;
+ rv = kstrtouint(buf, 10, &n);
+ if (rv < 0)
+ return rv;
+ atomic_set(&mddev->max_corr_read_errors, n);
+ return len;
}
static struct md_sysfs_entry max_corr_read_errors =
@@ -4003,8 +4012,10 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
else
rdev = md_import_device(dev, -1, -1);
- if (IS_ERR(rdev))
+ if (IS_ERR(rdev)) {
+ mddev_unlock(mddev);
return PTR_ERR(rdev);
+ }
err = bind_rdev_to_array(rdev, mddev);
out:
if (err)
@@ -4298,15 +4309,18 @@ sync_min_show(struct mddev *mddev, char *page)
static ssize_t
sync_min_store(struct mddev *mddev, const char *buf, size_t len)
{
- int min;
- char *e;
+ unsigned int min;
+ int rv;
+
if (strncmp(buf, "system", 6)==0) {
- mddev->sync_speed_min = 0;
- return len;
+ min = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &min);
+ if (rv < 0)
+ return rv;
+ if (min == 0)
+ return -EINVAL;
}
- min = simple_strtoul(buf, &e, 10);
- if (buf == e || (*e && *e != '\n') || min <= 0)
- return -EINVAL;
mddev->sync_speed_min = min;
return len;
}
@@ -4324,15 +4338,18 @@ sync_max_show(struct mddev *mddev, char *page)
static ssize_t
sync_max_store(struct mddev *mddev, const char *buf, size_t len)
{
- int max;
- char *e;
+ unsigned int max;
+ int rv;
+
if (strncmp(buf, "system", 6)==0) {
- mddev->sync_speed_max = 0;
- return len;
+ max = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &max);
+ if (rv < 0)
+ return rv;
+ if (max == 0)
+ return -EINVAL;
}
- max = simple_strtoul(buf, &e, 10);
- if (buf == e || (*e && *e != '\n') || max <= 0)
- return -EINVAL;
mddev->sync_speed_max = max;
return len;
}
@@ -4515,12 +4532,13 @@ suspend_lo_show(struct mddev *mddev, char *page)
static ssize_t
suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old;
+ unsigned long long old, new;
int err;
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
@@ -4557,12 +4575,13 @@ suspend_hi_show(struct mddev *mddev, char *page)
static ssize_t
suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old;
+ unsigned long long old, new;
int err;
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
@@ -4604,11 +4623,13 @@ static ssize_t
reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
{
struct md_rdev *rdev;
- char *e;
+ unsigned long long new;
int err;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
if (err)
@@ -5157,6 +5178,7 @@ int md_run(struct mddev *mddev)
mddev_detach(mddev);
if (mddev->private)
pers->free(mddev, mddev->private);
+ mddev->private = NULL;
module_put(pers->owner);
bitmap_destroy(mddev);
return err;
@@ -5292,6 +5314,7 @@ static void md_clean(struct mddev *mddev)
mddev->changed = 0;
mddev->degraded = 0;
mddev->safemode = 0;
+ mddev->private = NULL;
mddev->merge_check_needed = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
@@ -5364,6 +5387,7 @@ static void __md_stop(struct mddev *mddev)
mddev->pers = NULL;
spin_unlock(&mddev->lock);
pers->free(mddev, mddev->private);
+ mddev->private = NULL;
if (pers->sync_request && mddev->to_remove == NULL)
mddev->to_remove = &md_redundancy_group;
module_put(pers->owner);
@@ -6373,7 +6397,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->ctime != info->ctime ||
mddev->level != info->level ||
/* mddev->layout != info->layout || */
- !mddev->persistent != info->not_persistent||
+ mddev->persistent != !info->not_persistent ||
mddev->chunk_sectors != info->chunk_size >> 9 ||
/* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
((state^info->state) & 0xfffffe00)
@@ -8104,6 +8128,15 @@ void md_check_recovery(struct mddev *mddev)
int spares = 0;
if (mddev->ro) {
+ struct md_rdev *rdev;
+ if (!mddev->external && mddev->in_sync)
+ /* 'Blocked' flag not needed as failed devices
+ * will be recorded if array switched to read/write.
+ * Leaving it set will prevent the device
+ * from being removed.
+ */
+ rdev_for_each(rdev, mddev)
+ clear_bit(Blocked, &rdev->flags);
/* On a read-only array we can:
* - remove failed devices
* - add already-in_sync devices if the array itself
@@ -9011,13 +9044,7 @@ static int get_ro(char *buffer, struct kernel_param *kp)
}
static int set_ro(const char *val, struct kernel_param *kp)
{
- char *e;
- int num = simple_strtoul(val, &e, 10);
- if (*val && (*e == '\0' || *e == '\n')) {
- start_readonly = num;
- return 0;
- }
- return -EINVAL;
+ return kstrtouint(val, 10, (unsigned int *)&start_readonly);
}
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 188d8e9a6bdc..940f2f365461 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2099,17 +2099,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
tbio->bi_rw = WRITE;
tbio->bi_private = r10_bio;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
-
- for (j=0; j < vcnt ; j++) {
- tbio->bi_io_vec[j].bv_offset = 0;
- tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
-
- memcpy(page_address(tbio->bi_io_vec[j].bv_page),
- page_address(fbio->bi_io_vec[j].bv_page),
- PAGE_SIZE);
- }
tbio->bi_end_io = end_sync_write;
+ bio_copy_data(tbio, fbio);
+
d = r10_bio->devs[i].devnum;
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
@@ -2124,17 +2117,14 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* that are active
*/
for (i = 0; i < conf->copies; i++) {
- int j, d;
+ int d;
tbio = r10_bio->devs[i].repl_bio;
if (!tbio || !tbio->bi_end_io)
continue;
if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
&& r10_bio->devs[i].bio != fbio)
- for (j = 0; j < vcnt; j++)
- memcpy(page_address(tbio->bi_io_vec[j].bv_page),
- page_address(fbio->bi_io_vec[j].bv_page),
- PAGE_SIZE);
+ bio_copy_data(tbio, fbio);
d = r10_bio->devs[i].devnum;
atomic_inc(&r10_bio->remaining);
md_sync_acct(conf->mirrors[d].replacement->bdev,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b6793d2e051f..59e44e99eef3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf,
int hash)
{
int size;
- bool do_wakeup = false;
+ unsigned long do_wakeup = 0;
+ int i = 0;
unsigned long flags;
if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -365,15 +366,21 @@ static void release_inactive_stripe_list(struct r5conf *conf,
!list_empty(list))
atomic_dec(&conf->empty_inactive_list_nr);
list_splice_tail_init(list, conf->inactive_list + hash);
- do_wakeup = true;
+ do_wakeup |= 1 << hash;
spin_unlock_irqrestore(conf->hash_locks + hash, flags);
}
size--;
hash--;
}
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+ if (do_wakeup & (1 << i))
+ wake_up(&conf->wait_for_stripe[i]);
+ }
+
if (do_wakeup) {
- wake_up(&conf->wait_for_stripe);
+ if (atomic_read(&conf->active_stripes) == 0)
+ wake_up(&conf->wait_for_quiescent);
if (conf->retry_read_aligned)
md_wakeup_thread(conf->mddev->thread);
}
@@ -667,15 +674,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
spin_lock_irq(conf->hash_locks + hash);
do {
- wait_event_lock_irq(conf->wait_for_stripe,
+ wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0 || noquiesce,
*(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
sh = get_free_stripe(conf, hash);
- if (!sh && llist_empty(&conf->released_stripes) &&
- !test_bit(R5_DID_ALLOC, &conf->cache_state))
+ if (!sh && !test_bit(R5_DID_ALLOC,
+ &conf->cache_state))
set_bit(R5_ALLOC_MORE,
&conf->cache_state);
}
@@ -684,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
- wait_event_lock_irq(
- conf->wait_for_stripe,
+ wait_event_exclusive_cmd(
+ conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash) &&
(atomic_read(&conf->active_stripes)
< (conf->max_nr_stripes * 3 / 4)
|| !test_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state)),
- *(conf->hash_locks + hash));
+ spin_unlock_irq(conf->hash_locks + hash),
+ spin_lock_irq(conf->hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
} else {
@@ -716,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
}
} while (sh == NULL);
+ if (!list_empty(conf->inactive_list + hash))
+ wake_up(&conf->wait_for_stripe[hash]);
+
spin_unlock_irq(conf->hash_locks + hash);
return sh;
}
@@ -2177,7 +2188,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
cnt = 0;
list_for_each_entry(nsh, &newstripes, lru) {
lock_device_hash_lock(conf, hash);
- wait_event_cmd(conf->wait_for_stripe,
+ wait_event_exclusive_cmd(conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash),
unlock_device_hash_lock(conf, hash),
lock_device_hash_lock(conf, hash));
@@ -4760,7 +4771,7 @@ static void raid5_align_endio(struct bio *bi, int error)
raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
- wake_up(&conf->wait_for_stripe);
+ wake_up(&conf->wait_for_quiescent);
return;
}
@@ -4855,7 +4866,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
align_bi->bi_iter.bi_sector += rdev->data_offset;
spin_lock_irq(&conf->device_lock);
- wait_event_lock_irq(conf->wait_for_stripe,
+ wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0,
conf->device_lock);
atomic_inc(&conf->active_aligned_reads);
@@ -5699,7 +5710,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
bio_endio(raid_bio, 0);
}
if (atomic_dec_and_test(&conf->active_aligned_reads))
- wake_up(&conf->wait_for_stripe);
+ wake_up(&conf->wait_for_quiescent);
return handled;
}
@@ -6433,7 +6444,10 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(&conf->device_lock);
seqcount_init(&conf->gen_lock);
- init_waitqueue_head(&conf->wait_for_stripe);
+ init_waitqueue_head(&conf->wait_for_quiescent);
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+ init_waitqueue_head(&conf->wait_for_stripe[i]);
+ }
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->hold_list);
@@ -7466,7 +7480,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
* active stripes can drain
*/
conf->quiesce = 2;
- wait_event_cmd(conf->wait_for_stripe,
+ wait_event_cmd(conf->wait_for_quiescent,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,
unlock_all_device_hash_locks_irq(conf),
@@ -7480,7 +7494,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
case 0: /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
- wake_up(&conf->wait_for_stripe);
+ wake_up(&conf->wait_for_quiescent);
wake_up(&conf->wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 896d603ad0da..02c3bf8fbfe7 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -511,7 +511,8 @@ struct r5conf {
struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
atomic_t empty_inactive_list_nr;
struct llist_head released_stripes;
- wait_queue_head_t wait_for_stripe;
+ wait_queue_head_t wait_for_quiescent;
+ wait_queue_head_t wait_for_stripe[NR_STRIPE_HASH_LOCKS];
wait_queue_head_t wait_for_overlap;
unsigned long cache_state;
#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked,
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
new file mode 100644
index 000000000000..72226acb5c0f
--- /dev/null
+++ b/drivers/nvdimm/Kconfig
@@ -0,0 +1,68 @@
+menuconfig LIBNVDIMM
+ tristate "NVDIMM (Non-Volatile Memory Device) Support"
+ depends on PHYS_ADDR_T_64BIT
+ depends on BLK_DEV
+ help
+ Generic support for non-volatile memory devices including
+ ACPI-6-NFIT defined resources. On platforms that define an
+ NFIT, or otherwise can discover NVDIMM resources, a libnvdimm
+ bus is registered to advertise PMEM (persistent memory)
+ namespaces (/dev/pmemX) and BLK (sliding mmio window(s))
+ namespaces (/dev/ndblkX.Y). A PMEM namespace refers to a
+ memory resource that may span multiple DIMMs and support DAX
+ (see CONFIG_DAX). A BLK namespace refers to an NVDIMM control
+ region which exposes an mmio register set for windowed access
+ mode to non-volatile memory.
+
+if LIBNVDIMM
+
+config BLK_DEV_PMEM
+ tristate "PMEM: Persistent memory block device support"
+ default LIBNVDIMM
+ depends on HAS_IOMEM
+ select ND_BTT if BTT
+ help
+ Memory ranges for PMEM are described by either an NFIT
+ (NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a
+ non-standard OEM-specific E820 memory type (type-12, see
+ CONFIG_X86_PMEM_LEGACY), or it is manually specified by the
+ 'memmap=nn[KMG]!ss[KMG]' kernel command line (see
+ Documentation/kernel-parameters.txt). This driver converts
+ these persistent memory ranges into block devices that are
+ capable of DAX (direct-access) file system mappings. See
+ Documentation/nvdimm/nvdimm.txt for more details.
+
+ Say Y if you want to use an NVDIMM
+
+config ND_BLK
+ tristate "BLK: Block data window (aperture) device support"
+ default LIBNVDIMM
+ select ND_BTT if BTT
+ help
+ Support NVDIMMs, or other devices, that implement a BLK-mode
+ access capability. BLK-mode access uses memory-mapped-i/o
+ apertures to access persistent media.
+
+ Say Y if your platform firmware emits an ACPI.NFIT table
+ (CONFIG_ACPI_NFIT), or otherwise exposes BLK-mode
+ capabilities.
+
+config ND_BTT
+ tristate
+
+config BTT
+ bool "BTT: Block Translation Table (atomic sector updates)"
+ default y if LIBNVDIMM
+ help
+ The Block Translation Table (BTT) provides atomic sector
+ update semantics for persistent memory devices, so that
+ applications that rely on sector writes not being torn (a
+ guarantee that typical disks provide) can continue to do so.
+ The BTT manifests itself as an alternate personality for an
+ NVDIMM namespace, i.e. a namespace can be in raw mode (pmemX,
+ ndblkX.Y, etc...), or 'sectored' mode, (pmemXs, ndblkX.Ys,
+ etc...).
+
+ Select Y if unsure
+
+endif
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
new file mode 100644
index 000000000000..594bb97c867a
--- /dev/null
+++ b/drivers/nvdimm/Makefile
@@ -0,0 +1,20 @@
+obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
+obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
+obj-$(CONFIG_ND_BTT) += nd_btt.o
+obj-$(CONFIG_ND_BLK) += nd_blk.o
+
+nd_pmem-y := pmem.o
+
+nd_btt-y := btt.o
+
+nd_blk-y := blk.o
+
+libnvdimm-y := core.o
+libnvdimm-y += bus.o
+libnvdimm-y += dimm_devs.o
+libnvdimm-y += dimm.o
+libnvdimm-y += region_devs.o
+libnvdimm-y += region.o
+libnvdimm-y += namespace_devs.o
+libnvdimm-y += label.o
+libnvdimm-$(CONFIG_BTT) += btt_devs.o
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
new file mode 100644
index 000000000000..4f97b248c236
--- /dev/null
+++ b/drivers/nvdimm/blk.c
@@ -0,0 +1,384 @@
+/*
+ * NVDIMM Block Window Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/nd.h>
+#include <linux/sizes.h>
+#include "nd.h"
+
+struct nd_blk_device {
+ struct request_queue *queue;
+ struct gendisk *disk;
+ struct nd_namespace_blk *nsblk;
+ struct nd_blk_region *ndbr;
+ size_t disk_size;
+ u32 sector_size;
+ u32 internal_lbasize;
+};
+
+static int nd_blk_major;
+
+static u32 nd_blk_meta_size(struct nd_blk_device *blk_dev)
+{
+ return blk_dev->nsblk->lbasize - blk_dev->sector_size;
+}
+
+static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk,
+ resource_size_t ns_offset, unsigned int len)
+{
+ int i;
+
+ for (i = 0; i < nsblk->num_resources; i++) {
+ if (ns_offset < resource_size(nsblk->res[i])) {
+ if (ns_offset + len > resource_size(nsblk->res[i])) {
+ dev_WARN_ONCE(&nsblk->common.dev, 1,
+ "illegal request\n");
+ return SIZE_MAX;
+ }
+ return nsblk->res[i]->start + ns_offset;
+ }
+ ns_offset -= resource_size(nsblk->res[i]);
+ }
+
+ dev_WARN_ONCE(&nsblk->common.dev, 1, "request out of range\n");
+ return SIZE_MAX;
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
+ struct bio_integrity_payload *bip, u64 lba,
+ int rw)
+{
+ unsigned int len = nd_blk_meta_size(blk_dev);
+ resource_size_t dev_offset, ns_offset;
+ struct nd_namespace_blk *nsblk;
+ struct nd_blk_region *ndbr;
+ int err = 0;
+
+ nsblk = blk_dev->nsblk;
+ ndbr = blk_dev->ndbr;
+ ns_offset = lba * blk_dev->internal_lbasize + blk_dev->sector_size;
+ dev_offset = to_dev_offset(nsblk, ns_offset, len);
+ if (dev_offset == SIZE_MAX)
+ return -EIO;
+
+ while (len) {
+ unsigned int cur_len;
+ struct bio_vec bv;
+ void *iobuf;
+
+ bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
+ /*
+ * The 'bv' obtained from bvec_iter_bvec has its .bv_len and
+ * .bv_offset already adjusted for iter->bi_bvec_done, and we
+ * can use those directly
+ */
+
+ cur_len = min(len, bv.bv_len);
+ iobuf = kmap_atomic(bv.bv_page);
+ err = ndbr->do_io(ndbr, dev_offset, iobuf + bv.bv_offset,
+ cur_len, rw);
+ kunmap_atomic(iobuf);
+ if (err)
+ return err;
+
+ len -= cur_len;
+ dev_offset += cur_len;
+ bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len);
+ }
+
+ return err;
+}
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
+ struct bio_integrity_payload *bip, u64 lba,
+ int rw)
+{
+ return 0;
+}
+#endif
+
+static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
+ struct bio_integrity_payload *bip, struct page *page,
+ unsigned int len, unsigned int off, int rw,
+ sector_t sector)
+{
+ struct nd_blk_region *ndbr = blk_dev->ndbr;
+ resource_size_t dev_offset, ns_offset;
+ int err = 0;
+ void *iobuf;
+ u64 lba;
+
+ while (len) {
+ unsigned int cur_len;
+
+ /*
+ * If we don't have an integrity payload, we don't have to
+ * split the bvec into sectors, as this would cause unnecessary
+ * Block Window setup/move steps. the do_io routine is capable
+ * of handling len <= PAGE_SIZE.
+ */
+ cur_len = bip ? min(len, blk_dev->sector_size) : len;
+
+ lba = div_u64(sector << SECTOR_SHIFT, blk_dev->sector_size);
+ ns_offset = lba * blk_dev->internal_lbasize;
+ dev_offset = to_dev_offset(blk_dev->nsblk, ns_offset, cur_len);
+ if (dev_offset == SIZE_MAX)
+ return -EIO;
+
+ iobuf = kmap_atomic(page);
+ err = ndbr->do_io(ndbr, dev_offset, iobuf + off, cur_len, rw);
+ kunmap_atomic(iobuf);
+ if (err)
+ return err;
+
+ if (bip) {
+ err = nd_blk_rw_integrity(blk_dev, bip, lba, rw);
+ if (err)
+ return err;
+ }
+ len -= cur_len;
+ off += cur_len;
+ sector += blk_dev->sector_size >> SECTOR_SHIFT;
+ }
+
+ return err;
+}
+
+static void nd_blk_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct block_device *bdev = bio->bi_bdev;
+ struct gendisk *disk = bdev->bd_disk;
+ struct bio_integrity_payload *bip;
+ struct nd_blk_device *blk_dev;
+ struct bvec_iter iter;
+ unsigned long start;
+ struct bio_vec bvec;
+ int err = 0, rw;
+ bool do_acct;
+
+ /*
+ * bio_integrity_enabled also checks if the bio already has an
+ * integrity payload attached. If it does, we *don't* do a
+ * bio_integrity_prep here - the payload has been generated by
+ * another kernel subsystem, and we just pass it through.
+ */
+ if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
+ err = -EIO;
+ goto out;
+ }
+
+ bip = bio_integrity(bio);
+ blk_dev = disk->private_data;
+ rw = bio_data_dir(bio);
+ do_acct = nd_iostat_start(bio, &start);
+ bio_for_each_segment(bvec, bio, iter) {
+ unsigned int len = bvec.bv_len;
+
+ BUG_ON(len > PAGE_SIZE);
+ err = nd_blk_do_bvec(blk_dev, bip, bvec.bv_page, len,
+ bvec.bv_offset, rw, iter.bi_sector);
+ if (err) {
+ dev_info(&blk_dev->nsblk->common.dev,
+ "io error in %s sector %lld, len %d,\n",
+ (rw == READ) ? "READ" : "WRITE",
+ (unsigned long long) iter.bi_sector, len);
+ break;
+ }
+ }
+ if (do_acct)
+ nd_iostat_end(bio, start);
+
+ out:
+ bio_endio(bio, err);
+}
+
+static int nd_blk_rw_bytes(struct nd_namespace_common *ndns,
+ resource_size_t offset, void *iobuf, size_t n, int rw)
+{
+ struct nd_blk_device *blk_dev = dev_get_drvdata(ndns->claim);
+ struct nd_namespace_blk *nsblk = blk_dev->nsblk;
+ struct nd_blk_region *ndbr = blk_dev->ndbr;
+ resource_size_t dev_offset;
+
+ dev_offset = to_dev_offset(nsblk, offset, n);
+
+ if (unlikely(offset + n > blk_dev->disk_size)) {
+ dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
+ return -EFAULT;
+ }
+
+ if (dev_offset == SIZE_MAX)
+ return -EIO;
+
+ return ndbr->do_io(ndbr, dev_offset, iobuf, n, rw);
+}
+
+static const struct block_device_operations nd_blk_fops = {
+ .owner = THIS_MODULE,
+ .revalidate_disk = nvdimm_revalidate_disk,
+};
+
+static int nd_blk_attach_disk(struct nd_namespace_common *ndns,
+ struct nd_blk_device *blk_dev)
+{
+ resource_size_t available_disk_size;
+ struct gendisk *disk;
+ u64 internal_nlba;
+
+ internal_nlba = div_u64(blk_dev->disk_size, blk_dev->internal_lbasize);
+ available_disk_size = internal_nlba * blk_dev->sector_size;
+
+ blk_dev->queue = blk_alloc_queue(GFP_KERNEL);
+ if (!blk_dev->queue)
+ return -ENOMEM;
+
+ blk_queue_make_request(blk_dev->queue, nd_blk_make_request);
+ blk_queue_max_hw_sectors(blk_dev->queue, UINT_MAX);
+ blk_queue_bounce_limit(blk_dev->queue, BLK_BOUNCE_ANY);
+ blk_queue_logical_block_size(blk_dev->queue, blk_dev->sector_size);
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, blk_dev->queue);
+
+ disk = blk_dev->disk = alloc_disk(0);
+ if (!disk) {
+ blk_cleanup_queue(blk_dev->queue);
+ return -ENOMEM;
+ }
+
+ disk->driverfs_dev = &ndns->dev;
+ disk->major = nd_blk_major;
+ disk->first_minor = 0;
+ disk->fops = &nd_blk_fops;
+ disk->private_data = blk_dev;
+ disk->queue = blk_dev->queue;
+ disk->flags = GENHD_FL_EXT_DEVT;
+ nvdimm_namespace_disk_name(ndns, disk->disk_name);
+ set_capacity(disk, 0);
+ add_disk(disk);
+
+ if (nd_blk_meta_size(blk_dev)) {
+ int rc = nd_integrity_init(disk, nd_blk_meta_size(blk_dev));
+
+ if (rc) {
+ del_gendisk(disk);
+ put_disk(disk);
+ blk_cleanup_queue(blk_dev->queue);
+ return rc;
+ }
+ }
+
+ set_capacity(disk, available_disk_size >> SECTOR_SHIFT);
+ revalidate_disk(disk);
+ return 0;
+}
+
+static int nd_blk_probe(struct device *dev)
+{
+ struct nd_namespace_common *ndns;
+ struct nd_namespace_blk *nsblk;
+ struct nd_blk_device *blk_dev;
+ int rc;
+
+ ndns = nvdimm_namespace_common_probe(dev);
+ if (IS_ERR(ndns))
+ return PTR_ERR(ndns);
+
+ blk_dev = kzalloc(sizeof(*blk_dev), GFP_KERNEL);
+ if (!blk_dev)
+ return -ENOMEM;
+
+ nsblk = to_nd_namespace_blk(&ndns->dev);
+ blk_dev->disk_size = nvdimm_namespace_capacity(ndns);
+ blk_dev->ndbr = to_nd_blk_region(dev->parent);
+ blk_dev->nsblk = to_nd_namespace_blk(&ndns->dev);
+ blk_dev->internal_lbasize = roundup(nsblk->lbasize,
+ INT_LBASIZE_ALIGNMENT);
+ blk_dev->sector_size = ((nsblk->lbasize >= 4096) ? 4096 : 512);
+ dev_set_drvdata(dev, blk_dev);
+
+ ndns->rw_bytes = nd_blk_rw_bytes;
+ if (is_nd_btt(dev))
+ rc = nvdimm_namespace_attach_btt(ndns);
+ else if (nd_btt_probe(ndns, blk_dev) == 0) {
+ /* we'll come back as btt-blk */
+ rc = -ENXIO;
+ } else
+ rc = nd_blk_attach_disk(ndns, blk_dev);
+ if (rc)
+ kfree(blk_dev);
+ return rc;
+}
+
+static void nd_blk_detach_disk(struct nd_blk_device *blk_dev)
+{
+ del_gendisk(blk_dev->disk);
+ put_disk(blk_dev->disk);
+ blk_cleanup_queue(blk_dev->queue);
+}
+
+static int nd_blk_remove(struct device *dev)
+{
+ struct nd_blk_device *blk_dev = dev_get_drvdata(dev);
+
+ if (is_nd_btt(dev))
+ nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns);
+ else
+ nd_blk_detach_disk(blk_dev);
+ kfree(blk_dev);
+
+ return 0;
+}
+
+static struct nd_device_driver nd_blk_driver = {
+ .probe = nd_blk_probe,
+ .remove = nd_blk_remove,
+ .drv = {
+ .name = "nd_blk",
+ },
+ .type = ND_DRIVER_NAMESPACE_BLK,
+};
+
+static int __init nd_blk_init(void)
+{
+ int rc;
+
+ rc = register_blkdev(0, "nd_blk");
+ if (rc < 0)
+ return rc;
+
+ nd_blk_major = rc;
+ rc = nd_driver_register(&nd_blk_driver);
+
+ if (rc < 0)
+ unregister_blkdev(nd_blk_major, "nd_blk");
+
+ return rc;
+}
+
+static void __exit nd_blk_exit(void)
+{
+ driver_unregister(&nd_blk_driver.drv);
+ unregister_blkdev(nd_blk_major, "nd_blk");
+}
+
+MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_BLK);
+module_init(nd_blk_init);
+module_exit(nd_blk_exit);
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
new file mode 100644
index 000000000000..411c7b2bb37a
--- /dev/null
+++ b/drivers/nvdimm/btt.c
@@ -0,0 +1,1479 @@
+/*
+ * Block Translation Table
+ * Copyright (c) 2014-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/highmem.h>
+#include <linux/debugfs.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/hdreg.h>
+#include <linux/genhd.h>
+#include <linux/sizes.h>
+#include <linux/ndctl.h>
+#include <linux/fs.h>
+#include <linux/nd.h>
+#include "btt.h"
+#include "nd.h"
+
+enum log_ent_request {
+ LOG_NEW_ENT = 0,
+ LOG_OLD_ENT
+};
+
+static int btt_major;
+
+static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
+ void *buf, size_t n)
+{
+ struct nd_btt *nd_btt = arena->nd_btt;
+ struct nd_namespace_common *ndns = nd_btt->ndns;
+
+ /* arena offsets are 4K from the base of the device */
+ offset += SZ_4K;
+ return nvdimm_read_bytes(ndns, offset, buf, n);
+}
+
+static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
+ void *buf, size_t n)
+{
+ struct nd_btt *nd_btt = arena->nd_btt;
+ struct nd_namespace_common *ndns = nd_btt->ndns;
+
+ /* arena offsets are 4K from the base of the device */
+ offset += SZ_4K;
+ return nvdimm_write_bytes(ndns, offset, buf, n);
+}
+
+static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
+{
+ int ret;
+
+ ret = arena_write_bytes(arena, arena->info2off, super,
+ sizeof(struct btt_sb));
+ if (ret)
+ return ret;
+
+ return arena_write_bytes(arena, arena->infooff, super,
+ sizeof(struct btt_sb));
+}
+
+static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
+{
+ WARN_ON(!super);
+ return arena_read_bytes(arena, arena->infooff, super,
+ sizeof(struct btt_sb));
+}
+
+/*
+ * 'raw' version of btt_map write
+ * Assumptions:
+ * mapping is in little-endian
+ * mapping contains 'E' and 'Z' flags as desired
+ */
+static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping)
+{
+ u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
+
+ WARN_ON(lba >= arena->external_nlba);
+ return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE);
+}
+
+static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
+ u32 z_flag, u32 e_flag)
+{
+ u32 ze;
+ __le32 mapping_le;
+
+ /*
+ * This 'mapping' is supposed to be just the LBA mapping, without
+ * any flags set, so strip the flag bits.
+ */
+ mapping &= MAP_LBA_MASK;
+
+ ze = (z_flag << 1) + e_flag;
+ switch (ze) {
+ case 0:
+ /*
+ * We want to set neither of the Z or E flags, and
+ * in the actual layout, this means setting the bit
+ * positions of both to '1' to indicate a 'normal'
+ * map entry
+ */
+ mapping |= MAP_ENT_NORMAL;
+ break;
+ case 1:
+ mapping |= (1 << MAP_ERR_SHIFT);
+ break;
+ case 2:
+ mapping |= (1 << MAP_TRIM_SHIFT);
+ break;
+ default:
+ /*
+ * The case where Z and E are both sent in as '1' could be
+ * construed as a valid 'normal' case, but we decide not to,
+ * to avoid confusion
+ */
+ WARN_ONCE(1, "Invalid use of Z and E flags\n");
+ return -EIO;
+ }
+
+ mapping_le = cpu_to_le32(mapping);
+ return __btt_map_write(arena, lba, mapping_le);
+}
+
+static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
+ int *trim, int *error)
+{
+ int ret;
+ __le32 in;
+ u32 raw_mapping, postmap, ze, z_flag, e_flag;
+ u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
+
+ WARN_ON(lba >= arena->external_nlba);
+
+ ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE);
+ if (ret)
+ return ret;
+
+ raw_mapping = le32_to_cpu(in);
+
+ z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
+ e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
+ ze = (z_flag << 1) + e_flag;
+ postmap = raw_mapping & MAP_LBA_MASK;
+
+ /* Reuse the {z,e}_flag variables for *trim and *error */
+ z_flag = 0;
+ e_flag = 0;
+
+ switch (ze) {
+ case 0:
+ /* Initial state. Return postmap = premap */
+ *mapping = lba;
+ break;
+ case 1:
+ *mapping = postmap;
+ e_flag = 1;
+ break;
+ case 2:
+ *mapping = postmap;
+ z_flag = 1;
+ break;
+ case 3:
+ *mapping = postmap;
+ break;
+ default:
+ return -EIO;
+ }
+
+ if (trim)
+ *trim = z_flag;
+ if (error)
+ *error = e_flag;
+
+ return ret;
+}
+
+static int btt_log_read_pair(struct arena_info *arena, u32 lane,
+ struct log_entry *ent)
+{
+ WARN_ON(!ent);
+ return arena_read_bytes(arena,
+ arena->logoff + (2 * lane * LOG_ENT_SIZE), ent,
+ 2 * LOG_ENT_SIZE);
+}
+
+static struct dentry *debugfs_root;
+
+static void arena_debugfs_init(struct arena_info *a, struct dentry *parent,
+ int idx)
+{
+ char dirname[32];
+ struct dentry *d;
+
+ /* If for some reason, parent bttN was not created, exit */
+ if (!parent)
+ return;
+
+ snprintf(dirname, 32, "arena%d", idx);
+ d = debugfs_create_dir(dirname, parent);
+ if (IS_ERR_OR_NULL(d))
+ return;
+ a->debugfs_dir = d;
+
+ debugfs_create_x64("size", S_IRUGO, d, &a->size);
+ debugfs_create_x64("external_lba_start", S_IRUGO, d,
+ &a->external_lba_start);
+ debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba);
+ debugfs_create_u32("internal_lbasize", S_IRUGO, d,
+ &a->internal_lbasize);
+ debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba);
+ debugfs_create_u32("external_lbasize", S_IRUGO, d,
+ &a->external_lbasize);
+ debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree);
+ debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major);
+ debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor);
+ debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff);
+ debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff);
+ debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff);
+ debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff);
+ debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff);
+ debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off);
+ debugfs_create_x32("flags", S_IRUGO, d, &a->flags);
+}
+
+static void btt_debugfs_init(struct btt *btt)
+{
+ int i = 0;
+ struct arena_info *arena;
+
+ btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev),
+ debugfs_root);
+ if (IS_ERR_OR_NULL(btt->debugfs_dir))
+ return;
+
+ list_for_each_entry(arena, &btt->arena_list, list) {
+ arena_debugfs_init(arena, btt->debugfs_dir, i);
+ i++;
+ }
+}
+
+/*
+ * This function accepts two log entries, and uses the
+ * sequence number to find the 'older' entry.
+ * It also updates the sequence number in this old entry to
+ * make it the 'new' one if the mark_flag is set.
+ * Finally, it returns which of the entries was the older one.
+ *
+ * TODO The logic feels a bit kludge-y. make it better..
+ */
+static int btt_log_get_old(struct log_entry *ent)
+{
+ int old;
+
+ /*
+ * the first ever time this is seen, the entry goes into [0]
+ * the next time, the following logic works out to put this
+ * (next) entry into [1]
+ */
+ if (ent[0].seq == 0) {
+ ent[0].seq = cpu_to_le32(1);
+ return 0;
+ }
+
+ if (ent[0].seq == ent[1].seq)
+ return -EINVAL;
+ if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5)
+ return -EINVAL;
+
+ if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) {
+ if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1)
+ old = 0;
+ else
+ old = 1;
+ } else {
+ if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1)
+ old = 1;
+ else
+ old = 0;
+ }
+
+ return old;
+}
+
+static struct device *to_dev(struct arena_info *arena)
+{
+ return &arena->nd_btt->dev;
+}
+
+/*
+ * This function copies the desired (old/new) log entry into ent if
+ * it is not NULL. It returns the sub-slot number (0 or 1)
+ * where the desired log entry was found. Negative return values
+ * indicate errors.
+ */
+static int btt_log_read(struct arena_info *arena, u32 lane,
+ struct log_entry *ent, int old_flag)
+{
+ int ret;
+ int old_ent, ret_ent;
+ struct log_entry log[2];
+
+ ret = btt_log_read_pair(arena, lane, log);
+ if (ret)
+ return -EIO;
+
+ old_ent = btt_log_get_old(log);
+ if (old_ent < 0 || old_ent > 1) {
+ dev_info(to_dev(arena),
+ "log corruption (%d): lane %d seq [%d, %d]\n",
+ old_ent, lane, log[0].seq, log[1].seq);
+ /* TODO set error state? */
+ return -EIO;
+ }
+
+ ret_ent = (old_flag ? old_ent : (1 - old_ent));
+
+ if (ent != NULL)
+ memcpy(ent, &log[ret_ent], LOG_ENT_SIZE);
+
+ return ret_ent;
+}
+
+/*
+ * This function commits a log entry to media
+ * It does _not_ prepare the freelist entry for the next write
+ * btt_flog_write is the wrapper for updating the freelist elements
+ */
+static int __btt_log_write(struct arena_info *arena, u32 lane,
+ u32 sub, struct log_entry *ent)
+{
+ int ret;
+ /*
+ * Ignore the padding in log_entry for calculating log_half.
+ * The entry is 'committed' when we write the sequence number,
+ * and we want to ensure that that is the last thing written.
+ * We don't bother writing the padding as that would be extra
+ * media wear and write amplification
+ */
+ unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2;
+ u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE);
+ void *src = ent;
+
+ /* split the 16B write into atomic, durable halves */
+ ret = arena_write_bytes(arena, ns_off, src, log_half);
+ if (ret)
+ return ret;
+
+ ns_off += log_half;
+ src += log_half;
+ return arena_write_bytes(arena, ns_off, src, log_half);
+}
+
+static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
+ struct log_entry *ent)
+{
+ int ret;
+
+ ret = __btt_log_write(arena, lane, sub, ent);
+ if (ret)
+ return ret;
+
+ /* prepare the next free entry */
+ arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
+ if (++(arena->freelist[lane].seq) == 4)
+ arena->freelist[lane].seq = 1;
+ arena->freelist[lane].block = le32_to_cpu(ent->old_map);
+
+ return ret;
+}
+
+/*
+ * This function initializes the BTT map to the initial state, which is
+ * all-zeroes, and indicates an identity mapping
+ */
+static int btt_map_init(struct arena_info *arena)
+{
+ int ret = -EINVAL;
+ void *zerobuf;
+ size_t offset = 0;
+ size_t chunk_size = SZ_2M;
+ size_t mapsize = arena->logoff - arena->mapoff;
+
+ zerobuf = kzalloc(chunk_size, GFP_KERNEL);
+ if (!zerobuf)
+ return -ENOMEM;
+
+ while (mapsize) {
+ size_t size = min(mapsize, chunk_size);
+
+ ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
+ size);
+ if (ret)
+ goto free;
+
+ offset += size;
+ mapsize -= size;
+ cond_resched();
+ }
+
+ free:
+ kfree(zerobuf);
+ return ret;
+}
+
+/*
+ * This function initializes the BTT log with 'fake' entries pointing
+ * to the initial reserved set of blocks as being free
+ */
+static int btt_log_init(struct arena_info *arena)
+{
+ int ret;
+ u32 i;
+ struct log_entry log, zerolog;
+
+ memset(&zerolog, 0, sizeof(zerolog));
+
+ for (i = 0; i < arena->nfree; i++) {
+ log.lba = cpu_to_le32(i);
+ log.old_map = cpu_to_le32(arena->external_nlba + i);
+ log.new_map = cpu_to_le32(arena->external_nlba + i);
+ log.seq = cpu_to_le32(LOG_SEQ_INIT);
+ ret = __btt_log_write(arena, i, 0, &log);
+ if (ret)
+ return ret;
+ ret = __btt_log_write(arena, i, 1, &zerolog);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int btt_freelist_init(struct arena_info *arena)
+{
+ int old, new, ret;
+ u32 i, map_entry;
+ struct log_entry log_new, log_old;
+
+ arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry),
+ GFP_KERNEL);
+ if (!arena->freelist)
+ return -ENOMEM;
+
+ for (i = 0; i < arena->nfree; i++) {
+ old = btt_log_read(arena, i, &log_old, LOG_OLD_ENT);
+ if (old < 0)
+ return old;
+
+ new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT);
+ if (new < 0)
+ return new;
+
+ /* sub points to the next one to be overwritten */
+ arena->freelist[i].sub = 1 - new;
+ arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq));
+ arena->freelist[i].block = le32_to_cpu(log_new.old_map);
+
+ /* This implies a newly created or untouched flog entry */
+ if (log_new.old_map == log_new.new_map)
+ continue;
+
+ /* Check if map recovery is needed */
+ ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry,
+ NULL, NULL);
+ if (ret)
+ return ret;
+ if ((le32_to_cpu(log_new.new_map) != map_entry) &&
+ (le32_to_cpu(log_new.old_map) == map_entry)) {
+ /*
+ * Last transaction wrote the flog, but wasn't able
+ * to complete the map write. So fix up the map.
+ */
+ ret = btt_map_write(arena, le32_to_cpu(log_new.lba),
+ le32_to_cpu(log_new.new_map), 0, 0);
+ if (ret)
+ return ret;
+ }
+
+ }
+
+ return 0;
+}
+
+static int btt_rtt_init(struct arena_info *arena)
+{
+ arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
+ if (arena->rtt == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int btt_maplocks_init(struct arena_info *arena)
+{
+ u32 i;
+
+ arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock),
+ GFP_KERNEL);
+ if (!arena->map_locks)
+ return -ENOMEM;
+
+ for (i = 0; i < arena->nfree; i++)
+ spin_lock_init(&arena->map_locks[i].lock);
+
+ return 0;
+}
+
+static struct arena_info *alloc_arena(struct btt *btt, size_t size,
+ size_t start, size_t arena_off)
+{
+ struct arena_info *arena;
+ u64 logsize, mapsize, datasize;
+ u64 available = size;
+
+ arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL);
+ if (!arena)
+ return NULL;
+ arena->nd_btt = btt->nd_btt;
+
+ if (!size)
+ return arena;
+
+ arena->size = size;
+ arena->external_lba_start = start;
+ arena->external_lbasize = btt->lbasize;
+ arena->internal_lbasize = roundup(arena->external_lbasize,
+ INT_LBASIZE_ALIGNMENT);
+ arena->nfree = BTT_DEFAULT_NFREE;
+ arena->version_major = 1;
+ arena->version_minor = 1;
+
+ if (available % BTT_PG_SIZE)
+ available -= (available % BTT_PG_SIZE);
+
+ /* Two pages are reserved for the super block and its copy */
+ available -= 2 * BTT_PG_SIZE;
+
+ /* The log takes a fixed amount of space based on nfree */
+ logsize = roundup(2 * arena->nfree * sizeof(struct log_entry),
+ BTT_PG_SIZE);
+ available -= logsize;
+
+ /* Calculate optimal split between map and data area */
+ arena->internal_nlba = div_u64(available - BTT_PG_SIZE,
+ arena->internal_lbasize + MAP_ENT_SIZE);
+ arena->external_nlba = arena->internal_nlba - arena->nfree;
+
+ mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE);
+ datasize = available - mapsize;
+
+ /* 'Absolute' values, relative to start of storage space */
+ arena->infooff = arena_off;
+ arena->dataoff = arena->infooff + BTT_PG_SIZE;
+ arena->mapoff = arena->dataoff + datasize;
+ arena->logoff = arena->mapoff + mapsize;
+ arena->info2off = arena->logoff + logsize;
+ return arena;
+}
+
+static void free_arenas(struct btt *btt)
+{
+ struct arena_info *arena, *next;
+
+ list_for_each_entry_safe(arena, next, &btt->arena_list, list) {
+ list_del(&arena->list);
+ kfree(arena->rtt);
+ kfree(arena->map_locks);
+ kfree(arena->freelist);
+ debugfs_remove_recursive(arena->debugfs_dir);
+ kfree(arena);
+ }
+}
+
+/*
+ * This function checks if the metadata layout is valid and error free
+ */
+static int arena_is_valid(struct arena_info *arena, struct btt_sb *super,
+ u8 *uuid, u32 lbasize)
+{
+ u64 checksum;
+
+ if (memcmp(super->uuid, uuid, 16))
+ return 0;
+
+ checksum = le64_to_cpu(super->checksum);
+ super->checksum = 0;
+ if (checksum != nd_btt_sb_checksum(super))
+ return 0;
+ super->checksum = cpu_to_le64(checksum);
+
+ if (lbasize != le32_to_cpu(super->external_lbasize))
+ return 0;
+
+ /* TODO: figure out action for this */
+ if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0)
+ dev_info(to_dev(arena), "Found arena with an error flag\n");
+
+ return 1;
+}
+
+/*
+ * This function reads an existing valid btt superblock and
+ * populates the corresponding arena_info struct
+ */
+static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super,
+ u64 arena_off)
+{
+ arena->internal_nlba = le32_to_cpu(super->internal_nlba);
+ arena->internal_lbasize = le32_to_cpu(super->internal_lbasize);
+ arena->external_nlba = le32_to_cpu(super->external_nlba);
+ arena->external_lbasize = le32_to_cpu(super->external_lbasize);
+ arena->nfree = le32_to_cpu(super->nfree);
+ arena->version_major = le16_to_cpu(super->version_major);
+ arena->version_minor = le16_to_cpu(super->version_minor);
+
+ arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off +
+ le64_to_cpu(super->nextoff));
+ arena->infooff = arena_off;
+ arena->dataoff = arena_off + le64_to_cpu(super->dataoff);
+ arena->mapoff = arena_off + le64_to_cpu(super->mapoff);
+ arena->logoff = arena_off + le64_to_cpu(super->logoff);
+ arena->info2off = arena_off + le64_to_cpu(super->info2off);
+
+ arena->size = (super->nextoff > 0) ? (le64_to_cpu(super->nextoff)) :
+ (arena->info2off - arena->infooff + BTT_PG_SIZE);
+
+ arena->flags = le32_to_cpu(super->flags);
+}
+
+static int discover_arenas(struct btt *btt)
+{
+ int ret = 0;
+ struct arena_info *arena;
+ struct btt_sb *super;
+ size_t remaining = btt->rawsize;
+ u64 cur_nlba = 0;
+ size_t cur_off = 0;
+ int num_arenas = 0;
+
+ super = kzalloc(sizeof(*super), GFP_KERNEL);
+ if (!super)
+ return -ENOMEM;
+
+ while (remaining) {
+ /* Alloc memory for arena */
+ arena = alloc_arena(btt, 0, 0, 0);
+ if (!arena) {
+ ret = -ENOMEM;
+ goto out_super;
+ }
+
+ arena->infooff = cur_off;
+ ret = btt_info_read(arena, super);
+ if (ret)
+ goto out;
+
+ if (!arena_is_valid(arena, super, btt->nd_btt->uuid,
+ btt->lbasize)) {
+ if (remaining == btt->rawsize) {
+ btt->init_state = INIT_NOTFOUND;
+ dev_info(to_dev(arena), "No existing arenas\n");
+ goto out;
+ } else {
+ dev_info(to_dev(arena),
+ "Found corrupted metadata!\n");
+ ret = -ENODEV;
+ goto out;
+ }
+ }
+
+ arena->external_lba_start = cur_nlba;
+ parse_arena_meta(arena, super, cur_off);
+
+ ret = btt_freelist_init(arena);
+ if (ret)
+ goto out;
+
+ ret = btt_rtt_init(arena);
+ if (ret)
+ goto out;
+
+ ret = btt_maplocks_init(arena);
+ if (ret)
+ goto out;
+
+ list_add_tail(&arena->list, &btt->arena_list);
+
+ remaining -= arena->size;
+ cur_off += arena->size;
+ cur_nlba += arena->external_nlba;
+ num_arenas++;
+
+ if (arena->nextoff == 0)
+ break;
+ }
+ btt->num_arenas = num_arenas;
+ btt->nlba = cur_nlba;
+ btt->init_state = INIT_READY;
+
+ kfree(super);
+ return ret;
+
+ out:
+ kfree(arena);
+ free_arenas(btt);
+ out_super:
+ kfree(super);
+ return ret;
+}
+
+static int create_arenas(struct btt *btt)
+{
+ size_t remaining = btt->rawsize;
+ size_t cur_off = 0;
+
+ while (remaining) {
+ struct arena_info *arena;
+ size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining);
+
+ remaining -= arena_size;
+ if (arena_size < ARENA_MIN_SIZE)
+ break;
+
+ arena = alloc_arena(btt, arena_size, btt->nlba, cur_off);
+ if (!arena) {
+ free_arenas(btt);
+ return -ENOMEM;
+ }
+ btt->nlba += arena->external_nlba;
+ if (remaining >= ARENA_MIN_SIZE)
+ arena->nextoff = arena->size;
+ else
+ arena->nextoff = 0;
+ cur_off += arena_size;
+ list_add_tail(&arena->list, &btt->arena_list);
+ }
+
+ return 0;
+}
+
+/*
+ * This function completes arena initialization by writing
+ * all the metadata.
+ * It is only called for an uninitialized arena when a write
+ * to that arena occurs for the first time.
+ */
+static int btt_arena_write_layout(struct arena_info *arena, u8 *uuid)
+{
+ int ret;
+ struct btt_sb *super;
+
+ ret = btt_map_init(arena);
+ if (ret)
+ return ret;
+
+ ret = btt_log_init(arena);
+ if (ret)
+ return ret;
+
+ super = kzalloc(sizeof(struct btt_sb), GFP_NOIO);
+ if (!super)
+ return -ENOMEM;
+
+ strncpy(super->signature, BTT_SIG, BTT_SIG_LEN);
+ memcpy(super->uuid, uuid, 16);
+ super->flags = cpu_to_le32(arena->flags);
+ super->version_major = cpu_to_le16(arena->version_major);
+ super->version_minor = cpu_to_le16(arena->version_minor);
+ super->external_lbasize = cpu_to_le32(arena->external_lbasize);
+ super->external_nlba = cpu_to_le32(arena->external_nlba);
+ super->internal_lbasize = cpu_to_le32(arena->internal_lbasize);
+ super->internal_nlba = cpu_to_le32(arena->internal_nlba);
+ super->nfree = cpu_to_le32(arena->nfree);
+ super->infosize = cpu_to_le32(sizeof(struct btt_sb));
+ super->nextoff = cpu_to_le64(arena->nextoff);
+ /*
+ * Subtract arena->infooff (arena start) so numbers are relative
+ * to 'this' arena
+ */
+ super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff);
+ super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff);
+ super->logoff = cpu_to_le64(arena->logoff - arena->infooff);
+ super->info2off = cpu_to_le64(arena->info2off - arena->infooff);
+
+ super->flags = 0;
+ super->checksum = cpu_to_le64(nd_btt_sb_checksum(super));
+
+ ret = btt_info_write(arena, super);
+
+ kfree(super);
+ return ret;
+}
+
+/*
+ * This function completes the initialization for the BTT namespace
+ * such that it is ready to accept IOs
+ */
+static int btt_meta_init(struct btt *btt)
+{
+ int ret = 0;
+ struct arena_info *arena;
+
+ mutex_lock(&btt->init_lock);
+ list_for_each_entry(arena, &btt->arena_list, list) {
+ ret = btt_arena_write_layout(arena, btt->nd_btt->uuid);
+ if (ret)
+ goto unlock;
+
+ ret = btt_freelist_init(arena);
+ if (ret)
+ goto unlock;
+
+ ret = btt_rtt_init(arena);
+ if (ret)
+ goto unlock;
+
+ ret = btt_maplocks_init(arena);
+ if (ret)
+ goto unlock;
+ }
+
+ btt->init_state = INIT_READY;
+
+ unlock:
+ mutex_unlock(&btt->init_lock);
+ return ret;
+}
+
+static u32 btt_meta_size(struct btt *btt)
+{
+ return btt->lbasize - btt->sector_size;
+}
+
+/*
+ * This function calculates the arena in which the given LBA lies
+ * by doing a linear walk. This is acceptable since we expect only
+ * a few arenas. If we have backing devices that get much larger,
+ * we can construct a balanced binary tree of arenas at init time
+ * so that this range search becomes faster.
+ */
+static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap,
+ struct arena_info **arena)
+{
+ struct arena_info *arena_list;
+ __u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size);
+
+ list_for_each_entry(arena_list, &btt->arena_list, list) {
+ if (lba < arena_list->external_nlba) {
+ *arena = arena_list;
+ *premap = lba;
+ return 0;
+ }
+ lba -= arena_list->external_nlba;
+ }
+
+ return -EIO;
+}
+
+/*
+ * The following (lock_map, unlock_map) are mostly just to improve
+ * readability, since they index into an array of locks
+ */
+static void lock_map(struct arena_info *arena, u32 premap)
+ __acquires(&arena->map_locks[idx].lock)
+{
+ u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
+
+ spin_lock(&arena->map_locks[idx].lock);
+}
+
+static void unlock_map(struct arena_info *arena, u32 premap)
+ __releases(&arena->map_locks[idx].lock)
+{
+ u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
+
+ spin_unlock(&arena->map_locks[idx].lock);
+}
+
+static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
+{
+ return arena->dataoff + ((u64)lba * arena->internal_lbasize);
+}
+
+static int btt_data_read(struct arena_info *arena, struct page *page,
+ unsigned int off, u32 lba, u32 len)
+{
+ int ret;
+ u64 nsoff = to_namespace_offset(arena, lba);
+ void *mem = kmap_atomic(page);
+
+ ret = arena_read_bytes(arena, nsoff, mem + off, len);
+ kunmap_atomic(mem);
+
+ return ret;
+}
+
+static int btt_data_write(struct arena_info *arena, u32 lba,
+ struct page *page, unsigned int off, u32 len)
+{
+ int ret;
+ u64 nsoff = to_namespace_offset(arena, lba);
+ void *mem = kmap_atomic(page);
+
+ ret = arena_write_bytes(arena, nsoff, mem + off, len);
+ kunmap_atomic(mem);
+
+ return ret;
+}
+
+static void zero_fill_data(struct page *page, unsigned int off, u32 len)
+{
+ void *mem = kmap_atomic(page);
+
+ memset(mem + off, 0, len);
+ kunmap_atomic(mem);
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
+ struct arena_info *arena, u32 postmap, int rw)
+{
+ unsigned int len = btt_meta_size(btt);
+ u64 meta_nsoff;
+ int ret = 0;
+
+ if (bip == NULL)
+ return 0;
+
+ meta_nsoff = to_namespace_offset(arena, postmap) + btt->sector_size;
+
+ while (len) {
+ unsigned int cur_len;
+ struct bio_vec bv;
+ void *mem;
+
+ bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
+ /*
+ * The 'bv' obtained from bvec_iter_bvec has its .bv_len and
+ * .bv_offset already adjusted for iter->bi_bvec_done, and we
+ * can use those directly
+ */
+
+ cur_len = min(len, bv.bv_len);
+ mem = kmap_atomic(bv.bv_page);
+ if (rw)
+ ret = arena_write_bytes(arena, meta_nsoff,
+ mem + bv.bv_offset, cur_len);
+ else
+ ret = arena_read_bytes(arena, meta_nsoff,
+ mem + bv.bv_offset, cur_len);
+
+ kunmap_atomic(mem);
+ if (ret)
+ return ret;
+
+ len -= cur_len;
+ meta_nsoff += cur_len;
+ bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len);
+ }
+
+ return ret;
+}
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
+ struct arena_info *arena, u32 postmap, int rw)
+{
+ return 0;
+}
+#endif
+
+static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
+ struct page *page, unsigned int off, sector_t sector,
+ unsigned int len)
+{
+ int ret = 0;
+ int t_flag, e_flag;
+ struct arena_info *arena = NULL;
+ u32 lane = 0, premap, postmap;
+
+ while (len) {
+ u32 cur_len;
+
+ lane = nd_region_acquire_lane(btt->nd_region);
+
+ ret = lba_to_arena(btt, sector, &premap, &arena);
+ if (ret)
+ goto out_lane;
+
+ cur_len = min(btt->sector_size, len);
+
+ ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag);
+ if (ret)
+ goto out_lane;
+
+ /*
+ * We loop to make sure that the post map LBA didn't change
+ * from under us between writing the RTT and doing the actual
+ * read.
+ */
+ while (1) {
+ u32 new_map;
+
+ if (t_flag) {
+ zero_fill_data(page, off, cur_len);
+ goto out_lane;
+ }
+
+ if (e_flag) {
+ ret = -EIO;
+ goto out_lane;
+ }
+
+ arena->rtt[lane] = RTT_VALID | postmap;
+ /*
+ * Barrier to make sure this write is not reordered
+ * to do the verification map_read before the RTT store
+ */
+ barrier();
+
+ ret = btt_map_read(arena, premap, &new_map, &t_flag,
+ &e_flag);
+ if (ret)
+ goto out_rtt;
+
+ if (postmap == new_map)
+ break;
+
+ postmap = new_map;
+ }
+
+ ret = btt_data_read(arena, page, off, postmap, cur_len);
+ if (ret)
+ goto out_rtt;
+
+ if (bip) {
+ ret = btt_rw_integrity(btt, bip, arena, postmap, READ);
+ if (ret)
+ goto out_rtt;
+ }
+
+ arena->rtt[lane] = RTT_INVALID;
+ nd_region_release_lane(btt->nd_region, lane);
+
+ len -= cur_len;
+ off += cur_len;
+ sector += btt->sector_size >> SECTOR_SHIFT;
+ }
+
+ return 0;
+
+ out_rtt:
+ arena->rtt[lane] = RTT_INVALID;
+ out_lane:
+ nd_region_release_lane(btt->nd_region, lane);
+ return ret;
+}
+
+static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
+ sector_t sector, struct page *page, unsigned int off,
+ unsigned int len)
+{
+ int ret = 0;
+ struct arena_info *arena = NULL;
+ u32 premap = 0, old_postmap, new_postmap, lane = 0, i;
+ struct log_entry log;
+ int sub;
+
+ while (len) {
+ u32 cur_len;
+
+ lane = nd_region_acquire_lane(btt->nd_region);
+
+ ret = lba_to_arena(btt, sector, &premap, &arena);
+ if (ret)
+ goto out_lane;
+ cur_len = min(btt->sector_size, len);
+
+ if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) {
+ ret = -EIO;
+ goto out_lane;
+ }
+
+ new_postmap = arena->freelist[lane].block;
+
+ /* Wait if the new block is being read from */
+ for (i = 0; i < arena->nfree; i++)
+ while (arena->rtt[i] == (RTT_VALID | new_postmap))
+ cpu_relax();
+
+
+ if (new_postmap >= arena->internal_nlba) {
+ ret = -EIO;
+ goto out_lane;
+ }
+
+ ret = btt_data_write(arena, new_postmap, page, off, cur_len);
+ if (ret)
+ goto out_lane;
+
+ if (bip) {
+ ret = btt_rw_integrity(btt, bip, arena, new_postmap,
+ WRITE);
+ if (ret)
+ goto out_lane;
+ }
+
+ lock_map(arena, premap);
+ ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL);
+ if (ret)
+ goto out_map;
+ if (old_postmap >= arena->internal_nlba) {
+ ret = -EIO;
+ goto out_map;
+ }
+
+ log.lba = cpu_to_le32(premap);
+ log.old_map = cpu_to_le32(old_postmap);
+ log.new_map = cpu_to_le32(new_postmap);
+ log.seq = cpu_to_le32(arena->freelist[lane].seq);
+ sub = arena->freelist[lane].sub;
+ ret = btt_flog_write(arena, lane, sub, &log);
+ if (ret)
+ goto out_map;
+
+ ret = btt_map_write(arena, premap, new_postmap, 0, 0);
+ if (ret)
+ goto out_map;
+
+ unlock_map(arena, premap);
+ nd_region_release_lane(btt->nd_region, lane);
+
+ len -= cur_len;
+ off += cur_len;
+ sector += btt->sector_size >> SECTOR_SHIFT;
+ }
+
+ return 0;
+
+ out_map:
+ unlock_map(arena, premap);
+ out_lane:
+ nd_region_release_lane(btt->nd_region, lane);
+ return ret;
+}
+
+static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
+ struct page *page, unsigned int len, unsigned int off,
+ int rw, sector_t sector)
+{
+ int ret;
+
+ if (rw == READ) {
+ ret = btt_read_pg(btt, bip, page, off, sector, len);
+ flush_dcache_page(page);
+ } else {
+ flush_dcache_page(page);
+ ret = btt_write_pg(btt, bip, sector, page, off, len);
+ }
+
+ return ret;
+}
+
+static void btt_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ struct btt *btt = q->queuedata;
+ struct bvec_iter iter;
+ unsigned long start;
+ struct bio_vec bvec;
+ int err = 0, rw;
+ bool do_acct;
+
+ /*
+ * bio_integrity_enabled also checks if the bio already has an
+ * integrity payload attached. If it does, we *don't* do a
+ * bio_integrity_prep here - the payload has been generated by
+ * another kernel subsystem, and we just pass it through.
+ */
+ if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
+ err = -EIO;
+ goto out;
+ }
+
+ do_acct = nd_iostat_start(bio, &start);
+ rw = bio_data_dir(bio);
+ bio_for_each_segment(bvec, bio, iter) {
+ unsigned int len = bvec.bv_len;
+
+ BUG_ON(len > PAGE_SIZE);
+ /* Make sure len is in multiples of sector size. */
+ /* XXX is this right? */
+ BUG_ON(len < btt->sector_size);
+ BUG_ON(len % btt->sector_size);
+
+ err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
+ rw, iter.bi_sector);
+ if (err) {
+ dev_info(&btt->nd_btt->dev,
+ "io error in %s sector %lld, len %d,\n",
+ (rw == READ) ? "READ" : "WRITE",
+ (unsigned long long) iter.bi_sector, len);
+ break;
+ }
+ }
+ if (do_acct)
+ nd_iostat_end(bio, start);
+
+out:
+ bio_endio(bio, err);
+}
+
+static int btt_rw_page(struct block_device *bdev, sector_t sector,
+ struct page *page, int rw)
+{
+ struct btt *btt = bdev->bd_disk->private_data;
+
+ btt_do_bvec(btt, NULL, page, PAGE_CACHE_SIZE, 0, rw, sector);
+ page_endio(page, rw & WRITE, 0);
+ return 0;
+}
+
+
+static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+ /* some standard values */
+ geo->heads = 1 << 6;
+ geo->sectors = 1 << 5;
+ geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+ return 0;
+}
+
+static const struct block_device_operations btt_fops = {
+ .owner = THIS_MODULE,
+ .rw_page = btt_rw_page,
+ .getgeo = btt_getgeo,
+ .revalidate_disk = nvdimm_revalidate_disk,
+};
+
+static int btt_blk_init(struct btt *btt)
+{
+ struct nd_btt *nd_btt = btt->nd_btt;
+ struct nd_namespace_common *ndns = nd_btt->ndns;
+
+ /* create a new disk and request queue for btt */
+ btt->btt_queue = blk_alloc_queue(GFP_KERNEL);
+ if (!btt->btt_queue)
+ return -ENOMEM;
+
+ btt->btt_disk = alloc_disk(0);
+ if (!btt->btt_disk) {
+ blk_cleanup_queue(btt->btt_queue);
+ return -ENOMEM;
+ }
+
+ nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
+ btt->btt_disk->driverfs_dev = &btt->nd_btt->dev;
+ btt->btt_disk->major = btt_major;
+ btt->btt_disk->first_minor = 0;
+ btt->btt_disk->fops = &btt_fops;
+ btt->btt_disk->private_data = btt;
+ btt->btt_disk->queue = btt->btt_queue;
+ btt->btt_disk->flags = GENHD_FL_EXT_DEVT;
+
+ blk_queue_make_request(btt->btt_queue, btt_make_request);
+ blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
+ blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX);
+ blk_queue_bounce_limit(btt->btt_queue, BLK_BOUNCE_ANY);
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, btt->btt_queue);
+ btt->btt_queue->queuedata = btt;
+
+ set_capacity(btt->btt_disk, 0);
+ add_disk(btt->btt_disk);
+ if (btt_meta_size(btt)) {
+ int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));
+
+ if (rc) {
+ del_gendisk(btt->btt_disk);
+ put_disk(btt->btt_disk);
+ blk_cleanup_queue(btt->btt_queue);
+ return rc;
+ }
+ }
+ set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
+ revalidate_disk(btt->btt_disk);
+
+ return 0;
+}
+
+static void btt_blk_cleanup(struct btt *btt)
+{
+ blk_integrity_unregister(btt->btt_disk);
+ del_gendisk(btt->btt_disk);
+ put_disk(btt->btt_disk);
+ blk_cleanup_queue(btt->btt_queue);
+}
+
+/**
+ * btt_init - initialize a block translation table for the given device
+ * @nd_btt: device with BTT geometry and backing device info
+ * @rawsize: raw size in bytes of the backing device
+ * @lbasize: lba size of the backing device
+ * @uuid: A uuid for the backing device - this is stored on media
+ * @maxlane: maximum number of parallel requests the device can handle
+ *
+ * Initialize a Block Translation Table on a backing device to provide
+ * single sector power fail atomicity.
+ *
+ * Context:
+ * Might sleep.
+ *
+ * Returns:
+ * Pointer to a new struct btt on success, NULL on failure.
+ */
+static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
+ u32 lbasize, u8 *uuid, struct nd_region *nd_region)
+{
+ int ret;
+ struct btt *btt;
+ struct device *dev = &nd_btt->dev;
+
+ btt = kzalloc(sizeof(struct btt), GFP_KERNEL);
+ if (!btt)
+ return NULL;
+
+ btt->nd_btt = nd_btt;
+ btt->rawsize = rawsize;
+ btt->lbasize = lbasize;
+ btt->sector_size = ((lbasize >= 4096) ? 4096 : 512);
+ INIT_LIST_HEAD(&btt->arena_list);
+ mutex_init(&btt->init_lock);
+ btt->nd_region = nd_region;
+
+ ret = discover_arenas(btt);
+ if (ret) {
+ dev_err(dev, "init: error in arena_discover: %d\n", ret);
+ goto out_free;
+ }
+
+ if (btt->init_state != INIT_READY && nd_region->ro) {
+ dev_info(dev, "%s is read-only, unable to init btt metadata\n",
+ dev_name(&nd_region->dev));
+ goto out_free;
+ } else if (btt->init_state != INIT_READY) {
+ btt->num_arenas = (rawsize / ARENA_MAX_SIZE) +
+ ((rawsize % ARENA_MAX_SIZE) ? 1 : 0);
+ dev_dbg(dev, "init: %d arenas for %llu rawsize\n",
+ btt->num_arenas, rawsize);
+
+ ret = create_arenas(btt);
+ if (ret) {
+ dev_info(dev, "init: create_arenas: %d\n", ret);
+ goto out_free;
+ }
+
+ ret = btt_meta_init(btt);
+ if (ret) {
+ dev_err(dev, "init: error in meta_init: %d\n", ret);
+ goto out_free;
+ }
+ }
+
+ ret = btt_blk_init(btt);
+ if (ret) {
+ dev_err(dev, "init: error in blk_init: %d\n", ret);
+ goto out_free;
+ }
+
+ btt_debugfs_init(btt);
+
+ return btt;
+
+ out_free:
+ kfree(btt);
+ return NULL;
+}
+
+/**
+ * btt_fini - de-initialize a BTT
+ * @btt: the BTT handle that was generated by btt_init
+ *
+ * De-initialize a Block Translation Table on device removal
+ *
+ * Context:
+ * Might sleep.
+ */
+static void btt_fini(struct btt *btt)
+{
+ if (btt) {
+ btt_blk_cleanup(btt);
+ free_arenas(btt);
+ debugfs_remove_recursive(btt->debugfs_dir);
+ kfree(btt);
+ }
+}
+
+int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
+{
+ struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
+ struct nd_region *nd_region;
+ struct btt *btt;
+ size_t rawsize;
+
+ if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize)
+ return -ENODEV;
+
+ rawsize = nvdimm_namespace_capacity(ndns) - SZ_4K;
+ if (rawsize < ARENA_MIN_SIZE) {
+ return -ENXIO;
+ }
+ nd_region = to_nd_region(nd_btt->dev.parent);
+ btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid,
+ nd_region);
+ if (!btt)
+ return -ENOMEM;
+ nd_btt->btt = btt;
+
+ return 0;
+}
+EXPORT_SYMBOL(nvdimm_namespace_attach_btt);
+
+int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns)
+{
+ struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
+ struct btt *btt = nd_btt->btt;
+
+ btt_fini(btt);
+ nd_btt->btt = NULL;
+
+ return 0;
+}
+EXPORT_SYMBOL(nvdimm_namespace_detach_btt);
+
+static int __init nd_btt_init(void)
+{
+ int rc;
+
+ BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K);
+
+ btt_major = register_blkdev(0, "btt");
+ if (btt_major < 0)
+ return btt_major;
+
+ debugfs_root = debugfs_create_dir("btt", NULL);
+ if (IS_ERR_OR_NULL(debugfs_root)) {
+ rc = -ENXIO;
+ goto err_debugfs;
+ }
+
+ return 0;
+
+ err_debugfs:
+ unregister_blkdev(btt_major, "btt");
+
+ return rc;
+}
+
+static void __exit nd_btt_exit(void)
+{
+ debugfs_remove_recursive(debugfs_root);
+ unregister_blkdev(btt_major, "btt");
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT);
+MODULE_AUTHOR("Vishal Verma <vishal.l.verma@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
+module_init(nd_btt_init);
+module_exit(nd_btt_exit);
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
new file mode 100644
index 000000000000..75b0d80a6bd9
--- /dev/null
+++ b/drivers/nvdimm/btt.h
@@ -0,0 +1,185 @@
+/*
+ * Block Translation Table library
+ * Copyright (c) 2014-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_BTT_H
+#define _LINUX_BTT_H
+
+#include <linux/types.h>
+
+#define BTT_SIG_LEN 16
+#define BTT_SIG "BTT_ARENA_INFO\0"
+#define MAP_ENT_SIZE 4
+#define MAP_TRIM_SHIFT 31
+#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT)
+#define MAP_ERR_SHIFT 30
+#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
+#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
+#define MAP_ENT_NORMAL 0xC0000000
+#define LOG_ENT_SIZE sizeof(struct log_entry)
+#define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */
+#define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */
+#define RTT_VALID (1UL << 31)
+#define RTT_INVALID 0
+#define BTT_PG_SIZE 4096
+#define BTT_DEFAULT_NFREE ND_MAX_LANES
+#define LOG_SEQ_INIT 1
+
+#define IB_FLAG_ERROR 0x00000001
+#define IB_FLAG_ERROR_MASK 0x00000001
+
+enum btt_init_state {
+ INIT_UNCHECKED = 0,
+ INIT_NOTFOUND,
+ INIT_READY
+};
+
+struct log_entry {
+ __le32 lba;
+ __le32 old_map;
+ __le32 new_map;
+ __le32 seq;
+ __le64 padding[2];
+};
+
+struct btt_sb {
+ u8 signature[BTT_SIG_LEN];
+ u8 uuid[16];
+ u8 parent_uuid[16];
+ __le32 flags;
+ __le16 version_major;
+ __le16 version_minor;
+ __le32 external_lbasize;
+ __le32 external_nlba;
+ __le32 internal_lbasize;
+ __le32 internal_nlba;
+ __le32 nfree;
+ __le32 infosize;
+ __le64 nextoff;
+ __le64 dataoff;
+ __le64 mapoff;
+ __le64 logoff;
+ __le64 info2off;
+ u8 padding[3968];
+ __le64 checksum;
+};
+
+struct free_entry {
+ u32 block;
+ u8 sub;
+ u8 seq;
+};
+
+struct aligned_lock {
+ union {
+ spinlock_t lock;
+ u8 cacheline_padding[L1_CACHE_BYTES];
+ };
+};
+
+/**
+ * struct arena_info - handle for an arena
+ * @size: Size in bytes this arena occupies on the raw device.
+ * This includes arena metadata.
+ * @external_lba_start: The first external LBA in this arena.
+ * @internal_nlba: Number of internal blocks available in the arena
+ * including nfree reserved blocks
+ * @internal_lbasize: Internal and external lba sizes may be different as
+ * we can round up 'odd' external lbasizes such as 520B
+ * to be aligned.
+ * @external_nlba: Number of blocks contributed by the arena to the number
+ * reported to upper layers. (internal_nlba - nfree)
+ * @external_lbasize: LBA size as exposed to upper layers.
+ * @nfree: A reserve number of 'free' blocks that is used to
+ * handle incoming writes.
+ * @version_major: Metadata layout version major.
+ * @version_minor: Metadata layout version minor.
+ * @nextoff: Offset in bytes to the start of the next arena.
+ * @infooff: Offset in bytes to the info block of this arena.
+ * @dataoff: Offset in bytes to the data area of this arena.
+ * @mapoff: Offset in bytes to the map area of this arena.
+ * @logoff: Offset in bytes to the log area of this arena.
+ * @info2off: Offset in bytes to the backup info block of this arena.
+ * @freelist: Pointer to in-memory list of free blocks
+ * @rtt: Pointer to in-memory "Read Tracking Table"
+ * @map_locks: Spinlocks protecting concurrent map writes
+ * @nd_btt: Pointer to parent nd_btt structure.
+ * @list: List head for list of arenas
+ * @debugfs_dir: Debugfs dentry
+ * @flags: Arena flags - may signify error states.
+ *
+ * arena_info is a per-arena handle. Once an arena is narrowed down for an
+ * IO, this struct is passed around for the duration of the IO.
+ */
+struct arena_info {
+ u64 size; /* Total bytes for this arena */
+ u64 external_lba_start;
+ u32 internal_nlba;
+ u32 internal_lbasize;
+ u32 external_nlba;
+ u32 external_lbasize;
+ u32 nfree;
+ u16 version_major;
+ u16 version_minor;
+ /* Byte offsets to the different on-media structures */
+ u64 nextoff;
+ u64 infooff;
+ u64 dataoff;
+ u64 mapoff;
+ u64 logoff;
+ u64 info2off;
+ /* Pointers to other in-memory structures for this arena */
+ struct free_entry *freelist;
+ u32 *rtt;
+ struct aligned_lock *map_locks;
+ struct nd_btt *nd_btt;
+ struct list_head list;
+ struct dentry *debugfs_dir;
+ /* Arena flags */
+ u32 flags;
+};
+
+/**
+ * struct btt - handle for a BTT instance
+ * @btt_disk: Pointer to the gendisk for BTT device
+ * @btt_queue: Pointer to the request queue for the BTT device
+ * @arena_list: Head of the list of arenas
+ * @debugfs_dir: Debugfs dentry
+ * @nd_btt: Parent nd_btt struct
+ * @nlba: Number of logical blocks exposed to the upper layers
+ * after removing the amount of space needed by metadata
+ * @rawsize: Total size in bytes of the available backing device
+ * @lbasize: LBA size as requested and presented to upper layers.
+ * This is sector_size + size of any metadata.
+ * @sector_size: The Linux sector size - 512 or 4096
+ * @lanes: Per-lane spinlocks
+ * @init_lock: Mutex used for the BTT initialization
+ * @init_state: Flag describing the initialization state for the BTT
+ * @num_arenas: Number of arenas in the BTT instance
+ */
+struct btt {
+ struct gendisk *btt_disk;
+ struct request_queue *btt_queue;
+ struct list_head arena_list;
+ struct dentry *debugfs_dir;
+ struct nd_btt *nd_btt;
+ u64 nlba;
+ unsigned long long rawsize;
+ u32 lbasize;
+ u32 sector_size;
+ struct nd_region *nd_region;
+ struct mutex init_lock;
+ int init_state;
+ int num_arenas;
+};
+#endif
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
new file mode 100644
index 000000000000..6ac8c0fea3ec
--- /dev/null
+++ b/drivers/nvdimm/btt_devs.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "btt.h"
+#include "nd.h"
+
+static void __nd_btt_detach_ndns(struct nd_btt *nd_btt)
+{
+ struct nd_namespace_common *ndns = nd_btt->ndns;
+
+ dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex)
+ || ndns->claim != &nd_btt->dev,
+ "%s: invalid claim\n", __func__);
+ ndns->claim = NULL;
+ nd_btt->ndns = NULL;
+ put_device(&ndns->dev);
+}
+
+static void nd_btt_detach_ndns(struct nd_btt *nd_btt)
+{
+ struct nd_namespace_common *ndns = nd_btt->ndns;
+
+ if (!ndns)
+ return;
+ get_device(&ndns->dev);
+ device_lock(&ndns->dev);
+ __nd_btt_detach_ndns(nd_btt);
+ device_unlock(&ndns->dev);
+ put_device(&ndns->dev);
+}
+
+static bool __nd_btt_attach_ndns(struct nd_btt *nd_btt,
+ struct nd_namespace_common *ndns)
+{
+ if (ndns->claim)
+ return false;
+ dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex)
+ || nd_btt->ndns,
+ "%s: invalid claim\n", __func__);
+ ndns->claim = &nd_btt->dev;
+ nd_btt->ndns = ndns;
+ get_device(&ndns->dev);
+ return true;
+}
+
+static bool nd_btt_attach_ndns(struct nd_btt *nd_btt,
+ struct nd_namespace_common *ndns)
+{
+ bool claimed;
+
+ device_lock(&ndns->dev);
+ claimed = __nd_btt_attach_ndns(nd_btt, ndns);
+ device_unlock(&ndns->dev);
+ return claimed;
+}
+
+static void nd_btt_release(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ dev_dbg(dev, "%s\n", __func__);
+ nd_btt_detach_ndns(nd_btt);
+ ida_simple_remove(&nd_region->btt_ida, nd_btt->id);
+ kfree(nd_btt->uuid);
+ kfree(nd_btt);
+}
+
+static struct device_type nd_btt_device_type = {
+ .name = "nd_btt",
+ .release = nd_btt_release,
+};
+
+bool is_nd_btt(struct device *dev)
+{
+ return dev->type == &nd_btt_device_type;
+}
+EXPORT_SYMBOL(is_nd_btt);
+
+struct nd_btt *to_nd_btt(struct device *dev)
+{
+ struct nd_btt *nd_btt = container_of(dev, struct nd_btt, dev);
+
+ WARN_ON(!is_nd_btt(dev));
+ return nd_btt;
+}
+EXPORT_SYMBOL(to_nd_btt);
+
+static const unsigned long btt_lbasize_supported[] = { 512, 520, 528,
+ 4096, 4104, 4160, 4224, 0 };
+
+static ssize_t sector_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ return nd_sector_size_show(nd_btt->lbasize, btt_lbasize_supported, buf);
+}
+
+static ssize_t sector_size_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ rc = nd_sector_size_store(dev, buf, &nd_btt->lbasize,
+ btt_lbasize_supported);
+ dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+ rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(sector_size);
+
+static ssize_t uuid_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ if (nd_btt->uuid)
+ return sprintf(buf, "%pUb\n", nd_btt->uuid);
+ return sprintf(buf, "\n");
+}
+
+static ssize_t uuid_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len);
+ dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+ rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+ device_unlock(dev);
+
+ return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static ssize_t namespace_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ rc = sprintf(buf, "%s\n", nd_btt->ndns
+ ? dev_name(&nd_btt->ndns->dev) : "");
+ nvdimm_bus_unlock(dev);
+ return rc;
+}
+
+static int namespace_match(struct device *dev, void *data)
+{
+ char *name = data;
+
+ return strcmp(name, dev_name(dev)) == 0;
+}
+
+static bool is_nd_btt_idle(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ if (nd_region->btt_seed == dev || nd_btt->ndns || dev->driver)
+ return false;
+ return true;
+}
+
+static ssize_t __namespace_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+ struct nd_namespace_common *ndns;
+ struct device *found;
+ char *name;
+
+ if (dev->driver) {
+ dev_dbg(dev, "%s: -EBUSY\n", __func__);
+ return -EBUSY;
+ }
+
+ name = kstrndup(buf, len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+ strim(name);
+
+ if (strncmp(name, "namespace", 9) == 0 || strcmp(name, "") == 0)
+ /* pass */;
+ else {
+ len = -EINVAL;
+ goto out;
+ }
+
+ ndns = nd_btt->ndns;
+ if (strcmp(name, "") == 0) {
+ /* detach the namespace and destroy / reset the btt device */
+ nd_btt_detach_ndns(nd_btt);
+ if (is_nd_btt_idle(dev))
+ nd_device_unregister(dev, ND_ASYNC);
+ else {
+ nd_btt->lbasize = 0;
+ kfree(nd_btt->uuid);
+ nd_btt->uuid = NULL;
+ }
+ goto out;
+ } else if (ndns) {
+ dev_dbg(dev, "namespace already set to: %s\n",
+ dev_name(&ndns->dev));
+ len = -EBUSY;
+ goto out;
+ }
+
+ found = device_find_child(dev->parent, name, namespace_match);
+ if (!found) {
+ dev_dbg(dev, "'%s' not found under %s\n", name,
+ dev_name(dev->parent));
+ len = -ENODEV;
+ goto out;
+ }
+
+ ndns = to_ndns(found);
+ if (__nvdimm_namespace_capacity(ndns) < SZ_16M) {
+ dev_dbg(dev, "%s too small to host btt\n", name);
+ len = -ENXIO;
+ goto out_attach;
+ }
+
+ WARN_ON_ONCE(!is_nvdimm_bus_locked(&nd_btt->dev));
+ if (!nd_btt_attach_ndns(nd_btt, ndns)) {
+ dev_dbg(dev, "%s already claimed\n",
+ dev_name(&ndns->dev));
+ len = -EBUSY;
+ }
+
+ out_attach:
+ put_device(&ndns->dev); /* from device_find_child */
+ out:
+ kfree(name);
+ return len;
+}
+
+static ssize_t namespace_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ device_lock(dev);
+ rc = __namespace_store(dev, attr, buf, len);
+ dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+ rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+ device_unlock(dev);
+ nvdimm_bus_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RW(namespace);
+
+static struct attribute *nd_btt_attributes[] = {
+ &dev_attr_sector_size.attr,
+ &dev_attr_namespace.attr,
+ &dev_attr_uuid.attr,
+ NULL,
+};
+
+static struct attribute_group nd_btt_attribute_group = {
+ .attrs = nd_btt_attributes,
+};
+
+static const struct attribute_group *nd_btt_attribute_groups[] = {
+ &nd_btt_attribute_group,
+ &nd_device_attribute_group,
+ &nd_numa_attribute_group,
+ NULL,
+};
+
+static struct device *__nd_btt_create(struct nd_region *nd_region,
+ unsigned long lbasize, u8 *uuid,
+ struct nd_namespace_common *ndns)
+{
+ struct nd_btt *nd_btt;
+ struct device *dev;
+
+ nd_btt = kzalloc(sizeof(*nd_btt), GFP_KERNEL);
+ if (!nd_btt)
+ return NULL;
+
+ nd_btt->id = ida_simple_get(&nd_region->btt_ida, 0, 0, GFP_KERNEL);
+ if (nd_btt->id < 0) {
+ kfree(nd_btt);
+ return NULL;
+ }
+
+ nd_btt->lbasize = lbasize;
+ if (uuid)
+ uuid = kmemdup(uuid, 16, GFP_KERNEL);
+ nd_btt->uuid = uuid;
+ dev = &nd_btt->dev;
+ dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id);
+ dev->parent = &nd_region->dev;
+ dev->type = &nd_btt_device_type;
+ dev->groups = nd_btt_attribute_groups;
+ device_initialize(&nd_btt->dev);
+ if (ndns && !__nd_btt_attach_ndns(nd_btt, ndns)) {
+ dev_dbg(&ndns->dev, "%s failed, already claimed by %s\n",
+ __func__, dev_name(ndns->claim));
+ put_device(dev);
+ return NULL;
+ }
+ return dev;
+}
+
+struct device *nd_btt_create(struct nd_region *nd_region)
+{
+ struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL);
+
+ if (dev)
+ __nd_device_register(dev);
+ return dev;
+}
+
+/*
+ * nd_btt_sb_checksum: compute checksum for btt info block
+ *
+ * Returns a fletcher64 checksum of everything in the given info block
+ * except the last field (since that's where the checksum lives).
+ */
+u64 nd_btt_sb_checksum(struct btt_sb *btt_sb)
+{
+ u64 sum;
+ __le64 sum_save;
+
+ sum_save = btt_sb->checksum;
+ btt_sb->checksum = 0;
+ sum = nd_fletcher64(btt_sb, sizeof(*btt_sb), 1);
+ btt_sb->checksum = sum_save;
+ return sum;
+}
+EXPORT_SYMBOL(nd_btt_sb_checksum);
+
+static int __nd_btt_probe(struct nd_btt *nd_btt,
+ struct nd_namespace_common *ndns, struct btt_sb *btt_sb)
+{
+ u64 checksum;
+
+ if (!btt_sb || !ndns || !nd_btt)
+ return -ENODEV;
+
+ if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb)))
+ return -ENXIO;
+
+ if (nvdimm_namespace_capacity(ndns) < SZ_16M)
+ return -ENXIO;
+
+ if (memcmp(btt_sb->signature, BTT_SIG, BTT_SIG_LEN) != 0)
+ return -ENODEV;
+
+ checksum = le64_to_cpu(btt_sb->checksum);
+ btt_sb->checksum = 0;
+ if (checksum != nd_btt_sb_checksum(btt_sb))
+ return -ENODEV;
+ btt_sb->checksum = cpu_to_le64(checksum);
+
+ nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize);
+ nd_btt->uuid = kmemdup(btt_sb->uuid, 16, GFP_KERNEL);
+ if (!nd_btt->uuid)
+ return -ENOMEM;
+
+ __nd_device_register(&nd_btt->dev);
+
+ return 0;
+}
+
+int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
+{
+ int rc;
+ struct device *dev;
+ struct btt_sb *btt_sb;
+ struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+ if (ndns->force_raw)
+ return -ENODEV;
+
+ nvdimm_bus_lock(&ndns->dev);
+ dev = __nd_btt_create(nd_region, 0, NULL, ndns);
+ nvdimm_bus_unlock(&ndns->dev);
+ if (!dev)
+ return -ENOMEM;
+ dev_set_drvdata(dev, drvdata);
+ btt_sb = kzalloc(sizeof(*btt_sb), GFP_KERNEL);
+ rc = __nd_btt_probe(to_nd_btt(dev), ndns, btt_sb);
+ kfree(btt_sb);
+ dev_dbg(&ndns->dev, "%s: btt: %s\n", __func__,
+ rc == 0 ? dev_name(dev) : "<none>");
+ if (rc < 0) {
+ __nd_btt_detach_ndns(to_nd_btt(dev));
+ put_device(dev);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(nd_btt_probe);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
new file mode 100644
index 000000000000..8eb22c0ca7ce
--- /dev/null
+++ b/drivers/nvdimm/bus.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/fcntl.h>
+#include <linux/async.h>
+#include <linux/genhd.h>
+#include <linux/ndctl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "nd.h"
+
+int nvdimm_major;
+static int nvdimm_bus_major;
+static struct class *nd_class;
+
+static int to_nd_device_type(struct device *dev)
+{
+ if (is_nvdimm(dev))
+ return ND_DEVICE_DIMM;
+ else if (is_nd_pmem(dev))
+ return ND_DEVICE_REGION_PMEM;
+ else if (is_nd_blk(dev))
+ return ND_DEVICE_REGION_BLK;
+ else if (is_nd_pmem(dev->parent) || is_nd_blk(dev->parent))
+ return nd_region_to_nstype(to_nd_region(dev->parent));
+
+ return 0;
+}
+
+static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ /*
+ * Ensure that region devices always have their numa node set as
+ * early as possible.
+ */
+ if (is_nd_pmem(dev) || is_nd_blk(dev))
+ set_dev_node(dev, to_nd_region(dev)->numa_node);
+ return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT,
+ to_nd_device_type(dev));
+}
+
+static int nvdimm_bus_match(struct device *dev, struct device_driver *drv)
+{
+ struct nd_device_driver *nd_drv = to_nd_device_driver(drv);
+
+ return test_bit(to_nd_device_type(dev), &nd_drv->type);
+}
+
+static struct module *to_bus_provider(struct device *dev)
+{
+ /* pin bus providers while regions are enabled */
+ if (is_nd_pmem(dev) || is_nd_blk(dev)) {
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ return nvdimm_bus->module;
+ }
+ return NULL;
+}
+
+static void nvdimm_bus_probe_start(struct nvdimm_bus *nvdimm_bus)
+{
+ nvdimm_bus_lock(&nvdimm_bus->dev);
+ nvdimm_bus->probe_active++;
+ nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
+static void nvdimm_bus_probe_end(struct nvdimm_bus *nvdimm_bus)
+{
+ nvdimm_bus_lock(&nvdimm_bus->dev);
+ if (--nvdimm_bus->probe_active == 0)
+ wake_up(&nvdimm_bus->probe_wait);
+ nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
+static int nvdimm_bus_probe(struct device *dev)
+{
+ struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
+ struct module *provider = to_bus_provider(dev);
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+ int rc;
+
+ if (!try_module_get(provider))
+ return -ENXIO;
+
+ nvdimm_bus_probe_start(nvdimm_bus);
+ rc = nd_drv->probe(dev);
+ if (rc == 0)
+ nd_region_probe_success(nvdimm_bus, dev);
+ else
+ nd_region_disable(nvdimm_bus, dev);
+ nvdimm_bus_probe_end(nvdimm_bus);
+
+ dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name,
+ dev_name(dev), rc);
+
+ if (rc != 0)
+ module_put(provider);
+ return rc;
+}
+
+static int nvdimm_bus_remove(struct device *dev)
+{
+ struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
+ struct module *provider = to_bus_provider(dev);
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+ int rc;
+
+ rc = nd_drv->remove(dev);
+ nd_region_disable(nvdimm_bus, dev);
+
+ dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
+ dev_name(dev), rc);
+ module_put(provider);
+ return rc;
+}
+
+static struct bus_type nvdimm_bus_type = {
+ .name = "nd",
+ .uevent = nvdimm_bus_uevent,
+ .match = nvdimm_bus_match,
+ .probe = nvdimm_bus_probe,
+ .remove = nvdimm_bus_remove,
+};
+
+static ASYNC_DOMAIN_EXCLUSIVE(nd_async_domain);
+
+void nd_synchronize(void)
+{
+ async_synchronize_full_domain(&nd_async_domain);
+}
+EXPORT_SYMBOL_GPL(nd_synchronize);
+
+static void nd_async_device_register(void *d, async_cookie_t cookie)
+{
+ struct device *dev = d;
+
+ if (device_add(dev) != 0) {
+ dev_err(dev, "%s: failed\n", __func__);
+ put_device(dev);
+ }
+ put_device(dev);
+}
+
+static void nd_async_device_unregister(void *d, async_cookie_t cookie)
+{
+ struct device *dev = d;
+
+ /* flush bus operations before delete */
+ nvdimm_bus_lock(dev);
+ nvdimm_bus_unlock(dev);
+
+ device_unregister(dev);
+ put_device(dev);
+}
+
+void __nd_device_register(struct device *dev)
+{
+ dev->bus = &nvdimm_bus_type;
+ get_device(dev);
+ async_schedule_domain(nd_async_device_register, dev,
+ &nd_async_domain);
+}
+
+void nd_device_register(struct device *dev)
+{
+ device_initialize(dev);
+ __nd_device_register(dev);
+}
+EXPORT_SYMBOL(nd_device_register);
+
+void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
+{
+ switch (mode) {
+ case ND_ASYNC:
+ get_device(dev);
+ async_schedule_domain(nd_async_device_unregister, dev,
+ &nd_async_domain);
+ break;
+ case ND_SYNC:
+ nd_synchronize();
+ device_unregister(dev);
+ break;
+ }
+}
+EXPORT_SYMBOL(nd_device_unregister);
+
+/**
+ * __nd_driver_register() - register a region or a namespace driver
+ * @nd_drv: driver to register
+ * @owner: automatically set by nd_driver_register() macro
+ * @mod_name: automatically set by nd_driver_register() macro
+ */
+int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner,
+ const char *mod_name)
+{
+ struct device_driver *drv = &nd_drv->drv;
+
+ if (!nd_drv->type) {
+ pr_debug("driver type bitmask not set (%pf)\n",
+ __builtin_return_address(0));
+ return -EINVAL;
+ }
+
+ if (!nd_drv->probe || !nd_drv->remove) {
+ pr_debug("->probe() and ->remove() must be specified\n");
+ return -EINVAL;
+ }
+
+ drv->bus = &nvdimm_bus_type;
+ drv->owner = owner;
+ drv->mod_name = mod_name;
+
+ return driver_register(drv);
+}
+EXPORT_SYMBOL(__nd_driver_register);
+
+int nvdimm_revalidate_disk(struct gendisk *disk)
+{
+ struct device *dev = disk->driverfs_dev;
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ const char *pol = nd_region->ro ? "only" : "write";
+
+ if (nd_region->ro == get_disk_ro(disk))
+ return 0;
+
+ dev_info(dev, "%s read-%s, marking %s read-%s\n",
+ dev_name(&nd_region->dev), pol, disk->disk_name, pol);
+ set_disk_ro(disk, nd_region->ro);
+
+ return 0;
+
+}
+EXPORT_SYMBOL(nvdimm_revalidate_disk);
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, ND_DEVICE_MODALIAS_FMT "\n",
+ to_nd_device_type(dev));
+}
+static DEVICE_ATTR_RO(modalias);
+
+static ssize_t devtype_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%s\n", dev->type->name);
+}
+static DEVICE_ATTR_RO(devtype);
+
+static struct attribute *nd_device_attributes[] = {
+ &dev_attr_modalias.attr,
+ &dev_attr_devtype.attr,
+ NULL,
+};
+
+/**
+ * nd_device_attribute_group - generic attributes for all devices on an nd bus
+ */
+struct attribute_group nd_device_attribute_group = {
+ .attrs = nd_device_attributes,
+};
+EXPORT_SYMBOL_GPL(nd_device_attribute_group);
+
+static ssize_t numa_node_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", dev_to_node(dev));
+}
+static DEVICE_ATTR_RO(numa_node);
+
+static struct attribute *nd_numa_attributes[] = {
+ &dev_attr_numa_node.attr,
+ NULL,
+};
+
+static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a,
+ int n)
+{
+ if (!IS_ENABLED(CONFIG_NUMA))
+ return 0;
+
+ return a->mode;
+}
+
+/**
+ * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus
+ */
+struct attribute_group nd_numa_attribute_group = {
+ .attrs = nd_numa_attributes,
+ .is_visible = nd_numa_attr_visible,
+};
+EXPORT_SYMBOL_GPL(nd_numa_attribute_group);
+
+int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus)
+{
+ dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id);
+ struct device *dev;
+
+ dev = device_create(nd_class, &nvdimm_bus->dev, devt, nvdimm_bus,
+ "ndctl%d", nvdimm_bus->id);
+
+ if (IS_ERR(dev)) {
+ dev_dbg(&nvdimm_bus->dev, "failed to register ndctl%d: %ld\n",
+ nvdimm_bus->id, PTR_ERR(dev));
+ return PTR_ERR(dev);
+ }
+ return 0;
+}
+
+void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus)
+{
+ device_destroy(nd_class, MKDEV(nvdimm_bus_major, nvdimm_bus->id));
+}
+
+static const struct nd_cmd_desc __nd_cmd_dimm_descs[] = {
+ [ND_CMD_IMPLEMENTED] = { },
+ [ND_CMD_SMART] = {
+ .out_num = 2,
+ .out_sizes = { 4, 8, },
+ },
+ [ND_CMD_SMART_THRESHOLD] = {
+ .out_num = 2,
+ .out_sizes = { 4, 8, },
+ },
+ [ND_CMD_DIMM_FLAGS] = {
+ .out_num = 2,
+ .out_sizes = { 4, 4 },
+ },
+ [ND_CMD_GET_CONFIG_SIZE] = {
+ .out_num = 3,
+ .out_sizes = { 4, 4, 4, },
+ },
+ [ND_CMD_GET_CONFIG_DATA] = {
+ .in_num = 2,
+ .in_sizes = { 4, 4, },
+ .out_num = 2,
+ .out_sizes = { 4, UINT_MAX, },
+ },
+ [ND_CMD_SET_CONFIG_DATA] = {
+ .in_num = 3,
+ .in_sizes = { 4, 4, UINT_MAX, },
+ .out_num = 1,
+ .out_sizes = { 4, },
+ },
+ [ND_CMD_VENDOR] = {
+ .in_num = 3,
+ .in_sizes = { 4, 4, UINT_MAX, },
+ .out_num = 3,
+ .out_sizes = { 4, 4, UINT_MAX, },
+ },
+};
+
+const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd)
+{
+ if (cmd < ARRAY_SIZE(__nd_cmd_dimm_descs))
+ return &__nd_cmd_dimm_descs[cmd];
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_dimm_desc);
+
+static const struct nd_cmd_desc __nd_cmd_bus_descs[] = {
+ [ND_CMD_IMPLEMENTED] = { },
+ [ND_CMD_ARS_CAP] = {
+ .in_num = 2,
+ .in_sizes = { 8, 8, },
+ .out_num = 2,
+ .out_sizes = { 4, 4, },
+ },
+ [ND_CMD_ARS_START] = {
+ .in_num = 4,
+ .in_sizes = { 8, 8, 2, 6, },
+ .out_num = 1,
+ .out_sizes = { 4, },
+ },
+ [ND_CMD_ARS_STATUS] = {
+ .out_num = 2,
+ .out_sizes = { 4, UINT_MAX, },
+ },
+};
+
+const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd)
+{
+ if (cmd < ARRAY_SIZE(__nd_cmd_bus_descs))
+ return &__nd_cmd_bus_descs[cmd];
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_bus_desc);
+
+u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
+ const struct nd_cmd_desc *desc, int idx, void *buf)
+{
+ if (idx >= desc->in_num)
+ return UINT_MAX;
+
+ if (desc->in_sizes[idx] < UINT_MAX)
+ return desc->in_sizes[idx];
+
+ if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA && idx == 2) {
+ struct nd_cmd_set_config_hdr *hdr = buf;
+
+ return hdr->in_length;
+ } else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) {
+ struct nd_cmd_vendor_hdr *hdr = buf;
+
+ return hdr->in_length;
+ }
+
+ return UINT_MAX;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_in_size);
+
+u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
+ const struct nd_cmd_desc *desc, int idx, const u32 *in_field,
+ const u32 *out_field)
+{
+ if (idx >= desc->out_num)
+ return UINT_MAX;
+
+ if (desc->out_sizes[idx] < UINT_MAX)
+ return desc->out_sizes[idx];
+
+ if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && idx == 1)
+ return in_field[1];
+ else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2)
+ return out_field[1];
+ else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 1)
+ return ND_CMD_ARS_STATUS_MAX;
+
+ return UINT_MAX;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_out_size);
+
+void wait_nvdimm_bus_probe_idle(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ do {
+ if (nvdimm_bus->probe_active == 0)
+ break;
+ nvdimm_bus_unlock(&nvdimm_bus->dev);
+ wait_event(nvdimm_bus->probe_wait,
+ nvdimm_bus->probe_active == 0);
+ nvdimm_bus_lock(&nvdimm_bus->dev);
+ } while (true);
+}
+
+/* set_config requires an idle interleave set */
+static int nd_cmd_clear_to_send(struct nvdimm *nvdimm, unsigned int cmd)
+{
+ struct nvdimm_bus *nvdimm_bus;
+
+ if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA)
+ return 0;
+
+ nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev);
+ wait_nvdimm_bus_probe_idle(&nvdimm_bus->dev);
+
+ if (atomic_read(&nvdimm->busy))
+ return -EBUSY;
+ return 0;
+}
+
+static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
+ int read_only, unsigned int ioctl_cmd, unsigned long arg)
+{
+ struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+ size_t buf_len = 0, in_len = 0, out_len = 0;
+ static char out_env[ND_CMD_MAX_ENVELOPE];
+ static char in_env[ND_CMD_MAX_ENVELOPE];
+ const struct nd_cmd_desc *desc = NULL;
+ unsigned int cmd = _IOC_NR(ioctl_cmd);
+ void __user *p = (void __user *) arg;
+ struct device *dev = &nvdimm_bus->dev;
+ const char *cmd_name, *dimm_name;
+ unsigned long dsm_mask;
+ void *buf;
+ int rc, i;
+
+ if (nvdimm) {
+ desc = nd_cmd_dimm_desc(cmd);
+ cmd_name = nvdimm_cmd_name(cmd);
+ dsm_mask = nvdimm->dsm_mask ? *(nvdimm->dsm_mask) : 0;
+ dimm_name = dev_name(&nvdimm->dev);
+ } else {
+ desc = nd_cmd_bus_desc(cmd);
+ cmd_name = nvdimm_bus_cmd_name(cmd);
+ dsm_mask = nd_desc->dsm_mask;
+ dimm_name = "bus";
+ }
+
+ if (!desc || (desc->out_num + desc->in_num == 0) ||
+ !test_bit(cmd, &dsm_mask))
+ return -ENOTTY;
+
+ /* fail write commands (when read-only) */
+ if (read_only)
+ switch (ioctl_cmd) {
+ case ND_IOCTL_VENDOR:
+ case ND_IOCTL_SET_CONFIG_DATA:
+ case ND_IOCTL_ARS_START:
+ dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n",
+ nvdimm ? nvdimm_cmd_name(cmd)
+ : nvdimm_bus_cmd_name(cmd));
+ return -EPERM;
+ default:
+ break;
+ }
+
+ /* process an input envelope */
+ for (i = 0; i < desc->in_num; i++) {
+ u32 in_size, copy;
+
+ in_size = nd_cmd_in_size(nvdimm, cmd, desc, i, in_env);
+ if (in_size == UINT_MAX) {
+ dev_err(dev, "%s:%s unknown input size cmd: %s field: %d\n",
+ __func__, dimm_name, cmd_name, i);
+ return -ENXIO;
+ }
+ if (!access_ok(VERIFY_READ, p + in_len, in_size))
+ return -EFAULT;
+ if (in_len < sizeof(in_env))
+ copy = min_t(u32, sizeof(in_env) - in_len, in_size);
+ else
+ copy = 0;
+ if (copy && copy_from_user(&in_env[in_len], p + in_len, copy))
+ return -EFAULT;
+ in_len += in_size;
+ }
+
+ /* process an output envelope */
+ for (i = 0; i < desc->out_num; i++) {
+ u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i,
+ (u32 *) in_env, (u32 *) out_env);
+ u32 copy;
+
+ if (out_size == UINT_MAX) {
+ dev_dbg(dev, "%s:%s unknown output size cmd: %s field: %d\n",
+ __func__, dimm_name, cmd_name, i);
+ return -EFAULT;
+ }
+ if (!access_ok(VERIFY_WRITE, p + in_len + out_len, out_size))
+ return -EFAULT;
+ if (out_len < sizeof(out_env))
+ copy = min_t(u32, sizeof(out_env) - out_len, out_size);
+ else
+ copy = 0;
+ if (copy && copy_from_user(&out_env[out_len],
+ p + in_len + out_len, copy))
+ return -EFAULT;
+ out_len += out_size;
+ }
+
+ buf_len = out_len + in_len;
+ if (!access_ok(VERIFY_WRITE, p, sizeof(buf_len)))
+ return -EFAULT;
+
+ if (buf_len > ND_IOCTL_MAX_BUFLEN) {
+ dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__,
+ dimm_name, cmd_name, buf_len,
+ ND_IOCTL_MAX_BUFLEN);
+ return -EINVAL;
+ }
+
+ buf = vmalloc(buf_len);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, p, buf_len)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ nvdimm_bus_lock(&nvdimm_bus->dev);
+ rc = nd_cmd_clear_to_send(nvdimm, cmd);
+ if (rc)
+ goto out_unlock;
+
+ rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len);
+ if (rc < 0)
+ goto out_unlock;
+ if (copy_to_user(p, buf, buf_len))
+ rc = -EFAULT;
+ out_unlock:
+ nvdimm_bus_unlock(&nvdimm_bus->dev);
+ out:
+ vfree(buf);
+ return rc;
+}
+
+static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ long id = (long) file->private_data;
+ int rc = -ENXIO, read_only;
+ struct nvdimm_bus *nvdimm_bus;
+
+ read_only = (O_RDWR != (file->f_flags & O_ACCMODE));
+ mutex_lock(&nvdimm_bus_list_mutex);
+ list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
+ if (nvdimm_bus->id == id) {
+ rc = __nd_ioctl(nvdimm_bus, NULL, read_only, cmd, arg);
+ break;
+ }
+ }
+ mutex_unlock(&nvdimm_bus_list_mutex);
+
+ return rc;
+}
+
+static int match_dimm(struct device *dev, void *data)
+{
+ long id = (long) data;
+
+ if (is_nvdimm(dev)) {
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+
+ return nvdimm->id == id;
+ }
+
+ return 0;
+}
+
+static long nvdimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int rc = -ENXIO, read_only;
+ struct nvdimm_bus *nvdimm_bus;
+
+ read_only = (O_RDWR != (file->f_flags & O_ACCMODE));
+ mutex_lock(&nvdimm_bus_list_mutex);
+ list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
+ struct device *dev = device_find_child(&nvdimm_bus->dev,
+ file->private_data, match_dimm);
+ struct nvdimm *nvdimm;
+
+ if (!dev)
+ continue;
+
+ nvdimm = to_nvdimm(dev);
+ rc = __nd_ioctl(nvdimm_bus, nvdimm, read_only, cmd, arg);
+ put_device(dev);
+ break;
+ }
+ mutex_unlock(&nvdimm_bus_list_mutex);
+
+ return rc;
+}
+
+static int nd_open(struct inode *inode, struct file *file)
+{
+ long minor = iminor(inode);
+
+ file->private_data = (void *) minor;
+ return 0;
+}
+
+static const struct file_operations nvdimm_bus_fops = {
+ .owner = THIS_MODULE,
+ .open = nd_open,
+ .unlocked_ioctl = nd_ioctl,
+ .compat_ioctl = nd_ioctl,
+ .llseek = noop_llseek,
+};
+
+static const struct file_operations nvdimm_fops = {
+ .owner = THIS_MODULE,
+ .open = nd_open,
+ .unlocked_ioctl = nvdimm_ioctl,
+ .compat_ioctl = nvdimm_ioctl,
+ .llseek = noop_llseek,
+};
+
+int __init nvdimm_bus_init(void)
+{
+ int rc;
+
+ rc = bus_register(&nvdimm_bus_type);
+ if (rc)
+ return rc;
+
+ rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops);
+ if (rc < 0)
+ goto err_bus_chrdev;
+ nvdimm_bus_major = rc;
+
+ rc = register_chrdev(0, "dimmctl", &nvdimm_fops);
+ if (rc < 0)
+ goto err_dimm_chrdev;
+ nvdimm_major = rc;
+
+ nd_class = class_create(THIS_MODULE, "nd");
+ if (IS_ERR(nd_class))
+ goto err_class;
+
+ return 0;
+
+ err_class:
+ unregister_chrdev(nvdimm_major, "dimmctl");
+ err_dimm_chrdev:
+ unregister_chrdev(nvdimm_bus_major, "ndctl");
+ err_bus_chrdev:
+ bus_unregister(&nvdimm_bus_type);
+
+ return rc;
+}
+
+void nvdimm_bus_exit(void)
+{
+ class_destroy(nd_class);
+ unregister_chrdev(nvdimm_bus_major, "ndctl");
+ unregister_chrdev(nvdimm_major, "dimmctl");
+ bus_unregister(&nvdimm_bus_type);
+}
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
new file mode 100644
index 000000000000..cb62ec6a12d0
--- /dev/null
+++ b/drivers/nvdimm/core.c
@@ -0,0 +1,465 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/libnvdimm.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/ctype.h>
+#include <linux/ndctl.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include "nd-core.h"
+#include "nd.h"
+
+LIST_HEAD(nvdimm_bus_list);
+DEFINE_MUTEX(nvdimm_bus_list_mutex);
+static DEFINE_IDA(nd_ida);
+
+void nvdimm_bus_lock(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ if (!nvdimm_bus)
+ return;
+ mutex_lock(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(nvdimm_bus_lock);
+
+void nvdimm_bus_unlock(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ if (!nvdimm_bus)
+ return;
+ mutex_unlock(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(nvdimm_bus_unlock);
+
+bool is_nvdimm_bus_locked(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ if (!nvdimm_bus)
+ return false;
+ return mutex_is_locked(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(is_nvdimm_bus_locked);
+
+u64 nd_fletcher64(void *addr, size_t len, bool le)
+{
+ u32 *buf = addr;
+ u32 lo32 = 0;
+ u64 hi32 = 0;
+ int i;
+
+ for (i = 0; i < len / sizeof(u32); i++) {
+ lo32 += le ? le32_to_cpu((__le32) buf[i]) : buf[i];
+ hi32 += lo32;
+ }
+
+ return hi32 << 32 | lo32;
+}
+EXPORT_SYMBOL_GPL(nd_fletcher64);
+
+static void nvdimm_bus_release(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus;
+
+ nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+ ida_simple_remove(&nd_ida, nvdimm_bus->id);
+ kfree(nvdimm_bus);
+}
+
+struct nvdimm_bus *to_nvdimm_bus(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus;
+
+ nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+ WARN_ON(nvdimm_bus->dev.release != nvdimm_bus_release);
+ return nvdimm_bus;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm_bus);
+
+struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus)
+{
+ /* struct nvdimm_bus definition is private to libnvdimm */
+ return nvdimm_bus->nd_desc;
+}
+EXPORT_SYMBOL_GPL(to_nd_desc);
+
+struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev)
+{
+ struct device *dev;
+
+ for (dev = nd_dev; dev; dev = dev->parent)
+ if (dev->release == nvdimm_bus_release)
+ break;
+ dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n");
+ if (dev)
+ return to_nvdimm_bus(dev);
+ return NULL;
+}
+
+static bool is_uuid_sep(char sep)
+{
+ if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0')
+ return true;
+ return false;
+}
+
+static int nd_uuid_parse(struct device *dev, u8 *uuid_out, const char *buf,
+ size_t len)
+{
+ const char *str = buf;
+ u8 uuid[16];
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ if (!isxdigit(str[0]) || !isxdigit(str[1])) {
+ dev_dbg(dev, "%s: pos: %d buf[%zd]: %c buf[%zd]: %c\n",
+ __func__, i, str - buf, str[0],
+ str + 1 - buf, str[1]);
+ return -EINVAL;
+ }
+
+ uuid[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]);
+ str += 2;
+ if (is_uuid_sep(*str))
+ str++;
+ }
+
+ memcpy(uuid_out, uuid, sizeof(uuid));
+ return 0;
+}
+
+/**
+ * nd_uuid_store: common implementation for writing 'uuid' sysfs attributes
+ * @dev: container device for the uuid property
+ * @uuid_out: uuid buffer to replace
+ * @buf: raw sysfs buffer to parse
+ *
+ * Enforce that uuids can only be changed while the device is disabled
+ * (driver detached)
+ * LOCKING: expects device_lock() is held on entry
+ */
+int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
+ size_t len)
+{
+ u8 uuid[16];
+ int rc;
+
+ if (dev->driver)
+ return -EBUSY;
+
+ rc = nd_uuid_parse(dev, uuid, buf, len);
+ if (rc)
+ return rc;
+
+ kfree(*uuid_out);
+ *uuid_out = kmemdup(uuid, sizeof(uuid), GFP_KERNEL);
+ if (!(*uuid_out))
+ return -ENOMEM;
+
+ return 0;
+}
+
+ssize_t nd_sector_size_show(unsigned long current_lbasize,
+ const unsigned long *supported, char *buf)
+{
+ ssize_t len = 0;
+ int i;
+
+ for (i = 0; supported[i]; i++)
+ if (current_lbasize == supported[i])
+ len += sprintf(buf + len, "[%ld] ", supported[i]);
+ else
+ len += sprintf(buf + len, "%ld ", supported[i]);
+ len += sprintf(buf + len, "\n");
+ return len;
+}
+
+ssize_t nd_sector_size_store(struct device *dev, const char *buf,
+ unsigned long *current_lbasize, const unsigned long *supported)
+{
+ unsigned long lbasize;
+ int rc, i;
+
+ if (dev->driver)
+ return -EBUSY;
+
+ rc = kstrtoul(buf, 0, &lbasize);
+ if (rc)
+ return rc;
+
+ for (i = 0; supported[i]; i++)
+ if (lbasize == supported[i])
+ break;
+
+ if (supported[i]) {
+ *current_lbasize = lbasize;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+
+void __nd_iostat_start(struct bio *bio, unsigned long *start)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ const int rw = bio_data_dir(bio);
+ int cpu = part_stat_lock();
+
+ *start = jiffies;
+ part_round_stats(cpu, &disk->part0);
+ part_stat_inc(cpu, &disk->part0, ios[rw]);
+ part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio));
+ part_inc_in_flight(&disk->part0, rw);
+ part_stat_unlock();
+}
+EXPORT_SYMBOL(__nd_iostat_start);
+
+void nd_iostat_end(struct bio *bio, unsigned long start)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ unsigned long duration = jiffies - start;
+ const int rw = bio_data_dir(bio);
+ int cpu = part_stat_lock();
+
+ part_stat_add(cpu, &disk->part0, ticks[rw], duration);
+ part_round_stats(cpu, &disk->part0);
+ part_dec_in_flight(&disk->part0, rw);
+ part_stat_unlock();
+}
+EXPORT_SYMBOL(nd_iostat_end);
+
+static ssize_t commands_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int cmd, len = 0;
+ struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+ struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+
+ for_each_set_bit(cmd, &nd_desc->dsm_mask, BITS_PER_LONG)
+ len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd));
+ len += sprintf(buf + len, "\n");
+ return len;
+}
+static DEVICE_ATTR_RO(commands);
+
+static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus)
+{
+ struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+ struct device *parent = nvdimm_bus->dev.parent;
+
+ if (nd_desc->provider_name)
+ return nd_desc->provider_name;
+ else if (parent)
+ return dev_name(parent);
+ else
+ return "unknown";
+}
+
+static ssize_t provider_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+
+ return sprintf(buf, "%s\n", nvdimm_bus_provider(nvdimm_bus));
+}
+static DEVICE_ATTR_RO(provider);
+
+static int flush_namespaces(struct device *dev, void *data)
+{
+ device_lock(dev);
+ device_unlock(dev);
+ return 0;
+}
+
+static int flush_regions_dimms(struct device *dev, void *data)
+{
+ device_lock(dev);
+ device_unlock(dev);
+ device_for_each_child(dev, NULL, flush_namespaces);
+ return 0;
+}
+
+static ssize_t wait_probe_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ nd_synchronize();
+ device_for_each_child(dev, NULL, flush_regions_dimms);
+ return sprintf(buf, "1\n");
+}
+static DEVICE_ATTR_RO(wait_probe);
+
+static struct attribute *nvdimm_bus_attributes[] = {
+ &dev_attr_commands.attr,
+ &dev_attr_wait_probe.attr,
+ &dev_attr_provider.attr,
+ NULL,
+};
+
+struct attribute_group nvdimm_bus_attribute_group = {
+ .attrs = nvdimm_bus_attributes,
+};
+EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
+
+struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
+ struct nvdimm_bus_descriptor *nd_desc, struct module *module)
+{
+ struct nvdimm_bus *nvdimm_bus;
+ int rc;
+
+ nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL);
+ if (!nvdimm_bus)
+ return NULL;
+ INIT_LIST_HEAD(&nvdimm_bus->list);
+ init_waitqueue_head(&nvdimm_bus->probe_wait);
+ nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
+ mutex_init(&nvdimm_bus->reconfig_mutex);
+ if (nvdimm_bus->id < 0) {
+ kfree(nvdimm_bus);
+ return NULL;
+ }
+ nvdimm_bus->nd_desc = nd_desc;
+ nvdimm_bus->module = module;
+ nvdimm_bus->dev.parent = parent;
+ nvdimm_bus->dev.release = nvdimm_bus_release;
+ nvdimm_bus->dev.groups = nd_desc->attr_groups;
+ dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id);
+ rc = device_register(&nvdimm_bus->dev);
+ if (rc) {
+ dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc);
+ goto err;
+ }
+
+ rc = nvdimm_bus_create_ndctl(nvdimm_bus);
+ if (rc)
+ goto err;
+
+ mutex_lock(&nvdimm_bus_list_mutex);
+ list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list);
+ mutex_unlock(&nvdimm_bus_list_mutex);
+
+ return nvdimm_bus;
+ err:
+ put_device(&nvdimm_bus->dev);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__nvdimm_bus_register);
+
+static int child_unregister(struct device *dev, void *data)
+{
+ /*
+ * the singular ndctl class device per bus needs to be
+ * "device_destroy"ed, so skip it here
+ *
+ * i.e. remove classless children
+ */
+ if (dev->class)
+ /* pass */;
+ else
+ nd_device_unregister(dev, ND_SYNC);
+ return 0;
+}
+
+void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
+{
+ if (!nvdimm_bus)
+ return;
+
+ mutex_lock(&nvdimm_bus_list_mutex);
+ list_del_init(&nvdimm_bus->list);
+ mutex_unlock(&nvdimm_bus_list_mutex);
+
+ nd_synchronize();
+ device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
+ nvdimm_bus_destroy_ndctl(nvdimm_bus);
+
+ device_unregister(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_unregister);
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static int nd_pi_nop_generate_verify(struct blk_integrity_iter *iter)
+{
+ return 0;
+}
+
+int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
+{
+ struct blk_integrity integrity = {
+ .name = "ND-PI-NOP",
+ .generate_fn = nd_pi_nop_generate_verify,
+ .verify_fn = nd_pi_nop_generate_verify,
+ .tuple_size = meta_size,
+ .tag_size = meta_size,
+ };
+ int ret;
+
+ if (meta_size == 0)
+ return 0;
+
+ ret = blk_integrity_register(disk, &integrity);
+ if (ret)
+ return ret;
+
+ blk_queue_max_integrity_segments(disk->queue, 1);
+
+ return 0;
+}
+EXPORT_SYMBOL(nd_integrity_init);
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
+{
+ return 0;
+}
+EXPORT_SYMBOL(nd_integrity_init);
+
+#endif
+
+static __init int libnvdimm_init(void)
+{
+ int rc;
+
+ rc = nvdimm_bus_init();
+ if (rc)
+ return rc;
+ rc = nvdimm_init();
+ if (rc)
+ goto err_dimm;
+ rc = nd_region_init();
+ if (rc)
+ goto err_region;
+ return 0;
+ err_region:
+ nvdimm_exit();
+ err_dimm:
+ nvdimm_bus_exit();
+ return rc;
+}
+
+static __exit void libnvdimm_exit(void)
+{
+ WARN_ON(!list_empty(&nvdimm_bus_list));
+ nd_region_exit();
+ nvdimm_exit();
+ nvdimm_bus_exit();
+}
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+subsys_initcall(libnvdimm_init);
+module_exit(libnvdimm_exit);
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
new file mode 100644
index 000000000000..71d12bb67339
--- /dev/null
+++ b/drivers/nvdimm/dimm.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/ndctl.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/nd.h>
+#include "label.h"
+#include "nd.h"
+
+static int nvdimm_probe(struct device *dev)
+{
+ struct nvdimm_drvdata *ndd;
+ int rc;
+
+ ndd = kzalloc(sizeof(*ndd), GFP_KERNEL);
+ if (!ndd)
+ return -ENOMEM;
+
+ dev_set_drvdata(dev, ndd);
+ ndd->dpa.name = dev_name(dev);
+ ndd->ns_current = -1;
+ ndd->ns_next = -1;
+ ndd->dpa.start = 0;
+ ndd->dpa.end = -1;
+ ndd->dev = dev;
+ get_device(dev);
+ kref_init(&ndd->kref);
+
+ rc = nvdimm_init_nsarea(ndd);
+ if (rc)
+ goto err;
+
+ rc = nvdimm_init_config_data(ndd);
+ if (rc)
+ goto err;
+
+ dev_dbg(dev, "config data size: %d\n", ndd->nsarea.config_size);
+
+ nvdimm_bus_lock(dev);
+ ndd->ns_current = nd_label_validate(ndd);
+ ndd->ns_next = nd_label_next_nsindex(ndd->ns_current);
+ nd_label_copy(ndd, to_next_namespace_index(ndd),
+ to_current_namespace_index(ndd));
+ rc = nd_label_reserve_dpa(ndd);
+ nvdimm_bus_unlock(dev);
+
+ if (rc)
+ goto err;
+
+ return 0;
+
+ err:
+ put_ndd(ndd);
+ return rc;
+}
+
+static int nvdimm_remove(struct device *dev)
+{
+ struct nvdimm_drvdata *ndd = dev_get_drvdata(dev);
+
+ nvdimm_bus_lock(dev);
+ dev_set_drvdata(dev, NULL);
+ nvdimm_bus_unlock(dev);
+ put_ndd(ndd);
+
+ return 0;
+}
+
+static struct nd_device_driver nvdimm_driver = {
+ .probe = nvdimm_probe,
+ .remove = nvdimm_remove,
+ .drv = {
+ .name = "nvdimm",
+ },
+ .type = ND_DRIVER_DIMM,
+};
+
+int __init nvdimm_init(void)
+{
+ return nd_driver_register(&nvdimm_driver);
+}
+
+void nvdimm_exit(void)
+{
+ driver_unregister(&nvdimm_driver.drv);
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DIMM);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
new file mode 100644
index 000000000000..c05eb807d674
--- /dev/null
+++ b/drivers/nvdimm/dimm_devs.c
@@ -0,0 +1,551 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/vmalloc.h>
+#include <linux/device.h>
+#include <linux/ndctl.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "label.h"
+#include "nd.h"
+
+static DEFINE_IDA(dimm_ida);
+
+/*
+ * Retrieve bus and dimm handle and return if this bus supports
+ * get_config_data commands
+ */
+static int __validate_dimm(struct nvdimm_drvdata *ndd)
+{
+ struct nvdimm *nvdimm;
+
+ if (!ndd)
+ return -EINVAL;
+
+ nvdimm = to_nvdimm(ndd->dev);
+
+ if (!nvdimm->dsm_mask)
+ return -ENXIO;
+ if (!test_bit(ND_CMD_GET_CONFIG_DATA, nvdimm->dsm_mask))
+ return -ENXIO;
+
+ return 0;
+}
+
+static int validate_dimm(struct nvdimm_drvdata *ndd)
+{
+ int rc = __validate_dimm(ndd);
+
+ if (rc && ndd)
+ dev_dbg(ndd->dev, "%pf: %s error: %d\n",
+ __builtin_return_address(0), __func__, rc);
+ return rc;
+}
+
+/**
+ * nvdimm_init_nsarea - determine the geometry of a dimm's namespace area
+ * @nvdimm: dimm to initialize
+ */
+int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd)
+{
+ struct nd_cmd_get_config_size *cmd = &ndd->nsarea;
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+ struct nvdimm_bus_descriptor *nd_desc;
+ int rc = validate_dimm(ndd);
+
+ if (rc)
+ return rc;
+
+ if (cmd->config_size)
+ return 0; /* already valid */
+
+ memset(cmd, 0, sizeof(*cmd));
+ nd_desc = nvdimm_bus->nd_desc;
+ return nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+ ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd));
+}
+
+int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+ struct nd_cmd_get_config_data_hdr *cmd;
+ struct nvdimm_bus_descriptor *nd_desc;
+ int rc = validate_dimm(ndd);
+ u32 max_cmd_size, config_size;
+ size_t offset;
+
+ if (rc)
+ return rc;
+
+ if (ndd->data)
+ return 0;
+
+ if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0
+ || ndd->nsarea.config_size < ND_LABEL_MIN_SIZE) {
+ dev_dbg(ndd->dev, "failed to init config data area: (%d:%d)\n",
+ ndd->nsarea.max_xfer, ndd->nsarea.config_size);
+ return -ENXIO;
+ }
+
+ ndd->data = kmalloc(ndd->nsarea.config_size, GFP_KERNEL);
+ if (!ndd->data)
+ ndd->data = vmalloc(ndd->nsarea.config_size);
+
+ if (!ndd->data)
+ return -ENOMEM;
+
+ max_cmd_size = min_t(u32, PAGE_SIZE, ndd->nsarea.max_xfer);
+ cmd = kzalloc(max_cmd_size + sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ nd_desc = nvdimm_bus->nd_desc;
+ for (config_size = ndd->nsarea.config_size, offset = 0;
+ config_size; config_size -= cmd->in_length,
+ offset += cmd->in_length) {
+ cmd->in_length = min(config_size, max_cmd_size);
+ cmd->in_offset = offset;
+ rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+ ND_CMD_GET_CONFIG_DATA, cmd,
+ cmd->in_length + sizeof(*cmd));
+ if (rc || cmd->status) {
+ rc = -ENXIO;
+ break;
+ }
+ memcpy(ndd->data + offset, cmd->out_buf, cmd->in_length);
+ }
+ dev_dbg(ndd->dev, "%s: len: %zu rc: %d\n", __func__, offset, rc);
+ kfree(cmd);
+
+ return rc;
+}
+
+int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
+ void *buf, size_t len)
+{
+ int rc = validate_dimm(ndd);
+ size_t max_cmd_size, buf_offset;
+ struct nd_cmd_set_config_hdr *cmd;
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+ struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+
+ if (rc)
+ return rc;
+
+ if (!ndd->data)
+ return -ENXIO;
+
+ if (offset + len > ndd->nsarea.config_size)
+ return -ENXIO;
+
+ max_cmd_size = min_t(u32, PAGE_SIZE, len);
+ max_cmd_size = min_t(u32, max_cmd_size, ndd->nsarea.max_xfer);
+ cmd = kzalloc(max_cmd_size + sizeof(*cmd) + sizeof(u32), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ for (buf_offset = 0; len; len -= cmd->in_length,
+ buf_offset += cmd->in_length) {
+ size_t cmd_size;
+ u32 *status;
+
+ cmd->in_offset = offset + buf_offset;
+ cmd->in_length = min(max_cmd_size, len);
+ memcpy(cmd->in_buf, buf + buf_offset, cmd->in_length);
+
+ /* status is output in the last 4-bytes of the command buffer */
+ cmd_size = sizeof(*cmd) + cmd->in_length + sizeof(u32);
+ status = ((void *) cmd) + cmd_size - sizeof(u32);
+
+ rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+ ND_CMD_SET_CONFIG_DATA, cmd, cmd_size);
+ if (rc || *status) {
+ rc = rc ? rc : -ENXIO;
+ break;
+ }
+ }
+ kfree(cmd);
+
+ return rc;
+}
+
+static void nvdimm_release(struct device *dev)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+
+ ida_simple_remove(&dimm_ida, nvdimm->id);
+ kfree(nvdimm);
+}
+
+static struct device_type nvdimm_device_type = {
+ .name = "nvdimm",
+ .release = nvdimm_release,
+};
+
+bool is_nvdimm(struct device *dev)
+{
+ return dev->type == &nvdimm_device_type;
+}
+
+struct nvdimm *to_nvdimm(struct device *dev)
+{
+ struct nvdimm *nvdimm = container_of(dev, struct nvdimm, dev);
+
+ WARN_ON(!is_nvdimm(dev));
+ return nvdimm;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm);
+
+struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr)
+{
+ struct nd_region *nd_region = &ndbr->nd_region;
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+
+ return nd_mapping->nvdimm;
+}
+EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm);
+
+struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping)
+{
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm->dev));
+
+ return dev_get_drvdata(&nvdimm->dev);
+}
+EXPORT_SYMBOL(to_ndd);
+
+void nvdimm_drvdata_release(struct kref *kref)
+{
+ struct nvdimm_drvdata *ndd = container_of(kref, typeof(*ndd), kref);
+ struct device *dev = ndd->dev;
+ struct resource *res, *_r;
+
+ dev_dbg(dev, "%s\n", __func__);
+
+ nvdimm_bus_lock(dev);
+ for_each_dpa_resource_safe(ndd, res, _r)
+ nvdimm_free_dpa(ndd, res);
+ nvdimm_bus_unlock(dev);
+
+ if (ndd->data && is_vmalloc_addr(ndd->data))
+ vfree(ndd->data);
+ else
+ kfree(ndd->data);
+ kfree(ndd);
+ put_device(dev);
+}
+
+void get_ndd(struct nvdimm_drvdata *ndd)
+{
+ kref_get(&ndd->kref);
+}
+
+void put_ndd(struct nvdimm_drvdata *ndd)
+{
+ if (ndd)
+ kref_put(&ndd->kref, nvdimm_drvdata_release);
+}
+
+const char *nvdimm_name(struct nvdimm *nvdimm)
+{
+ return dev_name(&nvdimm->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_name);
+
+void *nvdimm_provider_data(struct nvdimm *nvdimm)
+{
+ if (nvdimm)
+ return nvdimm->provider_data;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nvdimm_provider_data);
+
+static ssize_t commands_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ int cmd, len = 0;
+
+ if (!nvdimm->dsm_mask)
+ return sprintf(buf, "\n");
+
+ for_each_set_bit(cmd, nvdimm->dsm_mask, BITS_PER_LONG)
+ len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd));
+ len += sprintf(buf + len, "\n");
+ return len;
+}
+static DEVICE_ATTR_RO(commands);
+
+static ssize_t state_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+
+ /*
+ * The state may be in the process of changing, userspace should
+ * quiesce probing if it wants a static answer
+ */
+ nvdimm_bus_lock(dev);
+ nvdimm_bus_unlock(dev);
+ return sprintf(buf, "%s\n", atomic_read(&nvdimm->busy)
+ ? "active" : "idle");
+}
+static DEVICE_ATTR_RO(state);
+
+static ssize_t available_slots_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvdimm_drvdata *ndd = dev_get_drvdata(dev);
+ ssize_t rc;
+ u32 nfree;
+
+ if (!ndd)
+ return -ENXIO;
+
+ nvdimm_bus_lock(dev);
+ nfree = nd_label_nfree(ndd);
+ if (nfree - 1 > nfree) {
+ dev_WARN_ONCE(dev, 1, "we ate our last label?\n");
+ nfree = 0;
+ } else
+ nfree--;
+ rc = sprintf(buf, "%d\n", nfree);
+ nvdimm_bus_unlock(dev);
+ return rc;
+}
+static DEVICE_ATTR_RO(available_slots);
+
+static struct attribute *nvdimm_attributes[] = {
+ &dev_attr_state.attr,
+ &dev_attr_commands.attr,
+ &dev_attr_available_slots.attr,
+ NULL,
+};
+
+struct attribute_group nvdimm_attribute_group = {
+ .attrs = nvdimm_attributes,
+};
+EXPORT_SYMBOL_GPL(nvdimm_attribute_group);
+
+struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
+ const struct attribute_group **groups, unsigned long flags,
+ unsigned long *dsm_mask)
+{
+ struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL);
+ struct device *dev;
+
+ if (!nvdimm)
+ return NULL;
+
+ nvdimm->id = ida_simple_get(&dimm_ida, 0, 0, GFP_KERNEL);
+ if (nvdimm->id < 0) {
+ kfree(nvdimm);
+ return NULL;
+ }
+ nvdimm->provider_data = provider_data;
+ nvdimm->flags = flags;
+ nvdimm->dsm_mask = dsm_mask;
+ atomic_set(&nvdimm->busy, 0);
+ dev = &nvdimm->dev;
+ dev_set_name(dev, "nmem%d", nvdimm->id);
+ dev->parent = &nvdimm_bus->dev;
+ dev->type = &nvdimm_device_type;
+ dev->devt = MKDEV(nvdimm_major, nvdimm->id);
+ dev->groups = groups;
+ nd_device_register(dev);
+
+ return nvdimm;
+}
+EXPORT_SYMBOL_GPL(nvdimm_create);
+
+/**
+ * nd_blk_available_dpa - account the unused dpa of BLK region
+ * @nd_mapping: container of dpa-resource-root + labels
+ *
+ * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges.
+ */
+resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping)
+{
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ resource_size_t map_end, busy = 0, available;
+ struct resource *res;
+
+ if (!ndd)
+ return 0;
+
+ map_end = nd_mapping->start + nd_mapping->size - 1;
+ for_each_dpa_resource(ndd, res)
+ if (res->start >= nd_mapping->start && res->start < map_end) {
+ resource_size_t end = min(map_end, res->end);
+
+ busy += end - res->start + 1;
+ } else if (res->end >= nd_mapping->start
+ && res->end <= map_end) {
+ busy += res->end - nd_mapping->start;
+ } else if (nd_mapping->start > res->start
+ && nd_mapping->start < res->end) {
+ /* total eclipse of the BLK region mapping */
+ busy += nd_mapping->size;
+ }
+
+ available = map_end - nd_mapping->start + 1;
+ if (busy < available)
+ return available - busy;
+ return 0;
+}
+
+/**
+ * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa
+ * @nd_mapping: container of dpa-resource-root + labels
+ * @nd_region: constrain available space check to this reference region
+ * @overlap: calculate available space assuming this level of overlap
+ *
+ * Validate that a PMEM label, if present, aligns with the start of an
+ * interleave set and truncate the available size at the lowest BLK
+ * overlap point.
+ *
+ * The expectation is that this routine is called multiple times as it
+ * probes for the largest BLK encroachment for any single member DIMM of
+ * the interleave set. Once that value is determined the PMEM-limit for
+ * the set can be established.
+ */
+resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, resource_size_t *overlap)
+{
+ resource_size_t map_start, map_end, busy = 0, available, blk_start;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct resource *res;
+ const char *reason;
+
+ if (!ndd)
+ return 0;
+
+ map_start = nd_mapping->start;
+ map_end = map_start + nd_mapping->size - 1;
+ blk_start = max(map_start, map_end + 1 - *overlap);
+ for_each_dpa_resource(ndd, res)
+ if (res->start >= map_start && res->start < map_end) {
+ if (strncmp(res->name, "blk", 3) == 0)
+ blk_start = min(blk_start, res->start);
+ else if (res->start != map_start) {
+ reason = "misaligned to iset";
+ goto err;
+ } else {
+ if (busy) {
+ reason = "duplicate overlapping PMEM reservations?";
+ goto err;
+ }
+ busy += resource_size(res);
+ continue;
+ }
+ } else if (res->end >= map_start && res->end <= map_end) {
+ if (strncmp(res->name, "blk", 3) == 0) {
+ /*
+ * If a BLK allocation overlaps the start of
+ * PMEM the entire interleave set may now only
+ * be used for BLK.
+ */
+ blk_start = map_start;
+ } else {
+ reason = "misaligned to iset";
+ goto err;
+ }
+ } else if (map_start > res->start && map_start < res->end) {
+ /* total eclipse of the mapping */
+ busy += nd_mapping->size;
+ blk_start = map_start;
+ }
+
+ *overlap = map_end + 1 - blk_start;
+ available = blk_start - map_start;
+ if (busy < available)
+ return available - busy;
+ return 0;
+
+ err:
+ /*
+ * Something is wrong, PMEM must align with the start of the
+ * interleave set, and there can only be one allocation per set.
+ */
+ nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason);
+ return 0;
+}
+
+void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res)
+{
+ WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev));
+ kfree(res->name);
+ __release_region(&ndd->dpa, res->start, resource_size(res));
+}
+
+struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
+ struct nd_label_id *label_id, resource_size_t start,
+ resource_size_t n)
+{
+ char *name = kmemdup(label_id, sizeof(*label_id), GFP_KERNEL);
+ struct resource *res;
+
+ if (!name)
+ return NULL;
+
+ WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev));
+ res = __request_region(&ndd->dpa, start, n, name, 0);
+ if (!res)
+ kfree(name);
+ return res;
+}
+
+/**
+ * nvdimm_allocated_dpa - sum up the dpa currently allocated to this label_id
+ * @nvdimm: container of dpa-resource-root + labels
+ * @label_id: dpa resource name of the form {pmem|blk}-<human readable uuid>
+ */
+resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
+ struct nd_label_id *label_id)
+{
+ resource_size_t allocated = 0;
+ struct resource *res;
+
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id->id) == 0)
+ allocated += resource_size(res);
+
+ return allocated;
+}
+
+static int count_dimms(struct device *dev, void *c)
+{
+ int *count = c;
+
+ if (is_nvdimm(dev))
+ (*count)++;
+ return 0;
+}
+
+int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count)
+{
+ int count = 0;
+ /* Flush any possible dimm registration failures */
+ nd_synchronize();
+
+ device_for_each_child(&nvdimm_bus->dev, &count, count_dimms);
+ dev_dbg(&nvdimm_bus->dev, "%s: count: %d\n", __func__, count);
+ if (count != dimm_count)
+ return -ENXIO;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count);
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
new file mode 100644
index 000000000000..96526dcfdd37
--- /dev/null
+++ b/drivers/nvdimm/label.c
@@ -0,0 +1,927 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/device.h>
+#include <linux/ndctl.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "label.h"
+#include "nd.h"
+
+static u32 best_seq(u32 a, u32 b)
+{
+ a &= NSINDEX_SEQ_MASK;
+ b &= NSINDEX_SEQ_MASK;
+
+ if (a == 0 || a == b)
+ return b;
+ else if (b == 0)
+ return a;
+ else if (nd_inc_seq(a) == b)
+ return b;
+ else
+ return a;
+}
+
+size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
+{
+ u32 index_span;
+
+ if (ndd->nsindex_size)
+ return ndd->nsindex_size;
+
+ /*
+ * The minimum index space is 512 bytes, with that amount of
+ * index we can describe ~1400 labels which is less than a byte
+ * of overhead per label. Round up to a byte of overhead per
+ * label and determine the size of the index region. Yes, this
+ * starts to waste space at larger config_sizes, but it's
+ * unlikely we'll ever see anything but 128K.
+ */
+ index_span = ndd->nsarea.config_size / 129;
+ index_span /= NSINDEX_ALIGN * 2;
+ ndd->nsindex_size = index_span * NSINDEX_ALIGN;
+
+ return ndd->nsindex_size;
+}
+
+int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd)
+{
+ return ndd->nsarea.config_size / 129;
+}
+
+int nd_label_validate(struct nvdimm_drvdata *ndd)
+{
+ /*
+ * On media label format consists of two index blocks followed
+ * by an array of labels. None of these structures are ever
+ * updated in place. A sequence number tracks the current
+ * active index and the next one to write, while labels are
+ * written to free slots.
+ *
+ * +------------+
+ * | |
+ * | nsindex0 |
+ * | |
+ * +------------+
+ * | |
+ * | nsindex1 |
+ * | |
+ * +------------+
+ * | label0 |
+ * +------------+
+ * | label1 |
+ * +------------+
+ * | |
+ * ....nslot...
+ * | |
+ * +------------+
+ * | labelN |
+ * +------------+
+ */
+ struct nd_namespace_index *nsindex[] = {
+ to_namespace_index(ndd, 0),
+ to_namespace_index(ndd, 1),
+ };
+ const int num_index = ARRAY_SIZE(nsindex);
+ struct device *dev = ndd->dev;
+ bool valid[2] = { 0 };
+ int i, num_valid = 0;
+ u32 seq;
+
+ for (i = 0; i < num_index; i++) {
+ u32 nslot;
+ u8 sig[NSINDEX_SIG_LEN];
+ u64 sum_save, sum, size;
+
+ memcpy(sig, nsindex[i]->sig, NSINDEX_SIG_LEN);
+ if (memcmp(sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN) != 0) {
+ dev_dbg(dev, "%s: nsindex%d signature invalid\n",
+ __func__, i);
+ continue;
+ }
+ sum_save = __le64_to_cpu(nsindex[i]->checksum);
+ nsindex[i]->checksum = __cpu_to_le64(0);
+ sum = nd_fletcher64(nsindex[i], sizeof_namespace_index(ndd), 1);
+ nsindex[i]->checksum = __cpu_to_le64(sum_save);
+ if (sum != sum_save) {
+ dev_dbg(dev, "%s: nsindex%d checksum invalid\n",
+ __func__, i);
+ continue;
+ }
+
+ seq = __le32_to_cpu(nsindex[i]->seq);
+ if ((seq & NSINDEX_SEQ_MASK) == 0) {
+ dev_dbg(dev, "%s: nsindex%d sequence: %#x invalid\n",
+ __func__, i, seq);
+ continue;
+ }
+
+ /* sanity check the index against expected values */
+ if (__le64_to_cpu(nsindex[i]->myoff)
+ != i * sizeof_namespace_index(ndd)) {
+ dev_dbg(dev, "%s: nsindex%d myoff: %#llx invalid\n",
+ __func__, i, (unsigned long long)
+ __le64_to_cpu(nsindex[i]->myoff));
+ continue;
+ }
+ if (__le64_to_cpu(nsindex[i]->otheroff)
+ != (!i) * sizeof_namespace_index(ndd)) {
+ dev_dbg(dev, "%s: nsindex%d otheroff: %#llx invalid\n",
+ __func__, i, (unsigned long long)
+ __le64_to_cpu(nsindex[i]->otheroff));
+ continue;
+ }
+
+ size = __le64_to_cpu(nsindex[i]->mysize);
+ if (size > sizeof_namespace_index(ndd)
+ || size < sizeof(struct nd_namespace_index)) {
+ dev_dbg(dev, "%s: nsindex%d mysize: %#llx invalid\n",
+ __func__, i, size);
+ continue;
+ }
+
+ nslot = __le32_to_cpu(nsindex[i]->nslot);
+ if (nslot * sizeof(struct nd_namespace_label)
+ + 2 * sizeof_namespace_index(ndd)
+ > ndd->nsarea.config_size) {
+ dev_dbg(dev, "%s: nsindex%d nslot: %u invalid, config_size: %#x\n",
+ __func__, i, nslot,
+ ndd->nsarea.config_size);
+ continue;
+ }
+ valid[i] = true;
+ num_valid++;
+ }
+
+ switch (num_valid) {
+ case 0:
+ break;
+ case 1:
+ for (i = 0; i < num_index; i++)
+ if (valid[i])
+ return i;
+ /* can't have num_valid > 0 but valid[] = { false, false } */
+ WARN_ON(1);
+ break;
+ default:
+ /* pick the best index... */
+ seq = best_seq(__le32_to_cpu(nsindex[0]->seq),
+ __le32_to_cpu(nsindex[1]->seq));
+ if (seq == (__le32_to_cpu(nsindex[1]->seq) & NSINDEX_SEQ_MASK))
+ return 1;
+ else
+ return 0;
+ break;
+ }
+
+ return -1;
+}
+
+void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst,
+ struct nd_namespace_index *src)
+{
+ if (dst && src)
+ /* pass */;
+ else
+ return;
+
+ memcpy(dst, src, sizeof_namespace_index(ndd));
+}
+
+static struct nd_namespace_label *nd_label_base(struct nvdimm_drvdata *ndd)
+{
+ void *base = to_namespace_index(ndd, 0);
+
+ return base + 2 * sizeof_namespace_index(ndd);
+}
+
+static int to_slot(struct nvdimm_drvdata *ndd,
+ struct nd_namespace_label *nd_label)
+{
+ return nd_label - nd_label_base(ndd);
+}
+
+#define for_each_clear_bit_le(bit, addr, size) \
+ for ((bit) = find_next_zero_bit_le((addr), (size), 0); \
+ (bit) < (size); \
+ (bit) = find_next_zero_bit_le((addr), (size), (bit) + 1))
+
+/**
+ * preamble_index - common variable initialization for nd_label_* routines
+ * @ndd: dimm container for the relevant label set
+ * @idx: namespace_index index
+ * @nsindex_out: on return set to the currently active namespace index
+ * @free: on return set to the free label bitmap in the index
+ * @nslot: on return set to the number of slots in the label space
+ */
+static bool preamble_index(struct nvdimm_drvdata *ndd, int idx,
+ struct nd_namespace_index **nsindex_out,
+ unsigned long **free, u32 *nslot)
+{
+ struct nd_namespace_index *nsindex;
+
+ nsindex = to_namespace_index(ndd, idx);
+ if (nsindex == NULL)
+ return false;
+
+ *free = (unsigned long *) nsindex->free;
+ *nslot = __le32_to_cpu(nsindex->nslot);
+ *nsindex_out = nsindex;
+
+ return true;
+}
+
+char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags)
+{
+ if (!label_id || !uuid)
+ return NULL;
+ snprintf(label_id->id, ND_LABEL_ID_SIZE, "%s-%pUb",
+ flags & NSLABEL_FLAG_LOCAL ? "blk" : "pmem", uuid);
+ return label_id->id;
+}
+
+static bool preamble_current(struct nvdimm_drvdata *ndd,
+ struct nd_namespace_index **nsindex,
+ unsigned long **free, u32 *nslot)
+{
+ return preamble_index(ndd, ndd->ns_current, nsindex,
+ free, nslot);
+}
+
+static bool preamble_next(struct nvdimm_drvdata *ndd,
+ struct nd_namespace_index **nsindex,
+ unsigned long **free, u32 *nslot)
+{
+ return preamble_index(ndd, ndd->ns_next, nsindex,
+ free, nslot);
+}
+
+static bool slot_valid(struct nd_namespace_label *nd_label, u32 slot)
+{
+ /* check that we are written where we expect to be written */
+ if (slot != __le32_to_cpu(nd_label->slot))
+ return false;
+
+ /* check that DPA allocations are page aligned */
+ if ((__le64_to_cpu(nd_label->dpa)
+ | __le64_to_cpu(nd_label->rawsize)) % SZ_4K)
+ return false;
+
+ return true;
+}
+
+int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot, slot;
+
+ if (!preamble_current(ndd, &nsindex, &free, &nslot))
+ return 0; /* no label, nothing to reserve */
+
+ for_each_clear_bit_le(slot, free, nslot) {
+ struct nd_namespace_label *nd_label;
+ struct nd_region *nd_region = NULL;
+ u8 label_uuid[NSLABEL_UUID_LEN];
+ struct nd_label_id label_id;
+ struct resource *res;
+ u32 flags;
+
+ nd_label = nd_label_base(ndd) + slot;
+
+ if (!slot_valid(nd_label, slot))
+ continue;
+
+ memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN);
+ flags = __le32_to_cpu(nd_label->flags);
+ nd_label_gen_id(&label_id, label_uuid, flags);
+ res = nvdimm_allocate_dpa(ndd, &label_id,
+ __le64_to_cpu(nd_label->dpa),
+ __le64_to_cpu(nd_label->rawsize));
+ nd_dbg_dpa(nd_region, ndd, res, "reserve\n");
+ if (!res)
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+int nd_label_active_count(struct nvdimm_drvdata *ndd)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot, slot;
+ int count = 0;
+
+ if (!preamble_current(ndd, &nsindex, &free, &nslot))
+ return 0;
+
+ for_each_clear_bit_le(slot, free, nslot) {
+ struct nd_namespace_label *nd_label;
+
+ nd_label = nd_label_base(ndd) + slot;
+
+ if (!slot_valid(nd_label, slot)) {
+ u32 label_slot = __le32_to_cpu(nd_label->slot);
+ u64 size = __le64_to_cpu(nd_label->rawsize);
+ u64 dpa = __le64_to_cpu(nd_label->dpa);
+
+ dev_dbg(ndd->dev,
+ "%s: slot%d invalid slot: %d dpa: %llx size: %llx\n",
+ __func__, slot, label_slot, dpa, size);
+ continue;
+ }
+ count++;
+ }
+ return count;
+}
+
+struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot, slot;
+
+ if (!preamble_current(ndd, &nsindex, &free, &nslot))
+ return NULL;
+
+ for_each_clear_bit_le(slot, free, nslot) {
+ struct nd_namespace_label *nd_label;
+
+ nd_label = nd_label_base(ndd) + slot;
+ if (!slot_valid(nd_label, slot))
+ continue;
+
+ if (n-- == 0)
+ return nd_label_base(ndd) + slot;
+ }
+
+ return NULL;
+}
+
+u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot, slot;
+
+ if (!preamble_next(ndd, &nsindex, &free, &nslot))
+ return UINT_MAX;
+
+ WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
+
+ slot = find_next_bit_le(free, nslot, 0);
+ if (slot == nslot)
+ return UINT_MAX;
+
+ clear_bit_le(slot, free);
+
+ return slot;
+}
+
+bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot;
+
+ if (!preamble_next(ndd, &nsindex, &free, &nslot))
+ return false;
+
+ WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
+
+ if (slot < nslot)
+ return !test_and_set_bit_le(slot, free);
+ return false;
+}
+
+u32 nd_label_nfree(struct nvdimm_drvdata *ndd)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot;
+
+ WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
+
+ if (!preamble_next(ndd, &nsindex, &free, &nslot))
+ return nvdimm_num_label_slots(ndd);
+
+ return bitmap_weight(free, nslot);
+}
+
+static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq,
+ unsigned long flags)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long offset;
+ u64 checksum;
+ u32 nslot;
+ int rc;
+
+ nsindex = to_namespace_index(ndd, index);
+ if (flags & ND_NSINDEX_INIT)
+ nslot = nvdimm_num_label_slots(ndd);
+ else
+ nslot = __le32_to_cpu(nsindex->nslot);
+
+ memcpy(nsindex->sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN);
+ nsindex->flags = __cpu_to_le32(0);
+ nsindex->seq = __cpu_to_le32(seq);
+ offset = (unsigned long) nsindex
+ - (unsigned long) to_namespace_index(ndd, 0);
+ nsindex->myoff = __cpu_to_le64(offset);
+ nsindex->mysize = __cpu_to_le64(sizeof_namespace_index(ndd));
+ offset = (unsigned long) to_namespace_index(ndd,
+ nd_label_next_nsindex(index))
+ - (unsigned long) to_namespace_index(ndd, 0);
+ nsindex->otheroff = __cpu_to_le64(offset);
+ offset = (unsigned long) nd_label_base(ndd)
+ - (unsigned long) to_namespace_index(ndd, 0);
+ nsindex->labeloff = __cpu_to_le64(offset);
+ nsindex->nslot = __cpu_to_le32(nslot);
+ nsindex->major = __cpu_to_le16(1);
+ nsindex->minor = __cpu_to_le16(1);
+ nsindex->checksum = __cpu_to_le64(0);
+ if (flags & ND_NSINDEX_INIT) {
+ unsigned long *free = (unsigned long *) nsindex->free;
+ u32 nfree = ALIGN(nslot, BITS_PER_LONG);
+ int last_bits, i;
+
+ memset(nsindex->free, 0xff, nfree / 8);
+ for (i = 0, last_bits = nfree - nslot; i < last_bits; i++)
+ clear_bit_le(nslot + i, free);
+ }
+ checksum = nd_fletcher64(nsindex, sizeof_namespace_index(ndd), 1);
+ nsindex->checksum = __cpu_to_le64(checksum);
+ rc = nvdimm_set_config_data(ndd, __le64_to_cpu(nsindex->myoff),
+ nsindex, sizeof_namespace_index(ndd));
+ if (rc < 0)
+ return rc;
+
+ if (flags & ND_NSINDEX_INIT)
+ return 0;
+
+ /* copy the index we just wrote to the new 'next' */
+ WARN_ON(index != ndd->ns_next);
+ nd_label_copy(ndd, to_current_namespace_index(ndd), nsindex);
+ ndd->ns_current = nd_label_next_nsindex(ndd->ns_current);
+ ndd->ns_next = nd_label_next_nsindex(ndd->ns_next);
+ WARN_ON(ndd->ns_current == ndd->ns_next);
+
+ return 0;
+}
+
+static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd,
+ struct nd_namespace_label *nd_label)
+{
+ return (unsigned long) nd_label
+ - (unsigned long) to_namespace_index(ndd, 0);
+}
+
+static int __pmem_label_update(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm,
+ int pos)
+{
+ u64 cookie = nd_region_interleave_set_cookie(nd_region), rawsize;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_namespace_label *victim_label;
+ struct nd_namespace_label *nd_label;
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot, slot;
+ size_t offset;
+ int rc;
+
+ if (!preamble_next(ndd, &nsindex, &free, &nslot))
+ return -ENXIO;
+
+ /* allocate and write the label to the staging (next) index */
+ slot = nd_label_alloc_slot(ndd);
+ if (slot == UINT_MAX)
+ return -ENXIO;
+ dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot);
+
+ nd_label = nd_label_base(ndd) + slot;
+ memset(nd_label, 0, sizeof(struct nd_namespace_label));
+ memcpy(nd_label->uuid, nspm->uuid, NSLABEL_UUID_LEN);
+ if (nspm->alt_name)
+ memcpy(nd_label->name, nspm->alt_name, NSLABEL_NAME_LEN);
+ nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_UPDATING);
+ nd_label->nlabel = __cpu_to_le16(nd_region->ndr_mappings);
+ nd_label->position = __cpu_to_le16(pos);
+ nd_label->isetcookie = __cpu_to_le64(cookie);
+ rawsize = div_u64(resource_size(&nspm->nsio.res),
+ nd_region->ndr_mappings);
+ nd_label->rawsize = __cpu_to_le64(rawsize);
+ nd_label->dpa = __cpu_to_le64(nd_mapping->start);
+ nd_label->slot = __cpu_to_le32(slot);
+
+ /* update label */
+ offset = nd_label_offset(ndd, nd_label);
+ rc = nvdimm_set_config_data(ndd, offset, nd_label,
+ sizeof(struct nd_namespace_label));
+ if (rc < 0)
+ return rc;
+
+ /* Garbage collect the previous label */
+ victim_label = nd_mapping->labels[0];
+ if (victim_label) {
+ slot = to_slot(ndd, victim_label);
+ nd_label_free_slot(ndd, slot);
+ dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot);
+ }
+
+ /* update index */
+ rc = nd_label_write_index(ndd, ndd->ns_next,
+ nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
+ if (rc < 0)
+ return rc;
+
+ nd_mapping->labels[0] = nd_label;
+
+ return 0;
+}
+
+static void del_label(struct nd_mapping *nd_mapping, int l)
+{
+ struct nd_namespace_label *next_label, *nd_label;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ unsigned int slot;
+ int j;
+
+ nd_label = nd_mapping->labels[l];
+ slot = to_slot(ndd, nd_label);
+ dev_vdbg(ndd->dev, "%s: clear: %d\n", __func__, slot);
+
+ for (j = l; (next_label = nd_mapping->labels[j + 1]); j++)
+ nd_mapping->labels[j] = next_label;
+ nd_mapping->labels[j] = NULL;
+}
+
+static bool is_old_resource(struct resource *res, struct resource **list, int n)
+{
+ int i;
+
+ if (res->flags & DPA_RESOURCE_ADJUSTED)
+ return false;
+ for (i = 0; i < n; i++)
+ if (res == list[i])
+ return true;
+ return false;
+}
+
+static struct resource *to_resource(struct nvdimm_drvdata *ndd,
+ struct nd_namespace_label *nd_label)
+{
+ struct resource *res;
+
+ for_each_dpa_resource(ndd, res) {
+ if (res->start != __le64_to_cpu(nd_label->dpa))
+ continue;
+ if (resource_size(res) != __le64_to_cpu(nd_label->rawsize))
+ continue;
+ return res;
+ }
+
+ return NULL;
+}
+
+/*
+ * 1/ Account all the labels that can be freed after this update
+ * 2/ Allocate and write the label to the staging (next) index
+ * 3/ Record the resources in the namespace device
+ */
+static int __blk_label_update(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, struct nd_namespace_blk *nsblk,
+ int num_labels)
+{
+ int i, l, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_namespace_label *nd_label;
+ struct nd_namespace_index *nsindex;
+ unsigned long *free, *victim_map = NULL;
+ struct resource *res, **old_res_list;
+ struct nd_label_id label_id;
+ u8 uuid[NSLABEL_UUID_LEN];
+ u32 nslot, slot;
+
+ if (!preamble_next(ndd, &nsindex, &free, &nslot))
+ return -ENXIO;
+
+ old_res_list = nsblk->res;
+ nfree = nd_label_nfree(ndd);
+ old_num_resources = nsblk->num_resources;
+ nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
+
+ /*
+ * We need to loop over the old resources a few times, which seems a
+ * bit inefficient, but we need to know that we have the label
+ * space before we start mutating the tracking structures.
+ * Otherwise the recovery method of last resort for userspace is
+ * disable and re-enable the parent region.
+ */
+ alloc = 0;
+ for_each_dpa_resource(ndd, res) {
+ if (strcmp(res->name, label_id.id) != 0)
+ continue;
+ if (!is_old_resource(res, old_res_list, old_num_resources))
+ alloc++;
+ }
+
+ victims = 0;
+ if (old_num_resources) {
+ /* convert old local-label-map to dimm-slot victim-map */
+ victim_map = kcalloc(BITS_TO_LONGS(nslot), sizeof(long),
+ GFP_KERNEL);
+ if (!victim_map)
+ return -ENOMEM;
+
+ /* mark unused labels for garbage collection */
+ for_each_clear_bit_le(slot, free, nslot) {
+ nd_label = nd_label_base(ndd) + slot;
+ memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
+ if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
+ continue;
+ res = to_resource(ndd, nd_label);
+ if (res && is_old_resource(res, old_res_list,
+ old_num_resources))
+ continue;
+ slot = to_slot(ndd, nd_label);
+ set_bit(slot, victim_map);
+ victims++;
+ }
+ }
+
+ /* don't allow updates that consume the last label */
+ if (nfree - alloc < 0 || nfree - alloc + victims < 1) {
+ dev_info(&nsblk->common.dev, "insufficient label space\n");
+ kfree(victim_map);
+ return -ENOSPC;
+ }
+ /* from here on we need to abort on error */
+
+
+ /* assign all resources to the namespace before writing the labels */
+ nsblk->res = NULL;
+ nsblk->num_resources = 0;
+ for_each_dpa_resource(ndd, res) {
+ if (strcmp(res->name, label_id.id) != 0)
+ continue;
+ if (!nsblk_add_resource(nd_region, ndd, nsblk, res->start)) {
+ rc = -ENOMEM;
+ goto abort;
+ }
+ }
+
+ for (i = 0; i < nsblk->num_resources; i++) {
+ size_t offset;
+
+ res = nsblk->res[i];
+ if (is_old_resource(res, old_res_list, old_num_resources))
+ continue; /* carry-over */
+ slot = nd_label_alloc_slot(ndd);
+ if (slot == UINT_MAX)
+ goto abort;
+ dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot);
+
+ nd_label = nd_label_base(ndd) + slot;
+ memset(nd_label, 0, sizeof(struct nd_namespace_label));
+ memcpy(nd_label->uuid, nsblk->uuid, NSLABEL_UUID_LEN);
+ if (nsblk->alt_name)
+ memcpy(nd_label->name, nsblk->alt_name,
+ NSLABEL_NAME_LEN);
+ nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_LOCAL);
+ nd_label->nlabel = __cpu_to_le16(0); /* N/A */
+ nd_label->position = __cpu_to_le16(0); /* N/A */
+ nd_label->isetcookie = __cpu_to_le64(0); /* N/A */
+ nd_label->dpa = __cpu_to_le64(res->start);
+ nd_label->rawsize = __cpu_to_le64(resource_size(res));
+ nd_label->lbasize = __cpu_to_le64(nsblk->lbasize);
+ nd_label->slot = __cpu_to_le32(slot);
+
+ /* update label */
+ offset = nd_label_offset(ndd, nd_label);
+ rc = nvdimm_set_config_data(ndd, offset, nd_label,
+ sizeof(struct nd_namespace_label));
+ if (rc < 0)
+ goto abort;
+ }
+
+ /* free up now unused slots in the new index */
+ for_each_set_bit(slot, victim_map, victim_map ? nslot : 0) {
+ dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot);
+ nd_label_free_slot(ndd, slot);
+ }
+
+ /* update index */
+ rc = nd_label_write_index(ndd, ndd->ns_next,
+ nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
+ if (rc)
+ goto abort;
+
+ /*
+ * Now that the on-dimm labels are up to date, fix up the tracking
+ * entries in nd_mapping->labels
+ */
+ nlabel = 0;
+ for_each_label(l, nd_label, nd_mapping->labels) {
+ nlabel++;
+ memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
+ if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
+ continue;
+ nlabel--;
+ del_label(nd_mapping, l);
+ l--; /* retry with the new label at this index */
+ }
+ if (nlabel + nsblk->num_resources > num_labels) {
+ /*
+ * Bug, we can't end up with more resources than
+ * available labels
+ */
+ WARN_ON_ONCE(1);
+ rc = -ENXIO;
+ goto out;
+ }
+
+ for_each_clear_bit_le(slot, free, nslot) {
+ nd_label = nd_label_base(ndd) + slot;
+ memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
+ if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
+ continue;
+ res = to_resource(ndd, nd_label);
+ res->flags &= ~DPA_RESOURCE_ADJUSTED;
+ dev_vdbg(&nsblk->common.dev, "assign label[%d] slot: %d\n",
+ l, slot);
+ nd_mapping->labels[l++] = nd_label;
+ }
+ nd_mapping->labels[l] = NULL;
+
+ out:
+ kfree(old_res_list);
+ kfree(victim_map);
+ return rc;
+
+ abort:
+ /*
+ * 1/ repair the allocated label bitmap in the index
+ * 2/ restore the resource list
+ */
+ nd_label_copy(ndd, nsindex, to_current_namespace_index(ndd));
+ kfree(nsblk->res);
+ nsblk->res = old_res_list;
+ nsblk->num_resources = old_num_resources;
+ old_res_list = NULL;
+ goto out;
+}
+
+static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
+{
+ int i, l, old_num_labels = 0;
+ struct nd_namespace_index *nsindex;
+ struct nd_namespace_label *nd_label;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ size_t size = (num_labels + 1) * sizeof(struct nd_namespace_label *);
+
+ for_each_label(l, nd_label, nd_mapping->labels)
+ old_num_labels++;
+
+ /*
+ * We need to preserve all the old labels for the mapping so
+ * they can be garbage collected after writing the new labels.
+ */
+ if (num_labels > old_num_labels) {
+ struct nd_namespace_label **labels;
+
+ labels = krealloc(nd_mapping->labels, size, GFP_KERNEL);
+ if (!labels)
+ return -ENOMEM;
+ nd_mapping->labels = labels;
+ }
+ if (!nd_mapping->labels)
+ return -ENOMEM;
+
+ for (i = old_num_labels; i <= num_labels; i++)
+ nd_mapping->labels[i] = NULL;
+
+ if (ndd->ns_current == -1 || ndd->ns_next == -1)
+ /* pass */;
+ else
+ return max(num_labels, old_num_labels);
+
+ nsindex = to_namespace_index(ndd, 0);
+ memset(nsindex, 0, ndd->nsarea.config_size);
+ for (i = 0; i < 2; i++) {
+ int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT);
+
+ if (rc)
+ return rc;
+ }
+ ndd->ns_next = 1;
+ ndd->ns_current = 0;
+
+ return max(num_labels, old_num_labels);
+}
+
+static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid)
+{
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_namespace_label *nd_label;
+ struct nd_namespace_index *nsindex;
+ u8 label_uuid[NSLABEL_UUID_LEN];
+ int l, num_freed = 0;
+ unsigned long *free;
+ u32 nslot, slot;
+
+ if (!uuid)
+ return 0;
+
+ /* no index || no labels == nothing to delete */
+ if (!preamble_next(ndd, &nsindex, &free, &nslot)
+ || !nd_mapping->labels)
+ return 0;
+
+ for_each_label(l, nd_label, nd_mapping->labels) {
+ memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN);
+ if (memcmp(label_uuid, uuid, NSLABEL_UUID_LEN) != 0)
+ continue;
+ slot = to_slot(ndd, nd_label);
+ nd_label_free_slot(ndd, slot);
+ dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot);
+ del_label(nd_mapping, l);
+ num_freed++;
+ l--; /* retry with new label at this index */
+ }
+
+ if (num_freed > l) {
+ /*
+ * num_freed will only ever be > l when we delete the last
+ * label
+ */
+ kfree(nd_mapping->labels);
+ nd_mapping->labels = NULL;
+ dev_dbg(ndd->dev, "%s: no more labels\n", __func__);
+ }
+
+ return nd_label_write_index(ndd, ndd->ns_next,
+ nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
+}
+
+int nd_pmem_namespace_label_update(struct nd_region *nd_region,
+ struct nd_namespace_pmem *nspm, resource_size_t size)
+{
+ int i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ int rc;
+
+ if (size == 0) {
+ rc = del_labels(nd_mapping, nspm->uuid);
+ if (rc)
+ return rc;
+ continue;
+ }
+
+ rc = init_labels(nd_mapping, 1);
+ if (rc < 0)
+ return rc;
+
+ rc = __pmem_label_update(nd_region, nd_mapping, nspm, i);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+int nd_blk_namespace_label_update(struct nd_region *nd_region,
+ struct nd_namespace_blk *nsblk, resource_size_t size)
+{
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct resource *res;
+ int count = 0;
+
+ if (size == 0)
+ return del_labels(nd_mapping, nsblk->uuid);
+
+ for_each_dpa_resource(to_ndd(nd_mapping), res)
+ count++;
+
+ count = init_labels(nd_mapping, count);
+ if (count < 0)
+ return count;
+
+ return __blk_label_update(nd_region, nd_mapping, nsblk, count);
+}
diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h
new file mode 100644
index 000000000000..a59ef6eef2a3
--- /dev/null
+++ b/drivers/nvdimm/label.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __LABEL_H__
+#define __LABEL_H__
+
+#include <linux/ndctl.h>
+#include <linux/sizes.h>
+#include <linux/io.h>
+
+enum {
+ NSINDEX_SIG_LEN = 16,
+ NSINDEX_ALIGN = 256,
+ NSINDEX_SEQ_MASK = 0x3,
+ NSLABEL_UUID_LEN = 16,
+ NSLABEL_NAME_LEN = 64,
+ NSLABEL_FLAG_ROLABEL = 0x1, /* read-only label */
+ NSLABEL_FLAG_LOCAL = 0x2, /* DIMM-local namespace */
+ NSLABEL_FLAG_BTT = 0x4, /* namespace contains a BTT */
+ NSLABEL_FLAG_UPDATING = 0x8, /* label being updated */
+ BTT_ALIGN = 4096, /* all btt structures */
+ BTTINFO_SIG_LEN = 16,
+ BTTINFO_UUID_LEN = 16,
+ BTTINFO_FLAG_ERROR = 0x1, /* error state (read-only) */
+ BTTINFO_MAJOR_VERSION = 1,
+ ND_LABEL_MIN_SIZE = 512 * 129, /* see sizeof_namespace_index() */
+ ND_LABEL_ID_SIZE = 50,
+ ND_NSINDEX_INIT = 0x1,
+};
+
+static const char NSINDEX_SIGNATURE[] = "NAMESPACE_INDEX\0";
+
+/**
+ * struct nd_namespace_index - label set superblock
+ * @sig: NAMESPACE_INDEX\0
+ * @flags: placeholder
+ * @seq: sequence number for this index
+ * @myoff: offset of this index in label area
+ * @mysize: size of this index struct
+ * @otheroff: offset of other index
+ * @labeloff: offset of first label slot
+ * @nslot: total number of label slots
+ * @major: label area major version
+ * @minor: label area minor version
+ * @checksum: fletcher64 of all fields
+ * @free[0]: bitmap, nlabel bits
+ *
+ * The size of free[] is rounded up so the total struct size is a
+ * multiple of NSINDEX_ALIGN bytes. Any bits this allocates beyond
+ * nlabel bits must be zero.
+ */
+struct nd_namespace_index {
+ u8 sig[NSINDEX_SIG_LEN];
+ __le32 flags;
+ __le32 seq;
+ __le64 myoff;
+ __le64 mysize;
+ __le64 otheroff;
+ __le64 labeloff;
+ __le32 nslot;
+ __le16 major;
+ __le16 minor;
+ __le64 checksum;
+ u8 free[0];
+};
+
+/**
+ * struct nd_namespace_label - namespace superblock
+ * @uuid: UUID per RFC 4122
+ * @name: optional name (NULL-terminated)
+ * @flags: see NSLABEL_FLAG_*
+ * @nlabel: num labels to describe this ns
+ * @position: labels position in set
+ * @isetcookie: interleave set cookie
+ * @lbasize: LBA size in bytes or 0 for pmem
+ * @dpa: DPA of NVM range on this DIMM
+ * @rawsize: size of namespace
+ * @slot: slot of this label in label area
+ * @unused: must be zero
+ */
+struct nd_namespace_label {
+ u8 uuid[NSLABEL_UUID_LEN];
+ u8 name[NSLABEL_NAME_LEN];
+ __le32 flags;
+ __le16 nlabel;
+ __le16 position;
+ __le64 isetcookie;
+ __le64 lbasize;
+ __le64 dpa;
+ __le64 rawsize;
+ __le32 slot;
+ __le32 unused;
+};
+
+/**
+ * struct nd_label_id - identifier string for dpa allocation
+ * @id: "{blk|pmem}-<namespace uuid>"
+ */
+struct nd_label_id {
+ char id[ND_LABEL_ID_SIZE];
+};
+
+/*
+ * If the 'best' index is invalid, so is the 'next' index. Otherwise,
+ * the next index is MOD(index+1, 2)
+ */
+static inline int nd_label_next_nsindex(int index)
+{
+ if (index < 0)
+ return -1;
+
+ return (index + 1) % 2;
+}
+
+struct nvdimm_drvdata;
+int nd_label_validate(struct nvdimm_drvdata *ndd);
+void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst,
+ struct nd_namespace_index *src);
+size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd);
+int nd_label_active_count(struct nvdimm_drvdata *ndd);
+struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n);
+u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd);
+bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot);
+u32 nd_label_nfree(struct nvdimm_drvdata *ndd);
+struct nd_region;
+struct nd_namespace_pmem;
+struct nd_namespace_blk;
+int nd_pmem_namespace_label_update(struct nd_region *nd_region,
+ struct nd_namespace_pmem *nspm, resource_size_t size);
+int nd_blk_namespace_label_update(struct nd_region *nd_region,
+ struct nd_namespace_blk *nsblk, resource_size_t size);
+#endif /* __LABEL_H__ */
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
new file mode 100644
index 000000000000..fef0dd80d4ad
--- /dev/null
+++ b/drivers/nvdimm/namespace_devs.c
@@ -0,0 +1,1870 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "nd.h"
+
+static void namespace_io_release(struct device *dev)
+{
+ struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+ kfree(nsio);
+}
+
+static void namespace_pmem_release(struct device *dev)
+{
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ kfree(nspm->alt_name);
+ kfree(nspm->uuid);
+ kfree(nspm);
+}
+
+static void namespace_blk_release(struct device *dev)
+{
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+
+ if (nsblk->id >= 0)
+ ida_simple_remove(&nd_region->ns_ida, nsblk->id);
+ kfree(nsblk->alt_name);
+ kfree(nsblk->uuid);
+ kfree(nsblk->res);
+ kfree(nsblk);
+}
+
+static struct device_type namespace_io_device_type = {
+ .name = "nd_namespace_io",
+ .release = namespace_io_release,
+};
+
+static struct device_type namespace_pmem_device_type = {
+ .name = "nd_namespace_pmem",
+ .release = namespace_pmem_release,
+};
+
+static struct device_type namespace_blk_device_type = {
+ .name = "nd_namespace_blk",
+ .release = namespace_blk_release,
+};
+
+static bool is_namespace_pmem(struct device *dev)
+{
+ return dev ? dev->type == &namespace_pmem_device_type : false;
+}
+
+static bool is_namespace_blk(struct device *dev)
+{
+ return dev ? dev->type == &namespace_blk_device_type : false;
+}
+
+static bool is_namespace_io(struct device *dev)
+{
+ return dev ? dev->type == &namespace_io_device_type : false;
+}
+
+const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
+ char *name)
+{
+ struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+ const char *suffix = "";
+
+ if (ndns->claim && is_nd_btt(ndns->claim))
+ suffix = "s";
+
+ if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev))
+ sprintf(name, "pmem%d%s", nd_region->id, suffix);
+ else if (is_namespace_blk(&ndns->dev)) {
+ struct nd_namespace_blk *nsblk;
+
+ nsblk = to_nd_namespace_blk(&ndns->dev);
+ sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id, suffix);
+ } else {
+ return NULL;
+ }
+
+ return name;
+}
+EXPORT_SYMBOL(nvdimm_namespace_disk_name);
+
+static ssize_t nstype_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+
+ return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region));
+}
+static DEVICE_ATTR_RO(nstype);
+
+static ssize_t __alt_name_store(struct device *dev, const char *buf,
+ const size_t len)
+{
+ char *input, *pos, *alt_name, **ns_altname;
+ ssize_t rc;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ ns_altname = &nspm->alt_name;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ ns_altname = &nsblk->alt_name;
+ } else
+ return -ENXIO;
+
+ if (dev->driver || to_ndns(dev)->claim)
+ return -EBUSY;
+
+ input = kmemdup(buf, len + 1, GFP_KERNEL);
+ if (!input)
+ return -ENOMEM;
+
+ input[len] = '\0';
+ pos = strim(input);
+ if (strlen(pos) + 1 > NSLABEL_NAME_LEN) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ alt_name = kzalloc(NSLABEL_NAME_LEN, GFP_KERNEL);
+ if (!alt_name) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ kfree(*ns_altname);
+ *ns_altname = alt_name;
+ sprintf(*ns_altname, "%s", pos);
+ rc = len;
+
+out:
+ kfree(input);
+ return rc;
+}
+
+static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk)
+{
+ struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent);
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_label_id label_id;
+ resource_size_t size = 0;
+ struct resource *res;
+
+ if (!nsblk->uuid)
+ return 0;
+ nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id.id) == 0)
+ size += resource_size(res);
+ return size;
+}
+
+static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk)
+{
+ struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent);
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_label_id label_id;
+ struct resource *res;
+ int count, i;
+
+ if (!nsblk->uuid || !nsblk->lbasize || !ndd)
+ return false;
+
+ count = 0;
+ nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
+ for_each_dpa_resource(ndd, res) {
+ if (strcmp(res->name, label_id.id) != 0)
+ continue;
+ /*
+ * Resources with unacknoweldged adjustments indicate a
+ * failure to update labels
+ */
+ if (res->flags & DPA_RESOURCE_ADJUSTED)
+ return false;
+ count++;
+ }
+
+ /* These values match after a successful label update */
+ if (count != nsblk->num_resources)
+ return false;
+
+ for (i = 0; i < nsblk->num_resources; i++) {
+ struct resource *found = NULL;
+
+ for_each_dpa_resource(ndd, res)
+ if (res == nsblk->res[i]) {
+ found = res;
+ break;
+ }
+ /* stale resource */
+ if (!found)
+ return false;
+ }
+
+ return true;
+}
+
+resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk)
+{
+ resource_size_t size;
+
+ nvdimm_bus_lock(&nsblk->common.dev);
+ size = __nd_namespace_blk_validate(nsblk);
+ nvdimm_bus_unlock(&nsblk->common.dev);
+
+ return size;
+}
+EXPORT_SYMBOL(nd_namespace_blk_validate);
+
+
+static int nd_namespace_label_update(struct nd_region *nd_region,
+ struct device *dev)
+{
+ dev_WARN_ONCE(dev, dev->driver || to_ndns(dev)->claim,
+ "namespace must be idle during label update\n");
+ if (dev->driver || to_ndns(dev)->claim)
+ return 0;
+
+ /*
+ * Only allow label writes that will result in a valid namespace
+ * or deletion of an existing namespace.
+ */
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+ resource_size_t size = resource_size(&nspm->nsio.res);
+
+ if (size == 0 && nspm->uuid)
+ /* delete allocation */;
+ else if (!nspm->uuid)
+ return 0;
+
+ return nd_pmem_namespace_label_update(nd_region, nspm, size);
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+ resource_size_t size = nd_namespace_blk_size(nsblk);
+
+ if (size == 0 && nsblk->uuid)
+ /* delete allocation */;
+ else if (!nsblk->uuid || !nsblk->lbasize)
+ return 0;
+
+ return nd_blk_namespace_label_update(nd_region, nsblk, size);
+ } else
+ return -ENXIO;
+}
+
+static ssize_t alt_name_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ ssize_t rc;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ wait_nvdimm_bus_probe_idle(dev);
+ rc = __alt_name_store(dev, buf, len);
+ if (rc >= 0)
+ rc = nd_namespace_label_update(nd_region, dev);
+ dev_dbg(dev, "%s: %s(%zd)\n", __func__, rc < 0 ? "fail " : "", rc);
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc < 0 ? rc : len;
+}
+
+static ssize_t alt_name_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ char *ns_altname;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ ns_altname = nspm->alt_name;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ ns_altname = nsblk->alt_name;
+ } else
+ return -ENXIO;
+
+ return sprintf(buf, "%s\n", ns_altname ? ns_altname : "");
+}
+static DEVICE_ATTR_RW(alt_name);
+
+static int scan_free(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
+ resource_size_t n)
+{
+ bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ int rc = 0;
+
+ while (n) {
+ struct resource *res, *last;
+ resource_size_t new_start;
+
+ last = NULL;
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id->id) == 0)
+ last = res;
+ res = last;
+ if (!res)
+ return 0;
+
+ if (n >= resource_size(res)) {
+ n -= resource_size(res);
+ nd_dbg_dpa(nd_region, ndd, res, "delete %d\n", rc);
+ nvdimm_free_dpa(ndd, res);
+ /* retry with last resource deleted */
+ continue;
+ }
+
+ /*
+ * Keep BLK allocations relegated to high DPA as much as
+ * possible
+ */
+ if (is_blk)
+ new_start = res->start + n;
+ else
+ new_start = res->start;
+
+ rc = adjust_resource(res, new_start, resource_size(res) - n);
+ if (rc == 0)
+ res->flags |= DPA_RESOURCE_ADJUSTED;
+ nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc);
+ break;
+ }
+
+ return rc;
+}
+
+/**
+ * shrink_dpa_allocation - for each dimm in region free n bytes for label_id
+ * @nd_region: the set of dimms to reclaim @n bytes from
+ * @label_id: unique identifier for the namespace consuming this dpa range
+ * @n: number of bytes per-dimm to release
+ *
+ * Assumes resources are ordered. Starting from the end try to
+ * adjust_resource() the allocation to @n, but if @n is larger than the
+ * allocation delete it and find the 'new' last allocation in the label
+ * set.
+ */
+static int shrink_dpa_allocation(struct nd_region *nd_region,
+ struct nd_label_id *label_id, resource_size_t n)
+{
+ int i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ int rc;
+
+ rc = scan_free(nd_region, nd_mapping, label_id, n);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+static resource_size_t init_dpa_allocation(struct nd_label_id *label_id,
+ struct nd_region *nd_region, struct nd_mapping *nd_mapping,
+ resource_size_t n)
+{
+ bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ resource_size_t first_dpa;
+ struct resource *res;
+ int rc = 0;
+
+ /* allocate blk from highest dpa first */
+ if (is_blk)
+ first_dpa = nd_mapping->start + nd_mapping->size - n;
+ else
+ first_dpa = nd_mapping->start;
+
+ /* first resource allocation for this label-id or dimm */
+ res = nvdimm_allocate_dpa(ndd, label_id, first_dpa, n);
+ if (!res)
+ rc = -EBUSY;
+
+ nd_dbg_dpa(nd_region, ndd, res, "init %d\n", rc);
+ return rc ? n : 0;
+}
+
+static bool space_valid(bool is_pmem, bool is_reserve,
+ struct nd_label_id *label_id, struct resource *res)
+{
+ /*
+ * For BLK-space any space is valid, for PMEM-space, it must be
+ * contiguous with an existing allocation unless we are
+ * reserving pmem.
+ */
+ if (is_reserve || !is_pmem)
+ return true;
+ if (!res || strcmp(res->name, label_id->id) == 0)
+ return true;
+ return false;
+}
+
+enum alloc_loc {
+ ALLOC_ERR = 0, ALLOC_BEFORE, ALLOC_MID, ALLOC_AFTER,
+};
+
+static resource_size_t scan_allocate(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
+ resource_size_t n)
+{
+ resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1;
+ bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0;
+ bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ const resource_size_t to_allocate = n;
+ struct resource *res;
+ int first;
+
+ retry:
+ first = 0;
+ for_each_dpa_resource(ndd, res) {
+ resource_size_t allocate, available = 0, free_start, free_end;
+ struct resource *next = res->sibling, *new_res = NULL;
+ enum alloc_loc loc = ALLOC_ERR;
+ const char *action;
+ int rc = 0;
+
+ /* ignore resources outside this nd_mapping */
+ if (res->start > mapping_end)
+ continue;
+ if (res->end < nd_mapping->start)
+ continue;
+
+ /* space at the beginning of the mapping */
+ if (!first++ && res->start > nd_mapping->start) {
+ free_start = nd_mapping->start;
+ available = res->start - free_start;
+ if (space_valid(is_pmem, is_reserve, label_id, NULL))
+ loc = ALLOC_BEFORE;
+ }
+
+ /* space between allocations */
+ if (!loc && next) {
+ free_start = res->start + resource_size(res);
+ free_end = min(mapping_end, next->start - 1);
+ if (space_valid(is_pmem, is_reserve, label_id, res)
+ && free_start < free_end) {
+ available = free_end + 1 - free_start;
+ loc = ALLOC_MID;
+ }
+ }
+
+ /* space at the end of the mapping */
+ if (!loc && !next) {
+ free_start = res->start + resource_size(res);
+ free_end = mapping_end;
+ if (space_valid(is_pmem, is_reserve, label_id, res)
+ && free_start < free_end) {
+ available = free_end + 1 - free_start;
+ loc = ALLOC_AFTER;
+ }
+ }
+
+ if (!loc || !available)
+ continue;
+ allocate = min(available, n);
+ switch (loc) {
+ case ALLOC_BEFORE:
+ if (strcmp(res->name, label_id->id) == 0) {
+ /* adjust current resource up */
+ if (is_pmem && !is_reserve)
+ return n;
+ rc = adjust_resource(res, res->start - allocate,
+ resource_size(res) + allocate);
+ action = "cur grow up";
+ } else
+ action = "allocate";
+ break;
+ case ALLOC_MID:
+ if (strcmp(next->name, label_id->id) == 0) {
+ /* adjust next resource up */
+ if (is_pmem && !is_reserve)
+ return n;
+ rc = adjust_resource(next, next->start
+ - allocate, resource_size(next)
+ + allocate);
+ new_res = next;
+ action = "next grow up";
+ } else if (strcmp(res->name, label_id->id) == 0) {
+ action = "grow down";
+ } else
+ action = "allocate";
+ break;
+ case ALLOC_AFTER:
+ if (strcmp(res->name, label_id->id) == 0)
+ action = "grow down";
+ else
+ action = "allocate";
+ break;
+ default:
+ return n;
+ }
+
+ if (strcmp(action, "allocate") == 0) {
+ /* BLK allocate bottom up */
+ if (!is_pmem)
+ free_start += available - allocate;
+ else if (!is_reserve && free_start != nd_mapping->start)
+ return n;
+
+ new_res = nvdimm_allocate_dpa(ndd, label_id,
+ free_start, allocate);
+ if (!new_res)
+ rc = -EBUSY;
+ } else if (strcmp(action, "grow down") == 0) {
+ /* adjust current resource down */
+ rc = adjust_resource(res, res->start, resource_size(res)
+ + allocate);
+ if (rc == 0)
+ res->flags |= DPA_RESOURCE_ADJUSTED;
+ }
+
+ if (!new_res)
+ new_res = res;
+
+ nd_dbg_dpa(nd_region, ndd, new_res, "%s(%d) %d\n",
+ action, loc, rc);
+
+ if (rc)
+ return n;
+
+ n -= allocate;
+ if (n) {
+ /*
+ * Retry scan with newly inserted resources.
+ * For example, if we did an ALLOC_BEFORE
+ * insertion there may also have been space
+ * available for an ALLOC_AFTER insertion, so we
+ * need to check this same resource again
+ */
+ goto retry;
+ } else
+ return 0;
+ }
+
+ /*
+ * If we allocated nothing in the BLK case it may be because we are in
+ * an initial "pmem-reserve pass". Only do an initial BLK allocation
+ * when none of the DPA space is reserved.
+ */
+ if ((is_pmem || !ndd->dpa.child) && n == to_allocate)
+ return init_dpa_allocation(label_id, nd_region, nd_mapping, n);
+ return n;
+}
+
+static int merge_dpa(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, struct nd_label_id *label_id)
+{
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct resource *res;
+
+ if (strncmp("pmem", label_id->id, 4) == 0)
+ return 0;
+ retry:
+ for_each_dpa_resource(ndd, res) {
+ int rc;
+ struct resource *next = res->sibling;
+ resource_size_t end = res->start + resource_size(res);
+
+ if (!next || strcmp(res->name, label_id->id) != 0
+ || strcmp(next->name, label_id->id) != 0
+ || end != next->start)
+ continue;
+ end += resource_size(next);
+ nvdimm_free_dpa(ndd, next);
+ rc = adjust_resource(res, res->start, end - res->start);
+ nd_dbg_dpa(nd_region, ndd, res, "merge %d\n", rc);
+ if (rc)
+ return rc;
+ res->flags |= DPA_RESOURCE_ADJUSTED;
+ goto retry;
+ }
+
+ return 0;
+}
+
+static int __reserve_free_pmem(struct device *dev, void *data)
+{
+ struct nvdimm *nvdimm = data;
+ struct nd_region *nd_region;
+ struct nd_label_id label_id;
+ int i;
+
+ if (!is_nd_pmem(dev))
+ return 0;
+
+ nd_region = to_nd_region(dev);
+ if (nd_region->ndr_mappings == 0)
+ return 0;
+
+ memset(&label_id, 0, sizeof(label_id));
+ strcat(label_id.id, "pmem-reserve");
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ resource_size_t n, rem = 0;
+
+ if (nd_mapping->nvdimm != nvdimm)
+ continue;
+
+ n = nd_pmem_available_dpa(nd_region, nd_mapping, &rem);
+ if (n == 0)
+ return 0;
+ rem = scan_allocate(nd_region, nd_mapping, &label_id, n);
+ dev_WARN_ONCE(&nd_region->dev, rem,
+ "pmem reserve underrun: %#llx of %#llx bytes\n",
+ (unsigned long long) n - rem,
+ (unsigned long long) n);
+ return rem ? -ENXIO : 0;
+ }
+
+ return 0;
+}
+
+static void release_free_pmem(struct nvdimm_bus *nvdimm_bus,
+ struct nd_mapping *nd_mapping)
+{
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct resource *res, *_res;
+
+ for_each_dpa_resource_safe(ndd, res, _res)
+ if (strcmp(res->name, "pmem-reserve") == 0)
+ nvdimm_free_dpa(ndd, res);
+}
+
+static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus,
+ struct nd_mapping *nd_mapping)
+{
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+ int rc;
+
+ rc = device_for_each_child(&nvdimm_bus->dev, nvdimm,
+ __reserve_free_pmem);
+ if (rc)
+ release_free_pmem(nvdimm_bus, nd_mapping);
+ return rc;
+}
+
+/**
+ * grow_dpa_allocation - for each dimm allocate n bytes for @label_id
+ * @nd_region: the set of dimms to allocate @n more bytes from
+ * @label_id: unique identifier for the namespace consuming this dpa range
+ * @n: number of bytes per-dimm to add to the existing allocation
+ *
+ * Assumes resources are ordered. For BLK regions, first consume
+ * BLK-only available DPA free space, then consume PMEM-aliased DPA
+ * space starting at the highest DPA. For PMEM regions start
+ * allocations from the start of an interleave set and end at the first
+ * BLK allocation or the end of the interleave set, whichever comes
+ * first.
+ */
+static int grow_dpa_allocation(struct nd_region *nd_region,
+ struct nd_label_id *label_id, resource_size_t n)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+ bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
+ int i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ resource_size_t rem = n;
+ int rc, j;
+
+ /*
+ * In the BLK case try once with all unallocated PMEM
+ * reserved, and once without
+ */
+ for (j = is_pmem; j < 2; j++) {
+ bool blk_only = j == 0;
+
+ if (blk_only) {
+ rc = reserve_free_pmem(nvdimm_bus, nd_mapping);
+ if (rc)
+ return rc;
+ }
+ rem = scan_allocate(nd_region, nd_mapping,
+ label_id, rem);
+ if (blk_only)
+ release_free_pmem(nvdimm_bus, nd_mapping);
+
+ /* try again and allow encroachments into PMEM */
+ if (rem == 0)
+ break;
+ }
+
+ dev_WARN_ONCE(&nd_region->dev, rem,
+ "allocation underrun: %#llx of %#llx bytes\n",
+ (unsigned long long) n - rem,
+ (unsigned long long) n);
+ if (rem)
+ return -ENXIO;
+
+ rc = merge_dpa(nd_region, nd_mapping, label_id);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+static void nd_namespace_pmem_set_size(struct nd_region *nd_region,
+ struct nd_namespace_pmem *nspm, resource_size_t size)
+{
+ struct resource *res = &nspm->nsio.res;
+
+ res->start = nd_region->ndr_start;
+ res->end = nd_region->ndr_start + size - 1;
+}
+
+static ssize_t __size_store(struct device *dev, unsigned long long val)
+{
+ resource_size_t allocated = 0, available = 0;
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_mapping *nd_mapping;
+ struct nvdimm_drvdata *ndd;
+ struct nd_label_id label_id;
+ u32 flags = 0, remainder;
+ u8 *uuid = NULL;
+ int rc, i;
+
+ if (dev->driver || to_ndns(dev)->claim)
+ return -EBUSY;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ uuid = nspm->uuid;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ uuid = nsblk->uuid;
+ flags = NSLABEL_FLAG_LOCAL;
+ }
+
+ /*
+ * We need a uuid for the allocation-label and dimm(s) on which
+ * to store the label.
+ */
+ if (!uuid || nd_region->ndr_mappings == 0)
+ return -ENXIO;
+
+ div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder);
+ if (remainder) {
+ dev_dbg(dev, "%llu is not %dK aligned\n", val,
+ (SZ_4K * nd_region->ndr_mappings) / SZ_1K);
+ return -EINVAL;
+ }
+
+ nd_label_gen_id(&label_id, uuid, flags);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ nd_mapping = &nd_region->mapping[i];
+ ndd = to_ndd(nd_mapping);
+
+ /*
+ * All dimms in an interleave set, or the base dimm for a blk
+ * region, need to be enabled for the size to be changed.
+ */
+ if (!ndd)
+ return -ENXIO;
+
+ allocated += nvdimm_allocated_dpa(ndd, &label_id);
+ }
+ available = nd_region_available_dpa(nd_region);
+
+ if (val > available + allocated)
+ return -ENOSPC;
+
+ if (val == allocated)
+ return 0;
+
+ val = div_u64(val, nd_region->ndr_mappings);
+ allocated = div_u64(allocated, nd_region->ndr_mappings);
+ if (val < allocated)
+ rc = shrink_dpa_allocation(nd_region, &label_id,
+ allocated - val);
+ else
+ rc = grow_dpa_allocation(nd_region, &label_id, val - allocated);
+
+ if (rc)
+ return rc;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ nd_namespace_pmem_set_size(nd_region, nspm,
+ val * nd_region->ndr_mappings);
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ /*
+ * Try to delete the namespace if we deleted all of its
+ * allocation, this is not the seed device for the
+ * region, and it is not actively claimed by a btt
+ * instance.
+ */
+ if (val == 0 && nd_region->ns_seed != dev
+ && !nsblk->common.claim)
+ nd_device_unregister(dev, ND_ASYNC);
+ }
+
+ return rc;
+}
+
+static ssize_t size_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ unsigned long long val;
+ u8 **uuid = NULL;
+ int rc;
+
+ rc = kstrtoull(buf, 0, &val);
+ if (rc)
+ return rc;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ wait_nvdimm_bus_probe_idle(dev);
+ rc = __size_store(dev, val);
+ if (rc >= 0)
+ rc = nd_namespace_label_update(nd_region, dev);
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ uuid = &nspm->uuid;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ uuid = &nsblk->uuid;
+ }
+
+ if (rc == 0 && val == 0 && uuid) {
+ /* setting size zero == 'delete namespace' */
+ kfree(*uuid);
+ *uuid = NULL;
+ }
+
+ dev_dbg(dev, "%s: %llx %s (%d)\n", __func__, val, rc < 0
+ ? "fail" : "success", rc);
+
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc < 0 ? rc : len;
+}
+
+resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns)
+{
+ struct device *dev = &ndns->dev;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ return resource_size(&nspm->nsio.res);
+ } else if (is_namespace_blk(dev)) {
+ return nd_namespace_blk_size(to_nd_namespace_blk(dev));
+ } else if (is_namespace_io(dev)) {
+ struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+ return resource_size(&nsio->res);
+ } else
+ WARN_ONCE(1, "unknown namespace type\n");
+ return 0;
+}
+
+resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns)
+{
+ resource_size_t size;
+
+ nvdimm_bus_lock(&ndns->dev);
+ size = __nvdimm_namespace_capacity(ndns);
+ nvdimm_bus_unlock(&ndns->dev);
+
+ return size;
+}
+EXPORT_SYMBOL(nvdimm_namespace_capacity);
+
+static ssize_t size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)
+ nvdimm_namespace_capacity(to_ndns(dev)));
+}
+static DEVICE_ATTR(size, S_IRUGO, size_show, size_store);
+
+static ssize_t uuid_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u8 *uuid;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ uuid = nspm->uuid;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ uuid = nsblk->uuid;
+ } else
+ return -ENXIO;
+
+ if (uuid)
+ return sprintf(buf, "%pUb\n", uuid);
+ return sprintf(buf, "\n");
+}
+
+/**
+ * namespace_update_uuid - check for a unique uuid and whether we're "renaming"
+ * @nd_region: parent region so we can updates all dimms in the set
+ * @dev: namespace type for generating label_id
+ * @new_uuid: incoming uuid
+ * @old_uuid: reference to the uuid storage location in the namespace object
+ */
+static int namespace_update_uuid(struct nd_region *nd_region,
+ struct device *dev, u8 *new_uuid, u8 **old_uuid)
+{
+ u32 flags = is_namespace_blk(dev) ? NSLABEL_FLAG_LOCAL : 0;
+ struct nd_label_id old_label_id;
+ struct nd_label_id new_label_id;
+ int i;
+
+ if (!nd_is_uuid_unique(dev, new_uuid))
+ return -EINVAL;
+
+ if (*old_uuid == NULL)
+ goto out;
+
+ /*
+ * If we've already written a label with this uuid, then it's
+ * too late to rename because we can't reliably update the uuid
+ * without losing the old namespace. Userspace must delete this
+ * namespace to abandon the old uuid.
+ */
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+
+ /*
+ * This check by itself is sufficient because old_uuid
+ * would be NULL above if this uuid did not exist in the
+ * currently written set.
+ *
+ * FIXME: can we delete uuid with zero dpa allocated?
+ */
+ if (nd_mapping->labels)
+ return -EBUSY;
+ }
+
+ nd_label_gen_id(&old_label_id, *old_uuid, flags);
+ nd_label_gen_id(&new_label_id, new_uuid, flags);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct resource *res;
+
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, old_label_id.id) == 0)
+ sprintf((void *) res->name, "%s",
+ new_label_id.id);
+ }
+ kfree(*old_uuid);
+ out:
+ *old_uuid = new_uuid;
+ return 0;
+}
+
+static ssize_t uuid_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ u8 *uuid = NULL;
+ ssize_t rc = 0;
+ u8 **ns_uuid;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ ns_uuid = &nspm->uuid;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ ns_uuid = &nsblk->uuid;
+ } else
+ return -ENXIO;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ wait_nvdimm_bus_probe_idle(dev);
+ if (to_ndns(dev)->claim)
+ rc = -EBUSY;
+ if (rc >= 0)
+ rc = nd_uuid_store(dev, &uuid, buf, len);
+ if (rc >= 0)
+ rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid);
+ if (rc >= 0)
+ rc = nd_namespace_label_update(nd_region, dev);
+ else
+ kfree(uuid);
+ dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+ rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc < 0 ? rc : len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static ssize_t resource_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct resource *res;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ res = &nspm->nsio.res;
+ } else if (is_namespace_io(dev)) {
+ struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+ res = &nsio->res;
+ } else
+ return -ENXIO;
+
+ /* no address to convey if the namespace has no allocation */
+ if (resource_size(res) == 0)
+ return -ENXIO;
+ return sprintf(buf, "%#llx\n", (unsigned long long) res->start);
+}
+static DEVICE_ATTR_RO(resource);
+
+static const unsigned long ns_lbasize_supported[] = { 512, 520, 528,
+ 4096, 4104, 4160, 4224, 0 };
+
+static ssize_t sector_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ if (!is_namespace_blk(dev))
+ return -ENXIO;
+
+ return nd_sector_size_show(nsblk->lbasize, ns_lbasize_supported, buf);
+}
+
+static ssize_t sector_size_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ ssize_t rc = 0;
+
+ if (!is_namespace_blk(dev))
+ return -ENXIO;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ if (to_ndns(dev)->claim)
+ rc = -EBUSY;
+ if (rc >= 0)
+ rc = nd_sector_size_store(dev, buf, &nsblk->lbasize,
+ ns_lbasize_supported);
+ if (rc >= 0)
+ rc = nd_namespace_label_update(nd_region, dev);
+ dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__,
+ rc, rc < 0 ? "tried" : "wrote", buf,
+ buf[len - 1] == '\n' ? "" : "\n");
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(sector_size);
+
+static ssize_t dpa_extents_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_label_id label_id;
+ int count = 0, i;
+ u8 *uuid = NULL;
+ u32 flags = 0;
+
+ nvdimm_bus_lock(dev);
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ uuid = nspm->uuid;
+ flags = 0;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ uuid = nsblk->uuid;
+ flags = NSLABEL_FLAG_LOCAL;
+ }
+
+ if (!uuid)
+ goto out;
+
+ nd_label_gen_id(&label_id, uuid, flags);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct resource *res;
+
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id.id) == 0)
+ count++;
+ }
+ out:
+ nvdimm_bus_unlock(dev);
+
+ return sprintf(buf, "%d\n", count);
+}
+static DEVICE_ATTR_RO(dpa_extents);
+
+static ssize_t holder_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_namespace_common *ndns = to_ndns(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ rc = sprintf(buf, "%s\n", ndns->claim ? dev_name(ndns->claim) : "");
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(holder);
+
+static ssize_t force_raw_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ bool force_raw;
+ int rc = strtobool(buf, &force_raw);
+
+ if (rc)
+ return rc;
+
+ to_ndns(dev)->force_raw = force_raw;
+ return len;
+}
+
+static ssize_t force_raw_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", to_ndns(dev)->force_raw);
+}
+static DEVICE_ATTR_RW(force_raw);
+
+static struct attribute *nd_namespace_attributes[] = {
+ &dev_attr_nstype.attr,
+ &dev_attr_size.attr,
+ &dev_attr_uuid.attr,
+ &dev_attr_holder.attr,
+ &dev_attr_resource.attr,
+ &dev_attr_alt_name.attr,
+ &dev_attr_force_raw.attr,
+ &dev_attr_sector_size.attr,
+ &dev_attr_dpa_extents.attr,
+ NULL,
+};
+
+static umode_t namespace_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+
+ if (a == &dev_attr_resource.attr) {
+ if (is_namespace_blk(dev))
+ return 0;
+ return a->mode;
+ }
+
+ if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
+ if (a == &dev_attr_size.attr)
+ return S_IWUSR | S_IRUGO;
+
+ if (is_namespace_pmem(dev) && a == &dev_attr_sector_size.attr)
+ return 0;
+
+ return a->mode;
+ }
+
+ if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr
+ || a == &dev_attr_holder.attr
+ || a == &dev_attr_force_raw.attr)
+ return a->mode;
+
+ return 0;
+}
+
+static struct attribute_group nd_namespace_attribute_group = {
+ .attrs = nd_namespace_attributes,
+ .is_visible = namespace_visible,
+};
+
+static const struct attribute_group *nd_namespace_attribute_groups[] = {
+ &nd_device_attribute_group,
+ &nd_namespace_attribute_group,
+ &nd_numa_attribute_group,
+ NULL,
+};
+
+struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
+{
+ struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
+ struct nd_namespace_common *ndns;
+ resource_size_t size;
+
+ if (nd_btt) {
+ ndns = nd_btt->ndns;
+ if (!ndns)
+ return ERR_PTR(-ENODEV);
+
+ /*
+ * Flush any in-progess probes / removals in the driver
+ * for the raw personality of this namespace.
+ */
+ device_lock(&ndns->dev);
+ device_unlock(&ndns->dev);
+ if (ndns->dev.driver) {
+ dev_dbg(&ndns->dev, "is active, can't bind %s\n",
+ dev_name(&nd_btt->dev));
+ return ERR_PTR(-EBUSY);
+ }
+ if (dev_WARN_ONCE(&ndns->dev, ndns->claim != &nd_btt->dev,
+ "host (%s) vs claim (%s) mismatch\n",
+ dev_name(&nd_btt->dev),
+ dev_name(ndns->claim)))
+ return ERR_PTR(-ENXIO);
+ } else {
+ ndns = to_ndns(dev);
+ if (ndns->claim) {
+ dev_dbg(dev, "claimed by %s, failing probe\n",
+ dev_name(ndns->claim));
+
+ return ERR_PTR(-ENXIO);
+ }
+ }
+
+ size = nvdimm_namespace_capacity(ndns);
+ if (size < ND_MIN_NAMESPACE_SIZE) {
+ dev_dbg(&ndns->dev, "%pa, too small must be at least %#x\n",
+ &size, ND_MIN_NAMESPACE_SIZE);
+ return ERR_PTR(-ENODEV);
+ }
+
+ if (is_namespace_pmem(&ndns->dev)) {
+ struct nd_namespace_pmem *nspm;
+
+ nspm = to_nd_namespace_pmem(&ndns->dev);
+ if (!nspm->uuid) {
+ dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__);
+ return ERR_PTR(-ENODEV);
+ }
+ } else if (is_namespace_blk(&ndns->dev)) {
+ struct nd_namespace_blk *nsblk;
+
+ nsblk = to_nd_namespace_blk(&ndns->dev);
+ if (!nd_namespace_blk_validate(nsblk))
+ return ERR_PTR(-ENODEV);
+ }
+
+ return ndns;
+}
+EXPORT_SYMBOL(nvdimm_namespace_common_probe);
+
+static struct device **create_namespace_io(struct nd_region *nd_region)
+{
+ struct nd_namespace_io *nsio;
+ struct device *dev, **devs;
+ struct resource *res;
+
+ nsio = kzalloc(sizeof(*nsio), GFP_KERNEL);
+ if (!nsio)
+ return NULL;
+
+ devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL);
+ if (!devs) {
+ kfree(nsio);
+ return NULL;
+ }
+
+ dev = &nsio->common.dev;
+ dev->type = &namespace_io_device_type;
+ dev->parent = &nd_region->dev;
+ res = &nsio->res;
+ res->name = dev_name(&nd_region->dev);
+ res->flags = IORESOURCE_MEM;
+ res->start = nd_region->ndr_start;
+ res->end = res->start + nd_region->ndr_size - 1;
+
+ devs[0] = dev;
+ return devs;
+}
+
+static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid,
+ u64 cookie, u16 pos)
+{
+ struct nd_namespace_label *found = NULL;
+ int i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nd_namespace_label *nd_label;
+ bool found_uuid = false;
+ int l;
+
+ for_each_label(l, nd_label, nd_mapping->labels) {
+ u64 isetcookie = __le64_to_cpu(nd_label->isetcookie);
+ u16 position = __le16_to_cpu(nd_label->position);
+ u16 nlabel = __le16_to_cpu(nd_label->nlabel);
+
+ if (isetcookie != cookie)
+ continue;
+
+ if (memcmp(nd_label->uuid, uuid, NSLABEL_UUID_LEN) != 0)
+ continue;
+
+ if (found_uuid) {
+ dev_dbg(to_ndd(nd_mapping)->dev,
+ "%s duplicate entry for uuid\n",
+ __func__);
+ return false;
+ }
+ found_uuid = true;
+ if (nlabel != nd_region->ndr_mappings)
+ continue;
+ if (position != pos)
+ continue;
+ found = nd_label;
+ break;
+ }
+ if (found)
+ break;
+ }
+ return found != NULL;
+}
+
+static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
+{
+ struct nd_namespace_label *select = NULL;
+ int i;
+
+ if (!pmem_id)
+ return -ENODEV;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nd_namespace_label *nd_label;
+ u64 hw_start, hw_end, pmem_start, pmem_end;
+ int l;
+
+ for_each_label(l, nd_label, nd_mapping->labels)
+ if (memcmp(nd_label->uuid, pmem_id, NSLABEL_UUID_LEN) == 0)
+ break;
+
+ if (!nd_label) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ select = nd_label;
+ /*
+ * Check that this label is compliant with the dpa
+ * range published in NFIT
+ */
+ hw_start = nd_mapping->start;
+ hw_end = hw_start + nd_mapping->size;
+ pmem_start = __le64_to_cpu(select->dpa);
+ pmem_end = pmem_start + __le64_to_cpu(select->rawsize);
+ if (pmem_start == hw_start && pmem_end <= hw_end)
+ /* pass */;
+ else
+ return -EINVAL;
+
+ nd_mapping->labels[0] = select;
+ nd_mapping->labels[1] = NULL;
+ }
+ return 0;
+}
+
+/**
+ * find_pmem_label_set - validate interleave set labelling, retrieve label0
+ * @nd_region: region with mappings to validate
+ */
+static int find_pmem_label_set(struct nd_region *nd_region,
+ struct nd_namespace_pmem *nspm)
+{
+ u64 cookie = nd_region_interleave_set_cookie(nd_region);
+ struct nd_namespace_label *nd_label;
+ u8 select_id[NSLABEL_UUID_LEN];
+ resource_size_t size = 0;
+ u8 *pmem_id = NULL;
+ int rc = -ENODEV, l;
+ u16 i;
+
+ if (cookie == 0)
+ return -ENXIO;
+
+ /*
+ * Find a complete set of labels by uuid. By definition we can start
+ * with any mapping as the reference label
+ */
+ for_each_label(l, nd_label, nd_region->mapping[0].labels) {
+ u64 isetcookie = __le64_to_cpu(nd_label->isetcookie);
+
+ if (isetcookie != cookie)
+ continue;
+
+ for (i = 0; nd_region->ndr_mappings; i++)
+ if (!has_uuid_at_pos(nd_region, nd_label->uuid,
+ cookie, i))
+ break;
+ if (i < nd_region->ndr_mappings) {
+ /*
+ * Give up if we don't find an instance of a
+ * uuid at each position (from 0 to
+ * nd_region->ndr_mappings - 1), or if we find a
+ * dimm with two instances of the same uuid.
+ */
+ rc = -EINVAL;
+ goto err;
+ } else if (pmem_id) {
+ /*
+ * If there is more than one valid uuid set, we
+ * need userspace to clean this up.
+ */
+ rc = -EBUSY;
+ goto err;
+ }
+ memcpy(select_id, nd_label->uuid, NSLABEL_UUID_LEN);
+ pmem_id = select_id;
+ }
+
+ /*
+ * Fix up each mapping's 'labels' to have the validated pmem label for
+ * that position at labels[0], and NULL at labels[1]. In the process,
+ * check that the namespace aligns with interleave-set. We know
+ * that it does not overlap with any blk namespaces by virtue of
+ * the dimm being enabled (i.e. nd_label_reserve_dpa()
+ * succeeded).
+ */
+ rc = select_pmem_id(nd_region, pmem_id);
+ if (rc)
+ goto err;
+
+ /* Calculate total size and populate namespace properties from label0 */
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nd_namespace_label *label0 = nd_mapping->labels[0];
+
+ size += __le64_to_cpu(label0->rawsize);
+ if (__le16_to_cpu(label0->position) != 0)
+ continue;
+ WARN_ON(nspm->alt_name || nspm->uuid);
+ nspm->alt_name = kmemdup((void __force *) label0->name,
+ NSLABEL_NAME_LEN, GFP_KERNEL);
+ nspm->uuid = kmemdup((void __force *) label0->uuid,
+ NSLABEL_UUID_LEN, GFP_KERNEL);
+ }
+
+ if (!nspm->alt_name || !nspm->uuid) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ nd_namespace_pmem_set_size(nd_region, nspm, size);
+
+ return 0;
+ err:
+ switch (rc) {
+ case -EINVAL:
+ dev_dbg(&nd_region->dev, "%s: invalid label(s)\n", __func__);
+ break;
+ case -ENODEV:
+ dev_dbg(&nd_region->dev, "%s: label not found\n", __func__);
+ break;
+ default:
+ dev_dbg(&nd_region->dev, "%s: unexpected err: %d\n",
+ __func__, rc);
+ break;
+ }
+ return rc;
+}
+
+static struct device **create_namespace_pmem(struct nd_region *nd_region)
+{
+ struct nd_namespace_pmem *nspm;
+ struct device *dev, **devs;
+ struct resource *res;
+ int rc;
+
+ nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
+ if (!nspm)
+ return NULL;
+
+ dev = &nspm->nsio.common.dev;
+ dev->type = &namespace_pmem_device_type;
+ dev->parent = &nd_region->dev;
+ res = &nspm->nsio.res;
+ res->name = dev_name(&nd_region->dev);
+ res->flags = IORESOURCE_MEM;
+ rc = find_pmem_label_set(nd_region, nspm);
+ if (rc == -ENODEV) {
+ int i;
+
+ /* Pass, try to permit namespace creation... */
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+
+ kfree(nd_mapping->labels);
+ nd_mapping->labels = NULL;
+ }
+
+ /* Publish a zero-sized namespace for userspace to configure. */
+ nd_namespace_pmem_set_size(nd_region, nspm, 0);
+
+ rc = 0;
+ } else if (rc)
+ goto err;
+
+ devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL);
+ if (!devs)
+ goto err;
+
+ devs[0] = dev;
+ return devs;
+
+ err:
+ namespace_pmem_release(&nspm->nsio.common.dev);
+ return NULL;
+}
+
+struct resource *nsblk_add_resource(struct nd_region *nd_region,
+ struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk,
+ resource_size_t start)
+{
+ struct nd_label_id label_id;
+ struct resource *res;
+
+ nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
+ res = krealloc(nsblk->res,
+ sizeof(void *) * (nsblk->num_resources + 1),
+ GFP_KERNEL);
+ if (!res)
+ return NULL;
+ nsblk->res = (struct resource **) res;
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id.id) == 0
+ && res->start == start) {
+ nsblk->res[nsblk->num_resources++] = res;
+ return res;
+ }
+ return NULL;
+}
+
+static struct device *nd_namespace_blk_create(struct nd_region *nd_region)
+{
+ struct nd_namespace_blk *nsblk;
+ struct device *dev;
+
+ if (!is_nd_blk(&nd_region->dev))
+ return NULL;
+
+ nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
+ if (!nsblk)
+ return NULL;
+
+ dev = &nsblk->common.dev;
+ dev->type = &namespace_blk_device_type;
+ nsblk->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL);
+ if (nsblk->id < 0) {
+ kfree(nsblk);
+ return NULL;
+ }
+ dev_set_name(dev, "namespace%d.%d", nd_region->id, nsblk->id);
+ dev->parent = &nd_region->dev;
+ dev->groups = nd_namespace_attribute_groups;
+
+ return &nsblk->common.dev;
+}
+
+void nd_region_create_blk_seed(struct nd_region *nd_region)
+{
+ WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+ nd_region->ns_seed = nd_namespace_blk_create(nd_region);
+ /*
+ * Seed creation failures are not fatal, provisioning is simply
+ * disabled until memory becomes available
+ */
+ if (!nd_region->ns_seed)
+ dev_err(&nd_region->dev, "failed to create blk namespace\n");
+ else
+ nd_device_register(nd_region->ns_seed);
+}
+
+void nd_region_create_btt_seed(struct nd_region *nd_region)
+{
+ WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+ nd_region->btt_seed = nd_btt_create(nd_region);
+ /*
+ * Seed creation failures are not fatal, provisioning is simply
+ * disabled until memory becomes available
+ */
+ if (!nd_region->btt_seed)
+ dev_err(&nd_region->dev, "failed to create btt namespace\n");
+}
+
+static struct device **create_namespace_blk(struct nd_region *nd_region)
+{
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct nd_namespace_label *nd_label;
+ struct device *dev, **devs = NULL;
+ struct nd_namespace_blk *nsblk;
+ struct nvdimm_drvdata *ndd;
+ int i, l, count = 0;
+ struct resource *res;
+
+ if (nd_region->ndr_mappings == 0)
+ return NULL;
+
+ ndd = to_ndd(nd_mapping);
+ for_each_label(l, nd_label, nd_mapping->labels) {
+ u32 flags = __le32_to_cpu(nd_label->flags);
+ char *name[NSLABEL_NAME_LEN];
+ struct device **__devs;
+
+ if (flags & NSLABEL_FLAG_LOCAL)
+ /* pass */;
+ else
+ continue;
+
+ for (i = 0; i < count; i++) {
+ nsblk = to_nd_namespace_blk(devs[i]);
+ if (memcmp(nsblk->uuid, nd_label->uuid,
+ NSLABEL_UUID_LEN) == 0) {
+ res = nsblk_add_resource(nd_region, ndd, nsblk,
+ __le64_to_cpu(nd_label->dpa));
+ if (!res)
+ goto err;
+ nd_dbg_dpa(nd_region, ndd, res, "%s assign\n",
+ dev_name(&nsblk->common.dev));
+ break;
+ }
+ }
+ if (i < count)
+ continue;
+ __devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL);
+ if (!__devs)
+ goto err;
+ memcpy(__devs, devs, sizeof(dev) * count);
+ kfree(devs);
+ devs = __devs;
+
+ nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
+ if (!nsblk)
+ goto err;
+ dev = &nsblk->common.dev;
+ dev->type = &namespace_blk_device_type;
+ dev->parent = &nd_region->dev;
+ dev_set_name(dev, "namespace%d.%d", nd_region->id, count);
+ devs[count++] = dev;
+ nsblk->id = -1;
+ nsblk->lbasize = __le64_to_cpu(nd_label->lbasize);
+ nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN,
+ GFP_KERNEL);
+ if (!nsblk->uuid)
+ goto err;
+ memcpy(name, nd_label->name, NSLABEL_NAME_LEN);
+ if (name[0])
+ nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN,
+ GFP_KERNEL);
+ res = nsblk_add_resource(nd_region, ndd, nsblk,
+ __le64_to_cpu(nd_label->dpa));
+ if (!res)
+ goto err;
+ nd_dbg_dpa(nd_region, ndd, res, "%s assign\n",
+ dev_name(&nsblk->common.dev));
+ }
+
+ dev_dbg(&nd_region->dev, "%s: discovered %d blk namespace%s\n",
+ __func__, count, count == 1 ? "" : "s");
+
+ if (count == 0) {
+ /* Publish a zero-sized namespace for userspace to configure. */
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+
+ kfree(nd_mapping->labels);
+ nd_mapping->labels = NULL;
+ }
+
+ devs = kcalloc(2, sizeof(dev), GFP_KERNEL);
+ if (!devs)
+ goto err;
+ nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
+ if (!nsblk)
+ goto err;
+ dev = &nsblk->common.dev;
+ dev->type = &namespace_blk_device_type;
+ dev->parent = &nd_region->dev;
+ devs[count++] = dev;
+ }
+
+ return devs;
+
+err:
+ for (i = 0; i < count; i++) {
+ nsblk = to_nd_namespace_blk(devs[i]);
+ namespace_blk_release(&nsblk->common.dev);
+ }
+ kfree(devs);
+ return NULL;
+}
+
+static int init_active_labels(struct nd_region *nd_region)
+{
+ int i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+ int count, j;
+
+ /*
+ * If the dimm is disabled then prevent the region from
+ * being activated if it aliases DPA.
+ */
+ if (!ndd) {
+ if ((nvdimm->flags & NDD_ALIASING) == 0)
+ return 0;
+ dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n",
+ dev_name(&nd_mapping->nvdimm->dev));
+ return -ENXIO;
+ }
+ nd_mapping->ndd = ndd;
+ atomic_inc(&nvdimm->busy);
+ get_ndd(ndd);
+
+ count = nd_label_active_count(ndd);
+ dev_dbg(ndd->dev, "%s: %d\n", __func__, count);
+ if (!count)
+ continue;
+ nd_mapping->labels = kcalloc(count + 1, sizeof(void *),
+ GFP_KERNEL);
+ if (!nd_mapping->labels)
+ return -ENOMEM;
+ for (j = 0; j < count; j++) {
+ struct nd_namespace_label *label;
+
+ label = nd_label_active(ndd, j);
+ nd_mapping->labels[j] = label;
+ }
+ }
+
+ return 0;
+}
+
+int nd_region_register_namespaces(struct nd_region *nd_region, int *err)
+{
+ struct device **devs = NULL;
+ int i, rc = 0, type;
+
+ *err = 0;
+ nvdimm_bus_lock(&nd_region->dev);
+ rc = init_active_labels(nd_region);
+ if (rc) {
+ nvdimm_bus_unlock(&nd_region->dev);
+ return rc;
+ }
+
+ type = nd_region_to_nstype(nd_region);
+ switch (type) {
+ case ND_DEVICE_NAMESPACE_IO:
+ devs = create_namespace_io(nd_region);
+ break;
+ case ND_DEVICE_NAMESPACE_PMEM:
+ devs = create_namespace_pmem(nd_region);
+ break;
+ case ND_DEVICE_NAMESPACE_BLK:
+ devs = create_namespace_blk(nd_region);
+ break;
+ default:
+ break;
+ }
+ nvdimm_bus_unlock(&nd_region->dev);
+
+ if (!devs)
+ return -ENODEV;
+
+ for (i = 0; devs[i]; i++) {
+ struct device *dev = devs[i];
+ int id;
+
+ if (type == ND_DEVICE_NAMESPACE_BLK) {
+ struct nd_namespace_blk *nsblk;
+
+ nsblk = to_nd_namespace_blk(dev);
+ id = ida_simple_get(&nd_region->ns_ida, 0, 0,
+ GFP_KERNEL);
+ nsblk->id = id;
+ } else
+ id = i;
+
+ if (id < 0)
+ break;
+ dev_set_name(dev, "namespace%d.%d", nd_region->id, id);
+ dev->groups = nd_namespace_attribute_groups;
+ nd_device_register(dev);
+ }
+ if (i)
+ nd_region->ns_seed = devs[0];
+
+ if (devs[i]) {
+ int j;
+
+ for (j = i; devs[j]; j++) {
+ struct device *dev = devs[j];
+
+ device_initialize(dev);
+ put_device(dev);
+ }
+ *err = j - i;
+ /*
+ * All of the namespaces we tried to register failed, so
+ * fail region activation.
+ */
+ if (*err == 0)
+ rc = -ENODEV;
+ }
+ kfree(devs);
+
+ if (rc == -ENODEV)
+ return rc;
+
+ return i;
+}
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
new file mode 100644
index 000000000000..e1970c71ad1c
--- /dev/null
+++ b/drivers/nvdimm/nd-core.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ND_CORE_H__
+#define __ND_CORE_H__
+#include <linux/libnvdimm.h>
+#include <linux/device.h>
+#include <linux/libnvdimm.h>
+#include <linux/sizes.h>
+#include <linux/mutex.h>
+#include <linux/nd.h>
+
+extern struct list_head nvdimm_bus_list;
+extern struct mutex nvdimm_bus_list_mutex;
+extern int nvdimm_major;
+
+struct nvdimm_bus {
+ struct nvdimm_bus_descriptor *nd_desc;
+ wait_queue_head_t probe_wait;
+ struct module *module;
+ struct list_head list;
+ struct device dev;
+ int id, probe_active;
+ struct mutex reconfig_mutex;
+};
+
+struct nvdimm {
+ unsigned long flags;
+ void *provider_data;
+ unsigned long *dsm_mask;
+ struct device dev;
+ atomic_t busy;
+ int id;
+};
+
+bool is_nvdimm(struct device *dev);
+bool is_nd_pmem(struct device *dev);
+bool is_nd_blk(struct device *dev);
+struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
+int __init nvdimm_bus_init(void);
+void nvdimm_bus_exit(void);
+void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+struct nd_region;
+void nd_region_create_blk_seed(struct nd_region *nd_region);
+void nd_region_create_btt_seed(struct nd_region *nd_region);
+void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
+void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
+void nd_synchronize(void);
+int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus);
+int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus);
+int nvdimm_bus_init_interleave_sets(struct nvdimm_bus *nvdimm_bus);
+void __nd_device_register(struct device *dev);
+int nd_match_dimm(struct device *dev, void *data);
+struct nd_label_id;
+char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags);
+bool nd_is_uuid_unique(struct device *dev, u8 *uuid);
+struct nd_region;
+struct nvdimm_drvdata;
+struct nd_mapping;
+resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, resource_size_t *overlap);
+resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping);
+resource_size_t nd_region_available_dpa(struct nd_region *nd_region);
+resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
+ struct nd_label_id *label_id);
+struct nd_mapping;
+struct resource *nsblk_add_resource(struct nd_region *nd_region,
+ struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk,
+ resource_size_t start);
+int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd);
+void get_ndd(struct nvdimm_drvdata *ndd);
+resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
+#endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
new file mode 100644
index 000000000000..c41f53e74277
--- /dev/null
+++ b/drivers/nvdimm/nd.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ND_H__
+#define __ND_H__
+#include <linux/libnvdimm.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/ndctl.h>
+#include <linux/types.h>
+#include "label.h"
+
+enum {
+ /*
+ * Limits the maximum number of block apertures a dimm can
+ * support and is an input to the geometry/on-disk-format of a
+ * BTT instance
+ */
+ ND_MAX_LANES = 256,
+ SECTOR_SHIFT = 9,
+ INT_LBASIZE_ALIGNMENT = 64,
+};
+
+struct nvdimm_drvdata {
+ struct device *dev;
+ int nsindex_size;
+ struct nd_cmd_get_config_size nsarea;
+ void *data;
+ int ns_current, ns_next;
+ struct resource dpa;
+ struct kref kref;
+};
+
+struct nd_region_namespaces {
+ int count;
+ int active;
+};
+
+static inline struct nd_namespace_index *to_namespace_index(
+ struct nvdimm_drvdata *ndd, int i)
+{
+ if (i < 0)
+ return NULL;
+
+ return ndd->data + sizeof_namespace_index(ndd) * i;
+}
+
+static inline struct nd_namespace_index *to_current_namespace_index(
+ struct nvdimm_drvdata *ndd)
+{
+ return to_namespace_index(ndd, ndd->ns_current);
+}
+
+static inline struct nd_namespace_index *to_next_namespace_index(
+ struct nvdimm_drvdata *ndd)
+{
+ return to_namespace_index(ndd, ndd->ns_next);
+}
+
+#define nd_dbg_dpa(r, d, res, fmt, arg...) \
+ dev_dbg((r) ? &(r)->dev : (d)->dev, "%s: %.13s: %#llx @ %#llx " fmt, \
+ (r) ? dev_name((d)->dev) : "", res ? res->name : "null", \
+ (unsigned long long) (res ? resource_size(res) : 0), \
+ (unsigned long long) (res ? res->start : 0), ##arg)
+
+#define for_each_label(l, label, labels) \
+ for (l = 0; (label = labels ? labels[l] : NULL); l++)
+
+#define for_each_dpa_resource(ndd, res) \
+ for (res = (ndd)->dpa.child; res; res = res->sibling)
+
+#define for_each_dpa_resource_safe(ndd, res, next) \
+ for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \
+ res; res = next, next = next ? next->sibling : NULL)
+
+struct nd_percpu_lane {
+ int count;
+ spinlock_t lock;
+};
+
+struct nd_region {
+ struct device dev;
+ struct ida ns_ida;
+ struct ida btt_ida;
+ struct device *ns_seed;
+ struct device *btt_seed;
+ u16 ndr_mappings;
+ u64 ndr_size;
+ u64 ndr_start;
+ int id, num_lanes, ro, numa_node;
+ void *provider_data;
+ struct nd_interleave_set *nd_set;
+ struct nd_percpu_lane __percpu *lane;
+ struct nd_mapping mapping[0];
+};
+
+struct nd_blk_region {
+ int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+ void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+ int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
+ void *iobuf, u64 len, int rw);
+ void *blk_provider_data;
+ struct nd_region nd_region;
+};
+
+/*
+ * Lookup next in the repeating sequence of 01, 10, and 11.
+ */
+static inline unsigned nd_inc_seq(unsigned seq)
+{
+ static const unsigned next[] = { 0, 2, 3, 1 };
+
+ return next[seq & 3];
+}
+
+struct btt;
+struct nd_btt {
+ struct device dev;
+ struct nd_namespace_common *ndns;
+ struct btt *btt;
+ unsigned long lbasize;
+ u8 *uuid;
+ int id;
+};
+
+enum nd_async_mode {
+ ND_SYNC,
+ ND_ASYNC,
+};
+
+int nd_integrity_init(struct gendisk *disk, unsigned long meta_size);
+void wait_nvdimm_bus_probe_idle(struct device *dev);
+void nd_device_register(struct device *dev);
+void nd_device_unregister(struct device *dev, enum nd_async_mode mode);
+int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
+ size_t len);
+ssize_t nd_sector_size_show(unsigned long current_lbasize,
+ const unsigned long *supported, char *buf);
+ssize_t nd_sector_size_store(struct device *dev, const char *buf,
+ unsigned long *current_lbasize, const unsigned long *supported);
+int __init nvdimm_init(void);
+int __init nd_region_init(void);
+void nvdimm_exit(void);
+void nd_region_exit(void);
+struct nvdimm;
+struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping);
+int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd);
+int nvdimm_init_config_data(struct nvdimm_drvdata *ndd);
+int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
+ void *buf, size_t len);
+struct nd_btt *to_nd_btt(struct device *dev);
+struct btt_sb;
+u64 nd_btt_sb_checksum(struct btt_sb *btt_sb);
+#if IS_ENABLED(CONFIG_BTT)
+int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata);
+bool is_nd_btt(struct device *dev);
+struct device *nd_btt_create(struct nd_region *nd_region);
+#else
+static inline nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
+{
+ return -ENODEV;
+}
+
+static inline bool is_nd_btt(struct device *dev)
+{
+ return false;
+}
+
+static inline struct device *nd_btt_create(struct nd_region *nd_region)
+{
+ return NULL;
+}
+
+#endif
+struct nd_region *to_nd_region(struct device *dev);
+int nd_region_to_nstype(struct nd_region *nd_region);
+int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
+u64 nd_region_interleave_set_cookie(struct nd_region *nd_region);
+void nvdimm_bus_lock(struct device *dev);
+void nvdimm_bus_unlock(struct device *dev);
+bool is_nvdimm_bus_locked(struct device *dev);
+int nvdimm_revalidate_disk(struct gendisk *disk);
+void nvdimm_drvdata_release(struct kref *kref);
+void put_ndd(struct nvdimm_drvdata *ndd);
+int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd);
+void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res);
+struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
+ struct nd_label_id *label_id, resource_size_t start,
+ resource_size_t n);
+resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
+struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev);
+int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
+int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns);
+const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
+ char *name);
+int nd_blk_region_init(struct nd_region *nd_region);
+void __nd_iostat_start(struct bio *bio, unsigned long *start);
+static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+
+ if (!blk_queue_io_stat(disk->queue))
+ return false;
+
+ __nd_iostat_start(bio, start);
+ return true;
+}
+void nd_iostat_end(struct bio *bio, unsigned long start);
+resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk);
+#endif /* __ND_H__ */
diff --git a/drivers/block/pmem.c b/drivers/nvdimm/pmem.c
index 095dfaadcaa5..ade9eb917a4d 100644
--- a/drivers/block/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -1,7 +1,7 @@
/*
* Persistent Memory Driver
*
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014-2015, Intel Corporation.
* Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
* Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
*
@@ -23,8 +23,9 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/slab.h>
-
-#define PMEM_MINORS 16
+#include <linux/pmem.h>
+#include <linux/nd.h>
+#include "nd.h"
struct pmem_device {
struct request_queue *pmem_queue;
@@ -32,12 +33,11 @@ struct pmem_device {
/* One contiguous memory region per device */
phys_addr_t phys_addr;
- void *virt_addr;
+ void __pmem *virt_addr;
size_t size;
};
static int pmem_major;
-static atomic_t pmem_index;
static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
unsigned int len, unsigned int off, int rw,
@@ -45,13 +45,14 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
{
void *mem = kmap_atomic(page);
size_t pmem_off = sector << 9;
+ void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
if (rw == READ) {
- memcpy(mem + off, pmem->virt_addr + pmem_off, len);
+ memcpy_from_pmem(mem + off, pmem_addr, len);
flush_dcache_page(page);
} else {
flush_dcache_page(page);
- memcpy(pmem->virt_addr + pmem_off, mem + off, len);
+ memcpy_to_pmem(pmem_addr, mem + off, len);
}
kunmap_atomic(mem);
@@ -59,31 +60,24 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
static void pmem_make_request(struct request_queue *q, struct bio *bio)
{
- struct block_device *bdev = bio->bi_bdev;
- struct pmem_device *pmem = bdev->bd_disk->private_data;
- int rw;
+ bool do_acct;
+ unsigned long start;
struct bio_vec bvec;
- sector_t sector;
struct bvec_iter iter;
- int err = 0;
-
- if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
- err = -EIO;
- goto out;
- }
-
- BUG_ON(bio->bi_rw & REQ_DISCARD);
+ struct block_device *bdev = bio->bi_bdev;
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
- rw = bio_data_dir(bio);
- sector = bio->bi_iter.bi_sector;
- bio_for_each_segment(bvec, bio, iter) {
+ do_acct = nd_iostat_start(bio, &start);
+ bio_for_each_segment(bvec, bio, iter)
pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
- rw, sector);
- sector += bvec.bv_len >> 9;
- }
+ bio_data_dir(bio), iter.bi_sector);
+ if (do_acct)
+ nd_iostat_end(bio, start);
-out:
- bio_endio(bio, err);
+ if (bio_data_dir(bio))
+ wmb_pmem();
+
+ bio_endio(bio, 0);
}
static int pmem_rw_page(struct block_device *bdev, sector_t sector,
@@ -106,7 +100,8 @@ static long pmem_direct_access(struct block_device *bdev, sector_t sector,
if (!pmem)
return -ENODEV;
- *kaddr = pmem->virt_addr + offset;
+ /* FIXME convert DAX to comprehend that this mapping has a lifetime */
+ *kaddr = (void __force *) pmem->virt_addr + offset;
*pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
return pmem->size - offset;
@@ -116,124 +111,165 @@ static const struct block_device_operations pmem_fops = {
.owner = THIS_MODULE,
.rw_page = pmem_rw_page,
.direct_access = pmem_direct_access,
+ .revalidate_disk = nvdimm_revalidate_disk,
};
-static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
+static struct pmem_device *pmem_alloc(struct device *dev,
+ struct resource *res, int id)
{
struct pmem_device *pmem;
- struct gendisk *disk;
- int idx, err;
- err = -ENOMEM;
pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
if (!pmem)
- goto out;
+ return ERR_PTR(-ENOMEM);
pmem->phys_addr = res->start;
pmem->size = resource_size(res);
+ if (!arch_has_pmem_api())
+ dev_warn(dev, "unable to guarantee persistence of writes\n");
+
+ if (!request_mem_region(pmem->phys_addr, pmem->size, dev_name(dev))) {
+ dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
+ &pmem->phys_addr, pmem->size);
+ kfree(pmem);
+ return ERR_PTR(-EBUSY);
+ }
- err = -EINVAL;
- if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
- dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", &pmem->phys_addr, pmem->size);
- goto out_free_dev;
+ pmem->virt_addr = memremap_pmem(pmem->phys_addr, pmem->size);
+ if (!pmem->virt_addr) {
+ release_mem_region(pmem->phys_addr, pmem->size);
+ kfree(pmem);
+ return ERR_PTR(-ENXIO);
}
- /*
- * Map the memory as write-through, as we can't write back the contents
- * of the CPU caches in case of a crash.
- */
- err = -ENOMEM;
- pmem->virt_addr = ioremap_wt(pmem->phys_addr, pmem->size);
- if (!pmem->virt_addr)
- goto out_release_region;
+ return pmem;
+}
+
+static void pmem_detach_disk(struct pmem_device *pmem)
+{
+ del_gendisk(pmem->pmem_disk);
+ put_disk(pmem->pmem_disk);
+ blk_cleanup_queue(pmem->pmem_queue);
+}
+
+static int pmem_attach_disk(struct nd_namespace_common *ndns,
+ struct pmem_device *pmem)
+{
+ struct gendisk *disk;
pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
if (!pmem->pmem_queue)
- goto out_unmap;
+ return -ENOMEM;
blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
- blk_queue_max_hw_sectors(pmem->pmem_queue, 1024);
+ blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue);
- disk = alloc_disk(PMEM_MINORS);
- if (!disk)
- goto out_free_queue;
-
- idx = atomic_inc_return(&pmem_index) - 1;
+ disk = alloc_disk(0);
+ if (!disk) {
+ blk_cleanup_queue(pmem->pmem_queue);
+ return -ENOMEM;
+ }
disk->major = pmem_major;
- disk->first_minor = PMEM_MINORS * idx;
+ disk->first_minor = 0;
disk->fops = &pmem_fops;
disk->private_data = pmem;
disk->queue = pmem->pmem_queue;
disk->flags = GENHD_FL_EXT_DEVT;
- sprintf(disk->disk_name, "pmem%d", idx);
- disk->driverfs_dev = dev;
+ nvdimm_namespace_disk_name(ndns, disk->disk_name);
+ disk->driverfs_dev = &ndns->dev;
set_capacity(disk, pmem->size >> 9);
pmem->pmem_disk = disk;
add_disk(disk);
+ revalidate_disk(disk);
- return pmem;
+ return 0;
+}
-out_free_queue:
- blk_cleanup_queue(pmem->pmem_queue);
-out_unmap:
- iounmap(pmem->virt_addr);
-out_release_region:
- release_mem_region(pmem->phys_addr, pmem->size);
-out_free_dev:
- kfree(pmem);
-out:
- return ERR_PTR(err);
+static int pmem_rw_bytes(struct nd_namespace_common *ndns,
+ resource_size_t offset, void *buf, size_t size, int rw)
+{
+ struct pmem_device *pmem = dev_get_drvdata(ndns->claim);
+
+ if (unlikely(offset + size > pmem->size)) {
+ dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
+ return -EFAULT;
+ }
+
+ if (rw == READ)
+ memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
+ else {
+ memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
+ wmb_pmem();
+ }
+
+ return 0;
}
static void pmem_free(struct pmem_device *pmem)
{
- del_gendisk(pmem->pmem_disk);
- put_disk(pmem->pmem_disk);
- blk_cleanup_queue(pmem->pmem_queue);
- iounmap(pmem->virt_addr);
+ memunmap_pmem(pmem->virt_addr);
release_mem_region(pmem->phys_addr, pmem->size);
kfree(pmem);
}
-static int pmem_probe(struct platform_device *pdev)
+static int nd_pmem_probe(struct device *dev)
{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_namespace_common *ndns;
+ struct nd_namespace_io *nsio;
struct pmem_device *pmem;
- struct resource *res;
-
- if (WARN_ON(pdev->num_resources > 1))
- return -ENXIO;
+ int rc;
- res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- if (!res)
- return -ENXIO;
+ ndns = nvdimm_namespace_common_probe(dev);
+ if (IS_ERR(ndns))
+ return PTR_ERR(ndns);
- pmem = pmem_alloc(&pdev->dev, res);
+ nsio = to_nd_namespace_io(&ndns->dev);
+ pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
if (IS_ERR(pmem))
return PTR_ERR(pmem);
- platform_set_drvdata(pdev, pmem);
-
- return 0;
+ dev_set_drvdata(dev, pmem);
+ ndns->rw_bytes = pmem_rw_bytes;
+ if (is_nd_btt(dev))
+ rc = nvdimm_namespace_attach_btt(ndns);
+ else if (nd_btt_probe(ndns, pmem) == 0) {
+ /* we'll come back as btt-pmem */
+ rc = -ENXIO;
+ } else
+ rc = pmem_attach_disk(ndns, pmem);
+ if (rc)
+ pmem_free(pmem);
+ return rc;
}
-static int pmem_remove(struct platform_device *pdev)
+static int nd_pmem_remove(struct device *dev)
{
- struct pmem_device *pmem = platform_get_drvdata(pdev);
+ struct pmem_device *pmem = dev_get_drvdata(dev);
+ if (is_nd_btt(dev))
+ nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns);
+ else
+ pmem_detach_disk(pmem);
pmem_free(pmem);
+
return 0;
}
-static struct platform_driver pmem_driver = {
- .probe = pmem_probe,
- .remove = pmem_remove,
- .driver = {
- .owner = THIS_MODULE,
- .name = "pmem",
+MODULE_ALIAS("pmem");
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
+static struct nd_device_driver nd_pmem_driver = {
+ .probe = nd_pmem_probe,
+ .remove = nd_pmem_remove,
+ .drv = {
+ .name = "nd_pmem",
},
+ .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
};
static int __init pmem_init(void)
@@ -244,16 +280,19 @@ static int __init pmem_init(void)
if (pmem_major < 0)
return pmem_major;
- error = platform_driver_register(&pmem_driver);
- if (error)
+ error = nd_driver_register(&nd_pmem_driver);
+ if (error) {
unregister_blkdev(pmem_major, "pmem");
- return error;
+ return error;
+ }
+
+ return 0;
}
module_init(pmem_init);
static void pmem_exit(void)
{
- platform_driver_unregister(&pmem_driver);
+ driver_unregister(&nd_pmem_driver.drv);
unregister_blkdev(pmem_major, "pmem");
}
module_exit(pmem_exit);
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
new file mode 100644
index 000000000000..f28f78ccff19
--- /dev/null
+++ b/drivers/nvdimm/region.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/nd.h>
+#include "nd.h"
+
+static int nd_region_probe(struct device *dev)
+{
+ int err, rc;
+ static unsigned long once;
+ struct nd_region_namespaces *num_ns;
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ if (nd_region->num_lanes > num_online_cpus()
+ && nd_region->num_lanes < num_possible_cpus()
+ && !test_and_set_bit(0, &once)) {
+ dev_info(dev, "online cpus (%d) < concurrent i/o lanes (%d) < possible cpus (%d)\n",
+ num_online_cpus(), nd_region->num_lanes,
+ num_possible_cpus());
+ dev_info(dev, "setting nr_cpus=%d may yield better libnvdimm device performance\n",
+ nd_region->num_lanes);
+ }
+
+ rc = nd_blk_region_init(nd_region);
+ if (rc)
+ return rc;
+
+ rc = nd_region_register_namespaces(nd_region, &err);
+ num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL);
+ if (!num_ns)
+ return -ENOMEM;
+
+ if (rc < 0)
+ return rc;
+
+ num_ns->active = rc;
+ num_ns->count = rc + err;
+ dev_set_drvdata(dev, num_ns);
+
+ if (rc && err && rc == err)
+ return -ENODEV;
+
+ nd_region->btt_seed = nd_btt_create(nd_region);
+ if (err == 0)
+ return 0;
+
+ /*
+ * Given multiple namespaces per region, we do not want to
+ * disable all the successfully registered peer namespaces upon
+ * a single registration failure. If userspace is missing a
+ * namespace that it expects it can disable/re-enable the region
+ * to retry discovery after correcting the failure.
+ * <regionX>/namespaces returns the current
+ * "<async-registered>/<total>" namespace count.
+ */
+ dev_err(dev, "failed to register %d namespace%s, continuing...\n",
+ err, err == 1 ? "" : "s");
+ return 0;
+}
+
+static int child_unregister(struct device *dev, void *data)
+{
+ nd_device_unregister(dev, ND_SYNC);
+ return 0;
+}
+
+static int nd_region_remove(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ /* flush attribute readers and disable */
+ nvdimm_bus_lock(dev);
+ nd_region->ns_seed = NULL;
+ nd_region->btt_seed = NULL;
+ dev_set_drvdata(dev, NULL);
+ nvdimm_bus_unlock(dev);
+
+ device_for_each_child(dev, NULL, child_unregister);
+ return 0;
+}
+
+static struct nd_device_driver nd_region_driver = {
+ .probe = nd_region_probe,
+ .remove = nd_region_remove,
+ .drv = {
+ .name = "nd_region",
+ },
+ .type = ND_DRIVER_REGION_BLK | ND_DRIVER_REGION_PMEM,
+};
+
+int __init nd_region_init(void)
+{
+ return nd_driver_register(&nd_region_driver);
+}
+
+void nd_region_exit(void)
+{
+ driver_unregister(&nd_region_driver.drv);
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_PMEM);
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_BLK);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
new file mode 100644
index 000000000000..a5233422f9dc
--- /dev/null
+++ b/drivers/nvdimm/region_devs.c
@@ -0,0 +1,787 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/io.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "nd.h"
+
+static DEFINE_IDA(region_ida);
+
+static void nd_region_release(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ u16 i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ put_device(&nvdimm->dev);
+ }
+ free_percpu(nd_region->lane);
+ ida_simple_remove(&region_ida, nd_region->id);
+ if (is_nd_blk(dev))
+ kfree(to_nd_blk_region(dev));
+ else
+ kfree(nd_region);
+}
+
+static struct device_type nd_blk_device_type = {
+ .name = "nd_blk",
+ .release = nd_region_release,
+};
+
+static struct device_type nd_pmem_device_type = {
+ .name = "nd_pmem",
+ .release = nd_region_release,
+};
+
+static struct device_type nd_volatile_device_type = {
+ .name = "nd_volatile",
+ .release = nd_region_release,
+};
+
+bool is_nd_pmem(struct device *dev)
+{
+ return dev ? dev->type == &nd_pmem_device_type : false;
+}
+
+bool is_nd_blk(struct device *dev)
+{
+ return dev ? dev->type == &nd_blk_device_type : false;
+}
+
+struct nd_region *to_nd_region(struct device *dev)
+{
+ struct nd_region *nd_region = container_of(dev, struct nd_region, dev);
+
+ WARN_ON(dev->type->release != nd_region_release);
+ return nd_region;
+}
+EXPORT_SYMBOL_GPL(to_nd_region);
+
+struct nd_blk_region *to_nd_blk_region(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ WARN_ON(!is_nd_blk(dev));
+ return container_of(nd_region, struct nd_blk_region, nd_region);
+}
+EXPORT_SYMBOL_GPL(to_nd_blk_region);
+
+void *nd_region_provider_data(struct nd_region *nd_region)
+{
+ return nd_region->provider_data;
+}
+EXPORT_SYMBOL_GPL(nd_region_provider_data);
+
+void *nd_blk_region_provider_data(struct nd_blk_region *ndbr)
+{
+ return ndbr->blk_provider_data;
+}
+EXPORT_SYMBOL_GPL(nd_blk_region_provider_data);
+
+void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data)
+{
+ ndbr->blk_provider_data = data;
+}
+EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
+
+/**
+ * nd_region_to_nstype() - region to an integer namespace type
+ * @nd_region: region-device to interrogate
+ *
+ * This is the 'nstype' attribute of a region as well, an input to the
+ * MODALIAS for namespace devices, and bit number for a nvdimm_bus to match
+ * namespace devices with namespace drivers.
+ */
+int nd_region_to_nstype(struct nd_region *nd_region)
+{
+ if (is_nd_pmem(&nd_region->dev)) {
+ u16 i, alias;
+
+ for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ if (nvdimm->flags & NDD_ALIASING)
+ alias++;
+ }
+ if (alias)
+ return ND_DEVICE_NAMESPACE_PMEM;
+ else
+ return ND_DEVICE_NAMESPACE_IO;
+ } else if (is_nd_blk(&nd_region->dev)) {
+ return ND_DEVICE_NAMESPACE_BLK;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(nd_region_to_nstype);
+
+static int is_uuid_busy(struct device *dev, void *data)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ u8 *uuid = data;
+
+ switch (nd_region_to_nstype(nd_region)) {
+ case ND_DEVICE_NAMESPACE_PMEM: {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ if (!nspm->uuid)
+ break;
+ if (memcmp(uuid, nspm->uuid, NSLABEL_UUID_LEN) == 0)
+ return -EBUSY;
+ break;
+ }
+ case ND_DEVICE_NAMESPACE_BLK: {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ if (!nsblk->uuid)
+ break;
+ if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) == 0)
+ return -EBUSY;
+ break;
+ }
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int is_namespace_uuid_busy(struct device *dev, void *data)
+{
+ if (is_nd_pmem(dev) || is_nd_blk(dev))
+ return device_for_each_child(dev, data, is_uuid_busy);
+ return 0;
+}
+
+/**
+ * nd_is_uuid_unique - verify that no other namespace has @uuid
+ * @dev: any device on a nvdimm_bus
+ * @uuid: uuid to check
+ */
+bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ if (!nvdimm_bus)
+ return false;
+ WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
+ if (device_for_each_child(&nvdimm_bus->dev, uuid,
+ is_namespace_uuid_busy) != 0)
+ return false;
+ return true;
+}
+
+static ssize_t size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ unsigned long long size = 0;
+
+ if (is_nd_pmem(dev)) {
+ size = nd_region->ndr_size;
+ } else if (nd_region->ndr_mappings == 1) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+
+ size = nd_mapping->size;
+ }
+
+ return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t mappings_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ return sprintf(buf, "%d\n", nd_region->ndr_mappings);
+}
+static DEVICE_ATTR_RO(mappings);
+
+static ssize_t nstype_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region));
+}
+static DEVICE_ATTR_RO(nstype);
+
+static ssize_t set_cookie_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ struct nd_interleave_set *nd_set = nd_region->nd_set;
+
+ if (is_nd_pmem(dev) && nd_set)
+ /* pass, should be precluded by region_visible */;
+ else
+ return -ENXIO;
+
+ return sprintf(buf, "%#llx\n", nd_set->cookie);
+}
+static DEVICE_ATTR_RO(set_cookie);
+
+resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
+{
+ resource_size_t blk_max_overlap = 0, available, overlap;
+ int i;
+
+ WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+
+ retry:
+ available = 0;
+ overlap = blk_max_overlap;
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+
+ /* if a dimm is disabled the available capacity is zero */
+ if (!ndd)
+ return 0;
+
+ if (is_nd_pmem(&nd_region->dev)) {
+ available += nd_pmem_available_dpa(nd_region,
+ nd_mapping, &overlap);
+ if (overlap > blk_max_overlap) {
+ blk_max_overlap = overlap;
+ goto retry;
+ }
+ } else if (is_nd_blk(&nd_region->dev)) {
+ available += nd_blk_available_dpa(nd_mapping);
+ }
+ }
+
+ return available;
+}
+
+static ssize_t available_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ unsigned long long available = 0;
+
+ /*
+ * Flush in-flight updates and grab a snapshot of the available
+ * size. Of course, this value is potentially invalidated the
+ * memory nvdimm_bus_lock() is dropped, but that's userspace's
+ * problem to not race itself.
+ */
+ nvdimm_bus_lock(dev);
+ wait_nvdimm_bus_probe_idle(dev);
+ available = nd_region_available_dpa(nd_region);
+ nvdimm_bus_unlock(dev);
+
+ return sprintf(buf, "%llu\n", available);
+}
+static DEVICE_ATTR_RO(available_size);
+
+static ssize_t init_namespaces_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region_namespaces *num_ns = dev_get_drvdata(dev);
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ if (num_ns)
+ rc = sprintf(buf, "%d/%d\n", num_ns->active, num_ns->count);
+ else
+ rc = -ENXIO;
+ nvdimm_bus_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(init_namespaces);
+
+static ssize_t namespace_seed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ if (nd_region->ns_seed)
+ rc = sprintf(buf, "%s\n", dev_name(nd_region->ns_seed));
+ else
+ rc = sprintf(buf, "\n");
+ nvdimm_bus_unlock(dev);
+ return rc;
+}
+static DEVICE_ATTR_RO(namespace_seed);
+
+static ssize_t btt_seed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ if (nd_region->btt_seed)
+ rc = sprintf(buf, "%s\n", dev_name(nd_region->btt_seed));
+ else
+ rc = sprintf(buf, "\n");
+ nvdimm_bus_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(btt_seed);
+
+static ssize_t read_only_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ return sprintf(buf, "%d\n", nd_region->ro);
+}
+
+static ssize_t read_only_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ bool ro;
+ int rc = strtobool(buf, &ro);
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ if (rc)
+ return rc;
+
+ nd_region->ro = ro;
+ return len;
+}
+static DEVICE_ATTR_RW(read_only);
+
+static struct attribute *nd_region_attributes[] = {
+ &dev_attr_size.attr,
+ &dev_attr_nstype.attr,
+ &dev_attr_mappings.attr,
+ &dev_attr_btt_seed.attr,
+ &dev_attr_read_only.attr,
+ &dev_attr_set_cookie.attr,
+ &dev_attr_available_size.attr,
+ &dev_attr_namespace_seed.attr,
+ &dev_attr_init_namespaces.attr,
+ NULL,
+};
+
+static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, typeof(*dev), kobj);
+ struct nd_region *nd_region = to_nd_region(dev);
+ struct nd_interleave_set *nd_set = nd_region->nd_set;
+ int type = nd_region_to_nstype(nd_region);
+
+ if (a != &dev_attr_set_cookie.attr
+ && a != &dev_attr_available_size.attr)
+ return a->mode;
+
+ if ((type == ND_DEVICE_NAMESPACE_PMEM
+ || type == ND_DEVICE_NAMESPACE_BLK)
+ && a == &dev_attr_available_size.attr)
+ return a->mode;
+ else if (is_nd_pmem(dev) && nd_set)
+ return a->mode;
+
+ return 0;
+}
+
+struct attribute_group nd_region_attribute_group = {
+ .attrs = nd_region_attributes,
+ .is_visible = region_visible,
+};
+EXPORT_SYMBOL_GPL(nd_region_attribute_group);
+
+u64 nd_region_interleave_set_cookie(struct nd_region *nd_region)
+{
+ struct nd_interleave_set *nd_set = nd_region->nd_set;
+
+ if (nd_set)
+ return nd_set->cookie;
+ return 0;
+}
+
+/*
+ * Upon successful probe/remove, take/release a reference on the
+ * associated interleave set (if present), and plant new btt + namespace
+ * seeds. Also, on the removal of a BLK region, notify the provider to
+ * disable the region.
+ */
+static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
+ struct device *dev, bool probe)
+{
+ struct nd_region *nd_region;
+
+ if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) {
+ int i;
+
+ nd_region = to_nd_region(dev);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = nd_mapping->ndd;
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ kfree(nd_mapping->labels);
+ nd_mapping->labels = NULL;
+ put_ndd(ndd);
+ nd_mapping->ndd = NULL;
+ if (ndd)
+ atomic_dec(&nvdimm->busy);
+ }
+
+ if (is_nd_pmem(dev))
+ return;
+
+ to_nd_blk_region(dev)->disable(nvdimm_bus, dev);
+ }
+ if (dev->parent && is_nd_blk(dev->parent) && probe) {
+ nd_region = to_nd_region(dev->parent);
+ nvdimm_bus_lock(dev);
+ if (nd_region->ns_seed == dev)
+ nd_region_create_blk_seed(nd_region);
+ nvdimm_bus_unlock(dev);
+ }
+ if (is_nd_btt(dev) && probe) {
+ nd_region = to_nd_region(dev->parent);
+ nvdimm_bus_lock(dev);
+ if (nd_region->btt_seed == dev)
+ nd_region_create_btt_seed(nd_region);
+ nvdimm_bus_unlock(dev);
+ }
+}
+
+void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
+{
+ nd_region_notify_driver_action(nvdimm_bus, dev, true);
+}
+
+void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev)
+{
+ nd_region_notify_driver_action(nvdimm_bus, dev, false);
+}
+
+static ssize_t mappingN(struct device *dev, char *buf, int n)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ struct nd_mapping *nd_mapping;
+ struct nvdimm *nvdimm;
+
+ if (n >= nd_region->ndr_mappings)
+ return -ENXIO;
+ nd_mapping = &nd_region->mapping[n];
+ nvdimm = nd_mapping->nvdimm;
+
+ return sprintf(buf, "%s,%llu,%llu\n", dev_name(&nvdimm->dev),
+ nd_mapping->start, nd_mapping->size);
+}
+
+#define REGION_MAPPING(idx) \
+static ssize_t mapping##idx##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return mappingN(dev, buf, idx); \
+} \
+static DEVICE_ATTR_RO(mapping##idx)
+
+/*
+ * 32 should be enough for a while, even in the presence of socket
+ * interleave a 32-way interleave set is a degenerate case.
+ */
+REGION_MAPPING(0);
+REGION_MAPPING(1);
+REGION_MAPPING(2);
+REGION_MAPPING(3);
+REGION_MAPPING(4);
+REGION_MAPPING(5);
+REGION_MAPPING(6);
+REGION_MAPPING(7);
+REGION_MAPPING(8);
+REGION_MAPPING(9);
+REGION_MAPPING(10);
+REGION_MAPPING(11);
+REGION_MAPPING(12);
+REGION_MAPPING(13);
+REGION_MAPPING(14);
+REGION_MAPPING(15);
+REGION_MAPPING(16);
+REGION_MAPPING(17);
+REGION_MAPPING(18);
+REGION_MAPPING(19);
+REGION_MAPPING(20);
+REGION_MAPPING(21);
+REGION_MAPPING(22);
+REGION_MAPPING(23);
+REGION_MAPPING(24);
+REGION_MAPPING(25);
+REGION_MAPPING(26);
+REGION_MAPPING(27);
+REGION_MAPPING(28);
+REGION_MAPPING(29);
+REGION_MAPPING(30);
+REGION_MAPPING(31);
+
+static umode_t mapping_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ if (n < nd_region->ndr_mappings)
+ return a->mode;
+ return 0;
+}
+
+static struct attribute *mapping_attributes[] = {
+ &dev_attr_mapping0.attr,
+ &dev_attr_mapping1.attr,
+ &dev_attr_mapping2.attr,
+ &dev_attr_mapping3.attr,
+ &dev_attr_mapping4.attr,
+ &dev_attr_mapping5.attr,
+ &dev_attr_mapping6.attr,
+ &dev_attr_mapping7.attr,
+ &dev_attr_mapping8.attr,
+ &dev_attr_mapping9.attr,
+ &dev_attr_mapping10.attr,
+ &dev_attr_mapping11.attr,
+ &dev_attr_mapping12.attr,
+ &dev_attr_mapping13.attr,
+ &dev_attr_mapping14.attr,
+ &dev_attr_mapping15.attr,
+ &dev_attr_mapping16.attr,
+ &dev_attr_mapping17.attr,
+ &dev_attr_mapping18.attr,
+ &dev_attr_mapping19.attr,
+ &dev_attr_mapping20.attr,
+ &dev_attr_mapping21.attr,
+ &dev_attr_mapping22.attr,
+ &dev_attr_mapping23.attr,
+ &dev_attr_mapping24.attr,
+ &dev_attr_mapping25.attr,
+ &dev_attr_mapping26.attr,
+ &dev_attr_mapping27.attr,
+ &dev_attr_mapping28.attr,
+ &dev_attr_mapping29.attr,
+ &dev_attr_mapping30.attr,
+ &dev_attr_mapping31.attr,
+ NULL,
+};
+
+struct attribute_group nd_mapping_attribute_group = {
+ .is_visible = mapping_visible,
+ .attrs = mapping_attributes,
+};
+EXPORT_SYMBOL_GPL(nd_mapping_attribute_group);
+
+int nd_blk_region_init(struct nd_region *nd_region)
+{
+ struct device *dev = &nd_region->dev;
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ if (!is_nd_blk(dev))
+ return 0;
+
+ if (nd_region->ndr_mappings < 1) {
+ dev_err(dev, "invalid BLK region\n");
+ return -ENXIO;
+ }
+
+ return to_nd_blk_region(dev)->enable(nvdimm_bus, dev);
+}
+
+/**
+ * nd_region_acquire_lane - allocate and lock a lane
+ * @nd_region: region id and number of lanes possible
+ *
+ * A lane correlates to a BLK-data-window and/or a log slot in the BTT.
+ * We optimize for the common case where there are 256 lanes, one
+ * per-cpu. For larger systems we need to lock to share lanes. For now
+ * this implementation assumes the cost of maintaining an allocator for
+ * free lanes is on the order of the lock hold time, so it implements a
+ * static lane = cpu % num_lanes mapping.
+ *
+ * In the case of a BTT instance on top of a BLK namespace a lane may be
+ * acquired recursively. We lock on the first instance.
+ *
+ * In the case of a BTT instance on top of PMEM, we only acquire a lane
+ * for the BTT metadata updates.
+ */
+unsigned int nd_region_acquire_lane(struct nd_region *nd_region)
+{
+ unsigned int cpu, lane;
+
+ cpu = get_cpu();
+ if (nd_region->num_lanes < nr_cpu_ids) {
+ struct nd_percpu_lane *ndl_lock, *ndl_count;
+
+ lane = cpu % nd_region->num_lanes;
+ ndl_count = per_cpu_ptr(nd_region->lane, cpu);
+ ndl_lock = per_cpu_ptr(nd_region->lane, lane);
+ if (ndl_count->count++ == 0)
+ spin_lock(&ndl_lock->lock);
+ } else
+ lane = cpu;
+
+ return lane;
+}
+EXPORT_SYMBOL(nd_region_acquire_lane);
+
+void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane)
+{
+ if (nd_region->num_lanes < nr_cpu_ids) {
+ unsigned int cpu = get_cpu();
+ struct nd_percpu_lane *ndl_lock, *ndl_count;
+
+ ndl_count = per_cpu_ptr(nd_region->lane, cpu);
+ ndl_lock = per_cpu_ptr(nd_region->lane, lane);
+ if (--ndl_count->count == 0)
+ spin_unlock(&ndl_lock->lock);
+ put_cpu();
+ }
+ put_cpu();
+}
+EXPORT_SYMBOL(nd_region_release_lane);
+
+static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc, struct device_type *dev_type,
+ const char *caller)
+{
+ struct nd_region *nd_region;
+ struct device *dev;
+ void *region_buf;
+ unsigned int i;
+ int ro = 0;
+
+ for (i = 0; i < ndr_desc->num_mappings; i++) {
+ struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ if ((nd_mapping->start | nd_mapping->size) % SZ_4K) {
+ dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n",
+ caller, dev_name(&nvdimm->dev), i);
+
+ return NULL;
+ }
+
+ if (nvdimm->flags & NDD_UNARMED)
+ ro = 1;
+ }
+
+ if (dev_type == &nd_blk_device_type) {
+ struct nd_blk_region_desc *ndbr_desc;
+ struct nd_blk_region *ndbr;
+
+ ndbr_desc = to_blk_region_desc(ndr_desc);
+ ndbr = kzalloc(sizeof(*ndbr) + sizeof(struct nd_mapping)
+ * ndr_desc->num_mappings,
+ GFP_KERNEL);
+ if (ndbr) {
+ nd_region = &ndbr->nd_region;
+ ndbr->enable = ndbr_desc->enable;
+ ndbr->disable = ndbr_desc->disable;
+ ndbr->do_io = ndbr_desc->do_io;
+ }
+ region_buf = ndbr;
+ } else {
+ nd_region = kzalloc(sizeof(struct nd_region)
+ + sizeof(struct nd_mapping)
+ * ndr_desc->num_mappings,
+ GFP_KERNEL);
+ region_buf = nd_region;
+ }
+
+ if (!region_buf)
+ return NULL;
+ nd_region->id = ida_simple_get(&region_ida, 0, 0, GFP_KERNEL);
+ if (nd_region->id < 0)
+ goto err_id;
+
+ nd_region->lane = alloc_percpu(struct nd_percpu_lane);
+ if (!nd_region->lane)
+ goto err_percpu;
+
+ for (i = 0; i < nr_cpu_ids; i++) {
+ struct nd_percpu_lane *ndl;
+
+ ndl = per_cpu_ptr(nd_region->lane, i);
+ spin_lock_init(&ndl->lock);
+ ndl->count = 0;
+ }
+
+ memcpy(nd_region->mapping, ndr_desc->nd_mapping,
+ sizeof(struct nd_mapping) * ndr_desc->num_mappings);
+ for (i = 0; i < ndr_desc->num_mappings; i++) {
+ struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ get_device(&nvdimm->dev);
+ }
+ nd_region->ndr_mappings = ndr_desc->num_mappings;
+ nd_region->provider_data = ndr_desc->provider_data;
+ nd_region->nd_set = ndr_desc->nd_set;
+ nd_region->num_lanes = ndr_desc->num_lanes;
+ nd_region->ro = ro;
+ nd_region->numa_node = ndr_desc->numa_node;
+ ida_init(&nd_region->ns_ida);
+ ida_init(&nd_region->btt_ida);
+ dev = &nd_region->dev;
+ dev_set_name(dev, "region%d", nd_region->id);
+ dev->parent = &nvdimm_bus->dev;
+ dev->type = dev_type;
+ dev->groups = ndr_desc->attr_groups;
+ nd_region->ndr_size = resource_size(ndr_desc->res);
+ nd_region->ndr_start = ndr_desc->res->start;
+ nd_device_register(dev);
+
+ return nd_region;
+
+ err_percpu:
+ ida_simple_remove(&region_ida, nd_region->id);
+ err_id:
+ kfree(region_buf);
+ return NULL;
+}
+
+struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc)
+{
+ ndr_desc->num_lanes = ND_MAX_LANES;
+ return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type,
+ __func__);
+}
+EXPORT_SYMBOL_GPL(nvdimm_pmem_region_create);
+
+struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc)
+{
+ if (ndr_desc->num_mappings > 1)
+ return NULL;
+ ndr_desc->num_lanes = min(ndr_desc->num_lanes, ND_MAX_LANES);
+ return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type,
+ __func__);
+}
+EXPORT_SYMBOL_GPL(nvdimm_blk_region_create);
+
+struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc)
+{
+ ndr_desc->num_lanes = ND_MAX_LANES;
+ return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type,
+ __func__);
+}
+EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index e9851add6f4e..964ad572aaee 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1056,19 +1056,21 @@ struct vfio_devices {
static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
{
struct vfio_devices *devs = data;
- struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
-
- if (pci_drv != &vfio_pci_driver)
- return -EBUSY;
+ struct vfio_device *device;
if (devs->cur_index == devs->max_index)
return -ENOSPC;
- devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev);
- if (!devs->devices[devs->cur_index])
+ device = vfio_device_get_from_dev(&pdev->dev);
+ if (!device)
return -EINVAL;
- devs->cur_index++;
+ if (pci_dev_driver(pdev) != &vfio_pci_driver) {
+ vfio_device_put(device);
+ return -EBUSY;
+ }
+
+ devs->devices[devs->cur_index++] = device;
return 0;
}
diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
index 9a4403e2a36c..bb30128782aa 100644
--- a/drivers/vfio/platform/Kconfig
+++ b/drivers/vfio/platform/Kconfig
@@ -1,6 +1,6 @@
config VFIO_PLATFORM
tristate "VFIO support for platform devices"
- depends on VFIO && EVENTFD && ARM
+ depends on VFIO && EVENTFD && (ARM || ARM64)
select VFIO_VIRQFD
help
Support for platform devices with VFIO. This is required to make
@@ -18,3 +18,5 @@ config VFIO_AMBA
framework.
If you don't know what to do here, say N.
+
+source "drivers/vfio/platform/reset/Kconfig"
diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
index 81de144c0eaa..9ce8afe28450 100644
--- a/drivers/vfio/platform/Makefile
+++ b/drivers/vfio/platform/Makefile
@@ -2,7 +2,9 @@
vfio-platform-y := vfio_platform.o vfio_platform_common.o vfio_platform_irq.o
obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
+obj-$(CONFIG_VFIO_PLATFORM) += reset/
vfio-amba-y := vfio_amba.o
obj-$(CONFIG_VFIO_AMBA) += vfio-amba.o
+obj-$(CONFIG_VFIO_AMBA) += reset/
diff --git a/drivers/vfio/platform/reset/Kconfig b/drivers/vfio/platform/reset/Kconfig
new file mode 100644
index 000000000000..746b96b0003b
--- /dev/null
+++ b/drivers/vfio/platform/reset/Kconfig
@@ -0,0 +1,7 @@
+config VFIO_PLATFORM_CALXEDAXGMAC_RESET
+ tristate "VFIO support for calxeda xgmac reset"
+ depends on VFIO_PLATFORM
+ help
+ Enables the VFIO platform driver to handle reset for Calxeda xgmac
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/platform/reset/Makefile b/drivers/vfio/platform/reset/Makefile
new file mode 100644
index 000000000000..2a486af9f8fa
--- /dev/null
+++ b/drivers/vfio/platform/reset/Makefile
@@ -0,0 +1,5 @@
+vfio-platform-calxedaxgmac-y := vfio_platform_calxedaxgmac.o
+
+ccflags-y += -Idrivers/vfio/platform
+
+obj-$(CONFIG_VFIO_PLATFORM_CALXEDAXGMAC_RESET) += vfio-platform-calxedaxgmac.o
diff --git a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
new file mode 100644
index 000000000000..619dc7d22082
--- /dev/null
+++ b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
@@ -0,0 +1,86 @@
+/*
+ * VFIO platform driver specialized for Calxeda xgmac reset
+ * reset code is inherited from calxeda xgmac native driver
+ *
+ * Copyright 2010-2011 Calxeda, Inc.
+ * Copyright (c) 2015 Linaro Ltd.
+ * www.linaro.org
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include "vfio_platform_private.h"
+
+#define DRIVER_VERSION "0.1"
+#define DRIVER_AUTHOR "Eric Auger <eric.auger@linaro.org>"
+#define DRIVER_DESC "Reset support for Calxeda xgmac vfio platform device"
+
+#define CALXEDAXGMAC_COMPAT "calxeda,hb-xgmac"
+
+/* XGMAC Register definitions */
+#define XGMAC_CONTROL 0x00000000 /* MAC Configuration */
+
+/* DMA Control and Status Registers */
+#define XGMAC_DMA_CONTROL 0x00000f18 /* Ctrl (Operational Mode) */
+#define XGMAC_DMA_INTR_ENA 0x00000f1c /* Interrupt Enable */
+
+/* DMA Control registe defines */
+#define DMA_CONTROL_ST 0x00002000 /* Start/Stop Transmission */
+#define DMA_CONTROL_SR 0x00000002 /* Start/Stop Receive */
+
+/* Common MAC defines */
+#define MAC_ENABLE_TX 0x00000008 /* Transmitter Enable */
+#define MAC_ENABLE_RX 0x00000004 /* Receiver Enable */
+
+static inline void xgmac_mac_disable(void __iomem *ioaddr)
+{
+ u32 value = readl(ioaddr + XGMAC_DMA_CONTROL);
+
+ value &= ~(DMA_CONTROL_ST | DMA_CONTROL_SR);
+ writel(value, ioaddr + XGMAC_DMA_CONTROL);
+
+ value = readl(ioaddr + XGMAC_CONTROL);
+ value &= ~(MAC_ENABLE_TX | MAC_ENABLE_RX);
+ writel(value, ioaddr + XGMAC_CONTROL);
+}
+
+int vfio_platform_calxedaxgmac_reset(struct vfio_platform_device *vdev)
+{
+ struct vfio_platform_region reg = vdev->regions[0];
+
+ if (!reg.ioaddr) {
+ reg.ioaddr =
+ ioremap_nocache(reg.addr, reg.size);
+ if (!reg.ioaddr)
+ return -ENOMEM;
+ }
+
+ /* disable IRQ */
+ writel(0, reg.ioaddr + XGMAC_DMA_INTR_ENA);
+
+ /* Disable the MAC core */
+ xgmac_mac_disable(reg.ioaddr);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_calxedaxgmac_reset);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index abcff7a1aa66..e43efb5e92bf 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -25,6 +25,44 @@
static DEFINE_MUTEX(driver_lock);
+static const struct vfio_platform_reset_combo reset_lookup_table[] = {
+ {
+ .compat = "calxeda,hb-xgmac",
+ .reset_function_name = "vfio_platform_calxedaxgmac_reset",
+ .module_name = "vfio-platform-calxedaxgmac",
+ },
+};
+
+static void vfio_platform_get_reset(struct vfio_platform_device *vdev,
+ struct device *dev)
+{
+ const char *compat;
+ int (*reset)(struct vfio_platform_device *);
+ int ret, i;
+
+ ret = device_property_read_string(dev, "compatible", &compat);
+ if (ret)
+ return;
+
+ for (i = 0 ; i < ARRAY_SIZE(reset_lookup_table); i++) {
+ if (!strcmp(reset_lookup_table[i].compat, compat)) {
+ request_module(reset_lookup_table[i].module_name);
+ reset = __symbol_get(
+ reset_lookup_table[i].reset_function_name);
+ if (reset) {
+ vdev->reset = reset;
+ return;
+ }
+ }
+ }
+}
+
+static void vfio_platform_put_reset(struct vfio_platform_device *vdev)
+{
+ if (vdev->reset)
+ symbol_put_addr(vdev->reset);
+}
+
static int vfio_platform_regions_init(struct vfio_platform_device *vdev)
{
int cnt = 0, i;
@@ -100,6 +138,8 @@ static void vfio_platform_release(void *device_data)
mutex_lock(&driver_lock);
if (!(--vdev->refcnt)) {
+ if (vdev->reset)
+ vdev->reset(vdev);
vfio_platform_regions_cleanup(vdev);
vfio_platform_irq_cleanup(vdev);
}
@@ -127,6 +167,9 @@ static int vfio_platform_open(void *device_data)
ret = vfio_platform_irq_init(vdev);
if (ret)
goto err_irq;
+
+ if (vdev->reset)
+ vdev->reset(vdev);
}
vdev->refcnt++;
@@ -159,6 +202,8 @@ static long vfio_platform_ioctl(void *device_data,
if (info.argsz < minsz)
return -EINVAL;
+ if (vdev->reset)
+ vdev->flags |= VFIO_DEVICE_FLAGS_RESET;
info.flags = vdev->flags;
info.num_regions = vdev->num_regions;
info.num_irqs = vdev->num_irqs;
@@ -252,8 +297,12 @@ static long vfio_platform_ioctl(void *device_data,
return ret;
- } else if (cmd == VFIO_DEVICE_RESET)
- return -EINVAL;
+ } else if (cmd == VFIO_DEVICE_RESET) {
+ if (vdev->reset)
+ return vdev->reset(vdev);
+ else
+ return -EINVAL;
+ }
return -ENOTTY;
}
@@ -502,6 +551,8 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
return ret;
}
+ vfio_platform_get_reset(vdev, dev);
+
mutex_init(&vdev->igate);
return 0;
@@ -513,8 +564,11 @@ struct vfio_platform_device *vfio_platform_remove_common(struct device *dev)
struct vfio_platform_device *vdev;
vdev = vfio_del_group_dev(dev);
- if (vdev)
+
+ if (vdev) {
+ vfio_platform_put_reset(vdev);
iommu_group_put(dev->iommu_group);
+ }
return vdev;
}
diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h
index 5d31e0473406..1c9b3d59543c 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -67,6 +67,13 @@ struct vfio_platform_device {
struct resource*
(*get_resource)(struct vfio_platform_device *vdev, int i);
int (*get_irq)(struct vfio_platform_device *vdev, int i);
+ int (*reset)(struct vfio_platform_device *vdev);
+};
+
+struct vfio_platform_reset_combo {
+ const char *compat;
+ const char *reset_function_name;
+ const char *module_name;
};
extern int vfio_platform_probe_common(struct vfio_platform_device *vdev,
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index e1278fe04b1e..2fb29dfeffbd 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -661,18 +661,29 @@ int vfio_add_group_dev(struct device *dev,
EXPORT_SYMBOL_GPL(vfio_add_group_dev);
/**
- * Get a reference to the vfio_device for a device that is known to
- * be bound to a vfio driver. The driver implicitly holds a
- * vfio_device reference between vfio_add_group_dev and
- * vfio_del_group_dev. We can therefore use drvdata to increment
- * that reference from the struct device. This additional
- * reference must be released by calling vfio_device_put.
+ * Get a reference to the vfio_device for a device. Even if the
+ * caller thinks they own the device, they could be racing with a
+ * release call path, so we can't trust drvdata for the shortcut.
+ * Go the long way around, from the iommu_group to the vfio_group
+ * to the vfio_device.
*/
struct vfio_device *vfio_device_get_from_dev(struct device *dev)
{
- struct vfio_device *device = dev_get_drvdata(dev);
+ struct iommu_group *iommu_group;
+ struct vfio_group *group;
+ struct vfio_device *device;
+
+ iommu_group = iommu_group_get(dev);
+ if (!iommu_group)
+ return NULL;
- vfio_device_get(device);
+ group = vfio_group_get_from_iommu(iommu_group);
+ iommu_group_put(iommu_group);
+ if (!group)
+ return NULL;
+
+ device = vfio_group_get_device(group, dev);
+ vfio_group_put(group);
return device;
}
diff --git a/fs/Makefile b/fs/Makefile
index cb92fd4c3172..cb20e4bf2303 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -115,7 +115,6 @@ obj-$(CONFIG_AFS_FS) += afs/
obj-$(CONFIG_NILFS2_FS) += nilfs2/
obj-$(CONFIG_BEFS_FS) += befs/
obj-$(CONFIG_HOSTFS) += hostfs/
-obj-$(CONFIG_HPPFS) += hppfs/
obj-$(CONFIG_CACHEFILES) += cachefiles/
obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_TRACING) += tracefs/
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f04c873a7365..4fe10f93db8a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -43,7 +43,7 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode)
return container_of(inode, struct bdev_inode, vfs_inode);
}
-inline struct block_device *I_BDEV(struct inode *inode)
+struct block_device *I_BDEV(struct inode *inode)
{
return &BDEV_I(inode)->bdev;
}
@@ -377,7 +377,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
struct page *page)
{
const struct block_device_operations *ops = bdev->bd_disk->fops;
- if (!ops->rw_page)
+ if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
}
@@ -408,7 +408,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
int result;
int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
const struct block_device_operations *ops = bdev->bd_disk->fops;
- if (!ops->rw_page)
+ if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
set_page_writeback(page);
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index df9932b00d08..1ce06c849a86 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -85,6 +85,7 @@ BTRFS_WORK_HELPER(extent_refs_helper);
BTRFS_WORK_HELPER(scrub_helper);
BTRFS_WORK_HELPER(scrubwrc_helper);
BTRFS_WORK_HELPER(scrubnc_helper);
+BTRFS_WORK_HELPER(scrubparity_helper);
static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index ec2ee477f8ba..b0b093b6afec 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
BTRFS_WORK_HELPER_PROTO(scrub_helper);
BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
+
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
unsigned int flags,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 614aaa1969bd..802fabb30e15 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -250,8 +250,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
* the first item to check. But sometimes, we may enter it with
* slot==nritems. In that case, go to the next leaf before we continue.
*/
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
- ret = btrfs_next_old_leaf(root, path, time_seq);
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ if (time_seq == (u64)-1)
+ ret = btrfs_next_leaf(root, path);
+ else
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+ }
while (!ret && count < total_refs) {
eb = path->nodes[0];
@@ -291,7 +295,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- ret = btrfs_next_old_item(root, path, time_seq);
+ if (time_seq == (u64)-1)
+ ret = btrfs_next_item(root, path);
+ else
+ ret = btrfs_next_old_item(root, path, time_seq);
}
if (ret > 0)
@@ -334,6 +341,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
+ else if (time_seq == (u64)-1)
+ root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
@@ -343,7 +352,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
}
path->lowest_level = level;
- ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+ if (time_seq == (u64)-1)
+ ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
+ 0, 0);
+ else
+ ret = btrfs_search_old_slot(root, &ref->key_for_search, path,
+ time_seq);
/* root node has been locked, we can release @subvol_srcu safely here */
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -491,7 +505,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(!ref->wanted_disk_byte);
eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
0);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -507,7 +523,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
}
/*
- * merge two lists of backrefs and adjust counts accordingly
+ * merge backrefs and adjust counts accordingly
*
* mode = 1: merge identical keys, if key is set
* FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
@@ -535,9 +551,9 @@ static void __merge_refs(struct list_head *head, int mode)
ref2 = list_entry(pos2, struct __prelim_ref, list);
+ if (!ref_for_same_block(ref1, ref2))
+ continue;
if (mode == 1) {
- if (!ref_for_same_block(ref1, ref2))
- continue;
if (!ref1->parent && ref2->parent) {
xchg = ref1;
ref1 = ref2;
@@ -572,8 +588,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct list_head *prefs, u64 *total_refs,
u64 inum)
{
+ struct btrfs_delayed_ref_node *node;
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
- struct rb_node *n = &head->node.rb_node;
struct btrfs_key key;
struct btrfs_key op_key = {0};
int sgn;
@@ -583,12 +599,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
spin_lock(&head->lock);
- n = rb_first(&head->ref_root);
- while (n) {
- struct btrfs_delayed_ref_node *node;
- node = rb_entry(n, struct btrfs_delayed_ref_node,
- rb_node);
- n = rb_next(n);
+ list_for_each_entry(node, &head->ref_list, list) {
if (node->seq > seq)
continue;
@@ -882,6 +893,11 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
*
* NOTE: This can return values > 0
*
+ * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave
+ * much like trans == NULL case, the difference only lies in it will not
+ * commit root.
+ * The special case is for qgroup to search roots in commit_transaction().
+ *
* FIXME some caching might speed things up
*/
static int find_parent_nodes(struct btrfs_trans_handle *trans,
@@ -920,6 +936,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
+ if (time_seq == (u64)-1)
+ path->skip_locking = 1;
+
/*
* grab both a lock on the path and a lock on the delayed ref head.
* We need both to get a consistent picture of how the refs look
@@ -934,9 +953,10 @@ again:
BUG_ON(ret == 0);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (trans && likely(trans->type != __TRANS_DUMMY)) {
+ if (trans && likely(trans->type != __TRANS_DUMMY) &&
+ time_seq != (u64)-1) {
#else
- if (trans) {
+ if (trans && time_seq != (u64)-1) {
#endif
/*
* look if there are updates for this ref queued and lock the
@@ -1034,7 +1054,10 @@ again:
eb = read_tree_block(fs_info->extent_root,
ref->parent, 0);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0f11ebc92f02..54114b4887dd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1439,8 +1439,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
old = read_tree_block(root, logical, 0);
- if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
- free_extent_buffer(old);
+ if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
+ if (!IS_ERR(old))
+ free_extent_buffer(old);
btrfs_warn(root->fs_info,
"failed to read tree block %llu from get_old_root", logical);
} else {
@@ -1685,7 +1686,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (!cur || !uptodate) {
if (!cur) {
cur = read_tree_block(root, blocknr, gen);
- if (!cur || !extent_buffer_uptodate(cur)) {
+ if (IS_ERR(cur)) {
+ return PTR_ERR(cur);
+ } else if (!extent_buffer_uptodate(cur)) {
free_extent_buffer(cur);
return -EIO;
}
@@ -1864,8 +1867,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
btrfs_node_ptr_generation(parent, slot));
- if (eb && !extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
+ if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
+ if (!IS_ERR(eb))
+ free_extent_buffer(eb);
eb = NULL;
}
@@ -2494,7 +2498,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
tmp = read_tree_block(root, blocknr, 0);
- if (tmp) {
+ if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
* it will never end up being up to date. Set ret to EIO now
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6f364e1d8d3d..80a9aefb0c46 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -174,7 +174,7 @@ struct btrfs_ordered_sum;
/* csum types */
#define BTRFS_CSUM_TYPE_CRC32 0
-static int btrfs_csum_sizes[] = { 4, 0 };
+static int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
@@ -1619,10 +1619,7 @@ struct btrfs_fs_info {
struct task_struct *cleaner_kthread;
int thread_pool_size;
- struct kobject super_kobj;
struct kobject *space_info_kobj;
- struct kobject *device_dir_kobj;
- struct completion kobj_unregister;
int do_barriers;
int closing;
int log_root_recovering;
@@ -1698,6 +1695,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_nocow_workers;
+ struct btrfs_workqueue *scrub_parity_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
@@ -1735,7 +1733,7 @@ struct btrfs_fs_info {
/* list of dirty qgroups to be written at next commit */
struct list_head dirty_qgroups;
- /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+ /* used by qgroup for an efficient tree traversal */
u64 qgroup_seq;
/* qgroup rescan items */
@@ -3458,6 +3456,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode);
void btrfs_orphan_release_metadata(struct inode *inode);
@@ -3515,6 +3514,9 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
int __get_raid_index(u64 flags);
int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const u64 type);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -4050,6 +4052,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
#ifdef CONFIG_BTRFS_ASSERT
+__cold
static inline void assfail(char *expr, char *file, int line)
{
pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
@@ -4065,10 +4068,12 @@ static inline void assfail(char *expr, char *file, int line)
#define btrfs_assert()
__printf(5, 6)
+__cold
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
+__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno);
@@ -4111,11 +4116,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact line number is reported.
*/
-
#define btrfs_abort_transaction(trans, root, errno) \
do { \
- __btrfs_abort_transaction(trans, root, __func__, \
- __LINE__, errno); \
+ /* Report first abort since mount */ \
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
+ &((root)->fs_info->fs_state))) { \
+ WARN(1, KERN_DEBUG \
+ "BTRFS: Transaction aborted (error %d)\n", \
+ (errno)); \
+ } \
+ __btrfs_abort_transaction((trans), (root), __func__, \
+ __LINE__, (errno)); \
} while (0)
#define btrfs_std_error(fs_info, errno) \
@@ -4132,6 +4143,7 @@ do { \
} while (0)
__printf(5, 6)
+__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 8f8ed7d20bac..ac3e81da6d4e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -22,6 +22,7 @@
#include "ctree.h"
#include "delayed-ref.h"
#include "transaction.h"
+#include "qgroup.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -84,87 +85,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
return 0;
}
-/*
- * entries in the rb tree are ordered by the byte number of the extent,
- * type of the delayed backrefs and content of delayed backrefs.
- */
-static int comp_entry(struct btrfs_delayed_ref_node *ref2,
- struct btrfs_delayed_ref_node *ref1,
- bool compare_seq)
-{
- if (ref1->bytenr < ref2->bytenr)
- return -1;
- if (ref1->bytenr > ref2->bytenr)
- return 1;
- if (ref1->is_head && ref2->is_head)
- return 0;
- if (ref2->is_head)
- return -1;
- if (ref1->is_head)
- return 1;
- if (ref1->type < ref2->type)
- return -1;
- if (ref1->type > ref2->type)
- return 1;
- if (ref1->no_quota > ref2->no_quota)
- return 1;
- if (ref1->no_quota < ref2->no_quota)
- return -1;
- /* merging of sequenced refs is not allowed */
- if (compare_seq) {
- if (ref1->seq < ref2->seq)
- return -1;
- if (ref1->seq > ref2->seq)
- return 1;
- }
- if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
- return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
- btrfs_delayed_node_to_tree_ref(ref1),
- ref1->type);
- } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
- ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
- return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
- btrfs_delayed_node_to_data_ref(ref1));
- }
- BUG();
- return 0;
-}
-
-/*
- * insert a new ref into the rbtree. This returns any existing refs
- * for the same (bytenr,parent) tuple, or NULL if the new node was properly
- * inserted.
- */
-static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
- struct rb_node *node)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct btrfs_delayed_ref_node *entry;
- struct btrfs_delayed_ref_node *ins;
- int cmp;
-
- ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
- rb_node);
-
- cmp = comp_entry(entry, ins, 1);
- if (cmp < 0)
- p = &(*p)->rb_left;
- else if (cmp > 0)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
/* insert a new ref to head ref rbtree */
static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
struct rb_node *node)
@@ -268,7 +188,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
rb_erase(&head->href_node, &delayed_refs->href_root);
} else {
assert_spin_locked(&head->lock);
- rb_erase(&ref->rb_node, &head->ref_root);
+ list_del(&ref->list);
}
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
@@ -277,99 +197,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates--;
}
-static int merge_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head,
- struct btrfs_delayed_ref_node *ref, u64 seq)
-{
- struct rb_node *node;
- int mod = 0;
- int done = 0;
-
- node = rb_next(&ref->rb_node);
- while (!done && node) {
- struct btrfs_delayed_ref_node *next;
-
- next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_next(node);
- if (seq && next->seq >= seq)
- break;
- if (comp_entry(ref, next, 0))
- continue;
-
- if (ref->action == next->action) {
- mod = next->ref_mod;
- } else {
- if (ref->ref_mod < next->ref_mod) {
- struct btrfs_delayed_ref_node *tmp;
-
- tmp = ref;
- ref = next;
- next = tmp;
- done = 1;
- }
- mod = -next->ref_mod;
- }
-
- drop_delayed_ref(trans, delayed_refs, head, next);
- ref->ref_mod += mod;
- if (ref->ref_mod == 0) {
- drop_delayed_ref(trans, delayed_refs, head, ref);
- done = 1;
- } else {
- /*
- * You can't have multiples of the same ref on a tree
- * block.
- */
- WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
- }
- }
- return done;
-}
-
-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head)
-{
- struct rb_node *node;
- u64 seq = 0;
-
- assert_spin_locked(&head->lock);
- /*
- * We don't have too much refs to merge in the case of delayed data
- * refs.
- */
- if (head->is_data)
- return;
-
- spin_lock(&fs_info->tree_mod_seq_lock);
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- struct seq_list *elem;
-
- elem = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- seq = elem->seq;
- }
- spin_unlock(&fs_info->tree_mod_seq_lock);
-
- node = rb_first(&head->ref_root);
- while (node) {
- struct btrfs_delayed_ref_node *ref;
-
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
- /* We can't merge refs that are outside of our seq count */
- if (seq && ref->seq >= seq)
- break;
- if (merge_ref(trans, delayed_refs, head, ref, seq))
- node = rb_first(&head->ref_root);
- else
- node = rb_next(&ref->rb_node);
- }
-}
-
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
u64 seq)
@@ -443,45 +270,71 @@ again:
}
/*
- * helper function to update an extent delayed ref in the
- * rbtree. existing and update must both have the same
- * bytenr and parent
+ * Helper to insert the ref_node to the tail or merge with tail.
*
- * This may free existing if the update cancels out whatever
- * operation it was doing.
+ * Return 0 for insert.
+ * Return >0 for merge.
*/
-static noinline void
-update_existing_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head,
- struct btrfs_delayed_ref_node *existing,
- struct btrfs_delayed_ref_node *update)
+static int
+add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *root,
+ struct btrfs_delayed_ref_head *href,
+ struct btrfs_delayed_ref_node *ref)
{
- if (update->action != existing->action) {
- /*
- * this is effectively undoing either an add or a
- * drop. We decrement the ref_mod, and if it goes
- * down to zero we just delete the entry without
- * every changing the extent allocation tree.
- */
- existing->ref_mod--;
- if (existing->ref_mod == 0)
- drop_delayed_ref(trans, delayed_refs, head, existing);
- else
- WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
- existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ struct btrfs_delayed_ref_node *exist;
+ int mod;
+ int ret = 0;
+
+ spin_lock(&href->lock);
+ /* Check whether we can merge the tail node with ref */
+ if (list_empty(&href->ref_list))
+ goto add_tail;
+ exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
+ list);
+ /* No need to compare bytenr nor is_head */
+ if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
+ exist->seq != ref->seq)
+ goto add_tail;
+
+ if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ exist->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+ comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist),
+ btrfs_delayed_node_to_tree_ref(ref),
+ ref->type))
+ goto add_tail;
+ if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ exist->type == BTRFS_SHARED_DATA_REF_KEY) &&
+ comp_data_refs(btrfs_delayed_node_to_data_ref(exist),
+ btrfs_delayed_node_to_data_ref(ref)))
+ goto add_tail;
+
+ /* Now we are sure we can merge */
+ ret = 1;
+ if (exist->action == ref->action) {
+ mod = ref->ref_mod;
} else {
- WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
- existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
- /*
- * the action on the existing ref matches
- * the action on the ref we're trying to add.
- * Bump the ref_mod by one so the backref that
- * is eventually added/removed has the correct
- * reference count
- */
- existing->ref_mod += update->ref_mod;
+ /* Need to change action */
+ if (exist->ref_mod < ref->ref_mod) {
+ exist->action = ref->action;
+ mod = -exist->ref_mod;
+ exist->ref_mod = ref->ref_mod;
+ } else
+ mod = -ref->ref_mod;
}
+ exist->ref_mod += mod;
+
+ /* remove existing tail if its ref_mod is zero */
+ if (exist->ref_mod == 0)
+ drop_delayed_ref(trans, root, href, exist);
+ spin_unlock(&href->lock);
+ return ret;
+
+add_tail:
+ list_add_tail(&ref->list, &href->ref_list);
+ atomic_inc(&root->num_entries);
+ trans->delayed_ref_updates++;
+ spin_unlock(&href->lock);
+ return ret;
}
/*
@@ -568,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *ref, u64 bytenr,
- u64 num_bytes, int action, int is_data)
+ struct btrfs_delayed_ref_node *ref,
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr, u64 num_bytes, int action, int is_data)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *qexisting;
int count_mod = 1;
int must_insert_reserved = 0;
@@ -618,10 +473,22 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
head_ref = btrfs_delayed_node_to_head(ref);
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
- head_ref->ref_root = RB_ROOT;
+ INIT_LIST_HEAD(&head_ref->ref_list);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
+ /* Record qgroup extent info if provided */
+ if (qrecord) {
+ qrecord->bytenr = bytenr;
+ qrecord->num_bytes = num_bytes;
+ qrecord->old_roots = NULL;
+
+ qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs,
+ qrecord);
+ if (qexisting)
+ kfree(qrecord);
+ }
+
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
@@ -659,10 +526,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 parent, u64 ref_root, int level,
int action, int no_quota)
{
- struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_tree_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
u64 seq = 0;
+ int ret;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -693,21 +560,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_tree_ref(ref, full_ref, action);
- spin_lock(&head_ref->lock);
- existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
- if (existing) {
- update_existing_ref(trans, delayed_refs, head_ref, existing,
- ref);
- /*
- * we've updated the existing ref, free the newly
- * allocated ref
- */
+ ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+ /*
+ * XXX: memory should be freed at the same level allocated.
+ * But bad practice is anywhere... Follow it now. Need cleanup.
+ */
+ if (ret > 0)
kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
- } else {
- atomic_inc(&delayed_refs->num_entries);
- trans->delayed_ref_updates++;
- }
- spin_unlock(&head_ref->lock);
}
/*
@@ -721,10 +581,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
u64 offset, int action, int no_quota)
{
- struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_data_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
u64 seq = 0;
+ int ret;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -758,21 +618,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_data_ref(ref, full_ref, action);
- spin_lock(&head_ref->lock);
- existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
- if (existing) {
- update_existing_ref(trans, delayed_refs, head_ref, existing,
- ref);
- /*
- * we've updated the existing ref, free the newly
- * allocated ref
- */
+ ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+ if (ret > 0)
kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
- } else {
- atomic_inc(&delayed_refs->num_entries);
- trans->delayed_ref_updates++;
- }
- spin_unlock(&head_ref->lock);
}
/*
@@ -790,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *record = NULL;
if (!is_fstree(ref_root) || !fs_info->quota_enabled)
no_quota = 0;
@@ -800,9 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
return -ENOMEM;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
- if (!head_ref) {
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
- return -ENOMEM;
+ if (!head_ref)
+ goto free_ref;
+
+ if (fs_info->quota_enabled && is_fstree(ref_root)) {
+ record = kmalloc(sizeof(*record), GFP_NOFS);
+ if (!record)
+ goto free_head_ref;
}
head_ref->extent_op = extent_op;
@@ -814,7 +668,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, action, 0);
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -823,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
spin_unlock(&delayed_refs->lock);
return 0;
+
+free_head_ref:
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+free_ref:
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
+ return -ENOMEM;
}
/*
@@ -839,6 +700,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *record = NULL;
if (!is_fstree(ref_root) || !fs_info->quota_enabled)
no_quota = 0;
@@ -854,6 +716,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
return -ENOMEM;
}
+ if (fs_info->quota_enabled && is_fstree(ref_root)) {
+ record = kmalloc(sizeof(*record), GFP_NOFS);
+ if (!record) {
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_head_cachep,
+ head_ref);
+ return -ENOMEM;
+ }
+ }
+
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -863,7 +735,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, action, 1);
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -891,9 +763,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
- num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
- extent_op->is_data);
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
+ num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+ extent_op->is_data);
spin_unlock(&delayed_refs->lock);
return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 5eb0892396d0..13fb5e6090fe 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -24,9 +24,25 @@
#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+/*
+ * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the
+ * same ref_node structure.
+ * Ref_head is in a higher logic level than tree/data ref, and duplicated
+ * bytenr/num_bytes in ref_node is really a waste or memory, they should be
+ * referred from ref_head.
+ * This gets more disgusting after we use list to store tree/data ref in
+ * ref_head. Must clean this mess up later.
+ */
struct btrfs_delayed_ref_node {
+ /*
+ * ref_head use rb tree, stored in ref_root->href.
+ * indexed by bytenr
+ */
struct rb_node rb_node;
+ /*data/tree ref use list, stored in ref_head->ref_list. */
+ struct list_head list;
+
/* the starting bytenr of the extent */
u64 bytenr;
@@ -83,7 +99,7 @@ struct btrfs_delayed_ref_head {
struct mutex mutex;
spinlock_t lock;
- struct rb_root ref_root;
+ struct list_head ref_list;
struct rb_node href_node;
@@ -132,6 +148,9 @@ struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root href_root;
+ /* dirty extent records */
+ struct rb_root dirty_extent_root;
+
/* this spin lock protects the rbtree and the entries inside */
spinlock_t lock;
@@ -156,6 +175,14 @@ struct btrfs_delayed_ref_root {
int flushing;
u64 run_delayed_start;
+
+ /*
+ * To make qgroup to skip given root.
+ * This is for snapshot, as btrfs_qgroup_inherit() will manully
+ * modify counters for snapshot and its source, so we should skip
+ * the snapshot in new_root/old_roots or it will get calculated twice
+ */
+ u64 qgroup_to_skip;
};
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 0573848c7333..862fbc206755 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
+ ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
+ if (ret)
+ btrfs_error(root->fs_info, ret, "kobj add dev failed");
+
printk_in_rcu(KERN_INFO
"BTRFS: dev_replace from %s (devid %llu) to %s started\n",
src_device->missing ? "<missing disk>" :
@@ -583,8 +587,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
- btrfs_kobj_rm_device(fs_info, src_device);
- btrfs_kobj_add_device(fs_info, tgt_device);
+ btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
/* write back the superblocks */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0bccf18dc1dc..3f43bfea3684 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1149,12 +1149,12 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
- return NULL;
+ return ERR_PTR(-ENOMEM);
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
if (ret) {
free_extent_buffer(buf);
- return NULL;
+ return ERR_PTR(ret);
}
return buf;
@@ -1509,20 +1509,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
generation);
- if (!root->node) {
- ret = -ENOMEM;
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
goto find_fail;
} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
- goto read_fail;
+ free_extent_buffer(root->node);
+ goto find_fail;
}
root->commit_root = btrfs_root_node(root);
out:
btrfs_free_path(path);
return root;
-read_fail:
- free_extent_buffer(root->node);
find_fail:
kfree(root);
alloc_fail:
@@ -2320,8 +2319,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
log_tree_root->node = read_tree_block(tree_root, bytenr,
fs_info->generation + 1);
- if (!log_tree_root->node ||
- !extent_buffer_uptodate(log_tree_root->node)) {
+ if (IS_ERR(log_tree_root->node)) {
+ printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ ret = PTR_ERR(log_tree_root->node);
+ kfree(log_tree_root);
+ return ret;
+ } else if (!extent_buffer_uptodate(log_tree_root->node)) {
printk(KERN_ERR "BTRFS: failed to read log tree\n");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
@@ -2494,7 +2497,6 @@ int open_ctree(struct super_block *sb,
seqlock_init(&fs_info->profiles_lock);
init_rwsem(&fs_info->delayed_iput_sem);
- init_completion(&fs_info->kobj_unregister);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
@@ -2797,8 +2799,8 @@ int open_ctree(struct super_block *sb,
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
generation);
- if (!chunk_root->node ||
- !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+ if (IS_ERR(chunk_root->node) ||
+ !extent_buffer_uptodate(chunk_root->node)) {
printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
sb->s_id);
goto fail_tree_roots;
@@ -2834,8 +2836,8 @@ retry_root_backup:
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
generation);
- if (!tree_root->node ||
- !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ if (IS_ERR(tree_root->node) ||
+ !extent_buffer_uptodate(tree_root->node)) {
printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
sb->s_id);
@@ -2874,10 +2876,22 @@ retry_root_backup:
btrfs_close_extra_devices(fs_devices, 1);
+ ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
+ if (ret) {
+ pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret);
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_sysfs_add_device(fs_devices);
+ if (ret) {
+ pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret);
+ goto fail_fsdev_sysfs;
+ }
+
ret = btrfs_sysfs_add_one(fs_info);
if (ret) {
pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
- goto fail_block_groups;
+ goto fail_fsdev_sysfs;
}
ret = btrfs_init_space_info(fs_info);
@@ -3055,6 +3069,9 @@ fail_cleaner:
fail_sysfs:
btrfs_sysfs_remove_one(fs_info);
+fail_fsdev_sysfs:
+ btrfs_sysfs_remove_fsid(fs_info->fs_devices);
+
fail_block_groups:
btrfs_put_block_group_cache(fs_info);
btrfs_free_block_groups(fs_info);
@@ -3725,6 +3742,7 @@ void close_ctree(struct btrfs_root *root)
}
btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_fsid(fs_info->fs_devices);
btrfs_free_fs_roots(fs_info);
@@ -4053,6 +4071,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_node *tmp;
bool pin_bytes = false;
head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -4068,11 +4087,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
continue;
}
spin_lock(&head->lock);
- while ((node = rb_first(&head->ref_root)) != NULL) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
+ list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list,
+ list) {
ref->in_tree = 0;
- rb_erase(&ref->rb_node, &head->ref_root);
+ list_del(&ref->list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0ec3acd14cbf..38b76cc02f48 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
+ struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extra_op,
- int no_quota);
+ struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
struct btrfs_extent_item *ei);
@@ -1967,10 +1966,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes,
+ struct btrfs_delayed_ref_node *node,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int refs_to_add,
- int no_quota,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1978,9 +1976,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
struct btrfs_key key;
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
u64 refs;
int ret;
- enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
+ int no_quota = node->no_quota;
path = btrfs_alloc_path();
if (!path)
@@ -1996,26 +1996,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
bytenr, num_bytes, parent,
root_objectid, owner, offset,
refs_to_add, extent_op);
- if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
+ if ((ret < 0 && ret != -EAGAIN) || !ret)
goto out;
- /*
- * Ok we were able to insert an inline extent and it appears to be a new
- * reference, deal with the qgroup accounting.
- */
- if (!ret && !no_quota) {
- ASSERT(root->fs_info->quota_enabled);
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- item = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
- type = BTRFS_QGROUP_OPER_ADD_SHARED;
- btrfs_release_path(path);
-
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- bytenr, num_bytes, type, 0);
- goto out;
- }
/*
* Ok we had -EAGAIN which means we didn't have space to insert and
@@ -2026,8 +2008,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, item);
- if (refs)
- type = BTRFS_QGROUP_OPER_ADD_SHARED;
btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
@@ -2035,13 +2015,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- if (!no_quota) {
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- bytenr, num_bytes, type, 0);
- if (ret)
- goto out;
- }
-
path->reada = 1;
path->leave_spinning = 1;
/* now insert the actual backref */
@@ -2087,17 +2060,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
ref->objectid, ref->offset,
&ins, node->ref_mod);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
- node->num_bytes, parent,
+ ret = __btrfs_inc_extent_ref(trans, root, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- node->no_quota, extent_op);
+ extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, root, node->bytenr,
- node->num_bytes, parent,
+ ret = __btrfs_free_extent(trans, root, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- extent_op, node->no_quota);
+ extent_op);
} else {
BUG();
}
@@ -2255,15 +2226,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref->level, &ins,
node->no_quota);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
- node->num_bytes, parent, ref_root,
- ref->level, 0, 1, node->no_quota,
+ ret = __btrfs_inc_extent_ref(trans, root, node,
+ parent, ref_root,
+ ref->level, 0, 1,
extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, root, node->bytenr,
- node->num_bytes, parent, ref_root,
- ref->level, 0, 1, extent_op,
- node->no_quota);
+ ret = __btrfs_free_extent(trans, root, node,
+ parent, ref_root,
+ ref->level, 0, 1, extent_op);
} else {
BUG();
}
@@ -2323,28 +2293,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline struct btrfs_delayed_ref_node *
+static inline struct btrfs_delayed_ref_node *
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
- struct rb_node *node;
- struct btrfs_delayed_ref_node *ref, *last = NULL;;
+ if (list_empty(&head->ref_list))
+ return NULL;
- /*
- * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
- * this prevents ref count from going down to zero when
- * there still are pending delayed ref.
- */
- node = rb_first(&head->ref_root);
- while (node) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
- if (ref->action == BTRFS_ADD_DELAYED_REF)
- return ref;
- else if (last == NULL)
- last = ref;
- node = rb_next(node);
- }
- return last;
+ return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
+ list);
}
/*
@@ -2396,16 +2352,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
}
- /*
- * We need to try and merge add/drops of the same ref since we
- * can run into issues with relocate dropping the implicit ref
- * and then it being added back again before the drop can
- * finish. If we merged anything we need to re-loop so we can
- * get a good ref.
- */
spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
- locked_ref);
/*
* locked_ref is the head node, so we have to go one
@@ -2482,7 +2429,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_unlock(&locked_ref->lock);
spin_lock(&delayed_refs->lock);
spin_lock(&locked_ref->lock);
- if (rb_first(&locked_ref->ref_root) ||
+ if (!list_empty(&locked_ref->ref_list) ||
locked_ref->extent_op) {
spin_unlock(&locked_ref->lock);
spin_unlock(&delayed_refs->lock);
@@ -2496,7 +2443,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
} else {
actual_count++;
ref->in_tree = 0;
- rb_erase(&ref->rb_node, &locked_ref->ref_root);
+ list_del(&ref->list);
}
atomic_dec(&delayed_refs->num_entries);
@@ -2864,9 +2811,6 @@ again:
goto again;
}
out:
- ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
- if (ret)
- return ret;
assert_qgroups_uptodate(trans);
return 0;
}
@@ -2905,7 +2849,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_data_ref *data_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- struct rb_node *node;
int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
@@ -2934,11 +2877,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
spin_lock(&head->lock);
- node = rb_first(&head->ref_root);
- while (node) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_next(node);
-
+ list_for_each_entry(ref, &head->ref_list, list) {
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
ret = 1;
@@ -3693,7 +3632,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->disk_total += total_bytes * factor;
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
- found->full = 0;
+ if (total_bytes > 0)
+ found->full = 0;
spin_unlock(&found->lock);
*space_info = found;
return 0;
@@ -3721,7 +3661,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_reserved = 0;
found->bytes_readonly = 0;
found->bytes_may_use = 0;
- found->full = 0;
+ if (total_bytes > 0)
+ found->full = 0;
+ else
+ found->full = 1;
found->force_alloc = CHUNK_ALLOC_NO_FORCE;
found->chunk_alloc = 0;
found->flush = 0;
@@ -3975,6 +3918,9 @@ commit_trans:
!atomic_read(&root->fs_info->open_ioctl_trans)) {
need_commit--;
+ if (need_commit > 0)
+ btrfs_wait_ordered_roots(fs_info, -1);
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -4088,7 +4034,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
return 1;
}
-static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
{
u64 num_dev;
@@ -4102,24 +4048,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
else
num_dev = 1; /* DUP or single */
- /* metadata for updaing devices and chunk tree */
- return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+ return num_dev;
}
-static void check_system_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 type)
+/*
+ * If @is_allocation is true, reserve space in the system space info necessary
+ * for allocating a chunk, otherwise if it's false, reserve space necessary for
+ * removing a chunk.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 type)
{
struct btrfs_space_info *info;
u64 left;
u64 thresh;
+ int ret = 0;
+ u64 num_devs;
+
+ /*
+ * Needed because we can end up allocating a system chunk and for an
+ * atomic and race free space reservation in the chunk block reserve.
+ */
+ ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
left = info->total_bytes - info->bytes_used - info->bytes_pinned -
- info->bytes_reserved - info->bytes_readonly;
+ info->bytes_reserved - info->bytes_readonly -
+ info->bytes_may_use;
spin_unlock(&info->lock);
- thresh = get_system_chunk_thresh(root, type);
+ num_devs = get_profile_num_devs(root, type);
+
+ /* num_devs device items to update and 1 chunk item to add or remove */
+ thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
+ btrfs_calc_trans_metadata_size(root, 1);
+
if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
left, thresh, type);
@@ -4130,7 +4095,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
u64 flags;
flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
- btrfs_alloc_chunk(trans, root, flags);
+ /*
+ * Ignore failure to create system chunk. We might end up not
+ * needing it, as we might not need to COW all nodes/leafs from
+ * the paths we visit in the chunk tree (they were already COWed
+ * or created in the current transaction for example).
+ */
+ ret = btrfs_alloc_chunk(trans, root, flags);
+ }
+
+ if (!ret) {
+ ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
+ &root->fs_info->chunk_block_rsv,
+ thresh, BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ trans->chunk_bytes_reserved += thresh;
}
}
@@ -5188,6 +5167,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
trans->bytes_reserved = 0;
}
+/*
+ * To be called after all the new block groups attached to the transaction
+ * handle have been created (btrfs_create_pending_block_groups()).
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->root->fs_info;
+
+ if (!trans->chunk_bytes_reserved)
+ return;
+
+ WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+
+ block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
+ trans->chunk_bytes_reserved);
+ trans->chunk_bytes_reserved = 0;
+}
+
/* Can only return 0 or -ENOSPC */
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
@@ -6092,11 +6089,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
+ struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -6110,10 +6106,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
+ int no_quota = node->no_quota;
u32 item_size;
u64 refs;
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
int last_ref = 0;
- enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
@@ -6294,7 +6292,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
refs -= refs_to_drop;
if (refs > 0) {
- type = BTRFS_QGROUP_OPER_SUB_SHARED;
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, ei);
/*
@@ -6356,18 +6353,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- /* Deal with the quota accounting */
- if (!ret && last_ref && !no_quota) {
- int mod_seq = 0;
-
- if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
- type == BTRFS_QGROUP_OPER_SUB_SHARED)
- mod_seq = 1;
-
- ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
- bytenr, num_bytes, type,
- mod_seq);
- }
out:
btrfs_free_path(path);
return ret;
@@ -6393,7 +6378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
goto out_delayed_unlock;
spin_lock(&head->lock);
- if (rb_first(&head->ref_root))
+ if (!list_empty(&head->ref_list))
goto out;
if (head->extent_op) {
@@ -7303,13 +7288,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- /* Always set parent to 0 here since its exclusive anyway. */
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- ins->objectid, ins->offset,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
- if (ret)
- return ret;
-
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7391,14 +7369,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- if (!no_quota) {
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- ins->objectid, num_bytes,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
- if (ret)
- return ret;
- }
-
ret = update_block_group(trans, root, ins->objectid, root->nodesize,
1);
if (ret) { /* -ENOENT, logic error */
@@ -7755,12 +7725,18 @@ reada:
wc->reada_slot = slot;
}
+/*
+ * TODO: Modify related function to add related node/leaf to dirty_extent_root,
+ * for later qgroup accounting.
+ *
+ * Current, this function does nothing.
+ */
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
{
int nr = btrfs_header_nritems(eb);
- int i, extent_type, ret;
+ int i, extent_type;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
@@ -7783,13 +7759,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
-
- ret = btrfs_qgroup_record_ref(trans, root->fs_info,
- root->objectid,
- bytenr, num_bytes,
- BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
- if (ret)
- return ret;
}
return 0;
}
@@ -7858,6 +7827,8 @@ static int adjust_slots_upwards(struct btrfs_root *root,
/*
* root_eb is the subtree root and is locked before this function is called.
+ * TODO: Modify this function to mark all (including complete shared node)
+ * to dirty_extent_root to allow it get accounted in qgroup.
*/
static int account_shared_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -7920,7 +7891,11 @@ walk_down:
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
eb = read_tree_block(root, child_bytenr, child_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
ret = -EIO;
goto out;
}
@@ -7931,16 +7906,6 @@ walk_down:
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
-
- ret = btrfs_qgroup_record_ref(trans, root->fs_info,
- root->objectid,
- child_bytenr,
- root->nodesize,
- BTRFS_QGROUP_OPER_SUB_SUBTREE,
- 0);
- if (ret)
- goto out;
-
}
if (level == 0) {
@@ -8151,7 +8116,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
next = read_tree_block(root, bytenr, generation);
- if (!next || !extent_buffer_uptodate(next)) {
+ if (IS_ERR(next)) {
+ return PTR_ERR(next);
+ } else if (!extent_buffer_uptodate(next)) {
free_extent_buffer(next);
return -EIO;
}
@@ -8533,24 +8500,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_end_trans;
}
- /*
- * Qgroup update accounting is run from
- * delayed ref handling. This usually works
- * out because delayed refs are normally the
- * only way qgroup updates are added. However,
- * we may have added updates during our tree
- * walk so run qgroups here to make sure we
- * don't lose any updates.
- */
- ret = btrfs_delayed_qgroup_accounting(trans,
- root->fs_info);
- if (ret)
- printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
- "running qgroup updates "
- "during snapshot delete. "
- "Quota is out of sync, "
- "rescan required.\n", ret);
-
btrfs_end_transaction_throttle(trans, tree_root);
if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8604,14 +8553,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
root_dropped = true;
out_end_trans:
- ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
- if (ret)
- printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
- "running qgroup updates "
- "during snapshot delete. "
- "Quota is out of sync, "
- "rescan required.\n", ret);
-
btrfs_end_transaction_throttle(trans, tree_root);
out_free:
kfree(wc);
@@ -9562,6 +9503,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, cache);
+ /*
+ * Call to ensure the corresponding space_info object is created and
+ * assigned to our block group, but don't update its counters just yet.
+ * We want our bg to be added to the rbtree with its ->space_info set.
+ */
+ ret = update_space_info(root->fs_info, cache->flags, 0, 0,
+ &cache->space_info);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
ret = btrfs_add_block_group_cache(root->fs_info, cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
@@ -9569,6 +9523,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
return ret;
}
+ /*
+ * Now that our block group has its ->space_info set and is inserted in
+ * the rbtree, update the space info's counters.
+ */
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
if (ret) {
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/fs/btrfs/extent-tree.h
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c374e1e71e5f..02d05817cbdf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask)
{
- return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+ int wake = 0;
+
+ if (bits & EXTENT_LOCKED)
+ wake = 1;
+
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
}
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
@@ -4490,6 +4495,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
free_extent_map(em);
em = NULL;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b072e17479aa..795d754327a7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1868,6 +1868,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_log_ctx ctx;
int ret = 0;
bool full_sync = 0;
+ const u64 len = end - start + 1;
trace_btrfs_sync_file(file, datasync);
@@ -1896,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* all extents are persisted and the respective file extent
* items are in the fs/subvol btree.
*/
- ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+ ret = btrfs_wait_ordered_range(inode, start, len);
} else {
/*
* Start any new ordered operations before starting to log the
@@ -1968,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
smp_mb();
if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
- (full_sync && BTRFS_I(inode)->last_trans <=
- root->fs_info->last_trans_committed)) {
+ (BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed &&
+ (full_sync ||
+ !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
/*
* We'v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9dbe5b548fa6..fb5a6b1c62a6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -231,6 +231,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
{
int ret = 0;
struct btrfs_path *path = btrfs_alloc_path();
+ bool locked = false;
if (!path) {
ret = -ENOMEM;
@@ -238,6 +239,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
}
if (block_group) {
+ locked = true;
mutex_lock(&trans->transaction->cache_write_mutex);
if (!list_empty(&block_group->io_list)) {
list_del_init(&block_group->io_list);
@@ -269,18 +271,14 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
*/
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
- if (ret) {
- mutex_unlock(&trans->transaction->cache_write_mutex);
- btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
+ if (ret)
+ goto fail;
ret = btrfs_update_inode(trans, root, inode);
- if (block_group)
- mutex_unlock(&trans->transaction->cache_write_mutex);
-
fail:
+ if (locked)
+ mutex_unlock(&trans->transaction->cache_write_mutex);
if (ret)
btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bb013672aee..855935f6671a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4986,24 +4986,40 @@ static void evict_inode_truncate_pages(struct inode *inode)
}
write_unlock(&map_tree->lock);
+ /*
+ * Keep looping until we have no more ranges in the io tree.
+ * We can have ongoing bios started by readpages (called from readahead)
+ * that didn't get their end io callbacks called yet or they are still
+ * in progress ((extent_io.c:end_bio_extent_readpage()). This means some
+ * ranges can still be locked and eviction started because before
+ * submitting those bios, which are executed by a separate task (work
+ * queue kthread), inode references (inode->i_count) were not taken
+ * (which would be dropped in the end io callback of each bio).
+ * Therefore here we effectively end up waiting for those bios and
+ * anyone else holding locked ranges without having bumped the inode's
+ * reference count - if we don't do it, when they access the inode's
+ * io_tree to unlock a range it may be too late, leading to an
+ * use-after-free issue.
+ */
spin_lock(&io_tree->lock);
while (!RB_EMPTY_ROOT(&io_tree->state)) {
struct extent_state *state;
struct extent_state *cached_state = NULL;
+ u64 start;
+ u64 end;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
- atomic_inc(&state->refs);
+ start = state->start;
+ end = state->end;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, state->start, state->end,
- 0, &cached_state);
- clear_extent_bit(io_tree, state->start, state->end,
+ lock_extent_bits(io_tree, start, end, 0, &cached_state);
+ clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 1,
&cached_state, GFP_NOFS);
- free_extent_state(state);
cond_resched();
spin_lock(&io_tree->lock);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1c22c6518504..c86b835da7a8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -553,8 +553,8 @@ static noinline int create_subvol(struct inode *dir,
key.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
if (IS_ERR(new_root)) {
- btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
ret = PTR_ERR(new_root);
+ btrfs_abort_transaction(trans, root, ret);
goto fail;
}
@@ -1318,7 +1318,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
i = range->start >> PAGE_CACHE_SHIFT;
}
if (!max_to_defrag)
- max_to_defrag = last_index + 1;
+ max_to_defrag = last_index - i + 1;
/*
* make writeback starts from i, so the defrag range can be
@@ -1368,7 +1368,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra_index = max(i, ra_index);
btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
cluster);
- ra_index += max_cluster;
+ ra_index += cluster;
}
mutex_lock(&inode->i_mutex);
@@ -2271,10 +2271,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
{
struct btrfs_ioctl_ino_lookup_args *args;
struct inode *inode;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ int ret = 0;
args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
@@ -2282,13 +2279,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
inode = file_inode(file);
+ /*
+ * Unprivileged query to obtain the containing subvolume root id. The
+ * path is reset so it's consistent with btrfs_search_path_in_tree.
+ */
if (args->treeid == 0)
args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+ if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
+ args->name[0] = 0;
+ goto out;
+ }
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
+ }
+
ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
args->treeid, args->objectid,
args->name);
+out:
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
ret = -EFAULT;
@@ -2413,8 +2425,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_unlock_inode;
}
- d_invalidate(dentry);
-
down_write(&root->fs_info->subvol_sem);
err = may_destroy_subvol(dest);
@@ -2508,7 +2518,7 @@ out_up_write:
out_unlock_inode:
mutex_unlock(&inode->i_mutex);
if (!err) {
- shrink_dcache_sb(root->fs_info->sb);
+ d_invalidate(dentry);
btrfs_invalidate_inodes(dest);
d_delete(dentry);
ASSERT(dest->send_in_progress == 0);
@@ -2879,12 +2889,19 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
return ret;
}
-static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
+ u64 olen)
{
+ u64 len = *plen;
u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
- if (off + len > inode->i_size || off + len < off)
+ if (off + olen > inode->i_size || off + olen < off)
return -EINVAL;
+
+ /* if we extend to eof, continue to block boundary */
+ if (off + len == inode->i_size)
+ *plen = len = ALIGN(inode->i_size, bs) - off;
+
/* Check that we are block aligned - btrfs_clone() requires this */
if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
return -EINVAL;
@@ -2892,10 +2909,11 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
return 0;
}
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
+ u64 len = olen;
/*
* btrfs_clone() can't handle extents in the same file
@@ -2910,11 +2928,11 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
btrfs_double_lock(src, loff, dst, dst_loff, len);
- ret = extent_same_check_offsets(src, loff, len);
+ ret = extent_same_check_offsets(src, loff, &len, olen);
if (ret)
goto out_unlock;
- ret = extent_same_check_offsets(dst, dst_loff, len);
+ ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
if (ret)
goto out_unlock;
@@ -2927,7 +2945,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
if (ret == 0)
- ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+ ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
out_unlock:
btrfs_double_unlock(src, loff, dst, dst_loff, len);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 760c4a5e096b..89656d799ff6 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->file_offset = file_offset;
entry->start = start;
entry->len = len;
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
- !(type == BTRFS_ORDERED_NOCOW))
- entry->csum_bytes_left = disk_len;
entry->disk_len = disk_len;
entry->bytes_left = len;
entry->inode = igrab(inode);
@@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode,
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
- WARN_ON(entry->csum_bytes_left < sum->len);
- entry->csum_bytes_left -= sum->len;
- if (entry->csum_bytes_left == 0)
- wake_up(&entry->wait);
spin_unlock_irq(&tree->lock);
}
@@ -509,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
- list_add_tail(&ordered->trans_list, &trans->ordered);
+ /*
+ * If our ordered extent completed it means it updated the
+ * fs/subvol and csum trees already, so no need to make the
+ * current transaction's commit wait for it, as we end up
+ * holding memory unnecessarily and delaying the inode's iput
+ * until the transaction commit (we schedule an iput for the
+ * inode when the ordered extent's refcount drops to 0), which
+ * prevents it from being evictable until the transaction
+ * commits.
+ */
+ if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
+ btrfs_put_ordered_extent(ordered);
+ else
+ list_add_tail(&ordered->trans_list, &trans->ordered);
+
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
@@ -844,6 +851,20 @@ out:
return entry;
}
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+ u64 file_offset,
+ u64 len)
+{
+ struct btrfs_ordered_extent *oe;
+
+ oe = btrfs_lookup_ordered_range(inode, file_offset, len);
+ if (oe) {
+ btrfs_put_ordered_extent(oe);
+ return true;
+ }
+ return false;
+}
+
/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e96cd4ccd805..7176cc0fe43f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -89,9 +89,6 @@ struct btrfs_ordered_extent {
/* number of bytes that still need writing */
u64 bytes_left;
- /* number of bytes that still need csumming */
- u64 csum_bytes_left;
-
/*
* the end of the ordered extent which is behind it but
* didn't update disk_i_size. Please see the comment of
@@ -191,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
u64 file_offset,
u64 len);
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+ u64 file_offset,
+ u64 len);
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3d6546581bb9..d5f1f033b7a0 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -34,6 +34,7 @@
#include "extent_io.h"
#include "qgroup.h"
+
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
* - reorganize keys
@@ -84,11 +85,42 @@ struct btrfs_qgroup {
/*
* temp variables for accounting operations
+ * Refer to qgroup_shared_accouting() for details.
*/
u64 old_refcnt;
u64 new_refcnt;
};
+static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
+ int mod)
+{
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq;
+ qg->old_refcnt += mod;
+}
+
+static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
+ int mod)
+{
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq;
+ qg->new_refcnt += mod;
+}
+
+static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+ if (qg->old_refcnt < seq)
+ return 0;
+ return qg->old_refcnt - seq;
+}
+
+static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+ if (qg->new_refcnt < seq)
+ return 0;
+ return qg->new_refcnt - seq;
+}
+
/*
* glue structure to represent the relations between qgroups.
*/
@@ -1115,14 +1147,14 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct ulist *tmp;
int ret = 0;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
-
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
if (!quota_root) {
@@ -1356,239 +1388,86 @@ out:
return ret;
}
-static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
- struct btrfs_qgroup_operation *oper2)
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
- /*
- * Ignore seq and type here, we're looking for any operation
- * at all related to this extent on that root.
- */
- if (oper1->bytenr < oper2->bytenr)
- return -1;
- if (oper1->bytenr > oper2->bytenr)
- return 1;
- if (oper1->ref_root < oper2->ref_root)
- return -1;
- if (oper1->ref_root > oper2->ref_root)
- return 1;
- return 0;
-}
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *node;
+ u64 qgroup_to_skip;
+ int ret = 0;
-static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct rb_node *n;
- struct btrfs_qgroup_operation *cur;
- int cmp;
+ delayed_refs = &trans->transaction->delayed_refs;
+ qgroup_to_skip = delayed_refs->qgroup_to_skip;
- spin_lock(&fs_info->qgroup_op_lock);
- n = fs_info->qgroup_op_tree.rb_node;
- while (n) {
- cur = rb_entry(n, struct btrfs_qgroup_operation, n);
- cmp = comp_oper_exist(cur, oper);
- if (cmp < 0) {
- n = n->rb_right;
- } else if (cmp) {
- n = n->rb_left;
- } else {
- spin_unlock(&fs_info->qgroup_op_lock);
- return -EEXIST;
- }
+ /*
+ * No need to do lock, since this function will only be called in
+ * btrfs_commmit_transaction().
+ */
+ node = rb_first(&delayed_refs->dirty_extent_root);
+ while (node) {
+ record = rb_entry(node, struct btrfs_qgroup_extent_record,
+ node);
+ ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0,
+ &record->old_roots);
+ if (ret < 0)
+ break;
+ if (qgroup_to_skip)
+ ulist_del(record->old_roots, qgroup_to_skip, 0);
+ node = rb_next(node);
}
- spin_unlock(&fs_info->qgroup_op_lock);
- return 0;
-}
-
-static int comp_oper(struct btrfs_qgroup_operation *oper1,
- struct btrfs_qgroup_operation *oper2)
-{
- if (oper1->bytenr < oper2->bytenr)
- return -1;
- if (oper1->bytenr > oper2->bytenr)
- return 1;
- if (oper1->ref_root < oper2->ref_root)
- return -1;
- if (oper1->ref_root > oper2->ref_root)
- return 1;
- if (oper1->seq < oper2->seq)
- return -1;
- if (oper1->seq > oper2->seq)
- return 1;
- if (oper1->type < oper2->type)
- return -1;
- if (oper1->type > oper2->type)
- return 1;
- return 0;
+ return ret;
}
-static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record)
{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct btrfs_qgroup_operation *cur;
- int cmp;
+ struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_qgroup_extent_record *entry;
+ u64 bytenr = record->bytenr;
- spin_lock(&fs_info->qgroup_op_lock);
- p = &fs_info->qgroup_op_tree.rb_node;
while (*p) {
- parent = *p;
- cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
- cmp = comp_oper(cur, oper);
- if (cmp < 0) {
- p = &(*p)->rb_right;
- } else if (cmp) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
+ node);
+ if (bytenr < entry->bytenr)
p = &(*p)->rb_left;
- } else {
- spin_unlock(&fs_info->qgroup_op_lock);
- return -EEXIST;
- }
- }
- rb_link_node(&oper->n, parent, p);
- rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
- spin_unlock(&fs_info->qgroup_op_lock);
- return 0;
-}
-
-/*
- * Record a quota operation for processing later on.
- * @trans: the transaction we are adding the delayed op to.
- * @fs_info: the fs_info for this fs.
- * @ref_root: the root of the reference we are acting on,
- * @bytenr: the bytenr we are acting on.
- * @num_bytes: the number of bytes in the reference.
- * @type: the type of operation this is.
- * @mod_seq: do we need to get a sequence number for looking up roots.
- *
- * We just add it to our trans qgroup_ref_list and carry on and process these
- * operations in order at some later point. If the reference root isn't a fs
- * root then we don't bother with doing anything.
- *
- * MUST BE HOLDING THE REF LOCK.
- */
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 ref_root,
- u64 bytenr, u64 num_bytes,
- enum btrfs_qgroup_operation_type type, int mod_seq)
-{
- struct btrfs_qgroup_operation *oper;
- int ret;
-
- if (!is_fstree(ref_root) || !fs_info->quota_enabled)
- return 0;
-
- oper = kmalloc(sizeof(*oper), GFP_NOFS);
- if (!oper)
- return -ENOMEM;
-
- oper->ref_root = ref_root;
- oper->bytenr = bytenr;
- oper->num_bytes = num_bytes;
- oper->type = type;
- oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
- INIT_LIST_HEAD(&oper->elem.list);
- oper->elem.seq = 0;
-
- trace_btrfs_qgroup_record_ref(oper);
-
- if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
- /*
- * If any operation for this bytenr/ref_root combo
- * exists, then we know it's not exclusively owned and
- * shouldn't be queued up.
- *
- * This also catches the case where we have a cloned
- * extent that gets queued up multiple times during
- * drop snapshot.
- */
- if (qgroup_oper_exists(fs_info, oper)) {
- kfree(oper);
- return 0;
- }
- }
-
- ret = insert_qgroup_oper(fs_info, oper);
- if (ret) {
- /* Shouldn't happen so have an assert for developers */
- ASSERT(0);
- kfree(oper);
- return ret;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return entry;
}
- list_add_tail(&oper->list, &trans->qgroup_ref_list);
- if (mod_seq)
- btrfs_get_tree_mod_seq(fs_info, &oper->elem);
-
- return 0;
-}
-
-static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct ulist *tmp;
- int sign = 0;
- int ret = 0;
-
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
-
- spin_lock(&fs_info->qgroup_lock);
- if (!fs_info->quota_root)
- goto out;
-
- switch (oper->type) {
- case BTRFS_QGROUP_OPER_ADD_EXCL:
- sign = 1;
- break;
- case BTRFS_QGROUP_OPER_SUB_EXCL:
- sign = -1;
- break;
- default:
- ASSERT(0);
- }
- ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
- oper->num_bytes, sign);
-out:
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(tmp);
- return ret;
+ rb_link_node(&record->node, parent_node, p);
+ rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
+ return NULL;
}
+#define UPDATE_NEW 0
+#define UPDATE_OLD 1
/*
- * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
- * properly.
+ * Walk all of the roots that points to the bytenr and adjust their refcnts.
*/
-static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
- u64 root_to_skip, struct ulist *tmp,
- struct ulist *roots, struct ulist *qgroups,
- u64 seq, int *old_roots, int rescan)
+static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+ struct ulist *roots, struct ulist *tmp,
+ struct ulist *qgroups, u64 seq, int update_old)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct ulist_node *tmp_unode;
struct ulist_iterator tmp_uiter;
struct btrfs_qgroup *qg;
- int ret;
+ int ret = 0;
+ if (!roots)
+ return 0;
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(roots, &uiter))) {
- /* We don't count our current root here */
- if (unode->val == root_to_skip)
- continue;
qg = find_qgroup_rb(fs_info, unode->val);
if (!qg)
continue;
- /*
- * We could have a pending removal of this same ref so we may
- * not have actually found our ref root when doing
- * btrfs_find_all_roots, so we need to keep track of how many
- * old roots we find in case we removed ours and added a
- * different one at the same time. I don't think this could
- * happen in practice but that sort of thinking leads to pain
- * and suffering and to the dark side.
- */
- (*old_roots)++;
ulist_reinit(tmp);
ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
@@ -1603,29 +1482,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_list *glist;
qg = u64_to_ptr(tmp_unode->aux);
- /*
- * We use this sequence number to keep from having to
- * run the whole list and 0 out the refcnt every time.
- * We basically use sequnce as the known 0 count and
- * then add 1 everytime we see a qgroup. This is how we
- * get how many of the roots actually point up to the
- * upper level qgroups in order to determine exclusive
- * counts.
- *
- * For rescan we want to set old_refcnt to seq so our
- * exclusive calculations end up correct.
- */
- if (rescan)
- qg->old_refcnt = seq;
- else if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
+ if (update_old)
+ btrfs_qgroup_update_old_refcnt(qg, seq, 1);
else
- qg->old_refcnt++;
-
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
+ btrfs_qgroup_update_new_refcnt(qg, seq, 1);
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(qgroups, glist->group->qgroupid,
ptr_to_u64(glist->group),
@@ -1644,161 +1504,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
}
/*
- * We need to walk forward in our operation tree and account for any roots that
- * were deleted after we made this operation.
- */
-static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper,
- struct ulist *tmp,
- struct ulist *qgroups, u64 seq,
- int *old_roots)
-{
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup *qg;
- struct btrfs_qgroup_operation *tmp_oper;
- struct rb_node *n;
- int ret;
-
- ulist_reinit(tmp);
-
- /*
- * We only walk forward in the tree since we're only interested in
- * removals that happened _after_ our operation.
- */
- spin_lock(&fs_info->qgroup_op_lock);
- n = rb_next(&oper->n);
- spin_unlock(&fs_info->qgroup_op_lock);
- if (!n)
- return 0;
- tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
- while (tmp_oper->bytenr == oper->bytenr) {
- /*
- * If it's not a removal we don't care, additions work out
- * properly with our refcnt tracking.
- */
- if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
- tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
- goto next;
- qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
- if (!qg)
- goto next;
- ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
- GFP_ATOMIC);
- if (ret) {
- if (ret < 0)
- return ret;
- /*
- * We only want to increase old_roots if this qgroup is
- * not already in the list of qgroups. If it is already
- * there then that means it must have been re-added or
- * the delete will be discarded because we had an
- * existing ref that we haven't looked up yet. In this
- * case we don't want to increase old_roots. So if ret
- * == 1 then we know that this is the first time we've
- * seen this qgroup and we can bump the old_roots.
- */
- (*old_roots)++;
- ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
-next:
- spin_lock(&fs_info->qgroup_op_lock);
- n = rb_next(&tmp_oper->n);
- spin_unlock(&fs_info->qgroup_op_lock);
- if (!n)
- break;
- tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
- }
-
- /* Ok now process the qgroups we found */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup_list *glist;
-
- qg = u64_to_ptr(unode->aux);
- if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
- else
- qg->old_refcnt++;
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(qgroups, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
-/* Add refcnt for the newly added reference. */
-static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper,
- struct btrfs_qgroup *qgroup,
- struct ulist *tmp, struct ulist *qgroups,
- u64 seq)
-{
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup *qg;
- int ret;
-
- ulist_reinit(tmp);
- ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup_list *glist;
-
- qg = u64_to_ptr(unode->aux);
- if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
- } else {
- if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
- else
- qg->old_refcnt++;
- }
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(qgroups, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
-/*
- * This adjusts the counters for all referenced qgroups if need be.
+ * Update qgroup rfer/excl counters.
+ * Rfer update is easy, codes can explain themselves.
+ *
+ * Excl update is tricky, the update is split into 2 part.
+ * Part 1: Possible exclusive <-> sharing detect:
+ * | A | !A |
+ * -------------------------------------
+ * B | * | - |
+ * -------------------------------------
+ * !B | + | ** |
+ * -------------------------------------
+ *
+ * Conditions:
+ * A: cur_old_roots < nr_old_roots (not exclusive before)
+ * !A: cur_old_roots == nr_old_roots (possible exclusive before)
+ * B: cur_new_roots < nr_new_roots (not exclusive now)
+ * !B: cur_new_roots == nr_new_roots (possible exclsuive now)
+ *
+ * Results:
+ * +: Possible sharing -> exclusive -: Possible exclusive -> sharing
+ * *: Definitely not changed. **: Possible unchanged.
+ *
+ * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
+ *
+ * To make the logic clear, we first use condition A and B to split
+ * combination into 4 results.
+ *
+ * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
+ * only on variant maybe 0.
+ *
+ * Lastly, check result **, since there are 2 variants maybe 0, split them
+ * again(2x2).
+ * But this time we don't need to consider other things, the codes and logic
+ * is easy to understand now.
*/
-static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
- u64 root_to_skip, u64 num_bytes,
- struct ulist *qgroups, u64 seq,
- int old_roots, int new_roots, int rescan)
+static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
+ struct ulist *qgroups,
+ u64 nr_old_roots,
+ u64 nr_new_roots,
+ u64 num_bytes, u64 seq)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -1810,423 +1555,191 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
bool dirty = false;
qg = u64_to_ptr(unode->aux);
- /*
- * Wasn't referenced before but is now, add to the reference
- * counters.
- */
- if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+ cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
+ cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
+
+ /* Rfer update part */
+ if (cur_old_count == 0 && cur_new_count > 0) {
qg->rfer += num_bytes;
qg->rfer_cmpr += num_bytes;
dirty = true;
}
-
- /*
- * Was referenced before but isn't now, subtract from the
- * reference counters.
- */
- if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+ if (cur_old_count > 0 && cur_new_count == 0) {
qg->rfer -= num_bytes;
qg->rfer_cmpr -= num_bytes;
dirty = true;
}
- if (qg->old_refcnt < seq)
- cur_old_count = 0;
- else
- cur_old_count = qg->old_refcnt - seq;
- if (qg->new_refcnt < seq)
- cur_new_count = 0;
- else
- cur_new_count = qg->new_refcnt - seq;
-
- /*
- * If our refcount was the same as the roots previously but our
- * new count isn't the same as the number of roots now then we
- * went from having a exclusive reference on this range to not.
- */
- if (old_roots && cur_old_count == old_roots &&
- (cur_new_count != new_roots || new_roots == 0)) {
- WARN_ON(cur_new_count != new_roots && new_roots == 0);
- qg->excl -= num_bytes;
- qg->excl_cmpr -= num_bytes;
- dirty = true;
+ /* Excl update part */
+ /* Exclusive/none -> shared case */
+ if (cur_old_count == nr_old_roots &&
+ cur_new_count < nr_new_roots) {
+ /* Exclusive -> shared */
+ if (cur_old_count != 0) {
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
}
- /*
- * If we didn't reference all the roots before but now we do we
- * have an exclusive reference to this range.
- */
- if ((!old_roots || (old_roots && cur_old_count != old_roots))
- && cur_new_count == new_roots) {
- qg->excl += num_bytes;
- qg->excl_cmpr += num_bytes;
- dirty = true;
+ /* Shared -> exclusive/none case */
+ if (cur_old_count < nr_old_roots &&
+ cur_new_count == nr_new_roots) {
+ /* Shared->exclusive */
+ if (cur_new_count != 0) {
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
+ }
}
+ /* Exclusive/none -> exclusive/none case */
+ if (cur_old_count == nr_old_roots &&
+ cur_new_count == nr_new_roots) {
+ if (cur_old_count == 0) {
+ /* None -> exclusive/none */
+
+ if (cur_new_count != 0) {
+ /* None -> exclusive */
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
+ }
+ /* None -> none, nothing changed */
+ } else {
+ /* Exclusive -> exclusive/none */
+
+ if (cur_new_count == 0) {
+ /* Exclusive -> none */
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
+ /* Exclusive -> exclusive, nothing changed */
+ }
+ }
if (dirty)
qgroup_dirty(fs_info, qg);
}
return 0;
}
-/*
- * If we removed a data extent and there were other references for that bytenr
- * then we need to lookup all referenced roots to make sure we still don't
- * reference this bytenr. If we do then we can just discard this operation.
- */
-static int check_existing_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct ulist *roots = NULL;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- int ret = 0;
-
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
- oper->elem.seq, &roots);
- if (ret < 0)
- return ret;
- ret = 0;
-
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- if (unode->val == oper->ref_root) {
- ret = 1;
- break;
- }
- }
- ulist_free(roots);
- btrfs_put_tree_mod_seq(fs_info, &oper->elem);
-
- return ret;
-}
-
-/*
- * If we share a reference across multiple roots then we may need to adjust
- * various qgroups referenced and exclusive counters. The basic premise is this
- *
- * 1) We have seq to represent a 0 count. Instead of looping through all of the
- * qgroups and resetting their refcount to 0 we just constantly bump this
- * sequence number to act as the base reference count. This means that if
- * anybody is equal to or below this sequence they were never referenced. We
- * jack this sequence up by the number of roots we found each time in order to
- * make sure we don't have any overlap.
- *
- * 2) We first search all the roots that reference the area _except_ the root
- * we're acting on currently. This makes up the old_refcnt of all the qgroups
- * before.
- *
- * 3) We walk all of the qgroups referenced by the root we are currently acting
- * on, and will either adjust old_refcnt in the case of a removal or the
- * new_refcnt in the case of an addition.
- *
- * 4) Finally we walk all the qgroups that are referenced by this range
- * including the root we are acting on currently. We will adjust the counters
- * based on the number of roots we had and will have after this operation.
- *
- * Take this example as an illustration
- *
- * [qgroup 1/0]
- * / | \
- * [qg 0/0] [qg 0/1] [qg 0/2]
- * \ | /
- * [ extent ]
- *
- * Say we are adding a reference that is covered by qg 0/0. The first step
- * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
- * old_roots being 2. Because it is adding new_roots will be 1. We then go
- * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
- * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we
- * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
- * reference and thus must add the size to the referenced bytes. Everything
- * else is the same so nothing else changes.
- */
-static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 num_bytes,
+ struct ulist *old_roots, struct ulist *new_roots)
{
- struct ulist *roots = NULL;
- struct ulist *qgroups, *tmp;
- struct btrfs_qgroup *qgroup;
- struct seq_list elem = SEQ_LIST_INIT(elem);
+ struct ulist *qgroups = NULL;
+ struct ulist *tmp = NULL;
u64 seq;
- int old_roots = 0;
- int new_roots = 0;
+ u64 nr_new_roots = 0;
+ u64 nr_old_roots = 0;
int ret = 0;
- if (oper->elem.seq) {
- ret = check_existing_refs(trans, fs_info, oper);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
- }
+ if (new_roots)
+ nr_new_roots = new_roots->nnodes;
+ if (old_roots)
+ nr_old_roots = old_roots->nnodes;
- qgroups = ulist_alloc(GFP_NOFS);
- if (!qgroups)
- return -ENOMEM;
+ if (!fs_info->quota_enabled)
+ goto out_free;
+ BUG_ON(!fs_info->quota_root);
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
tmp = ulist_alloc(GFP_NOFS);
if (!tmp) {
- ulist_free(qgroups);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_free;
}
- btrfs_get_tree_mod_seq(fs_info, &elem);
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
- &roots);
- btrfs_put_tree_mod_seq(fs_info, &elem);
- if (ret < 0) {
- ulist_free(qgroups);
- ulist_free(tmp);
- return ret;
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ ret = 0;
+ goto out_free;
+ }
}
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
spin_lock(&fs_info->qgroup_lock);
- qgroup = find_qgroup_rb(fs_info, oper->ref_root);
- if (!qgroup)
- goto out;
seq = fs_info->qgroup_seq;
- /*
- * So roots is the list of all the roots currently pointing at the
- * bytenr, including the ref we are adding if we are adding, or not if
- * we are removing a ref. So we pass in the ref_root to skip that root
- * in our calculations. We set old_refnct and new_refcnt cause who the
- * hell knows what everything looked like before, and it doesn't matter
- * except...
- */
- ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
- seq, &old_roots, 0);
+ /* Update old refcnts using old_roots */
+ ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
+ UPDATE_OLD);
if (ret < 0)
goto out;
- /*
- * Now adjust the refcounts of the qgroups that care about this
- * reference, either the old_count in the case of removal or new_count
- * in the case of an addition.
- */
- ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
- seq);
+ /* Update new refcnts using new_roots */
+ ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
+ UPDATE_NEW);
if (ret < 0)
goto out;
- /*
- * ...in the case of removals. If we had a removal before we got around
- * to processing this operation then we need to find that guy and count
- * his references as if they really existed so we don't end up screwing
- * up the exclusive counts. Then whenever we go to process the delete
- * everything will be grand and we can account for whatever exclusive
- * changes need to be made there. We also have to pass in old_roots so
- * we have an accurate count of the roots as it pertains to this
- * operations view of the world.
- */
- ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
- &old_roots);
- if (ret < 0)
- goto out;
+ qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+ num_bytes, seq);
/*
- * We are adding our root, need to adjust up the number of roots,
- * otherwise old_roots is the number of roots we want.
+ * Bump qgroup_seq to avoid seq overlap
*/
- if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
- new_roots = old_roots + 1;
- } else {
- new_roots = old_roots;
- old_roots++;
- }
- fs_info->qgroup_seq += old_roots + 1;
-
-
- /*
- * And now the magic happens, bless Arne for having a pretty elegant
- * solution for this.
- */
- qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
- qgroups, seq, old_roots, new_roots, 0);
+ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
out:
spin_unlock(&fs_info->qgroup_lock);
- ulist_free(qgroups);
- ulist_free(roots);
+out_free:
ulist_free(tmp);
+ ulist_free(qgroups);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
return ret;
}
-/*
- * Process a reference to a shared subtree. This type of operation is
- * queued during snapshot removal when we encounter extents which are
- * shared between more than one root.
- */
-static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct ulist *roots = NULL;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup_list *glist;
- struct ulist *parents;
- int ret = 0;
- int err;
- struct btrfs_qgroup *qg;
- u64 root_obj = 0;
- struct seq_list elem = SEQ_LIST_INIT(elem);
-
- parents = ulist_alloc(GFP_NOFS);
- if (!parents)
- return -ENOMEM;
-
- btrfs_get_tree_mod_seq(fs_info, &elem);
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
- elem.seq, &roots);
- btrfs_put_tree_mod_seq(fs_info, &elem);
- if (ret < 0)
- goto out;
-
- if (roots->nnodes != 1)
- goto out;
-
- ULIST_ITER_INIT(&uiter);
- unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
- /*
- * If we find our ref root then that means all refs
- * this extent has to the root have not yet been
- * deleted. In that case, we do nothing and let the
- * last ref for this bytenr drive our update.
- *
- * This can happen for example if an extent is
- * referenced multiple times in a snapshot (clone,
- * etc). If we are in the middle of snapshot removal,
- * queued updates for such an extent will find the
- * root if we have not yet finished removing the
- * snapshot.
- */
- if (unode->val == oper->ref_root)
- goto out;
-
- root_obj = unode->val;
- BUG_ON(!root_obj);
-
- spin_lock(&fs_info->qgroup_lock);
- qg = find_qgroup_rb(fs_info, root_obj);
- if (!qg)
- goto out_unlock;
-
- qg->excl += oper->num_bytes;
- qg->excl_cmpr += oper->num_bytes;
- qgroup_dirty(fs_info, qg);
-
- /*
- * Adjust counts for parent groups. First we find all
- * parents, then in the 2nd loop we do the adjustment
- * while adding parents of the parents to our ulist.
- */
- list_for_each_entry(glist, &qg->groups, next_group) {
- err = ulist_add(parents, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (err < 0) {
- ret = err;
- goto out_unlock;
- }
- }
-
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(parents, &uiter))) {
- qg = u64_to_ptr(unode->aux);
- qg->excl += oper->num_bytes;
- qg->excl_cmpr += oper->num_bytes;
- qgroup_dirty(fs_info, qg);
-
- /* Add any parents of the parents */
- list_for_each_entry(glist, &qg->groups, next_group) {
- err = ulist_add(parents, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (err < 0) {
- ret = err;
- goto out_unlock;
- }
- }
- }
-
-out_unlock:
- spin_unlock(&fs_info->qgroup_lock);
-
-out:
- ulist_free(roots);
- ulist_free(parents);
- return ret;
-}
-
-/*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
- */
-static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct ulist *new_roots = NULL;
+ struct rb_node *node;
+ u64 qgroup_to_skip;
int ret = 0;
- if (!fs_info->quota_enabled)
- return 0;
-
- BUG_ON(!fs_info->quota_root);
+ delayed_refs = &trans->transaction->delayed_refs;
+ qgroup_to_skip = delayed_refs->qgroup_to_skip;
+ while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
+ record = rb_entry(node, struct btrfs_qgroup_extent_record,
+ node);
- mutex_lock(&fs_info->qgroup_rescan_lock);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- return 0;
+ if (!ret) {
+ /*
+ * Use (u64)-1 as time_seq to do special search, which
+ * doesn't lock tree or delayed_refs and search current
+ * root. It's safe inside commit_transaction().
+ */
+ ret = btrfs_find_all_roots(trans, fs_info,
+ record->bytenr, (u64)-1, &new_roots);
+ if (ret < 0)
+ goto cleanup;
+ if (qgroup_to_skip)
+ ulist_del(new_roots, qgroup_to_skip, 0);
+ ret = btrfs_qgroup_account_extent(trans, fs_info,
+ record->bytenr, record->num_bytes,
+ record->old_roots, new_roots);
+ record->old_roots = NULL;
+ new_roots = NULL;
}
- }
- mutex_unlock(&fs_info->qgroup_rescan_lock);
+cleanup:
+ ulist_free(record->old_roots);
+ ulist_free(new_roots);
+ new_roots = NULL;
+ rb_erase(node, &delayed_refs->dirty_extent_root);
+ kfree(record);
- ASSERT(is_fstree(oper->ref_root));
-
- trace_btrfs_qgroup_account(oper);
-
- switch (oper->type) {
- case BTRFS_QGROUP_OPER_ADD_EXCL:
- case BTRFS_QGROUP_OPER_SUB_EXCL:
- ret = qgroup_excl_accounting(fs_info, oper);
- break;
- case BTRFS_QGROUP_OPER_ADD_SHARED:
- case BTRFS_QGROUP_OPER_SUB_SHARED:
- ret = qgroup_shared_accounting(trans, fs_info, oper);
- break;
- case BTRFS_QGROUP_OPER_SUB_SUBTREE:
- ret = qgroup_subtree_accounting(trans, fs_info, oper);
- break;
- default:
- ASSERT(0);
- }
- return ret;
-}
-
-/*
- * Needs to be called everytime we run delayed refs, even if there is an error
- * in order to cleanup outstanding operations.
- */
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_qgroup_operation *oper;
- int ret = 0;
-
- while (!list_empty(&trans->qgroup_ref_list)) {
- oper = list_first_entry(&trans->qgroup_ref_list,
- struct btrfs_qgroup_operation, list);
- list_del_init(&oper->list);
- if (!ret || !trans->aborted)
- ret = btrfs_qgroup_account(trans, fs_info, oper);
- spin_lock(&fs_info->qgroup_op_lock);
- rb_erase(&oper->n, &fs_info->qgroup_op_tree);
- spin_unlock(&fs_info->qgroup_op_lock);
- btrfs_put_tree_mod_seq(fs_info, &oper->elem);
- kfree(oper);
}
return ret;
}
@@ -2637,15 +2150,13 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct btrfs_trans_handle *trans, struct ulist *qgroups,
- struct ulist *tmp, struct extent_buffer *scratch_leaf)
+ struct btrfs_trans_handle *trans,
+ struct extent_buffer *scratch_leaf)
{
struct btrfs_key found;
struct ulist *roots = NULL;
struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
- u64 seq;
- int new_roots;
int slot;
int ret;
@@ -2695,33 +2206,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
else
num_bytes = found.offset;
- ulist_reinit(qgroups);
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
&roots);
if (ret < 0)
goto out;
- spin_lock(&fs_info->qgroup_lock);
- seq = fs_info->qgroup_seq;
- fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
-
- new_roots = 0;
- ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
- seq, &new_roots, 1);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
- goto out;
- }
-
- ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
- seq, 0, new_roots, 1);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
+ /* For rescan, just pass old_roots as NULL */
+ ret = btrfs_qgroup_account_extent(trans, fs_info,
+ found.objectid, num_bytes, NULL, roots);
+ if (ret < 0)
goto out;
- }
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
}
out:
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
@@ -2735,7 +2228,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct ulist *tmp = NULL, *qgroups = NULL;
struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
int ret = 0;
@@ -2743,12 +2235,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
path = btrfs_alloc_path();
if (!path)
goto out;
- qgroups = ulist_alloc(GFP_NOFS);
- if (!qgroups)
- goto out;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- goto out;
scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
if (!scratch_leaf)
goto out;
@@ -2764,7 +2250,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = -EINTR;
} else {
err = qgroup_rescan_leaf(fs_info, path, trans,
- qgroups, tmp, scratch_leaf);
+ scratch_leaf);
}
if (err > 0)
btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2774,8 +2260,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
out:
kfree(scratch_leaf);
- ulist_free(qgroups);
- ulist_free(tmp);
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index c5242aa9a4b2..6387dcfa354c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -19,43 +19,18 @@
#ifndef __BTRFS_QGROUP__
#define __BTRFS_QGROUP__
+#include "ulist.h"
+#include "delayed-ref.h"
+
/*
- * A description of the operations, all of these operations only happen when we
- * are adding the 1st reference for that subvolume in the case of adding space
- * or on the last reference delete in the case of subtraction. The only
- * exception is the last one, which is added for confusion.
- *
- * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
- * one pointing at the bytes we are adding. This is called on the first
- * allocation.
- *
- * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
- * shared between subvols. This is called on the creation of a ref that already
- * has refs from a different subvolume, so basically reflink.
- *
- * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
- * one referencing the range.
- *
- * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
- * refs with other subvolumes.
+ * Record a dirty extent, and info qgroup to update quota on it
+ * TODO: Use kmem cache to alloc it.
*/
-enum btrfs_qgroup_operation_type {
- BTRFS_QGROUP_OPER_ADD_EXCL,
- BTRFS_QGROUP_OPER_ADD_SHARED,
- BTRFS_QGROUP_OPER_SUB_EXCL,
- BTRFS_QGROUP_OPER_SUB_SHARED,
- BTRFS_QGROUP_OPER_SUB_SUBTREE,
-};
-
-struct btrfs_qgroup_operation {
- u64 ref_root;
+struct btrfs_qgroup_extent_record {
+ struct rb_node node;
u64 bytenr;
u64 num_bytes;
- u64 seq;
- enum btrfs_qgroup_operation_type type;
- struct seq_list elem;
- struct rb_node n;
- struct list_head list;
+ struct ulist *old_roots;
};
int btrfs_quota_enable(struct btrfs_trans_handle *trans,
@@ -79,16 +54,18 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 ref_root,
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record);
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes,
- enum btrfs_qgroup_operation_type type,
- int mod_seq);
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper);
+ struct ulist *old_roots, struct ulist *new_roots);
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 74b24b01d574..827951fbf7fc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1847,8 +1847,10 @@ again:
}
eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
- ret = (!eb) ? -ENOMEM : -EIO;
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
+ ret = -EIO;
free_extent_buffer(eb);
break;
}
@@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
eb = read_tree_block(root, bytenr, ptr_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2710,7 +2714,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
blocksize = root->nodesize;
generation = btrfs_node_ptr_generation(upper->eb, slot);
eb = read_tree_block(root, bytenr, generation);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ err = PTR_ERR(eb);
+ goto next;
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
err = -EIO;
goto next;
@@ -2873,7 +2880,9 @@ static int get_tree_block_key(struct reloc_control *rc,
BUG_ON(block->key_ready);
eb = read_tree_block(rc->extent_root, block->bytenr,
block->key.offset);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ab5811545a98..9f2feabe99f2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2662,18 +2662,30 @@ static void scrub_free_parity(struct scrub_parity *sparity)
kfree(sparity);
}
+static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
+{
+ struct scrub_parity *sparity = container_of(work, struct scrub_parity,
+ work);
+ struct scrub_ctx *sctx = sparity->sctx;
+
+ scrub_free_parity(sparity);
+ scrub_pending_bio_dec(sctx);
+}
+
static void scrub_parity_bio_endio(struct bio *bio, int error)
{
struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
- struct scrub_ctx *sctx = sparity->sctx;
if (error)
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
- scrub_free_parity(sparity);
- scrub_pending_bio_dec(sctx);
bio_put(bio);
+
+ btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
+ scrub_parity_bio_endio_worker, NULL, NULL);
+ btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers,
+ &sparity->work);
}
static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
@@ -3589,6 +3601,13 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
ret = -ENOMEM;
goto out;
}
+ fs_info->scrub_parity_workers =
+ btrfs_alloc_workqueue("btrfs-scrubparity", flags,
+ max_active, 2);
+ if (!fs_info->scrub_parity_workers) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
++fs_info->scrub_workers_refcnt;
out:
@@ -3601,6 +3620,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->scrub_workers);
btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
}
WARN_ON(fs_info->scrub_workers_refcnt < 0);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a1216f9b4917..aa72bfd28f7d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -243,6 +243,7 @@ struct waiting_dir_move {
* after this directory is moved, we can try to rmdir the ino rmdir_ino.
*/
u64 rmdir_ino;
+ bool orphanized;
};
struct orphan_dir_info {
@@ -1158,6 +1159,9 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
+ /* data offset in the file extent item */
+ u64 data_offset;
+
/* Just to check for bugs in backref resolving */
int found_itself;
};
@@ -1221,7 +1225,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
if (ret < 0)
return ret;
- if (offset + bctx->extent_len > i_size)
+ if (offset + bctx->data_offset + bctx->extent_len > i_size)
return 0;
/*
@@ -1363,6 +1367,19 @@ static int find_extent_clone(struct send_ctx *sctx,
backref_ctx->cur_offset = data_offset;
backref_ctx->found_itself = 0;
backref_ctx->extent_len = num_bytes;
+ /*
+ * For non-compressed extents iterate_extent_inodes() gives us extent
+ * offsets that already take into account the data offset, but not for
+ * compressed extents, since the offset is logical and not relative to
+ * the physical extent locations. We must take this into account to
+ * avoid sending clone offsets that go beyond the source file's size,
+ * which would result in the clone ioctl failing with -EINVAL on the
+ * receiving end.
+ */
+ if (compressed == BTRFS_COMPRESS_NONE)
+ backref_ctx->data_offset = 0;
+ else
+ backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
/*
* The last extent of a file may be too large due to page alignment.
@@ -1900,8 +1917,13 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
}
- /* we know that it is or will be overwritten. check this now */
- if (ow_inode < sctx->send_progress)
+ /*
+ * We know that it is or will be overwritten. Check this now.
+ * The current inode being processed might have been the one that caused
+ * inode 'ino' to be orphanized, therefore ow_inode can actually be the
+ * same as sctx->send_progress.
+ */
+ if (ow_inode <= sctx->send_progress)
ret = 1;
else
ret = 0;
@@ -2223,6 +2245,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
fs_path_reset(dest);
while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+ struct waiting_dir_move *wdm;
+
fs_path_reset(name);
if (is_waiting_for_rm(sctx, ino)) {
@@ -2233,7 +2257,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
break;
}
- if (is_waiting_for_move(sctx, ino)) {
+ wdm = get_waiting_dir_move(sctx, ino);
+ if (wdm && wdm->orphanized) {
+ ret = gen_unique_name(sctx, ino, gen, name);
+ stop = 1;
+ } else if (wdm) {
ret = get_first_ref(sctx->parent_root, ino,
&parent_inode, &parent_gen, name);
} else {
@@ -2328,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx)
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
le64_to_cpu(sctx->send_root->root_item.ctransid));
if (parent_root) {
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
- sctx->parent_root->root_item.uuid);
+ if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
le64_to_cpu(sctx->parent_root->root_item.ctransid));
}
@@ -2923,7 +2955,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
return entry != NULL;
}
-static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
{
struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
struct rb_node *parent = NULL;
@@ -2934,6 +2966,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
return -ENOMEM;
dm->ino = ino;
dm->rmdir_ino = 0;
+ dm->orphanized = orphanized;
while (*p) {
parent = *p;
@@ -3030,7 +3063,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
goto out;
}
- ret = add_waiting_dir_move(sctx, pm->ino);
+ ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
if (ret)
goto out;
@@ -3353,8 +3386,40 @@ out:
return ret;
}
+/*
+ * Check if ino ino1 is an ancestor of inode ino2 in the given root.
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int is_ancestor(struct btrfs_root *root,
+ const u64 ino1,
+ const u64 ino1_gen,
+ const u64 ino2,
+ struct fs_path *fs_path)
+{
+ u64 ino = ino2;
+
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ int ret;
+ u64 parent;
+ u64 parent_gen;
+
+ fs_path_reset(fs_path);
+ ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
+ if (ret < 0) {
+ if (ret == -ENOENT && ino == ino2)
+ ret = 0;
+ return ret;
+ }
+ if (parent == ino1)
+ return parent_gen == ino1_gen ? 1 : 0;
+ ino = parent;
+ }
+ return 0;
+}
+
static int wait_for_parent_move(struct send_ctx *sctx,
- struct recorded_ref *parent_ref)
+ struct recorded_ref *parent_ref,
+ const bool is_orphan)
{
int ret = 0;
u64 ino = parent_ref->dir;
@@ -3374,11 +3439,24 @@ static int wait_for_parent_move(struct send_ctx *sctx,
* Our current directory inode may not yet be renamed/moved because some
* ancestor (immediate or not) has to be renamed/moved first. So find if
* such ancestor exists and make sure our own rename/move happens after
- * that ancestor is processed.
+ * that ancestor is processed to avoid path build infinite loops (done
+ * at get_cur_path()).
*/
while (ino > BTRFS_FIRST_FREE_OBJECTID) {
if (is_waiting_for_move(sctx, ino)) {
- ret = 1;
+ /*
+ * If the current inode is an ancestor of ino in the
+ * parent root, we need to delay the rename of the
+ * current inode, otherwise don't delayed the rename
+ * because we can end up with a circular dependency
+ * of renames, resulting in some directories never
+ * getting the respective rename operations issued in
+ * the send stream or getting into infinite path build
+ * loops.
+ */
+ ret = is_ancestor(sctx->parent_root,
+ sctx->cur_ino, sctx->cur_inode_gen,
+ ino, path_before);
break;
}
@@ -3420,7 +3498,7 @@ out:
ino,
&sctx->new_refs,
&sctx->deleted_refs,
- false);
+ is_orphan);
if (!ret)
ret = 1;
}
@@ -3589,6 +3667,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
}
}
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
+ can_rename) {
+ ret = wait_for_parent_move(sctx, cur, is_orphan);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ can_rename = false;
+ *pending_move = 1;
+ }
+ }
+
/*
* link/move the ref to the new place. If we have an orphan
* inode, move it and update valid_path. If not, link or move
@@ -3609,18 +3698,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- ret = wait_for_parent_move(sctx, cur);
- if (ret < 0)
- goto out;
- if (ret) {
- *pending_move = 1;
- } else {
- ret = send_rename(sctx, valid_path,
- cur->full_path);
- if (!ret)
- ret = fs_path_copy(valid_path,
- cur->full_path);
- }
+ ret = send_rename(sctx, valid_path,
+ cur->full_path);
+ if (!ret)
+ ret = fs_path_copy(valid_path,
+ cur->full_path);
if (ret < 0)
goto out;
} else {
@@ -4508,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
if (ret < 0)
goto out;
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
- clone_root->root->root_item.uuid);
+ /*
+ * If the parent we're using has a received_uuid set then use that as
+ * our clone source as that is what we will look for when doing a
+ * receive.
+ *
+ * This covers the case that we create a snapshot off of a received
+ * subvolume and then use that as the parent and try to receive on a
+ * different host.
+ */
+ if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
le64_to_cpu(clone_root->root->root_item.ctransid));
TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9e66f5e724db..cd7ef34d2dce 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
* __btrfs_std_error decodes expected errors from the caller and
* invokes the approciate error response.
*/
+__cold
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
@@ -247,18 +248,11 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
* We'll complete the cleanup in btrfs_end_transaction and
* btrfs_commit_transaction.
*/
+__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno)
{
- /*
- * Report first abort since mount
- */
- if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
- &root->fs_info->fs_state)) {
- WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
- errno);
- }
trans->aborted = errno;
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
@@ -281,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
* __btrfs_panic decodes unexpected, fatal errors from the caller,
* issues an alert, and either panics or BUGs, depending on mount options.
*/
+__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
@@ -841,33 +836,153 @@ out:
return error;
}
-static struct dentry *get_default_root(struct super_block *sb,
- u64 subvol_objectid)
+static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_root *new_root;
- struct btrfs_dir_item *di;
- struct btrfs_path *path;
- struct btrfs_key location;
- struct inode *inode;
- u64 dir_id;
- int new = 0;
+ struct btrfs_root *fs_root;
+ struct btrfs_root_ref *root_ref;
+ struct btrfs_inode_ref *inode_ref;
+ struct btrfs_key key;
+ struct btrfs_path *path = NULL;
+ char *name = NULL, *ptr;
+ u64 dirid;
+ int len;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ path->leave_spinning = 1;
+
+ name = kmalloc(PATH_MAX, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ ptr = name + PATH_MAX - 1;
+ ptr[0] = '\0';
/*
- * We have a specific subvol we want to mount, just setup location and
- * go look up the root.
+ * Walk up the subvolume trees in the tree of tree roots by root
+ * backrefs until we hit the top-level subvolume.
*/
- if (subvol_objectid) {
- location.objectid = subvol_objectid;
- location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = (u64)-1;
- goto find_root;
+ while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+ key.objectid = subvol_objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = btrfs_previous_item(root, path, subvol_objectid,
+ BTRFS_ROOT_BACKREF_KEY);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto err;
+ }
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ subvol_objectid = key.offset;
+
+ root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_root_ref);
+ len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
+ ptr -= len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto err;
+ }
+ read_extent_buffer(path->nodes[0], ptr + 1,
+ (unsigned long)(root_ref + 1), len);
+ ptr[0] = '/';
+ dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
+ btrfs_release_path(path);
+
+ key.objectid = subvol_objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(fs_root)) {
+ ret = PTR_ERR(fs_root);
+ goto err;
+ }
+
+ /*
+ * Walk up the filesystem tree by inode refs until we hit the
+ * root directory.
+ */
+ while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = dirid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = btrfs_previous_item(fs_root, path, dirid,
+ BTRFS_INODE_REF_KEY);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto err;
+ }
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ dirid = key.offset;
+
+ inode_ref = btrfs_item_ptr(path->nodes[0],
+ path->slots[0],
+ struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(path->nodes[0],
+ inode_ref);
+ ptr -= len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto err;
+ }
+ read_extent_buffer(path->nodes[0], ptr + 1,
+ (unsigned long)(inode_ref + 1), len);
+ ptr[0] = '/';
+ btrfs_release_path(path);
+ }
}
+ btrfs_free_path(path);
+ if (ptr == name + PATH_MAX - 1) {
+ name[0] = '/';
+ name[1] = '\0';
+ } else {
+ memmove(name, ptr, name + PATH_MAX - ptr);
+ }
+ return name;
+
+err:
+ btrfs_free_path(path);
+ kfree(name);
+ return ERR_PTR(ret);
+}
+
+static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct btrfs_key location;
+ u64 dir_id;
+
path = btrfs_alloc_path();
if (!path)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
path->leave_spinning = 1;
/*
@@ -879,58 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb,
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
- return ERR_CAST(di);
+ return PTR_ERR(di);
}
if (!di) {
/*
* Ok the default dir item isn't there. This is weird since
* it's always been there, but don't freak out, just try and
- * mount to root most subvolume.
+ * mount the top-level subvolume.
*/
btrfs_free_path(path);
- dir_id = BTRFS_FIRST_FREE_OBJECTID;
- new_root = fs_info->fs_root;
- goto setup_root;
+ *objectid = BTRFS_FS_TREE_OBJECTID;
+ return 0;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
btrfs_free_path(path);
-
-find_root:
- new_root = btrfs_read_fs_root_no_name(fs_info, &location);
- if (IS_ERR(new_root))
- return ERR_CAST(new_root);
-
- if (!(sb->s_flags & MS_RDONLY)) {
- int ret;
- down_read(&fs_info->cleanup_work_sem);
- ret = btrfs_orphan_cleanup(new_root);
- up_read(&fs_info->cleanup_work_sem);
- if (ret)
- return ERR_PTR(ret);
- }
-
- dir_id = btrfs_root_dirid(&new_root->root_item);
-setup_root:
- location.objectid = dir_id;
- location.type = BTRFS_INODE_ITEM_KEY;
- location.offset = 0;
-
- inode = btrfs_iget(sb, &location, new_root, &new);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
-
- /*
- * If we're just mounting the root most subvol put the inode and return
- * a reference to the dentry. We will have already gotten a reference
- * to the inode in btrfs_fill_super so we're good to go.
- */
- if (!new && d_inode(sb->s_root) == inode) {
- iput(inode);
- return dget(sb->s_root);
- }
-
- return d_obtain_root(inode);
+ *objectid = location.objectid;
+ return 0;
}
static int btrfs_fill_super(struct super_block *sb,
@@ -1108,6 +1188,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+ seq_printf(seq, ",subvolid=%llu",
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ seq_puts(seq, ",subvol=");
+ seq_dentry(seq, dentry, " \t\n\\");
return 0;
}
@@ -1138,107 +1222,139 @@ static inline int is_subvolume_inode(struct inode *inode)
}
/*
- * This will strip out the subvol=%s argument for an argument string and add
- * subvolid=0 to make sure we get the actual tree root for path walking to the
- * subvol we want.
+ * This will add subvolid=0 to the argument string while removing any subvol=
+ * and subvolid= arguments to make sure we get the top-level root for path
+ * walking to the subvol we want.
*/
static char *setup_root_args(char *args)
{
- unsigned len = strlen(args) + 2 + 1;
- char *src, *dst, *buf;
+ char *buf, *dst, *sep;
- /*
- * We need the same args as before, but with this substitution:
- * s!subvol=[^,]+!subvolid=0!
- *
- * Since the replacement string is up to 2 bytes longer than the
- * original, allocate strlen(args) + 2 + 1 bytes.
- */
+ if (!args)
+ return kstrdup("subvolid=0", GFP_NOFS);
- src = strstr(args, "subvol=");
- /* This shouldn't happen, but just in case.. */
- if (!src)
- return NULL;
-
- buf = dst = kmalloc(len, GFP_NOFS);
+ /* The worst case is that we add ",subvolid=0" to the end. */
+ buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS);
if (!buf)
return NULL;
- /*
- * If the subvol= arg is not at the start of the string,
- * copy whatever precedes it into buf.
- */
- if (src != args) {
- *src++ = '\0';
- strcpy(buf, args);
- dst += strlen(args);
+ while (1) {
+ sep = strchrnul(args, ',');
+ if (!strstarts(args, "subvol=") &&
+ !strstarts(args, "subvolid=")) {
+ memcpy(dst, args, sep - args);
+ dst += sep - args;
+ *dst++ = ',';
+ }
+ if (*sep)
+ args = sep + 1;
+ else
+ break;
}
-
strcpy(dst, "subvolid=0");
- dst += strlen("subvolid=0");
-
- /*
- * If there is a "," after the original subvol=... string,
- * copy that suffix into our buffer. Otherwise, we're done.
- */
- src = strchr(src, ',');
- if (src)
- strcpy(dst, src);
return buf;
}
-static struct dentry *mount_subvol(const char *subvol_name, int flags,
- const char *device_name, char *data)
+static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
+ int flags, const char *device_name,
+ char *data)
{
struct dentry *root;
- struct vfsmount *mnt;
+ struct vfsmount *mnt = NULL;
char *newargs;
+ int ret;
newargs = setup_root_args(data);
- if (!newargs)
- return ERR_PTR(-ENOMEM);
- mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
- newargs);
+ if (!newargs) {
+ root = ERR_PTR(-ENOMEM);
+ goto out;
+ }
- if (PTR_RET(mnt) == -EBUSY) {
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
+ if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
if (flags & MS_RDONLY) {
- mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
- newargs);
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY,
+ device_name, newargs);
} else {
- int r;
- mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
- newargs);
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY,
+ device_name, newargs);
if (IS_ERR(mnt)) {
- kfree(newargs);
- return ERR_CAST(mnt);
+ root = ERR_CAST(mnt);
+ mnt = NULL;
+ goto out;
}
- r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
- if (r < 0) {
- /* FIXME: release vfsmount mnt ??*/
- kfree(newargs);
- return ERR_PTR(r);
+ down_write(&mnt->mnt_sb->s_umount);
+ ret = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+ up_write(&mnt->mnt_sb->s_umount);
+ if (ret < 0) {
+ root = ERR_PTR(ret);
+ goto out;
}
}
}
+ if (IS_ERR(mnt)) {
+ root = ERR_CAST(mnt);
+ mnt = NULL;
+ goto out;
+ }
- kfree(newargs);
+ if (!subvol_name) {
+ if (!subvol_objectid) {
+ ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
+ &subvol_objectid);
+ if (ret) {
+ root = ERR_PTR(ret);
+ goto out;
+ }
+ }
+ subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
+ subvol_objectid);
+ if (IS_ERR(subvol_name)) {
+ root = ERR_CAST(subvol_name);
+ subvol_name = NULL;
+ goto out;
+ }
- if (IS_ERR(mnt))
- return ERR_CAST(mnt);
+ }
root = mount_subtree(mnt, subvol_name);
+ /* mount_subtree() drops our reference on the vfsmount. */
+ mnt = NULL;
- if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) {
+ if (!IS_ERR(root)) {
struct super_block *s = root->d_sb;
- dput(root);
- root = ERR_PTR(-EINVAL);
- deactivate_locked_super(s);
- printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
- subvol_name);
+ struct inode *root_inode = d_inode(root);
+ u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
+
+ ret = 0;
+ if (!is_subvolume_inode(root_inode)) {
+ pr_err("BTRFS: '%s' is not a valid subvolume\n",
+ subvol_name);
+ ret = -EINVAL;
+ }
+ if (subvol_objectid && root_objectid != subvol_objectid) {
+ /*
+ * This will also catch a race condition where a
+ * subvolume which was passed by ID is renamed and
+ * another subvolume is renamed over the old location.
+ */
+ pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n",
+ subvol_name, subvol_objectid);
+ ret = -EINVAL;
+ }
+ if (ret) {
+ dput(root);
+ root = ERR_PTR(ret);
+ deactivate_locked_super(s);
+ }
}
+out:
+ mntput(mnt);
+ kfree(newargs);
+ kfree(subvol_name);
return root;
}
@@ -1303,7 +1419,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
{
struct block_device *bdev = NULL;
struct super_block *s;
- struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
struct security_mnt_opts new_sec_opts;
@@ -1323,10 +1438,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
return ERR_PTR(error);
}
- if (subvol_name) {
- root = mount_subvol(subvol_name, flags, device_name, data);
- kfree(subvol_name);
- return root;
+ if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+ /* mount_subvol() will free subvol_name. */
+ return mount_subvol(subvol_name, subvol_objectid, flags,
+ device_name, data);
}
security_init_mnt_opts(&new_sec_opts);
@@ -1392,23 +1507,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
}
-
- root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
- if (IS_ERR(root)) {
+ if (error) {
deactivate_locked_super(s);
- error = PTR_ERR(root);
goto error_sec_opts;
}
fs_info = btrfs_sb(s);
error = setup_security_options(fs_info, s, &new_sec_opts);
if (error) {
- dput(root);
deactivate_locked_super(s);
goto error_sec_opts;
}
- return root;
+ return dget(s->s_root);
error_close_devices:
btrfs_close_devices(fs_devices);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e8a4c86d274d..603b0cc2b9bb 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -33,6 +33,7 @@
#include "volumes.h"
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
static u64 get_features(struct btrfs_fs_info *fs_info,
enum btrfs_feature_set set)
@@ -428,7 +429,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
-static struct attribute *btrfs_attrs[] = {
+static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
BTRFS_ATTR_PTR(nodesize),
BTRFS_ATTR_PTR(sectorsize),
@@ -438,21 +439,29 @@ static struct attribute *btrfs_attrs[] = {
static void btrfs_release_super_kobj(struct kobject *kobj)
{
- struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- complete(&fs_info->kobj_unregister);
+ struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
+
+ memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+ complete(&fs_devs->kobj_unregister);
}
static struct kobj_type btrfs_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = btrfs_release_super_kobj,
- .default_attrs = btrfs_attrs,
};
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
+{
+ if (kobj->ktype != &btrfs_ktype)
+ return NULL;
+ return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+}
+
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
{
if (kobj->ktype != &btrfs_ktype)
return NULL;
- return container_of(kobj, struct btrfs_fs_info, super_kobj);
+ return to_fs_devs(kobj)->fs_info;
}
#define NUM_FEATURE_BITS 64
@@ -493,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
attrs[0] = &fa->kobj_attr.attr;
if (add) {
int ret;
- ret = sysfs_merge_group(&fs_info->super_kobj,
+ ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
&agroup);
if (ret)
return ret;
} else
- sysfs_unmerge_group(&fs_info->super_kobj,
+ sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
&agroup);
}
@@ -506,25 +515,49 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
return 0;
}
-static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
+{
+ if (fs_devs->device_dir_kobj) {
+ kobject_del(fs_devs->device_dir_kobj);
+ kobject_put(fs_devs->device_dir_kobj);
+ fs_devs->device_dir_kobj = NULL;
+ }
+
+ if (fs_devs->super_kobj.state_initialized) {
+ kobject_del(&fs_devs->super_kobj);
+ kobject_put(&fs_devs->super_kobj);
+ wait_for_completion(&fs_devs->kobj_unregister);
+ }
+}
+
+/* when fs_devs is NULL it will remove all fsid kobject */
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
{
- kobject_del(&fs_info->super_kobj);
- kobject_put(&fs_info->super_kobj);
- wait_for_completion(&fs_info->kobj_unregister);
+ struct list_head *fs_uuids = btrfs_get_fs_uuids();
+
+ if (fs_devs) {
+ __btrfs_sysfs_remove_fsid(fs_devs);
+ return;
+ }
+
+ list_for_each_entry(fs_devs, fs_uuids, list) {
+ __btrfs_sysfs_remove_fsid(fs_devs);
+ }
}
void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
{
+ btrfs_reset_fs_info_ptr(fs_info);
+
if (fs_info->space_info_kobj) {
sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
kobject_del(fs_info->space_info_kobj);
kobject_put(fs_info->space_info_kobj);
}
- kobject_del(fs_info->device_dir_kobj);
- kobject_put(fs_info->device_dir_kobj);
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
- __btrfs_sysfs_remove_one(fs_info);
+ sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
+ btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
}
const char * const btrfs_feature_set_names[3] = {
@@ -602,40 +635,60 @@ static void init_feature_attrs(void)
}
}
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+/* when one_device is NULL, it removes all device links */
+
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
struct hd_struct *disk;
struct kobject *disk_kobj;
- if (!fs_info->device_dir_kobj)
+ if (!fs_devices->device_dir_kobj)
return -EINVAL;
if (one_device && one_device->bdev) {
disk = one_device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_info->device_dir_kobj,
+ sysfs_remove_link(fs_devices->device_dir_kobj,
+ disk_kobj->name);
+ }
+
+ if (one_device)
+ return 0;
+
+ list_for_each_entry(one_device,
+ &fs_devices->devices, dev_list) {
+ if (!one_device->bdev)
+ continue;
+ disk = one_device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+
+ sysfs_remove_link(fs_devices->device_dir_kobj,
disk_kobj->name);
}
return 0;
}
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
- struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
{
- int error = 0;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- struct btrfs_device *dev;
-
- if (!fs_info->device_dir_kobj)
- fs_info->device_dir_kobj = kobject_create_and_add("devices",
- &fs_info->super_kobj);
+ if (!fs_devs->device_dir_kobj)
+ fs_devs->device_dir_kobj = kobject_create_and_add("devices",
+ &fs_devs->super_kobj);
- if (!fs_info->device_dir_kobj)
+ if (!fs_devs->device_dir_kobj)
return -ENOMEM;
+ return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device *one_device)
+{
+ int error = 0;
+ struct btrfs_device *dev;
+
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
struct hd_struct *disk;
struct kobject *disk_kobj;
@@ -649,7 +702,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
disk = dev->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
- error = sysfs_create_link(fs_info->device_dir_kobj,
+ error = sysfs_create_link(fs_devices->device_dir_kobj,
disk_kobj, disk_kobj->name);
if (error)
break;
@@ -667,34 +720,51 @@ static struct dentry *btrfs_debugfs_root_dentry;
/* Debugging tunables and exported data */
u64 btrfs_debugfs_test;
+/*
+ * Can be called by the device discovery thread.
+ * And parent can be specified for seed device
+ */
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+ struct kobject *parent)
+{
+ int error;
+
+ init_completion(&fs_devs->kobj_unregister);
+ fs_devs->super_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs_devs->super_kobj,
+ &btrfs_ktype, parent, "%pU", fs_devs->fsid);
+ return error;
+}
+
int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
{
int error;
+ struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
+ struct kobject *super_kobj = &fs_devs->super_kobj;
+
+ btrfs_set_fs_info_ptr(fs_info);
- init_completion(&fs_info->kobj_unregister);
- fs_info->super_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
- "%pU", fs_info->fsid);
+ error = btrfs_kobj_add_device(fs_devs, NULL);
if (error)
return error;
- error = sysfs_create_group(&fs_info->super_kobj,
- &btrfs_feature_attr_group);
+ error = sysfs_create_files(super_kobj, btrfs_attrs);
if (error) {
- __btrfs_sysfs_remove_one(fs_info);
+ btrfs_kobj_rm_device(fs_devs, NULL);
return error;
}
- error = addrm_unknown_feature_attrs(fs_info, true);
+ error = sysfs_create_group(super_kobj,
+ &btrfs_feature_attr_group);
if (error)
goto failure;
- error = btrfs_kobj_add_device(fs_info, NULL);
+ error = addrm_unknown_feature_attrs(fs_info, true);
if (error)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
- &fs_info->super_kobj);
+ super_kobj);
if (!fs_info->space_info_kobj) {
error = -ENOMEM;
goto failure;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 3a4bbed723fd..6392527bcc15 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,8 +82,12 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+ struct kobject *parent);
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index c32a7ba76bca..846d277b1901 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -21,6 +21,7 @@
#include "../transaction.h"
#include "../disk-io.h"
#include "../qgroup.h"
+#include "../backref.h"
static void init_dummy_trans(struct btrfs_trans_handle *trans)
{
@@ -227,6 +228,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist *old_roots = NULL;
+ struct ulist *new_roots = NULL;
int ret;
init_dummy_trans(&trans);
@@ -238,10 +241,15 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ /*
+ * Since the test trans doesn't havee the complicated delayed refs,
+ * we can only call btrfs_qgroup_account_extent() directly to test
+ * quota.
+ */
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
if (ret) {
- test_msg("Couldn't add space to a qgroup %d\n", ret);
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
@@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
if (ret)
return ret;
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Delayed qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
+ old_roots = NULL;
+ new_roots = NULL;
+
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
ret = remove_extent_item(root, 4096, 4096);
if (ret)
return -EINVAL;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Couldn't remove space from the qgroup %d\n", ret);
- return -EINVAL;
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return -EINVAL;
}
@@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist *old_roots = NULL;
+ struct ulist *new_roots = NULL;
int ret;
init_dummy_trans(&trans);
@@ -307,20 +337,29 @@ static int test_multiple_refs(struct btrfs_root *root)
return ret;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Couldn't add space to a qgroup %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Delayed qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root)
return -EINVAL;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = add_tree_ref(root, 4096, 4096, 0, 256);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Qgroup record ref failed %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root)
return -EINVAL;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = remove_extent_ref(root, 4096, 4096, 0, 256);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
- BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Qgroup record ref failed %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5628e25250c0..c0f18e7266b6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -225,12 +225,14 @@ loop:
cur_trans->dirty_bg_run = 0;
cur_trans->delayed_refs.href_root = RB_ROOT;
+ cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
cur_trans->delayed_refs.num_heads_ready = 0;
cur_trans->delayed_refs.pending_csums = 0;
cur_trans->delayed_refs.num_heads = 0;
cur_trans->delayed_refs.flushing = 0;
cur_trans->delayed_refs.run_delayed_start = 0;
+ cur_trans->delayed_refs.qgroup_to_skip = 0;
/*
* although the tree mod log is per file system and not per transaction,
@@ -509,6 +511,7 @@ again:
h->transaction = cur_trans;
h->blocks_used = 0;
h->bytes_reserved = 0;
+ h->chunk_bytes_reserved = 0;
h->root = root;
h->delayed_ref_updates = 0;
h->use_count = 1;
@@ -792,6 +795,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
+ btrfs_trans_release_chunk_metadata(trans);
+
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
should_end_transaction(trans, root) &&
ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
@@ -1290,6 +1295,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (pending->error)
goto no_free_objectid;
+ /*
+ * Make qgroup to skip current new snapshot's qgroupid, as it is
+ * accounted by later btrfs_qgroup_inherit().
+ */
+ btrfs_set_skip_qgroup(trans, objectid);
+
btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
if (to_reserve > 0) {
@@ -1298,7 +1309,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
to_reserve,
BTRFS_RESERVE_NO_FLUSH);
if (pending->error)
- goto no_free_objectid;
+ goto clear_skip_qgroup;
}
key.objectid = objectid;
@@ -1396,25 +1407,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, root, ret);
goto fail;
}
-
- /*
- * We need to flush delayed refs in order to make sure all of our quota
- * operations have been done before we call btrfs_qgroup_inherit.
- */
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto fail;
- }
-
- ret = btrfs_qgroup_inherit(trans, fs_info,
- root->root_key.objectid,
- objectid, pending->inherit);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto fail;
- }
-
/* see comments in should_cow_block() */
set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_wmb();
@@ -1497,11 +1489,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
}
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ /*
+ * account qgroup counters before qgroup_inherit()
+ */
+ ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+ if (ret)
+ goto fail;
+ ret = btrfs_qgroup_account_extents(trans, fs_info);
+ if (ret)
+ goto fail;
+ ret = btrfs_qgroup_inherit(trans, fs_info,
+ root->root_key.objectid,
+ objectid, pending->inherit);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
fail:
pending->error = ret;
dir_item_existed:
trans->block_rsv = rsv;
trans->bytes_reserved = 0;
+clear_skip_qgroup:
+ btrfs_clear_skip_qgroup(trans);
no_free_objectid:
kfree(new_root_item);
root_item_alloc_fail:
@@ -1963,6 +1981,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
goto scrub_continue;
}
+ /* Reocrd old roots for later qgroup accounting */
+ ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
/*
* make sure none of the code above managed to slip in a
* delayed item
@@ -2004,6 +2029,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
btrfs_free_log_root_tree(trans, root->fs_info);
+ /*
+ * Since fs roots are all committed, we can get a quite accurate
+ * new_roots. So let's do quota accounting.
+ */
+ ret = btrfs_qgroup_account_extents(trans, root->fs_info);
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
ret = commit_cowonly_roots(trans, root);
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
@@ -2054,6 +2090,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+ btrfs_trans_release_chunk_metadata(trans);
+
spin_lock(&root->fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
@@ -2123,6 +2161,7 @@ scrub_continue:
btrfs_scrub_continue(root);
cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
+ btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
if (trans->qgroup_reserved) {
btrfs_qgroup_free(root, trans->qgroup_reserved);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0b24755596ba..eb09c2067fa8 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -102,6 +102,7 @@ struct btrfs_transaction {
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
+ u64 chunk_bytes_reserved;
u64 qgroup_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
@@ -153,6 +154,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
spin_unlock(&BTRFS_I(inode)->lock);
}
+/*
+ * Make qgroup codes to skip given qgroupid, means the old/new_roots for
+ * qgroup won't contain the qgroupid in it.
+ */
+static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans,
+ u64 qgroupid)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ WARN_ON(delayed_refs->qgroup_to_skip);
+ delayed_refs->qgroup_to_skip = qgroupid;
+}
+
+static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ WARN_ON(!delayed_refs->qgroup_to_skip);
+ delayed_refs->qgroup_to_skip = 0;
+}
+
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a63719cc9578..a4b9c8b2d35a 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
goto out;
- if (btrfs_test_opt(root, SSD))
- goto out;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d04968374e9d..1ce80c1c4eb6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3881,12 +3881,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
&ordered->flags))
continue;
- if (ordered->csum_bytes_left) {
- btrfs_start_ordered_extent(inode, ordered, 0);
- wait_event(ordered->wait,
- ordered->csum_bytes_left == 0);
- }
-
list_for_each_entry(sum, &ordered->list, list) {
ret = btrfs_csum_file_blocks(trans, log, sum);
if (ret)
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 840a38b2778a..91feb2bdefee 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
return NULL;
}
+static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
+{
+ rb_erase(&node->rb_node, &ulist->root);
+ list_del(&node->list);
+ kfree(node);
+ BUG_ON(ulist->nnodes == 0);
+ ulist->nnodes--;
+}
+
static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
{
struct rb_node **p = &ulist->root.rb_node;
@@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
node->val = val;
node->aux = aux;
-#ifdef CONFIG_BTRFS_DEBUG
- node->seqnum = ulist->nnodes;
-#endif
ret = ulist_rbtree_insert(ulist, node);
ASSERT(!ret);
@@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
return 1;
}
+/*
+ * ulist_del - delete one node from ulist
+ * @ulist: ulist to remove node from
+ * @val: value to delete
+ * @aux: aux to delete
+ *
+ * The deletion will only be done when *BOTH* val and aux matches.
+ * Return 0 for successful delete.
+ * Return > 0 for not found.
+ */
+int ulist_del(struct ulist *ulist, u64 val, u64 aux)
+{
+ struct ulist_node *node;
+
+ node = ulist_rbtree_search(ulist, val);
+ /* Not found */
+ if (!node)
+ return 1;
+
+ if (node->aux != aux)
+ return 1;
+
+ /* Found and delete */
+ ulist_rbtree_erase(ulist, node);
+ return 0;
+}
+
/**
* ulist_next - iterate ulist
* @ulist: ulist to iterate
@@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
uiter->cur_list = uiter->cur_list->next;
} else {
uiter->cur_list = ulist->nodes.next;
-#ifdef CONFIG_BTRFS_DEBUG
- uiter->i = 0;
-#endif
}
node = list_entry(uiter->cur_list, struct ulist_node, list);
-#ifdef CONFIG_BTRFS_DEBUG
- ASSERT(node->seqnum == uiter->i);
- ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
- uiter->i++;
-#endif
return node;
}
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 4c29db604bbe..a01a2c45825f 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist);
int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
u64 *old_aux, gfp_t gfp_mask);
+int ulist_del(struct ulist *ulist, u64 val, u64 aux);
/* just like ulist_add_merge() but take a pointer for the aux data */
static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 53af23f2c087..4b438b4c8c91 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
+struct list_head *btrfs_get_fs_uuids(void)
+{
+ return &fs_uuids;
+}
static struct btrfs_fs_devices *__alloc_fs_devices(void)
{
@@ -441,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
+
+void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+{
+ struct btrfs_fs_devices *fs_devs;
+ struct btrfs_device *dev;
+
+ if (!cur_dev->name)
+ return;
+
+ list_for_each_entry(fs_devs, &fs_uuids, list) {
+ int del = 1;
+
+ if (fs_devs->opened)
+ continue;
+ if (fs_devs->seeding)
+ continue;
+
+ list_for_each_entry(dev, &fs_devs->devices, dev_list) {
+
+ if (dev == cur_dev)
+ continue;
+ if (!dev->name)
+ continue;
+
+ /*
+ * Todo: This won't be enough. What if the same device
+ * comes back (with new uuid and) with its mapper path?
+ * But for now, this does help as mostly an admin will
+ * either use mapper or non mapper path throughout.
+ */
+ rcu_read_lock();
+ del = strcmp(rcu_str_deref(dev->name),
+ rcu_str_deref(cur_dev->name));
+ rcu_read_unlock();
+ if (!del)
+ break;
+ }
+
+ if (!del) {
+ /* delete the stale device */
+ if (fs_devs->num_devices == 1) {
+ btrfs_sysfs_remove_fsid(fs_devs);
+ list_del(&fs_devs->list);
+ free_fs_devices(fs_devs);
+ } else {
+ fs_devs->num_devices--;
+ list_del(&dev->dev_list);
+ rcu_string_free(dev->name);
+ kfree(dev);
+ }
+ break;
+ }
+ }
+}
+
/*
* Add new device to list of registered devices
*
@@ -556,6 +615,12 @@ static noinline int device_list_add(const char *path,
if (!fs_devices->opened)
device->generation = found_transid;
+ /*
+ * if there is new btrfs on an already registered device,
+ * then remove the stale device entry.
+ */
+ btrfs_free_stale_device(device);
+
*fs_devices_ret = fs_devices;
return ret;
@@ -693,13 +758,13 @@ static void free_device(struct rcu_head *head)
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_device *device;
+ struct btrfs_device *device, *tmp;
if (--fs_devices->opened > 0)
return 0;
mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
struct btrfs_device *new_device;
struct rcu_string *name;
@@ -1067,15 +1132,31 @@ again:
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
+ u64 end;
+
if (map->stripes[i].dev != device)
continue;
if (map->stripes[i].physical >= physical_start + len ||
map->stripes[i].physical + em->orig_block_len <=
physical_start)
continue;
- *start = map->stripes[i].physical +
- em->orig_block_len;
- ret = 1;
+ /*
+ * Make sure that while processing the pinned list we do
+ * not override our *start with a lower value, because
+ * we can have pinned chunks that fall within this
+ * device hole and that have lower physical addresses
+ * than the pending chunks we processed before. If we
+ * do not take this special care we can end up getting
+ * 2 pending chunks that start at the same physical
+ * device offsets because the end offset of a pinned
+ * chunk can be equal to the start offset of some
+ * pending chunk.
+ */
+ end = map->stripes[i].physical + em->orig_block_len;
+ if (end > *start) {
+ *start = end;
+ ret = 1;
+ }
}
}
if (search_list == &trans->transaction->pending_chunks) {
@@ -1706,7 +1787,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev) {
device->fs_devices->open_devices--;
/* remove sysfs entry */
- btrfs_kobj_rm_device(root->fs_info, device);
+ btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
}
call_rcu(&device->rcu, free_device);
@@ -1875,6 +1956,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
mutex_lock(&uuid_mutex);
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
+
+ btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+
if (tgtdev->bdev) {
btrfs_scratch_superblock(tgtdev);
fs_info->fs_devices->open_devices--;
@@ -2211,7 +2295,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
tmp + 1);
/* add sysfs device entry */
- btrfs_kobj_add_device(root->fs_info, device);
+ btrfs_kobj_add_device(root->fs_info->fs_devices, device);
/*
* we've got more storage, clear any full flags on the space
@@ -2252,8 +2336,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
root->fs_info->fsid);
- if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
- goto error_trans;
+ if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+ fsid_buf))
+ pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
}
root->fs_info->num_tolerated_disk_barrier_failures =
@@ -2289,7 +2374,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
error_trans:
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
- btrfs_kobj_rm_device(root->fs_info, device);
+ btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2609,6 +2694,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
return -EINVAL;
}
map = (struct map_lookup *)em->bdev;
+ lock_chunks(root->fs_info->chunk_root);
+ check_system_chunk(trans, extent_root, map->type);
+ unlock_chunks(root->fs_info->chunk_root);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
@@ -3908,9 +3996,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
uuid_root = btrfs_create_tree(trans, fs_info,
BTRFS_UUID_TREE_OBJECTID);
if (IS_ERR(uuid_root)) {
- btrfs_abort_transaction(trans, tree_root,
- PTR_ERR(uuid_root));
- return PTR_ERR(uuid_root);
+ ret = PTR_ERR(uuid_root);
+ btrfs_abort_transaction(trans, tree_root, ret);
+ return ret;
}
fs_info->uuid_root = uuid_root;
@@ -3965,6 +4053,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
int slot;
int failed = 0;
bool retried = false;
+ bool checked_pending_chunks = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
@@ -4045,15 +4134,6 @@ again:
goto again;
} else if (failed && retried) {
ret = -ENOSPC;
- lock_chunks(root);
-
- btrfs_device_set_total_bytes(device, old_size);
- if (device->writeable)
- device->fs_devices->total_rw_bytes += diff;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += diff;
- spin_unlock(&root->fs_info->free_chunk_lock);
- unlock_chunks(root);
goto done;
}
@@ -4065,6 +4145,35 @@ again:
}
lock_chunks(root);
+
+ /*
+ * We checked in the above loop all device extents that were already in
+ * the device tree. However before we have updated the device's
+ * total_bytes to the new size, we might have had chunk allocations that
+ * have not complete yet (new block groups attached to transaction
+ * handles), and therefore their device extents were not yet in the
+ * device tree and we missed them in the loop above. So if we have any
+ * pending chunk using a device extent that overlaps the device range
+ * that we can not use anymore, commit the current transaction and
+ * repeat the search on the device tree - this way we guarantee we will
+ * not have chunks using device extents that end beyond 'new_size'.
+ */
+ if (!checked_pending_chunks) {
+ u64 start = new_size;
+ u64 len = old_size - new_size;
+
+ if (contains_pending_extent(trans, device, &start, len)) {
+ unlock_chunks(root);
+ checked_pending_chunks = true;
+ failed = 0;
+ retried = false;
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ goto done;
+ goto again;
+ }
+ }
+
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->resized_list))
list_add_tail(&device->resized_list,
@@ -4079,6 +4188,16 @@ again:
btrfs_end_transaction(trans, root);
done:
btrfs_free_path(path);
+ if (ret) {
+ lock_chunks(root);
+ btrfs_device_set_total_bytes(device, old_size);
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ unlock_chunks(root);
+ }
return ret;
}
@@ -6072,6 +6191,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
free_extent_map(em);
return -EIO;
}
+ btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing",
+ devid, uuid);
}
map->stripes[i].dev->in_fs_metadata = 1;
}
@@ -6191,10 +6312,11 @@ static int read_one_dev(struct btrfs_root *root,
if (!btrfs_test_opt(root, DEGRADED))
return -EIO;
- btrfs_warn(root->fs_info, "devid %llu missing", devid);
device = add_missing_dev(root, fs_devices, devid, dev_uuid);
if (!device)
return -ENOMEM;
+ btrfs_warn(root->fs_info, "devid %llu uuid %pU missing",
+ devid, dev_uuid);
} else {
if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
return -EIO;
@@ -6722,3 +6844,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
}
unlock_chunks(root);
}
+
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ while (fs_devices) {
+ fs_devices->fs_info = fs_info;
+ fs_devices = fs_devices->seed;
+ }
+}
+
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ while (fs_devices) {
+ fs_devices->fs_info = NULL;
+ fs_devices = fs_devices->seed;
+ }
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index cedae0356558..95842a909e7f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -253,6 +253,12 @@ struct btrfs_fs_devices {
* nonrot flag set
*/
int rotating;
+
+ struct btrfs_fs_info *fs_info;
+ /* sysfs kobjects */
+ struct kobject super_kobj;
+ struct kobject *device_dir_kobj;
+ struct completion kobj_unregister;
};
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@@ -535,5 +541,8 @@ static inline void unlock_chunks(struct btrfs_root *root)
mutex_unlock(&root->fs_info->chunk_mutex);
}
+struct list_head *btrfs_get_fs_uuids(void);
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/dax.c b/fs/dax.c
index 6f65f00e58ec..99b5fbc38992 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -309,14 +309,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
out:
i_mmap_unlock_read(mapping);
- if (bh->b_end_io)
- bh->b_end_io(bh, 1);
-
return error;
}
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
+/**
+ * __dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files. __dax_fault() assumes the caller has done all
+ * the necessary locking for the page fault to proceed successfully.
+ */
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block, dax_iodone_t complete_unwritten)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
@@ -417,7 +424,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
page_cache_release(page);
}
+ /*
+ * If we successfully insert the new mapping over an unwritten extent,
+ * we need to ensure we convert the unwritten extent. If there is an
+ * error inserting the mapping, the filesystem needs to leave it as
+ * unwritten to prevent exposure of the stale underlying data to
+ * userspace, but we still need to call the completion function so
+ * the private resources on the mapping buffer can be released. We
+ * indicate what the callback should do via the uptodate variable, same
+ * as for normal BH based IO completions.
+ */
error = dax_insert_mapping(inode, &bh, vma, vmf);
+ if (buffer_unwritten(&bh))
+ complete_unwritten(&bh, !error);
out:
if (error == -ENOMEM)
@@ -434,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
goto out;
}
+EXPORT_SYMBOL(__dax_fault);
/**
* dax_fault - handle a page fault on a DAX file
@@ -445,7 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* fault handler for DAX files.
*/
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
+ get_block_t get_block, dax_iodone_t complete_unwritten)
{
int result;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -454,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
- result = do_dax_fault(vma, vmf, get_block);
+ result = __dax_fault(vma, vmf, get_block, complete_unwritten);
if (vmf->flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3a0a6c6406d0..3b57c9f83c9b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -28,12 +28,12 @@
#ifdef CONFIG_FS_DAX
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext2_get_block);
+ return dax_fault(vma, vmf, ext2_get_block, NULL);
}
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext2_get_block);
+ return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
}
static const struct vm_operations_struct ext2_dax_vm_ops = {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ac517f15741c..bc313ac5d3fa 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -192,15 +192,27 @@ out:
}
#ifdef CONFIG_FS_DAX
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+ struct inode *inode = bh->b_assoc_map->host;
+ /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+ loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+ int err;
+ if (!uptodate)
+ return;
+ WARN_ON(!buffer_unwritten(bh));
+ err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext4_get_block);
+ return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
/* Is this the right get_block? */
}
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext4_get_block);
+ return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f8a8d4ee7459..41f8e55afcd1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -656,18 +656,6 @@ has_zeroout:
return retval;
}
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
- struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
- loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
- int err;
- if (!uptodate)
- return;
- WARN_ON(!buffer_unwritten(bh));
- err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
@@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
- if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+ if (IS_DAX(inode) && buffer_unwritten(bh)) {
+ /*
+ * dgc: I suspect unwritten conversion on ext4+DAX is
+ * fundamentally broken here when there are concurrent
+ * read/write in progress on this inode.
+ */
+ WARN_ON_ONCE(io_end);
bh->b_assoc_map = inode->i_mapping;
bh->b_private = (void *)(unsigned long)iblock;
- bh->b_end_io = ext4_end_io_unwritten;
}
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile
deleted file mode 100644
index 3a982bd975d2..000000000000
--- a/fs/hppfs/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-#
-# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
-# Licensed under the GPL
-#
-
-obj-$(CONFIG_HPPFS) += hppfs.o
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
deleted file mode 100644
index 2867837909a9..000000000000
--- a/fs/hppfs/hppfs.c
+++ /dev/null
@@ -1,765 +0,0 @@
-/*
- * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <linux/ctype.h>
-#include <linux/dcache.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/slab.h>
-#include <linux/statfs.h>
-#include <linux/types.h>
-#include <linux/pid_namespace.h>
-#include <linux/namei.h>
-#include <asm/uaccess.h>
-#include <os.h>
-
-static struct inode *get_inode(struct super_block *, struct dentry *);
-
-struct hppfs_data {
- struct list_head list;
- char contents[PAGE_SIZE - sizeof(struct list_head)];
-};
-
-struct hppfs_private {
- struct file *proc_file;
- int host_fd;
- loff_t len;
- struct hppfs_data *contents;
-};
-
-struct hppfs_inode_info {
- struct dentry *proc_dentry;
- struct inode vfs_inode;
-};
-
-static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode)
-{
- return container_of(inode, struct hppfs_inode_info, vfs_inode);
-}
-
-#define HPPFS_SUPER_MAGIC 0xb00000ee
-
-static const struct super_operations hppfs_sbops;
-
-static int is_pid(struct dentry *dentry)
-{
- struct super_block *sb;
- int i;
-
- sb = dentry->d_sb;
- if (dentry->d_parent != sb->s_root)
- return 0;
-
- for (i = 0; i < dentry->d_name.len; i++) {
- if (!isdigit(dentry->d_name.name[i]))
- return 0;
- }
- return 1;
-}
-
-static char *dentry_name(struct dentry *dentry, int extra)
-{
- struct dentry *parent;
- char *root, *name;
- const char *seg_name;
- int len, seg_len, root_len;
-
- len = 0;
- parent = dentry;
- while (parent->d_parent != parent) {
- if (is_pid(parent))
- len += strlen("pid") + 1;
- else len += parent->d_name.len + 1;
- parent = parent->d_parent;
- }
-
- root = "proc";
- root_len = strlen(root);
- len += root_len;
- name = kmalloc(len + extra + 1, GFP_KERNEL);
- if (name == NULL)
- return NULL;
-
- name[len] = '\0';
- parent = dentry;
- while (parent->d_parent != parent) {
- if (is_pid(parent)) {
- seg_name = "pid";
- seg_len = strlen(seg_name);
- }
- else {
- seg_name = parent->d_name.name;
- seg_len = parent->d_name.len;
- }
-
- len -= seg_len + 1;
- name[len] = '/';
- memcpy(&name[len + 1], seg_name, seg_len);
- parent = parent->d_parent;
- }
- memcpy(name, root, root_len);
- return name;
-}
-
-static int file_removed(struct dentry *dentry, const char *file)
-{
- char *host_file;
- int extra, fd;
-
- extra = 0;
- if (file != NULL)
- extra += strlen(file) + 1;
-
- host_file = dentry_name(dentry, extra + strlen("/remove"));
- if (host_file == NULL) {
- printk(KERN_ERR "file_removed : allocation failed\n");
- return -ENOMEM;
- }
-
- if (file != NULL) {
- strcat(host_file, "/");
- strcat(host_file, file);
- }
- strcat(host_file, "/remove");
-
- fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
- kfree(host_file);
- if (fd > 0) {
- os_close_file(fd);
- return 1;
- }
- return 0;
-}
-
-static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
- unsigned int flags)
-{
- struct dentry *proc_dentry, *parent;
- struct qstr *name = &dentry->d_name;
- struct inode *inode;
- int err, deleted;
-
- deleted = file_removed(dentry, NULL);
- if (deleted < 0)
- return ERR_PTR(deleted);
- else if (deleted)
- return ERR_PTR(-ENOENT);
-
- parent = HPPFS_I(ino)->proc_dentry;
- mutex_lock(&d_inode(parent)->i_mutex);
- proc_dentry = lookup_one_len(name->name, parent, name->len);
- mutex_unlock(&d_inode(parent)->i_mutex);
-
- if (IS_ERR(proc_dentry))
- return proc_dentry;
-
- err = -ENOMEM;
- inode = get_inode(ino->i_sb, proc_dentry);
- if (!inode)
- goto out;
-
- d_add(dentry, inode);
- return NULL;
-
- out:
- return ERR_PTR(err);
-}
-
-static const struct inode_operations hppfs_file_iops = {
-};
-
-static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count,
- loff_t *ppos, int is_user)
-{
- ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
- ssize_t n;
-
- read = file_inode(file)->i_fop->read;
-
- if (!is_user)
- set_fs(KERNEL_DS);
-
- n = (*read)(file, buf, count, &file->f_pos);
-
- if (!is_user)
- set_fs(USER_DS);
-
- if (ppos)
- *ppos = file->f_pos;
- return n;
-}
-
-static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count)
-{
- ssize_t n;
- int cur, err;
- char *new_buf;
-
- n = -ENOMEM;
- new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (new_buf == NULL) {
- printk(KERN_ERR "hppfs_read_file : kmalloc failed\n");
- goto out;
- }
- n = 0;
- while (count > 0) {
- cur = min_t(ssize_t, count, PAGE_SIZE);
- err = os_read_file(fd, new_buf, cur);
- if (err < 0) {
- printk(KERN_ERR "hppfs_read : read failed, "
- "errno = %d\n", err);
- n = err;
- goto out_free;
- } else if (err == 0)
- break;
-
- if (copy_to_user(buf, new_buf, err)) {
- n = -EFAULT;
- goto out_free;
- }
- n += err;
- count -= err;
- }
- out_free:
- kfree(new_buf);
- out:
- return n;
-}
-
-static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct hppfs_private *hppfs = file->private_data;
- struct hppfs_data *data;
- loff_t off;
- int err;
-
- if (hppfs->contents != NULL) {
- int rem;
-
- if (*ppos >= hppfs->len)
- return 0;
-
- data = hppfs->contents;
- off = *ppos;
- while (off >= sizeof(data->contents)) {
- data = list_entry(data->list.next, struct hppfs_data,
- list);
- off -= sizeof(data->contents);
- }
-
- if (off + count > hppfs->len)
- count = hppfs->len - off;
- rem = copy_to_user(buf, &data->contents[off], count);
- *ppos += count - rem;
- if (rem > 0)
- return -EFAULT;
- } else if (hppfs->host_fd != -1) {
- err = os_seek_file(hppfs->host_fd, *ppos);
- if (err) {
- printk(KERN_ERR "hppfs_read : seek failed, "
- "errno = %d\n", err);
- return err;
- }
- err = hppfs_read_file(hppfs->host_fd, buf, count);
- if (err < 0) {
- printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
- return err;
- }
- count = err;
- if (count > 0)
- *ppos += count;
- }
- else count = read_proc(hppfs->proc_file, buf, count, ppos, 1);
-
- return count;
-}
-
-static ssize_t hppfs_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
-{
- struct hppfs_private *data = file->private_data;
- struct file *proc_file = data->proc_file;
- ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
-
- write = file_inode(proc_file)->i_fop->write;
- return (*write)(proc_file, buf, len, ppos);
-}
-
-static int open_host_sock(char *host_file, int *filter_out)
-{
- char *end;
- int fd;
-
- end = &host_file[strlen(host_file)];
- strcpy(end, "/rw");
- *filter_out = 1;
- fd = os_connect_socket(host_file);
- if (fd > 0)
- return fd;
-
- strcpy(end, "/r");
- *filter_out = 0;
- fd = os_connect_socket(host_file);
- return fd;
-}
-
-static void free_contents(struct hppfs_data *head)
-{
- struct hppfs_data *data;
- struct list_head *ele, *next;
-
- if (head == NULL)
- return;
-
- list_for_each_safe(ele, next, &head->list) {
- data = list_entry(ele, struct hppfs_data, list);
- kfree(data);
- }
- kfree(head);
-}
-
-static struct hppfs_data *hppfs_get_data(int fd, int filter,
- struct file *proc_file,
- struct file *hppfs_file,
- loff_t *size_out)
-{
- struct hppfs_data *data, *new, *head;
- int n, err;
-
- err = -ENOMEM;
- data = kmalloc(sizeof(*data), GFP_KERNEL);
- if (data == NULL) {
- printk(KERN_ERR "hppfs_get_data : head allocation failed\n");
- goto failed;
- }
-
- INIT_LIST_HEAD(&data->list);
-
- head = data;
- *size_out = 0;
-
- if (filter) {
- while ((n = read_proc(proc_file, data->contents,
- sizeof(data->contents), NULL, 0)) > 0)
- os_write_file(fd, data->contents, n);
- err = os_shutdown_socket(fd, 0, 1);
- if (err) {
- printk(KERN_ERR "hppfs_get_data : failed to shut down "
- "socket\n");
- goto failed_free;
- }
- }
- while (1) {
- n = os_read_file(fd, data->contents, sizeof(data->contents));
- if (n < 0) {
- err = n;
- printk(KERN_ERR "hppfs_get_data : read failed, "
- "errno = %d\n", err);
- goto failed_free;
- } else if (n == 0)
- break;
-
- *size_out += n;
-
- if (n < sizeof(data->contents))
- break;
-
- new = kmalloc(sizeof(*data), GFP_KERNEL);
- if (new == 0) {
- printk(KERN_ERR "hppfs_get_data : data allocation "
- "failed\n");
- err = -ENOMEM;
- goto failed_free;
- }
-
- INIT_LIST_HEAD(&new->list);
- list_add(&new->list, &data->list);
- data = new;
- }
- return head;
-
- failed_free:
- free_contents(head);
- failed:
- return ERR_PTR(err);
-}
-
-static struct hppfs_private *hppfs_data(void)
-{
- struct hppfs_private *data;
-
- data = kmalloc(sizeof(*data), GFP_KERNEL);
- if (data == NULL)
- return data;
-
- *data = ((struct hppfs_private ) { .host_fd = -1,
- .len = -1,
- .contents = NULL } );
- return data;
-}
-
-static int file_mode(int fmode)
-{
- if (fmode == (FMODE_READ | FMODE_WRITE))
- return O_RDWR;
- if (fmode == FMODE_READ)
- return O_RDONLY;
- if (fmode == FMODE_WRITE)
- return O_WRONLY;
- return 0;
-}
-
-static int hppfs_open(struct inode *inode, struct file *file)
-{
- const struct cred *cred = file->f_cred;
- struct hppfs_private *data;
- struct path path;
- char *host_file;
- int err, fd, type, filter;
-
- err = -ENOMEM;
- data = hppfs_data();
- if (data == NULL)
- goto out;
-
- host_file = dentry_name(file->f_path.dentry, strlen("/rw"));
- if (host_file == NULL)
- goto out_free2;
-
- path.mnt = inode->i_sb->s_fs_info;
- path.dentry = HPPFS_I(inode)->proc_dentry;
-
- data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred);
- err = PTR_ERR(data->proc_file);
- if (IS_ERR(data->proc_file))
- goto out_free1;
-
- type = os_file_type(host_file);
- if (type == OS_TYPE_FILE) {
- fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
- if (fd >= 0)
- data->host_fd = fd;
- else
- printk(KERN_ERR "hppfs_open : failed to open '%s', "
- "errno = %d\n", host_file, -fd);
-
- data->contents = NULL;
- } else if (type == OS_TYPE_DIR) {
- fd = open_host_sock(host_file, &filter);
- if (fd > 0) {
- data->contents = hppfs_get_data(fd, filter,
- data->proc_file,
- file, &data->len);
- if (!IS_ERR(data->contents))
- data->host_fd = fd;
- } else
- printk(KERN_ERR "hppfs_open : failed to open a socket "
- "in '%s', errno = %d\n", host_file, -fd);
- }
- kfree(host_file);
-
- file->private_data = data;
- return 0;
-
- out_free1:
- kfree(host_file);
- out_free2:
- free_contents(data->contents);
- kfree(data);
- out:
- return err;
-}
-
-static int hppfs_dir_open(struct inode *inode, struct file *file)
-{
- const struct cred *cred = file->f_cred;
- struct hppfs_private *data;
- struct path path;
- int err;
-
- err = -ENOMEM;
- data = hppfs_data();
- if (data == NULL)
- goto out;
-
- path.mnt = inode->i_sb->s_fs_info;
- path.dentry = HPPFS_I(inode)->proc_dentry;
- data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred);
- err = PTR_ERR(data->proc_file);
- if (IS_ERR(data->proc_file))
- goto out_free;
-
- file->private_data = data;
- return 0;
-
- out_free:
- kfree(data);
- out:
- return err;
-}
-
-static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
-{
- struct hppfs_private *data = file->private_data;
- struct file *proc_file = data->proc_file;
- loff_t (*llseek)(struct file *, loff_t, int);
- loff_t ret;
-
- llseek = file_inode(proc_file)->i_fop->llseek;
- if (llseek != NULL) {
- ret = (*llseek)(proc_file, off, where);
- if (ret < 0)
- return ret;
- }
-
- return default_llseek(file, off, where);
-}
-
-static int hppfs_release(struct inode *inode, struct file *file)
-{
- struct hppfs_private *data = file->private_data;
- struct file *proc_file = data->proc_file;
- if (proc_file)
- fput(proc_file);
- kfree(data);
- return 0;
-}
-
-static const struct file_operations hppfs_file_fops = {
- .owner = NULL,
- .llseek = hppfs_llseek,
- .read = hppfs_read,
- .write = hppfs_write,
- .open = hppfs_open,
- .release = hppfs_release,
-};
-
-struct hppfs_dirent {
- struct dir_context ctx;
- struct dir_context *caller;
- struct dentry *dentry;
-};
-
-static int hppfs_filldir(struct dir_context *ctx, const char *name, int size,
- loff_t offset, u64 inode, unsigned int type)
-{
- struct hppfs_dirent *dirent =
- container_of(ctx, struct hppfs_dirent, ctx);
-
- if (file_removed(dirent->dentry, name))
- return 0;
-
- dirent->caller->pos = dirent->ctx.pos;
- return !dir_emit(dirent->caller, name, size, inode, type);
-}
-
-static int hppfs_readdir(struct file *file, struct dir_context *ctx)
-{
- struct hppfs_private *data = file->private_data;
- struct file *proc_file = data->proc_file;
- struct hppfs_dirent d = {
- .ctx.actor = hppfs_filldir,
- .caller = ctx,
- .dentry = file->f_path.dentry
- };
- int err;
- proc_file->f_pos = ctx->pos;
- err = iterate_dir(proc_file, &d.ctx);
- ctx->pos = d.ctx.pos;
- return err;
-}
-
-static const struct file_operations hppfs_dir_fops = {
- .owner = NULL,
- .iterate = hppfs_readdir,
- .open = hppfs_dir_open,
- .llseek = default_llseek,
- .release = hppfs_release,
-};
-
-static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
-{
- sf->f_blocks = 0;
- sf->f_bfree = 0;
- sf->f_bavail = 0;
- sf->f_files = 0;
- sf->f_ffree = 0;
- sf->f_type = HPPFS_SUPER_MAGIC;
- return 0;
-}
-
-static struct inode *hppfs_alloc_inode(struct super_block *sb)
-{
- struct hppfs_inode_info *hi;
-
- hi = kmalloc(sizeof(*hi), GFP_KERNEL);
- if (!hi)
- return NULL;
-
- hi->proc_dentry = NULL;
- inode_init_once(&hi->vfs_inode);
- return &hi->vfs_inode;
-}
-
-void hppfs_evict_inode(struct inode *ino)
-{
- clear_inode(ino);
- dput(HPPFS_I(ino)->proc_dentry);
- mntput(ino->i_sb->s_fs_info);
-}
-
-static void hppfs_i_callback(struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- kfree(HPPFS_I(inode));
-}
-
-static void hppfs_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, hppfs_i_callback);
-}
-
-static const struct super_operations hppfs_sbops = {
- .alloc_inode = hppfs_alloc_inode,
- .destroy_inode = hppfs_destroy_inode,
- .evict_inode = hppfs_evict_inode,
- .statfs = hppfs_statfs,
-};
-
-static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
- int buflen)
-{
- struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
- return d_inode(proc_dentry)->i_op->readlink(proc_dentry, buffer,
- buflen);
-}
-
-static const char *hppfs_follow_link(struct dentry *dentry, void **cookie)
-{
- struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
-
- return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, cookie);
-}
-
-static void hppfs_put_link(struct inode *inode, void *cookie)
-{
- struct inode *proc_inode = d_inode(HPPFS_I(inode)->proc_dentry);
-
- if (proc_inode->i_op->put_link)
- proc_inode->i_op->put_link(proc_inode, cookie);
-}
-
-static const struct inode_operations hppfs_dir_iops = {
- .lookup = hppfs_lookup,
-};
-
-static const struct inode_operations hppfs_link_iops = {
- .readlink = hppfs_readlink,
- .follow_link = hppfs_follow_link,
- .put_link = hppfs_put_link,
-};
-
-static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
-{
- struct inode *proc_ino = d_inode(dentry);
- struct inode *inode = new_inode(sb);
-
- if (!inode) {
- dput(dentry);
- return NULL;
- }
-
- if (d_is_dir(dentry)) {
- inode->i_op = &hppfs_dir_iops;
- inode->i_fop = &hppfs_dir_fops;
- } else if (d_is_symlink(dentry)) {
- inode->i_op = &hppfs_link_iops;
- inode->i_fop = &hppfs_file_fops;
- } else {
- inode->i_op = &hppfs_file_iops;
- inode->i_fop = &hppfs_file_fops;
- }
-
- HPPFS_I(inode)->proc_dentry = dentry;
-
- inode->i_uid = proc_ino->i_uid;
- inode->i_gid = proc_ino->i_gid;
- inode->i_atime = proc_ino->i_atime;
- inode->i_mtime = proc_ino->i_mtime;
- inode->i_ctime = proc_ino->i_ctime;
- inode->i_ino = proc_ino->i_ino;
- inode->i_mode = proc_ino->i_mode;
- set_nlink(inode, proc_ino->i_nlink);
- inode->i_size = proc_ino->i_size;
- inode->i_blocks = proc_ino->i_blocks;
-
- return inode;
-}
-
-static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
-{
- struct inode *root_inode;
- struct vfsmount *proc_mnt;
- int err = -ENOENT;
-
- proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt);
- if (IS_ERR(proc_mnt))
- goto out;
-
- sb->s_blocksize = 1024;
- sb->s_blocksize_bits = 10;
- sb->s_magic = HPPFS_SUPER_MAGIC;
- sb->s_op = &hppfs_sbops;
- sb->s_fs_info = proc_mnt;
-
- err = -ENOMEM;
- root_inode = get_inode(sb, dget(proc_mnt->mnt_root));
- sb->s_root = d_make_root(root_inode);
- if (!sb->s_root)
- goto out_mntput;
-
- return 0;
-
- out_mntput:
- mntput(proc_mnt);
- out:
- return(err);
-}
-
-static struct dentry *hppfs_read_super(struct file_system_type *type,
- int flags, const char *dev_name,
- void *data)
-{
- return mount_nodev(type, flags, data, hppfs_fill_super);
-}
-
-static struct file_system_type hppfs_type = {
- .owner = THIS_MODULE,
- .name = "hppfs",
- .mount = hppfs_read_super,
- .kill_sb = kill_anon_super,
- .fs_flags = 0,
-};
-MODULE_ALIAS_FS("hppfs");
-
-static int __init init_hppfs(void)
-{
- return register_filesystem(&hppfs_type);
-}
-
-static void __exit exit_hppfs(void)
-{
- unregister_filesystem(&hppfs_type);
-}
-
-module_init(init_hppfs)
-module_exit(exit_hppfs)
-MODULE_LICENSE("GPL");
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 555f82155be8..52b492721603 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -538,6 +538,7 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
return res;
}
+EXPORT_SYMBOL(seq_dentry);
static void *single_start(struct seq_file *p, loff_t *pos)
{
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 516162be1398..f9e9ffe6fb46 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -149,13 +149,27 @@ xfs_alloc_compute_aligned(
{
xfs_agblock_t bno;
xfs_extlen_t len;
+ xfs_extlen_t diff;
/* Trim busy sections out of found extent */
xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+ /*
+ * If we have a largish extent that happens to start before min_agbno,
+ * see if we can shift it into range...
+ */
+ if (bno < args->min_agbno && bno + len > args->min_agbno) {
+ diff = args->min_agbno - bno;
+ if (len > diff) {
+ bno += diff;
+ len -= diff;
+ }
+ }
+
if (args->alignment > 1 && len >= args->minlen) {
xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
- xfs_extlen_t diff = aligned_bno - bno;
+
+ diff = aligned_bno - bno;
*resbno = aligned_bno;
*reslen = diff >= len ? 0 : len - diff;
@@ -795,9 +809,13 @@ xfs_alloc_find_best_extent(
* The good extent is closer than this one.
*/
if (!dir) {
+ if (*sbnoa > args->max_agbno)
+ goto out_use_good;
if (*sbnoa >= args->agbno + gdiff)
goto out_use_good;
} else {
+ if (*sbnoa < args->min_agbno)
+ goto out_use_good;
if (*sbnoa <= args->agbno - gdiff)
goto out_use_good;
}
@@ -884,6 +902,17 @@ xfs_alloc_ag_vextent_near(
dofirst = prandom_u32() & 1;
#endif
+ /* handle unitialized agbno range so caller doesn't have to */
+ if (!args->min_agbno && !args->max_agbno)
+ args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
+ ASSERT(args->min_agbno <= args->max_agbno);
+
+ /* clamp agbno to the range if it's outside */
+ if (args->agbno < args->min_agbno)
+ args->agbno = args->min_agbno;
+ if (args->agbno > args->max_agbno)
+ args->agbno = args->max_agbno;
+
restart:
bno_cur_lt = NULL;
bno_cur_gt = NULL;
@@ -976,6 +1005,8 @@ restart:
&ltbnoa, &ltlena);
if (ltlena < args->minlen)
continue;
+ if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
+ continue;
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ASSERT(args->len >= args->minlen);
@@ -1096,11 +1127,11 @@ restart:
XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
xfs_alloc_compute_aligned(args, ltbno, ltlen,
&ltbnoa, &ltlena);
- if (ltlena >= args->minlen)
+ if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
break;
if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
goto error0;
- if (!i) {
+ if (!i || ltbnoa < args->min_agbno) {
xfs_btree_del_cursor(bno_cur_lt,
XFS_BTREE_NOERROR);
bno_cur_lt = NULL;
@@ -1112,11 +1143,11 @@ restart:
XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
xfs_alloc_compute_aligned(args, gtbno, gtlen,
&gtbnoa, &gtlena);
- if (gtlena >= args->minlen)
+ if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
break;
if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
goto error0;
- if (!i) {
+ if (!i || gtbnoa > args->max_agbno) {
xfs_btree_del_cursor(bno_cur_gt,
XFS_BTREE_NOERROR);
bno_cur_gt = NULL;
@@ -1216,6 +1247,7 @@ restart:
ASSERT(ltnew >= ltbno);
ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno);
args->agbno = ltnew;
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
@@ -1825,11 +1857,11 @@ xfs_alloc_compute_maxlevels(
xfs_extlen_t
xfs_alloc_longest_free_extent(
struct xfs_mount *mp,
- struct xfs_perag *pag)
+ struct xfs_perag *pag,
+ xfs_extlen_t need)
{
- xfs_extlen_t need, delta = 0;
+ xfs_extlen_t delta = 0;
- need = XFS_MIN_FREELIST_PAG(pag, mp);
if (need > pag->pagf_flcount)
delta = need - pag->pagf_flcount;
@@ -1838,131 +1870,150 @@ xfs_alloc_longest_free_extent(
return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
}
+unsigned int
+xfs_alloc_min_freelist(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag)
+{
+ unsigned int min_free;
+
+ /* space needed by-bno freespace btree */
+ min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1,
+ mp->m_ag_maxlevels);
+ /* space needed by-size freespace btree */
+ min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
+ mp->m_ag_maxlevels);
+
+ return min_free;
+}
+
+/*
+ * Check if the operation we are fixing up the freelist for should go ahead or
+ * not. If we are freeing blocks, we always allow it, otherwise the allocation
+ * is dependent on whether the size and shape of free space available will
+ * permit the requested allocation to take place.
+ */
+static bool
+xfs_alloc_space_available(
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t min_free,
+ int flags)
+{
+ struct xfs_perag *pag = args->pag;
+ xfs_extlen_t longest;
+ int available;
+
+ if (flags & XFS_ALLOC_FLAG_FREEING)
+ return true;
+
+ /* do we have enough contiguous free space for the allocation? */
+ longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+ if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
+ return false;
+
+ /* do have enough free space remaining for the allocation? */
+ available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
+ min_free - args->total);
+ if (available < (int)args->minleft)
+ return false;
+
+ return true;
+}
+
/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
STATIC int /* error */
xfs_alloc_fix_freelist(
- xfs_alloc_arg_t *args, /* allocation argument structure */
- int flags) /* XFS_ALLOC_FLAG_... */
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ int flags) /* XFS_ALLOC_FLAG_... */
{
- xfs_buf_t *agbp; /* agf buffer pointer */
- xfs_agf_t *agf; /* a.g. freespace structure pointer */
- xfs_buf_t *agflbp;/* agfl buffer pointer */
- xfs_agblock_t bno; /* freelist block */
- xfs_extlen_t delta; /* new blocks needed in freelist */
- int error; /* error result code */
- xfs_extlen_t longest;/* longest extent in allocation group */
- xfs_mount_t *mp; /* file system mount point structure */
- xfs_extlen_t need; /* total blocks needed in freelist */
- xfs_perag_t *pag; /* per-ag information structure */
- xfs_alloc_arg_t targs; /* local allocation arguments */
- xfs_trans_t *tp; /* transaction pointer */
-
- mp = args->mp;
+ struct xfs_mount *mp = args->mp;
+ struct xfs_perag *pag = args->pag;
+ struct xfs_trans *tp = args->tp;
+ struct xfs_buf *agbp = NULL;
+ struct xfs_buf *agflbp = NULL;
+ struct xfs_alloc_arg targs; /* local allocation arguments */
+ xfs_agblock_t bno; /* freelist block */
+ xfs_extlen_t need; /* total blocks needed in freelist */
+ int error;
- pag = args->pag;
- tp = args->tp;
if (!pag->pagf_init) {
- if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
- &agbp)))
- return error;
+ error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ if (error)
+ goto out_no_agbp;
if (!pag->pagf_init) {
ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
- args->agbp = NULL;
- return 0;
+ goto out_agbp_relse;
}
- } else
- agbp = NULL;
+ }
/*
- * If this is a metadata preferred pag and we are user data
- * then try somewhere else if we are not being asked to
- * try harder at this point
+ * If this is a metadata preferred pag and we are user data then try
+ * somewhere else if we are not being asked to try harder at this
+ * point
*/
if (pag->pagf_metadata && args->userdata &&
(flags & XFS_ALLOC_FLAG_TRYLOCK)) {
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
- args->agbp = NULL;
- return 0;
+ goto out_agbp_relse;
}
- if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
- /*
- * If it looks like there isn't a long enough extent, or enough
- * total blocks, reject it.
- */
- need = XFS_MIN_FREELIST_PAG(pag, mp);
- longest = xfs_alloc_longest_free_extent(mp, pag);
- if ((args->minlen + args->alignment + args->minalignslop - 1) >
- longest ||
- ((int)(pag->pagf_freeblks + pag->pagf_flcount -
- need - args->total) < (int)args->minleft)) {
- if (agbp)
- xfs_trans_brelse(tp, agbp);
- args->agbp = NULL;
- return 0;
- }
- }
+ need = xfs_alloc_min_freelist(mp, pag);
+ if (!xfs_alloc_space_available(args, need, flags))
+ goto out_agbp_relse;
/*
* Get the a.g. freespace buffer.
* Can fail if we're not blocking on locks, and it's held.
*/
- if (agbp == NULL) {
- if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
- &agbp)))
- return error;
- if (agbp == NULL) {
+ if (!agbp) {
+ error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ if (error)
+ goto out_no_agbp;
+ if (!agbp) {
ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
- args->agbp = NULL;
- return 0;
- }
- }
- /*
- * Figure out how many blocks we should have in the freelist.
- */
- agf = XFS_BUF_TO_AGF(agbp);
- need = XFS_MIN_FREELIST(agf, mp);
- /*
- * If there isn't enough total or single-extent, reject it.
- */
- if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
- delta = need > be32_to_cpu(agf->agf_flcount) ?
- (need - be32_to_cpu(agf->agf_flcount)) : 0;
- longest = be32_to_cpu(agf->agf_longest);
- longest = (longest > delta) ? (longest - delta) :
- (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
- if ((args->minlen + args->alignment + args->minalignslop - 1) >
- longest ||
- ((int)(be32_to_cpu(agf->agf_freeblks) +
- be32_to_cpu(agf->agf_flcount) - need - args->total) <
- (int)args->minleft)) {
- xfs_trans_brelse(tp, agbp);
- args->agbp = NULL;
- return 0;
+ goto out_no_agbp;
}
}
+
+ /* If there isn't enough total space or single-extent, reject it. */
+ need = xfs_alloc_min_freelist(mp, pag);
+ if (!xfs_alloc_space_available(args, need, flags))
+ goto out_agbp_relse;
+
/*
* Make the freelist shorter if it's too long.
+ *
+ * Note that from this point onwards, we will always release the agf and
+ * agfl buffers on error. This handles the case where we error out and
+ * the buffers are clean or may not have been joined to the transaction
+ * and hence need to be released manually. If they have been joined to
+ * the transaction, then xfs_trans_brelse() will handle them
+ * appropriately based on the recursion count and dirty state of the
+ * buffer.
+ *
+ * XXX (dgc): When we have lots of free space, does this buy us
+ * anything other than extra overhead when we need to put more blocks
+ * back on the free list? Maybe we should only do this when space is
+ * getting low or the AGFL is more than half full?
*/
- while (be32_to_cpu(agf->agf_flcount) > need) {
- xfs_buf_t *bp;
+ while (pag->pagf_flcount > need) {
+ struct xfs_buf *bp;
error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
if (error)
- return error;
- if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
- return error;
+ goto out_agbp_relse;
+ error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1);
+ if (error)
+ goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
xfs_trans_binval(tp, bp);
}
- /*
- * Initialize the args structure.
- */
+
memset(&targs, 0, sizeof(targs));
targs.tp = tp;
targs.mp = mp;
@@ -1971,21 +2022,20 @@ xfs_alloc_fix_freelist(
targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
- if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
- return error;
- /*
- * Make the freelist longer if it's too short.
- */
- while (be32_to_cpu(agf->agf_flcount) < need) {
+ error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
+ if (error)
+ goto out_agbp_relse;
+
+ /* Make the freelist longer if it's too short. */
+ while (pag->pagf_flcount < need) {
targs.agbno = 0;
- targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
- /*
- * Allocate as many blocks as possible at once.
- */
- if ((error = xfs_alloc_ag_vextent(&targs))) {
- xfs_trans_brelse(tp, agflbp);
- return error;
- }
+ targs.maxlen = need - pag->pagf_flcount;
+
+ /* Allocate as many blocks as possible at once. */
+ error = xfs_alloc_ag_vextent(&targs);
+ if (error)
+ goto out_agflbp_relse;
+
/*
* Stop if we run out. Won't happen if callers are obeying
* the restrictions correctly. Can happen for free calls
@@ -1994,9 +2044,7 @@ xfs_alloc_fix_freelist(
if (targs.agbno == NULLAGBLOCK) {
if (flags & XFS_ALLOC_FLAG_FREEING)
break;
- xfs_trans_brelse(tp, agflbp);
- args->agbp = NULL;
- return 0;
+ goto out_agflbp_relse;
}
/*
* Put each allocated block on the list.
@@ -2005,12 +2053,21 @@ xfs_alloc_fix_freelist(
error = xfs_alloc_put_freelist(tp, agbp,
agflbp, bno, 0);
if (error)
- return error;
+ goto out_agflbp_relse;
}
}
xfs_trans_brelse(tp, agflbp);
args->agbp = agbp;
return 0;
+
+out_agflbp_relse:
+ xfs_trans_brelse(tp, agflbp);
+out_agbp_relse:
+ if (agbp)
+ xfs_trans_brelse(tp, agbp);
+out_no_agbp:
+ args->agbp = NULL;
+ return error;
}
/*
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d1b4b6a5c894..ca1c8168373a 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg {
xfs_extlen_t total; /* total blocks needed in xaction */
xfs_extlen_t alignment; /* align answer to multiple of this */
xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */
+ xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */
+ xfs_agblock_t max_agbno; /* ... */
xfs_extlen_t len; /* output: actual size of extent */
xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
xfs_alloctype_t otype; /* original allocation type */
@@ -128,11 +130,9 @@ typedef struct xfs_alloc_arg {
#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
-/*
- * Find the length of the longest extent in an AG.
- */
-xfs_extlen_t
-xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+ struct xfs_perag *pag, xfs_extlen_t need);
+unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
/*
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 0a472fbe06d4..3349c9a1e845 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -266,7 +266,7 @@ xfs_attr_set(
tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
if (error) {
- xfs_trans_cancel(args.trans, 0);
+ xfs_trans_cancel(args.trans);
return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -276,7 +276,7 @@ xfs_attr_set(
XFS_QMOPT_RES_REGBLKS);
if (error) {
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_trans_cancel(args.trans);
return error;
}
@@ -320,8 +320,7 @@ xfs_attr_set(
xfs_trans_ichgtime(args.trans, dp,
XFS_ICHGTIME_CHG);
}
- err2 = xfs_trans_commit(args.trans,
- XFS_TRANS_RELEASE_LOG_RES);
+ err2 = xfs_trans_commit(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error ? error : err2;
@@ -383,16 +382,14 @@ xfs_attr_set(
* Commit the last in the sequence of transactions.
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
out:
- if (args.trans) {
- xfs_trans_cancel(args.trans,
- XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- }
+ if (args.trans)
+ xfs_trans_cancel(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
}
@@ -462,7 +459,7 @@ xfs_attr_remove(
error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
XFS_ATTRRM_SPACE_RES(mp), 0);
if (error) {
- xfs_trans_cancel(args.trans, 0);
+ xfs_trans_cancel(args.trans);
return error;
}
@@ -501,16 +498,14 @@ xfs_attr_remove(
* Commit the last in the sequence of transactions.
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
out:
- if (args.trans) {
- xfs_trans_cancel(args.trans,
- XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- }
+ if (args.trans)
+ xfs_trans_cancel(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index f1026e86dabc..63e05b663380 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1112,7 +1112,6 @@ xfs_bmap_add_attrfork(
int committed; /* xaction was committed */
int logflags; /* logging flags */
int error; /* error return value */
- int cancel_flags = 0;
ASSERT(XFS_IFORK_Q(ip) == 0);
@@ -1124,17 +1123,15 @@ xfs_bmap_add_attrfork(
tp->t_flags |= XFS_TRANS_RESERVE;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
XFS_QMOPT_RES_REGBLKS);
if (error)
goto trans_cancel;
- cancel_flags |= XFS_TRANS_ABORT;
if (XFS_IFORK_Q(ip))
goto trans_cancel;
if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
@@ -1218,14 +1215,14 @@ xfs_bmap_add_attrfork(
error = xfs_bmap_finish(&tp, &flist, &committed);
if (error)
goto bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
bmap_cancel:
xfs_bmap_cancel(&flist);
trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -3521,7 +3518,8 @@ xfs_bmap_longest_free_extent(
}
}
- longest = xfs_alloc_longest_free_extent(mp, pag);
+ longest = xfs_alloc_longest_free_extent(mp, pag,
+ xfs_alloc_min_freelist(mp, pag));
if (*blen < longest)
*blen = longest;
@@ -4424,7 +4422,15 @@ xfs_bmapi_convert_unwritten(
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
&bma->cur, mval, bma->firstblock, bma->flist,
&tmp_logflags);
- bma->logflags |= tmp_logflags;
+ /*
+ * Log the inode core unconditionally in the unwritten extent conversion
+ * path because the conversion might not have done so (e.g., if the
+ * extent count hasn't changed). We need to make sure the inode is dirty
+ * in the transaction for the sake of fsync(), even if nothing has
+ * changed, because fsync() will not force the log for this transaction
+ * unless it sees the inode pinned.
+ */
+ bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
if (error)
return error;
@@ -5918,7 +5924,7 @@ xfs_bmap_split_extent(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -5936,10 +5942,9 @@ xfs_bmap_split_extent(
if (error)
goto out;
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
+ return xfs_trans_commit(tp);
out:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 4daaa662337b..a0ae572051de 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -170,7 +170,7 @@ typedef struct xfs_sb {
__uint32_t sb_features_log_incompat;
__uint32_t sb_crc; /* superblock crc */
- __uint32_t sb_pad;
+ xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */
xfs_ino_t sb_pquotino; /* project quota inode */
xfs_lsn_t sb_lsn; /* last write sequence */
@@ -256,7 +256,7 @@ typedef struct xfs_dsb {
__be32 sb_features_log_incompat;
__le32 sb_crc; /* superblock crc */
- __be32 sb_pad;
+ __be32 sb_spino_align; /* sparse inode chunk alignment */
__be64 sb_pquotino; /* project quota inode */
__be64 sb_lsn; /* last write sequence */
@@ -457,8 +457,10 @@ xfs_sb_has_ro_compat_feature(
}
#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
#define XFS_SB_FEAT_INCOMPAT_ALL \
- (XFS_SB_FEAT_INCOMPAT_FTYPE)
+ (XFS_SB_FEAT_INCOMPAT_FTYPE| \
+ XFS_SB_FEAT_INCOMPAT_SPINODES)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -506,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
}
+static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES);
+}
+
/*
* end of superblock version macros
*/
@@ -758,19 +766,6 @@ typedef struct xfs_agfl {
#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
-
-#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
-#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
- (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
-#define XFS_MIN_FREELIST(a,mp) \
- (XFS_MIN_FREELIST_RAW( \
- be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
- be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
-#define XFS_MIN_FREELIST_PAG(pag,mp) \
- (XFS_MIN_FREELIST_RAW( \
- (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
- (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
-
#define XFS_AGB_TO_FSB(mp,agno,agbno) \
(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
#define XFS_FSB_TO_AGNO(mp,fsbno) \
@@ -1216,26 +1211,54 @@ typedef __uint64_t xfs_inofree_t;
#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
+#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */
+#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t))
+#define XFS_INODES_PER_HOLEMASK_BIT \
+ (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t)))
+
static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
{
return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
}
/*
- * Data record structure
+ * The on-disk inode record structure has two formats. The original "full"
+ * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount
+ * and replaces the 3 high-order freecount bytes wth the holemask and inode
+ * count.
+ *
+ * The holemask of the sparse record format allows an inode chunk to have holes
+ * that refer to blocks not owned by the inode record. This facilitates inode
+ * allocation in the event of severe free space fragmentation.
*/
typedef struct xfs_inobt_rec {
__be32 ir_startino; /* starting inode number */
- __be32 ir_freecount; /* count of free inodes (set bits) */
+ union {
+ struct {
+ __be32 ir_freecount; /* count of free inodes */
+ } f;
+ struct {
+ __be16 ir_holemask;/* hole mask for sparse chunks */
+ __u8 ir_count; /* total inode count */
+ __u8 ir_freecount; /* count of free inodes */
+ } sp;
+ } ir_u;
__be64 ir_free; /* free inode mask */
} xfs_inobt_rec_t;
typedef struct xfs_inobt_rec_incore {
xfs_agino_t ir_startino; /* starting inode number */
- __int32_t ir_freecount; /* count of free inodes (set bits) */
+ __uint16_t ir_holemask; /* hole mask for sparse chunks */
+ __uint8_t ir_count; /* total inode count */
+ __uint8_t ir_freecount; /* count of free inodes (set bits) */
xfs_inofree_t ir_free; /* free inode mask */
} xfs_inobt_rec_incore_t;
+static inline bool xfs_inobt_issparse(uint16_t holemask)
+{
+ /* non-zero holemask represents a sparse rec. */
+ return holemask;
+}
/*
* Key structure
@@ -1453,8 +1476,8 @@ struct xfs_acl {
sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
/* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE "SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18dc721ca19f..89689c6a43e2 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
+#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */
/*
* Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 1c9e75521250..66efc702452a 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -65,6 +65,8 @@ xfs_inobt_lookup(
int *stat) /* success/failure */
{
cur->bc_rec.i.ir_startino = ino;
+ cur->bc_rec.i.ir_holemask = 0;
+ cur->bc_rec.i.ir_count = 0;
cur->bc_rec.i.ir_freecount = 0;
cur->bc_rec.i.ir_free = 0;
return xfs_btree_lookup(cur, dir, stat);
@@ -82,7 +84,14 @@ xfs_inobt_update(
union xfs_btree_rec rec;
rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
- rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
+ rec.inobt.ir_u.sp.ir_count = irec->ir_count;
+ rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
+ } else {
+ /* ir_holemask/ir_count not supported on-disk */
+ rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ }
rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
return xfs_btree_update(cur, &rec);
}
@@ -100,12 +109,27 @@ xfs_inobt_get_rec(
int error;
error = xfs_btree_get_rec(cur, &rec, stat);
- if (!error && *stat == 1) {
- irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
- irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
- irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+ if (error || *stat == 0)
+ return error;
+
+ irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+ if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
+ irec->ir_count = rec->inobt.ir_u.sp.ir_count;
+ irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
+ } else {
+ /*
+ * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
+ * values for full inode chunks.
+ */
+ irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
+ irec->ir_count = XFS_INODES_PER_CHUNK;
+ irec->ir_freecount =
+ be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
}
- return error;
+ irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+
+ return 0;
}
/*
@@ -114,10 +138,14 @@ xfs_inobt_get_rec(
STATIC int
xfs_inobt_insert_rec(
struct xfs_btree_cur *cur,
+ __uint16_t holemask,
+ __uint8_t count,
__int32_t freecount,
xfs_inofree_t free,
int *stat)
{
+ cur->bc_rec.i.ir_holemask = holemask;
+ cur->bc_rec.i.ir_count = count;
cur->bc_rec.i.ir_freecount = freecount;
cur->bc_rec.i.ir_free = free;
return xfs_btree_insert(cur, stat);
@@ -154,7 +182,9 @@ xfs_inobt_insert(
}
ASSERT(i == 0);
- error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+ error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
+ XFS_INODES_PER_CHUNK,
+ XFS_INODES_PER_CHUNK,
XFS_INOBT_ALL_FREE, &i);
if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -220,6 +250,7 @@ xfs_ialloc_inode_init(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct list_head *buffer_list,
+ int icount,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_agblock_t length,
@@ -275,7 +306,7 @@ xfs_ialloc_inode_init(
* they track in the AIL as if they were physically logged.
*/
if (tp)
- xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+ xfs_icreate_log(tp, agno, agbno, icount,
mp->m_sb.sb_inodesize, length, gen);
} else
version = 2;
@@ -347,6 +378,214 @@ xfs_ialloc_inode_init(
}
/*
+ * Align startino and allocmask for a recently allocated sparse chunk such that
+ * they are fit for insertion (or merge) into the on-disk inode btrees.
+ *
+ * Background:
+ *
+ * When enabled, sparse inode support increases the inode alignment from cluster
+ * size to inode chunk size. This means that the minimum range between two
+ * non-adjacent inode records in the inobt is large enough for a full inode
+ * record. This allows for cluster sized, cluster aligned block allocation
+ * without need to worry about whether the resulting inode record overlaps with
+ * another record in the tree. Without this basic rule, we would have to deal
+ * with the consequences of overlap by potentially undoing recent allocations in
+ * the inode allocation codepath.
+ *
+ * Because of this alignment rule (which is enforced on mount), there are two
+ * inobt possibilities for newly allocated sparse chunks. One is that the
+ * aligned inode record for the chunk covers a range of inodes not already
+ * covered in the inobt (i.e., it is safe to insert a new sparse record). The
+ * other is that a record already exists at the aligned startino that considers
+ * the newly allocated range as sparse. In the latter case, record content is
+ * merged in hope that sparse inode chunks fill to full chunks over time.
+ */
+STATIC void
+xfs_align_sparse_ino(
+ struct xfs_mount *mp,
+ xfs_agino_t *startino,
+ uint16_t *allocmask)
+{
+ xfs_agblock_t agbno;
+ xfs_agblock_t mod;
+ int offset;
+
+ agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
+ mod = agbno % mp->m_sb.sb_inoalignmt;
+ if (!mod)
+ return;
+
+ /* calculate the inode offset and align startino */
+ offset = mod << mp->m_sb.sb_inopblog;
+ *startino -= offset;
+
+ /*
+ * Since startino has been aligned down, left shift allocmask such that
+ * it continues to represent the same physical inodes relative to the
+ * new startino.
+ */
+ *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
+}
+
+/*
+ * Determine whether the source inode record can merge into the target. Both
+ * records must be sparse, the inode ranges must match and there must be no
+ * allocation overlap between the records.
+ */
+STATIC bool
+__xfs_inobt_can_merge(
+ struct xfs_inobt_rec_incore *trec, /* tgt record */
+ struct xfs_inobt_rec_incore *srec) /* src record */
+{
+ uint64_t talloc;
+ uint64_t salloc;
+
+ /* records must cover the same inode range */
+ if (trec->ir_startino != srec->ir_startino)
+ return false;
+
+ /* both records must be sparse */
+ if (!xfs_inobt_issparse(trec->ir_holemask) ||
+ !xfs_inobt_issparse(srec->ir_holemask))
+ return false;
+
+ /* both records must track some inodes */
+ if (!trec->ir_count || !srec->ir_count)
+ return false;
+
+ /* can't exceed capacity of a full record */
+ if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
+ return false;
+
+ /* verify there is no allocation overlap */
+ talloc = xfs_inobt_irec_to_allocmask(trec);
+ salloc = xfs_inobt_irec_to_allocmask(srec);
+ if (talloc & salloc)
+ return false;
+
+ return true;
+}
+
+/*
+ * Merge the source inode record into the target. The caller must call
+ * __xfs_inobt_can_merge() to ensure the merge is valid.
+ */
+STATIC void
+__xfs_inobt_rec_merge(
+ struct xfs_inobt_rec_incore *trec, /* target */
+ struct xfs_inobt_rec_incore *srec) /* src */
+{
+ ASSERT(trec->ir_startino == srec->ir_startino);
+
+ /* combine the counts */
+ trec->ir_count += srec->ir_count;
+ trec->ir_freecount += srec->ir_freecount;
+
+ /*
+ * Merge the holemask and free mask. For both fields, 0 bits refer to
+ * allocated inodes. We combine the allocated ranges with bitwise AND.
+ */
+ trec->ir_holemask &= srec->ir_holemask;
+ trec->ir_free &= srec->ir_free;
+}
+
+/*
+ * Insert a new sparse inode chunk into the associated inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * This function supports two modes of handling preexisting records depending on
+ * the merge flag. If merge is true, the provided record is merged with the
+ * existing record and updated in place. The merged record is returned in nrec.
+ * If merge is false, an existing record is replaced with the provided record.
+ * If no preexisting record exists, the provided record is always inserted.
+ *
+ * It is considered corruption if a merge is requested and not possible. Given
+ * the sparse inode alignment constraints, this should never happen.
+ */
+STATIC int
+xfs_inobt_insert_sprec(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ int btnum,
+ struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
+ bool merge) /* merge or replace */
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ int error;
+ int i;
+ struct xfs_inobt_rec_incore rec;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+ /* the new record is pre-aligned so we know where to look */
+ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ /* if nothing there, insert a new record and return */
+ if (i == 0) {
+ error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+ nrec->ir_count, nrec->ir_freecount,
+ nrec->ir_free, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+
+ goto out;
+ }
+
+ /*
+ * A record exists at this startino. Merge or replace the record
+ * depending on what we've been asked to do.
+ */
+ if (merge) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ rec.ir_startino == nrec->ir_startino,
+ error);
+
+ /*
+ * This should never fail. If we have coexisting records that
+ * cannot merge, something is seriously wrong.
+ */
+ XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
+ error);
+
+ trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
+ rec.ir_holemask, nrec->ir_startino,
+ nrec->ir_holemask);
+
+ /* merge to nrec to output the updated record */
+ __xfs_inobt_rec_merge(nrec, &rec);
+
+ trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
+ nrec->ir_holemask);
+
+ error = xfs_inobt_rec_check_count(mp, nrec);
+ if (error)
+ goto error;
+ }
+
+ error = xfs_inobt_update(cur, nrec);
+ if (error)
+ goto error;
+
+out:
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
* Allocate new inodes in the allocation group specified by agbp.
* Return 0 for success, else error code.
*/
@@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc(
xfs_agino_t newlen; /* new number of inodes */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
+ uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */
+ struct xfs_inobt_rec_incore rec;
struct xfs_perag *pag;
+ int do_sparse = 0;
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = tp->t_mountp;
+ args.fsbno = NULLFSBLOCK;
+
+#ifdef DEBUG
+ /* randomly do sparse inode allocations */
+ if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
+ args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
+ do_sparse = prandom_u32() & 1;
+#endif
/*
* Locking will ensure that we don't have two callers in here
@@ -390,6 +640,8 @@ xfs_ialloc_ag_alloc(
agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
args.mp->m_ialloc_blks;
+ if (do_sparse)
+ goto sparse_alloc;
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -428,8 +680,7 @@ xfs_ialloc_ag_alloc(
* subsequent requests.
*/
args.minalignslop = 0;
- } else
- args.fsbno = NULLFSBLOCK;
+ }
if (unlikely(args.fsbno == NULLFSBLOCK)) {
/*
@@ -480,6 +731,47 @@ xfs_ialloc_ag_alloc(
return error;
}
+ /*
+ * Finally, try a sparse allocation if the filesystem supports it and
+ * the sparse allocation length is smaller than a full chunk.
+ */
+ if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
+ args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
+ args.fsbno == NULLFSBLOCK) {
+sparse_alloc:
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.agbno = be32_to_cpu(agi->agi_root);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.alignment = args.mp->m_sb.sb_spino_align;
+ args.prod = 1;
+
+ args.minlen = args.mp->m_ialloc_min_blks;
+ args.maxlen = args.minlen;
+
+ /*
+ * The inode record will be aligned to full chunk size. We must
+ * prevent sparse allocation from AG boundaries that result in
+ * invalid inode records, such as records that start at agbno 0
+ * or extend beyond the AG.
+ *
+ * Set min agbno to the first aligned, non-zero agbno and max to
+ * the last aligned agbno that is at least one full chunk from
+ * the end of the AG.
+ */
+ args.min_agbno = args.mp->m_sb.sb_inoalignmt;
+ args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
+ args.mp->m_sb.sb_inoalignmt) -
+ args.mp->m_ialloc_blks;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+
+ newlen = args.len << args.mp->m_sb.sb_inopblog;
+ ASSERT(newlen <= XFS_INODES_PER_CHUNK);
+ allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
+ }
+
if (args.fsbno == NULLFSBLOCK) {
*alloc = 0;
return 0;
@@ -495,8 +787,8 @@ xfs_ialloc_ag_alloc(
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
- args.len, prandom_u32());
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
+ args.agbno, args.len, prandom_u32());
if (error)
return error;
@@ -504,6 +796,73 @@ xfs_ialloc_ag_alloc(
* Convert the results.
*/
newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+
+ if (xfs_inobt_issparse(~allocmask)) {
+ /*
+ * We've allocated a sparse chunk. Align the startino and mask.
+ */
+ xfs_align_sparse_ino(args.mp, &newino, &allocmask);
+
+ rec.ir_startino = newino;
+ rec.ir_holemask = ~allocmask;
+ rec.ir_count = newlen;
+ rec.ir_freecount = newlen;
+ rec.ir_free = XFS_INOBT_ALL_FREE;
+
+ /*
+ * Insert the sparse record into the inobt and allow for a merge
+ * if necessary. If a merge does occur, rec is updated to the
+ * merged record.
+ */
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
+ &rec, true);
+ if (error == -EFSCORRUPTED) {
+ xfs_alert(args.mp,
+ "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
+ XFS_AGINO_TO_INO(args.mp, agno,
+ rec.ir_startino),
+ rec.ir_holemask, rec.ir_count);
+ xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+ if (error)
+ return error;
+
+ /*
+ * We can't merge the part we've just allocated as for the inobt
+ * due to finobt semantics. The original record may or may not
+ * exist independent of whether physical inodes exist in this
+ * sparse chunk.
+ *
+ * We must update the finobt record based on the inobt record.
+ * rec contains the fully merged and up to date inobt record
+ * from the previous call. Set merge false to replace any
+ * existing record with this one.
+ */
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
+ XFS_BTNUM_FINO, &rec,
+ false);
+ if (error)
+ return error;
+ }
+ } else {
+ /* full chunk - insert new records to both btrees */
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_INO);
+ if (error)
+ return error;
+
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino,
+ newlen, XFS_BTNUM_FINO);
+ if (error)
+ return error;
+ }
+ }
+
+ /*
+ * Update AGI counts and newino.
+ */
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
pag = xfs_perag_get(args.mp, agno);
@@ -512,20 +871,6 @@ xfs_ialloc_ag_alloc(
agi->agi_newino = cpu_to_be32(newino);
/*
- * Insert records describing the new inode chunk into the btrees.
- */
- error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
- XFS_BTNUM_INO);
- if (error)
- return error;
-
- if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
- error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
- XFS_BTNUM_FINO);
- if (error)
- return error;
- }
- /*
* Log allocation group header fields
*/
xfs_ialloc_log_agi(tp, agbp,
@@ -645,7 +990,7 @@ xfs_ialloc_ag_select(
* if we fail allocation due to alignment issues then it is most
* likely a real ENOSPC condition.
*/
- ineed = mp->m_ialloc_blks;
+ ineed = mp->m_ialloc_min_blks;
if (flags && ineed > 1)
ineed += xfs_ialloc_cluster_alignment(mp);
longest = pag->pagf_longest;
@@ -732,6 +1077,27 @@ xfs_ialloc_get_rec(
}
/*
+ * Return the offset of the first free inode in the record. If the inode chunk
+ * is sparsely allocated, we convert the record holemask to inode granularity
+ * and mask off the unallocated regions from the inode free mask.
+ */
+STATIC int
+xfs_inobt_first_free_inode(
+ struct xfs_inobt_rec_incore *rec)
+{
+ xfs_inofree_t realfree;
+
+ /* if there are no holes, return the first available offset */
+ if (!xfs_inobt_issparse(rec->ir_holemask))
+ return xfs_lowbit64(rec->ir_free);
+
+ realfree = xfs_inobt_irec_to_allocmask(rec);
+ realfree &= rec->ir_free;
+
+ return xfs_lowbit64(realfree);
+}
+
+/*
* Allocate an inode using the inobt-only algorithm.
*/
STATIC int
@@ -961,7 +1327,7 @@ newino:
}
alloc_inode:
- offset = xfs_lowbit64(rec.ir_free);
+ offset = xfs_inobt_first_free_inode(&rec);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1210,7 +1576,7 @@ xfs_dialloc_ag(
if (error)
goto error_cur;
- offset = xfs_lowbit64(rec.ir_free);
+ offset = xfs_inobt_first_free_inode(&rec);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1439,6 +1805,83 @@ out_error:
return error;
}
+/*
+ * Free the blocks of an inode chunk. We must consider that the inode chunk
+ * might be sparse and only free the regions that are allocated as part of the
+ * chunk.
+ */
+STATIC void
+xfs_difree_inode_chunk(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ struct xfs_inobt_rec_incore *rec,
+ struct xfs_bmap_free *flist)
+{
+ xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
+ int startidx, endidx;
+ int nextbit;
+ xfs_agblock_t agbno;
+ int contigblk;
+ DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
+
+ if (!xfs_inobt_issparse(rec->ir_holemask)) {
+ /* not sparse, calculate extent info directly */
+ xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+ XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
+ mp->m_ialloc_blks, flist, mp);
+ return;
+ }
+
+ /* holemask is only 16-bits (fits in an unsigned long) */
+ ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
+ holemask[0] = rec->ir_holemask;
+
+ /*
+ * Find contiguous ranges of zeroes (i.e., allocated regions) in the
+ * holemask and convert the start/end index of each range to an extent.
+ * We start with the start and end index both pointing at the first 0 in
+ * the mask.
+ */
+ startidx = endidx = find_first_zero_bit(holemask,
+ XFS_INOBT_HOLEMASK_BITS);
+ nextbit = startidx + 1;
+ while (startidx < XFS_INOBT_HOLEMASK_BITS) {
+ nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
+ nextbit);
+ /*
+ * If the next zero bit is contiguous, update the end index of
+ * the current range and continue.
+ */
+ if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
+ nextbit == endidx + 1) {
+ endidx = nextbit;
+ goto next;
+ }
+
+ /*
+ * nextbit is not contiguous with the current end index. Convert
+ * the current start/end to an extent and add it to the free
+ * list.
+ */
+ agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
+ mp->m_sb.sb_inopblock;
+ contigblk = ((endidx - startidx + 1) *
+ XFS_INODES_PER_HOLEMASK_BIT) /
+ mp->m_sb.sb_inopblock;
+
+ ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
+ ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
+ xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
+ flist, mp);
+
+ /* reset range to current bit and carry on... */
+ startidx = endidx = nextbit;
+
+next:
+ nextbit++;
+ }
+}
+
STATIC int
xfs_difree_inobt(
struct xfs_mount *mp,
@@ -1446,8 +1889,7 @@ xfs_difree_inobt(
struct xfs_buf *agbp,
xfs_agino_t agino,
struct xfs_bmap_free *flist,
- int *deleted,
- xfs_ino_t *first_ino,
+ struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
@@ -1501,20 +1943,23 @@ xfs_difree_inobt(
rec.ir_freecount++;
/*
- * When an inode cluster is free, it becomes eligible for removal
+ * When an inode chunk is free, it becomes eligible for removal. Don't
+ * remove the chunk if the block size is large enough for multiple inode
+ * chunks (that might not be free).
*/
if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
- (rec.ir_freecount == mp->m_ialloc_inos)) {
-
- *deleted = 1;
- *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+ rec.ir_free == XFS_INOBT_ALL_FREE &&
+ mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+ xic->deleted = 1;
+ xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+ xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
/*
* Remove the inode cluster from the AGI B+Tree, adjust the
* AGI and Superblock inode counts, and mark the disk space
* to be freed when the transaction is committed.
*/
- ilen = mp->m_ialloc_inos;
+ ilen = rec.ir_freecount;
be32_add_cpu(&agi->agi_count, -ilen);
be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1530,11 +1975,9 @@ xfs_difree_inobt(
goto error0;
}
- xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
- XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
- mp->m_ialloc_blks, flist, mp);
+ xfs_difree_inode_chunk(mp, agno, &rec, flist);
} else {
- *deleted = 0;
+ xic->deleted = 0;
error = xfs_inobt_update(cur, &rec);
if (error) {
@@ -1599,7 +2042,9 @@ xfs_difree_finobt(
*/
XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
- error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+ error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
+ ibtrec->ir_count,
+ ibtrec->ir_freecount,
ibtrec->ir_free, &i);
if (error)
goto error;
@@ -1634,8 +2079,13 @@ xfs_difree_finobt(
* free inode. Hence, if all of the inodes are free and we aren't
* keeping inode chunks permanently on disk, remove the record.
* Otherwise, update the record with the new information.
+ *
+ * Note that we currently can't free chunks when the block size is large
+ * enough for multiple chunks. Leave the finobt record to remain in sync
+ * with the inobt.
*/
- if (rec.ir_freecount == mp->m_ialloc_inos &&
+ if (rec.ir_free == XFS_INOBT_ALL_FREE &&
+ mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
!(mp->m_flags & XFS_MOUNT_IKEEP)) {
error = xfs_btree_delete(cur, &i);
if (error)
@@ -1671,8 +2121,7 @@ xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
struct xfs_bmap_free *flist, /* extents to free */
- int *deleted,/* set if inode cluster was deleted */
- xfs_ino_t *first_ino)/* first inode in deleted cluster */
+ struct xfs_icluster *xic) /* cluster info if deleted */
{
/* REFERENCED */
xfs_agblock_t agbno; /* block number containing inode */
@@ -1723,8 +2172,7 @@ xfs_difree(
/*
* Fix up the inode allocation btree.
*/
- error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
- &rec);
+ error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec);
if (error)
goto error0;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 100007d56449..6e450df2979b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -28,6 +28,13 @@ struct xfs_btree_cur;
/* Move inodes in clusters of this size */
#define XFS_INODE_BIG_CLUSTER_SIZE 8192
+struct xfs_icluster {
+ bool deleted; /* record is deleted */
+ xfs_ino_t first_ino; /* first inode number */
+ uint64_t alloc; /* inode phys. allocation bitmap for
+ * sparse chunks */
+};
+
/* Calculate and return the number of filesystem blocks per inode cluster */
static inline int
xfs_icluster_size_fsb(
@@ -44,8 +51,7 @@ xfs_icluster_size_fsb(
static inline struct xfs_dinode *
xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
{
- return (struct xfs_dinode *)
- (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
+ return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog);
}
/*
@@ -90,8 +96,7 @@ xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
struct xfs_bmap_free *flist, /* extents to free */
- int *deleted, /* set if inode cluster was deleted */
- xfs_ino_t *first_ino); /* first inode in deleted cluster */
+ struct xfs_icluster *ifree); /* cluster info if deleted */
/*
* Return the location of the inode in imap, for mapping it into a buffer.
@@ -156,7 +161,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
* Inode chunk initialisation routine
*/
int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
- struct list_head *buffer_list,
+ struct list_head *buffer_list, int icount,
xfs_agnumber_t agno, xfs_agblock_t agbno,
xfs_agblock_t length, unsigned int gen);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 964c465ca69c..674ad8f760be 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur(
union xfs_btree_rec *rec)
{
rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
- rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+ if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ rec->inobt.ir_u.sp.ir_holemask =
+ cpu_to_be16(cur->bc_rec.i.ir_holemask);
+ rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
+ rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount;
+ } else {
+ /* ir_holemask/ir_count not supported on-disk */
+ rec->inobt.ir_u.f.ir_freecount =
+ cpu_to_be32(cur->bc_rec.i.ir_freecount);
+ }
rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
}
@@ -418,3 +427,85 @@ xfs_inobt_maxrecs(
return blocklen / sizeof(xfs_inobt_rec_t);
return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
}
+
+/*
+ * Convert the inode record holemask to an inode allocation bitmap. The inode
+ * allocation bitmap is inode granularity and specifies whether an inode is
+ * physically allocated on disk (not whether the inode is considered allocated
+ * or free by the fs).
+ *
+ * A bit value of 1 means the inode is allocated, a value of 0 means it is free.
+ */
+uint64_t
+xfs_inobt_irec_to_allocmask(
+ struct xfs_inobt_rec_incore *rec)
+{
+ uint64_t bitmap = 0;
+ uint64_t inodespbit;
+ int nextbit;
+ uint allocbitmap;
+
+ /*
+ * The holemask has 16-bits for a 64 inode record. Therefore each
+ * holemask bit represents multiple inodes. Create a mask of bits to set
+ * in the allocmask for each holemask bit.
+ */
+ inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1;
+
+ /*
+ * Allocated inodes are represented by 0 bits in holemask. Invert the 0
+ * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask
+ * anything beyond the 16 holemask bits since this casts to a larger
+ * type.
+ */
+ allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1);
+
+ /*
+ * allocbitmap is the inverted holemask so every set bit represents
+ * allocated inodes. To expand from 16-bit holemask granularity to
+ * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target
+ * bitmap for every holemask bit.
+ */
+ nextbit = xfs_next_bit(&allocbitmap, 1, 0);
+ while (nextbit != -1) {
+ ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY));
+
+ bitmap |= (inodespbit <<
+ (nextbit * XFS_INODES_PER_HOLEMASK_BIT));
+
+ nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1);
+ }
+
+ return bitmap;
+}
+
+#if defined(DEBUG) || defined(XFS_WARN)
+/*
+ * Verify that an in-core inode record has a valid inode count.
+ */
+int
+xfs_inobt_rec_check_count(
+ struct xfs_mount *mp,
+ struct xfs_inobt_rec_incore *rec)
+{
+ int inocount = 0;
+ int nextbit = 0;
+ uint64_t allocbmap;
+ int wordsz;
+
+ wordsz = sizeof(allocbmap) / sizeof(unsigned int);
+ allocbmap = xfs_inobt_irec_to_allocmask(rec);
+
+ nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
+ while (nextbit != -1) {
+ inocount++;
+ nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
+ nextbit + 1);
+ }
+
+ if (inocount != rec->ir_count)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+#endif /* DEBUG */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index d7ebea72c2d0..bd88453217ce 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
xfs_btnum_t);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+/* ir_holemask to inode allocation bitmap conversion */
+uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
+
+#if defined(DEBUG) || defined(XFS_WARN)
+int xfs_inobt_rec_check_count(struct xfs_mount *,
+ struct xfs_inobt_rec_incore *);
+#else
+#define xfs_inobt_rec_check_count(mp, rec) 0
+#endif /* DEBUG */
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 002b6b3a1988..6526e7696184 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -46,8 +46,7 @@ xfs_inobp_check(
j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
for (i = 0; i < j; i++) {
- dip = (xfs_dinode_t *)xfs_buf_offset(bp,
- i * mp->m_sb.sb_inodesize);
+ dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
if (!dip->di_next_unlinked) {
xfs_alert(mp,
"Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
@@ -86,8 +85,7 @@ xfs_inode_buf_verify(
int di_ok;
xfs_dinode_t *dip;
- dip = (struct xfs_dinode *)xfs_buf_offset(bp,
- (i << mp->m_sb.sb_inodelog));
+ dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
XFS_DINODE_GOOD_VERSION(dip->di_version);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
@@ -186,7 +184,7 @@ xfs_imap_to_bp(
}
*bpp = bp;
- *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+ *dipp = xfs_buf_offset(bp, imap->im_boffset);
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index dc4bfc5d88fc..df9851c46b5c 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -174,6 +174,27 @@ xfs_mount_validate_sb(
return -EFSCORRUPTED;
}
+ /*
+ * Full inode chunks must be aligned to inode chunk size when
+ * sparse inodes are enabled to support the sparse chunk
+ * allocation algorithm and prevent overlapping inode records.
+ */
+ if (xfs_sb_version_hassparseinodes(sbp)) {
+ uint32_t align;
+
+ xfs_alert(mp,
+ "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
+
+ align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
+ >> sbp->sb_blocklog;
+ if (sbp->sb_inoalignmt != align) {
+ xfs_warn(mp,
+"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
+ sbp->sb_inoalignmt, align);
+ return -EINVAL;
+ }
+ }
+
if (unlikely(
sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
xfs_warn(mp,
@@ -374,7 +395,7 @@ __xfs_sb_from_disk(
be32_to_cpu(from->sb_features_log_incompat);
/* crc is only used on disk, not in memory; just init to 0 here. */
to->sb_crc = 0;
- to->sb_pad = 0;
+ to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
to->sb_lsn = be64_to_cpu(from->sb_lsn);
/* Convert on-disk flags to in-memory flags? */
@@ -516,7 +537,7 @@ xfs_sb_to_disk(
cpu_to_be32(from->sb_features_incompat);
to->sb_features_log_incompat =
cpu_to_be32(from->sb_features_log_incompat);
- to->sb_pad = 0;
+ to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
to->sb_lsn = cpu_to_be64(from->sb_lsn);
}
}
@@ -689,6 +710,11 @@ xfs_sb_mount_common(
mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
sbp->sb_inopblock);
mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+
+ if (sbp->sb_spino_align)
+ mp->m_ialloc_min_blks = sbp->sb_spino_align;
+ else
+ mp->m_ialloc_min_blks = mp->m_ialloc_blks;
}
/*
@@ -792,12 +818,12 @@ xfs_sync_sb(
tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
xfs_log_sb(tp);
if (wait)
xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 8dda4b321343..5be529707903 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -182,12 +182,6 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
count in superblock */
/*
- * Values for call flags parameter.
- */
-#define XFS_TRANS_RELEASE_LOG_RES 0x4
-#define XFS_TRANS_ABORT 0x8
-
-/*
* Field values for xfs_trans_mod_sb.
*/
#define XFS_TRANS_SB_ICOUNT 0x00000001
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 2d5bdfce6d8f..797815012c0e 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -73,9 +73,9 @@ struct xfs_trans_resv {
* 2 trees * (2 blocks/level * max depth - 1) * block size
*/
#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
- ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+ ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1)))
#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
- ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+ ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1)))
/*
* Per-directory log reservation for any directory change.
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index bf9c4579334d..41e0428d8175 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -67,7 +67,7 @@
#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
#define XFS_GROWFS_SPACE_RES(mp) \
- (2 * XFS_AG_MAXLEVELS(mp))
+ (2 * (mp)->m_ag_maxlevels)
#define XFS_GROWFSRT_SPACE_RES(mp,b) \
((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
#define XFS_LINK_SPACE_RES(mp,nl) \
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e5099f268032..3859f5e27a4d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -109,7 +109,7 @@ xfs_setfilesize_trans_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -145,7 +145,7 @@ xfs_setfilesize(
isize = xfs_new_eof(ip, offset + size);
if (!isize) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return 0;
}
@@ -155,7 +155,7 @@ xfs_setfilesize(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
STATIC int
@@ -1348,7 +1348,7 @@ __xfs_get_blocks(
sector_t iblock,
struct buffer_head *bh_result,
int create,
- int direct)
+ bool direct)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1413,6 +1413,7 @@ __xfs_get_blocks(
if (error)
return error;
new = 1;
+
} else {
/*
* Delalloc reservations do not require a transaction,
@@ -1507,49 +1508,29 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, false);
}
-STATIC int
+int
xfs_get_blocks_direct(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true);
}
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
- struct kiocb *iocb,
+static void
+__xfs_end_io_direct_write(
+ struct inode *inode,
+ struct xfs_ioend *ioend,
loff_t offset,
- ssize_t size,
- void *private)
+ ssize_t size)
{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ioend *ioend = private;
-
- trace_xfs_gbmap_direct_endio(ip, offset, size,
- ioend ? ioend->io_type : 0, NULL);
+ struct xfs_mount *mp = XFS_I(inode)->i_mount;
- if (!ioend) {
- ASSERT(offset + size <= i_size_read(inode));
- return;
- }
-
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
goto out_end_io;
/*
@@ -1586,10 +1567,10 @@ xfs_end_io_direct_write(
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*/
- spin_lock(&ip->i_flags_lock);
+ spin_lock(&XFS_I(inode)->i_flags_lock);
if (offset + size > i_size_read(inode))
i_size_write(inode, offset + size);
- spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&XFS_I(inode)->i_flags_lock);
/*
* If we are doing an append IO that needs to update the EOF on disk,
@@ -1606,6 +1587,98 @@ out_end_io:
return;
}
+/*
+ * Complete a direct I/O write request.
+ *
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
+ */
+STATIC void
+xfs_end_io_direct_write(
+ struct kiocb *iocb,
+ loff_t offset,
+ ssize_t size,
+ void *private)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_ioend *ioend = private;
+
+ trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
+ ioend ? ioend->io_type : 0, NULL);
+
+ if (!ioend) {
+ ASSERT(offset + size <= i_size_read(inode));
+ return;
+ }
+
+ __xfs_end_io_direct_write(inode, ioend, offset, size);
+}
+
+/*
+ * For DAX we need a mapping buffer callback for unwritten extent conversion
+ * when page faults allocate blocks and then zero them. Note that in this
+ * case the mapping indicated by the ioend may extend beyond EOF. We most
+ * definitely do not want to extend EOF here, so we trim back the ioend size to
+ * EOF.
+ */
+#ifdef CONFIG_FS_DAX
+void
+xfs_end_io_dax_write(
+ struct buffer_head *bh,
+ int uptodate)
+{
+ struct xfs_ioend *ioend = bh->b_private;
+ struct inode *inode = ioend->io_inode;
+ ssize_t size = ioend->io_size;
+
+ ASSERT(IS_DAX(ioend->io_inode));
+
+ /* if there was an error zeroing, then don't convert it */
+ if (!uptodate)
+ ioend->io_error = -EIO;
+
+ /*
+ * Trim update to EOF, so we don't extend EOF during unwritten extent
+ * conversion of partial EOF blocks.
+ */
+ spin_lock(&XFS_I(inode)->i_flags_lock);
+ if (ioend->io_offset + size > i_size_read(inode))
+ size = i_size_read(inode) - ioend->io_offset;
+ spin_unlock(&XFS_I(inode)->i_flags_lock);
+
+ __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
+
+}
+#else
+void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
+#endif
+
+static inline ssize_t
+xfs_vm_do_dio(
+ struct inode *inode,
+ struct kiocb *iocb,
+ struct iov_iter *iter,
+ loff_t offset,
+ void (*endio)(struct kiocb *iocb,
+ loff_t offset,
+ ssize_t size,
+ void *private),
+ int flags)
+{
+ struct block_device *bdev;
+
+ if (IS_DAX(inode))
+ return dax_do_io(iocb, inode, iter, offset,
+ xfs_get_blocks_direct, endio, 0);
+
+ bdev = xfs_find_bdev_for_inode(inode);
+ return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+ xfs_get_blocks_direct, endio, NULL, flags);
+}
+
STATIC ssize_t
xfs_vm_direct_IO(
struct kiocb *iocb,
@@ -1613,16 +1686,11 @@ xfs_vm_direct_IO(
loff_t offset)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
- struct block_device *bdev = xfs_find_bdev_for_inode(inode);
- if (iov_iter_rw(iter) == WRITE) {
- return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
- xfs_get_blocks_direct,
- xfs_end_io_direct_write, NULL,
- DIO_ASYNC_EXTEND);
- }
- return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
- xfs_get_blocks_direct, NULL, NULL, 0);
+ if (iov_iter_rw(iter) == WRITE)
+ return xfs_vm_do_dio(inode, iocb, iter, offset,
+ xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
+ return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
}
/*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index ac644e0137a4..86afd1ac7895 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -53,7 +53,12 @@ typedef struct xfs_ioend {
} xfs_ioend_t;
extern const struct address_space_operations xfs_address_space_operations;
-extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+int xfs_get_blocks(struct inode *inode, sector_t offset,
+ struct buffer_head *map_bh, int create);
+int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
+ struct buffer_head *map_bh, int create);
+void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
extern void xfs_count_page_state(struct page *, int *, int *);
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 3fbf167cfb4c..2bb959ada45b 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -394,7 +394,6 @@ xfs_attr_inactive(
{
struct xfs_trans *trans;
struct xfs_mount *mp;
- int cancel_flags = 0;
int lock_mode = XFS_ILOCK_SHARED;
int error = 0;
@@ -423,7 +422,6 @@ xfs_attr_inactive(
goto out_cancel;
lock_mode = XFS_ILOCK_EXCL;
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT;
xfs_ilock(dp, lock_mode);
if (!XFS_IFORK_Q(dp))
@@ -435,8 +433,14 @@ xfs_attr_inactive(
*/
xfs_trans_ijoin(trans, dp, 0);
- /* invalidate and truncate the attribute fork extents */
- if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) {
+ /*
+ * Invalidate and truncate the attribute fork extents. Make sure the
+ * fork actually has attributes as otherwise the invalidation has no
+ * blocks to read and returns an error. In this case, just do the fork
+ * removal below.
+ */
+ if (xfs_inode_hasattr(dp) &&
+ dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) {
error = xfs_attr3_root_inactive(&trans, dp);
if (error)
goto out_cancel;
@@ -449,12 +453,12 @@ xfs_attr_inactive(
/* Reset the attribute fork - this also destroys the in-core fork */
xfs_attr_fork_remove(dp, trans);
- error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(trans);
xfs_iunlock(dp, lock_mode);
return error;
out_cancel:
- xfs_trans_cancel(trans, cancel_flags);
+ xfs_trans_cancel(trans);
out_destroy_fork:
/* kill the in-core attr fork before we drop the inode lock */
if (dp->i_afp)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a52bbd3abc7d..0f34886cf726 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -75,28 +75,20 @@ xfs_bmap_finish(
xfs_efi_log_item_t *efi; /* extent free intention */
int error; /* error return value */
xfs_bmap_free_item_t *free; /* free extent item */
- struct xfs_trans_res tres; /* new log reservation */
xfs_mount_t *mp; /* filesystem mount structure */
xfs_bmap_free_item_t *next; /* next item on free list */
- xfs_trans_t *ntp; /* new transaction pointer */
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
if (flist->xbf_count == 0) {
*committed = 0;
return 0;
}
- ntp = *tp;
- efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+ efi = xfs_trans_get_efi(*tp, flist->xbf_count);
for (free = flist->xbf_first; free; free = free->xbfi_next)
- xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+ xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
free->xbfi_blockcount);
- tres.tr_logres = ntp->t_log_res;
- tres.tr_logcount = ntp->t_log_count;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- ntp = xfs_trans_dup(*tp);
- error = xfs_trans_commit(*tp, 0);
- *tp = ntp;
+ error = xfs_trans_roll(tp, NULL);
*committed = 1;
/*
* We have a new transaction, so we should return committed=1,
@@ -105,19 +97,10 @@ xfs_bmap_finish(
if (error)
return error;
- /*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(ntp->t_ticket);
-
- error = xfs_trans_reserve(ntp, &tres, 0, 0);
- if (error)
- return error;
- efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+ efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
for (free = flist->xbf_first; free != NULL; free = next) {
next = free->xbfi_next;
- if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+ if ((error = xfs_free_extent(*tp, free->xbfi_startblock,
free->xbfi_blockcount))) {
/*
* The bmap free list will be cleaned up at a
@@ -127,7 +110,7 @@ xfs_bmap_finish(
* happens, since this transaction may not be
* dirty yet.
*/
- mp = ntp->t_mountp;
+ mp = (*tp)->t_mountp;
if (!XFS_FORCED_SHUTDOWN(mp))
xfs_force_shutdown(mp,
(error == -EFSCORRUPTED) ?
@@ -135,7 +118,7 @@ xfs_bmap_finish(
SHUTDOWN_META_IO_ERROR);
return error;
}
- xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+ xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock,
free->xbfi_blockcount);
xfs_bmap_del_free(flist, NULL, free);
}
@@ -878,7 +861,7 @@ xfs_free_eofblocks(
if (need_iolock) {
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return -EAGAIN;
}
}
@@ -886,7 +869,7 @@ xfs_free_eofblocks(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
if (need_iolock)
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return error;
@@ -908,12 +891,9 @@ xfs_free_eofblocks(
* If we get an error at this point we simply don't
* bother truncating the file.
*/
- xfs_trans_cancel(tp,
- (XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT));
+ xfs_trans_cancel(tp);
} else {
- error = xfs_trans_commit(tp,
- XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (!error)
xfs_inode_clear_eofblocks_tag(ip);
}
@@ -1026,7 +1006,7 @@ xfs_alloc_file_space(
* Free the transaction structure.
*/
ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
break;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1053,7 +1033,7 @@ xfs_alloc_file_space(
goto error0;
}
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error) {
break;
@@ -1077,7 +1057,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
error1: /* Just cancel transaction */
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1133,14 +1113,29 @@ xfs_zero_remaining_bytes(
break;
ASSERT(imap.br_blockcount >= 1);
ASSERT(imap.br_startoff == offset_fsb);
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+
+ if (imap.br_startblock == HOLESTARTBLOCK ||
+ imap.br_state == XFS_EXT_UNWRITTEN) {
+ /* skip the entire extent */
+ lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
+ imap.br_blockcount) - 1;
+ continue;
+ }
+
lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
if (lastoffset > endoff)
lastoffset = endoff;
- if (imap.br_startblock == HOLESTARTBLOCK)
- continue;
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- if (imap.br_state == XFS_EXT_UNWRITTEN)
+
+ /* DAX can just zero the backing device directly */
+ if (IS_DAX(VFS_I(ip))) {
+ error = dax_zero_page_range(VFS_I(ip), offset,
+ lastoffset - offset + 1,
+ xfs_get_blocks_direct);
+ if (error)
+ return error;
continue;
+ }
error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -1289,7 +1284,7 @@ xfs_free_file_space(
* Free the transaction structure.
*/
ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
break;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1320,7 +1315,7 @@ xfs_free_file_space(
goto error0;
}
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
@@ -1330,7 +1325,7 @@ xfs_free_file_space(
error0:
xfs_bmap_cancel(&free_list);
error1:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
goto out;
}
@@ -1462,7 +1457,7 @@ xfs_shift_file_space(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
break;
}
@@ -1492,13 +1487,13 @@ xfs_shift_file_space(
if (error)
goto out;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
}
return error;
out:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
@@ -1718,7 +1713,7 @@ xfs_swap_extents(
tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
@@ -1901,7 +1896,7 @@ xfs_swap_extents(
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
trace_xfs_swap_extent_after(ip, 0);
trace_xfs_swap_extent_after(tip, 1);
@@ -1915,6 +1910,6 @@ out_unlock:
goto out;
out_trans_cancel:
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1790b00bea7a..a4b7d92e946c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1419,9 +1419,9 @@ xfs_buf_submit_wait(
return error;
}
-xfs_caddr_t
+void *
xfs_buf_offset(
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
size_t offset)
{
struct page *page;
@@ -1431,7 +1431,7 @@ xfs_buf_offset(
offset += bp->b_offset;
page = bp->b_pages[offset >> PAGE_SHIFT];
- return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
+ return page_address(page) + (offset & (PAGE_SIZE-1));
}
/*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 75ff5d5a7d2e..331c1ccf8264 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -299,7 +299,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
/* Buffer Utility Routines */
-extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
+extern void *xfs_buf_offset(struct xfs_buf *, size_t);
/* Delayed Write Buffer Routines */
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 02c01bbbc789..4143dc75dca4 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -568,8 +568,6 @@ xfs_qm_dqread(
struct xfs_buf *bp;
struct xfs_trans *tp = NULL;
int error;
- int cancelflags = 0;
-
dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
@@ -617,7 +615,6 @@ xfs_qm_dqread(
XFS_QM_DQALLOC_SPACE_RES(mp), 0);
if (error)
goto error1;
- cancelflags = XFS_TRANS_RELEASE_LOG_RES;
}
/*
@@ -632,7 +629,6 @@ xfs_qm_dqread(
* allocate (ENOENT).
*/
trace_xfs_dqread_fail(dqp);
- cancelflags |= XFS_TRANS_ABORT;
goto error1;
}
@@ -670,7 +666,7 @@ xfs_qm_dqread(
xfs_trans_brelse(tp, bp);
if (tp) {
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto error0;
}
@@ -680,7 +676,7 @@ xfs_qm_dqread(
error1:
if (tp)
- xfs_trans_cancel(tp, cancelflags);
+ xfs_trans_cancel(tp);
error0:
xfs_qm_dqdestroy(dqp);
*O_dqpp = NULL;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 338e50bbfd1e..74d0e5966ebc 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -127,7 +127,7 @@ xfs_error_report(
struct xfs_mount *mp,
const char *filename,
int linenum,
- inst_t *ra)
+ void *ra)
{
if (level <= xfs_error_level) {
xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
@@ -146,7 +146,7 @@ xfs_corruption_error(
void *p,
const char *filename,
int linenum,
- inst_t *ra)
+ void *ra)
{
if (level <= xfs_error_level)
xfs_hex_dump(p, 64);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c0394ed126fc..4ed3042a0f16 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -21,10 +21,10 @@
struct xfs_mount;
extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
- const char *filename, int linenum, inst_t *ra);
+ const char *filename, int linenum, void *ra);
extern void xfs_corruption_error(const char *tag, int level,
struct xfs_mount *mp, void *p, const char *filename,
- int linenum, inst_t *ra);
+ int linenum, void *ra);
extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_ERROR_REPORT(e, lvl, mp) \
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index cb7fe64cdbfa..adc8f8fdd145 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -239,7 +239,7 @@ xfs_efi_init(
xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
efip->efi_format.efi_nextents = nextents;
- efip->efi_format.efi_id = (__psint_t)(void*)efip;
+ efip->efi_format.efi_id = (uintptr_t)(void *)efip;
atomic_set(&efip->efi_next_extent, 0);
atomic_set(&efip->efi_refcount, 2);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7c62fca53e2f..874507de3485 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -80,14 +80,15 @@ xfs_rw_ilock_demote(
}
/*
- * xfs_iozero
+ * xfs_iozero clears the specified range supplied via the page cache (except in
+ * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * though the callers usually map the holes first and avoid them. If a block is
+ * not completely zeroed, then it will be read from disk before being partially
+ * zeroed.
*
- * xfs_iozero clears the specified range of buffer supplied,
- * and marks all the affected blocks as valid and modified. If
- * an affected block is not allocated, it will be allocated. If
- * an affected block is not completely overwritten, and is not
- * valid before the operation, it will be read from disk before
- * being partially zeroed.
+ * In the DAX case, we can just directly write to the underlying pages. This
+ * will not allocate blocks, but will avoid holes and unwritten extents and so
+ * not do unnecessary work.
*/
int
xfs_iozero(
@@ -97,7 +98,8 @@ xfs_iozero(
{
struct page *page;
struct address_space *mapping;
- int status;
+ int status = 0;
+
mapping = VFS_I(ip)->i_mapping;
do {
@@ -109,20 +111,27 @@ xfs_iozero(
if (bytes > count)
bytes = count;
- status = pagecache_write_begin(NULL, mapping, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
- if (status)
- break;
+ if (IS_DAX(VFS_I(ip))) {
+ status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+ xfs_get_blocks_direct);
+ if (status)
+ break;
+ } else {
+ status = pagecache_write_begin(NULL, mapping, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE,
+ &page, &fsdata);
+ if (status)
+ break;
- zero_user(page, offset, bytes);
+ zero_user(page, offset, bytes);
- status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
- page, fsdata);
- WARN_ON(status <= 0); /* can't return less than zero! */
+ status = pagecache_write_end(NULL, mapping, pos, bytes,
+ bytes, page, fsdata);
+ WARN_ON(status <= 0); /* can't return less than zero! */
+ status = 0;
+ }
pos += bytes;
count -= bytes;
- status = 0;
} while (count);
return status;
@@ -139,7 +148,7 @@ xfs_update_prealloc_flags(
tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -161,7 +170,7 @@ xfs_update_prealloc_flags(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (flags & XFS_PREALLOC_SYNC)
xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
/*
@@ -285,7 +294,7 @@ xfs_file_read_iter(
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS;
- if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+ if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -379,7 +388,11 @@ xfs_file_splice_read(
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+ /* for dax, we need to avoid the page cache */
+ if (IS_DAX(VFS_I(ip)))
+ ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
+ else
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
@@ -673,7 +686,7 @@ xfs_file_dio_aio_write(
mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */
- if ((pos | count) & target->bt_logical_sectormask)
+ if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */
@@ -759,8 +772,11 @@ xfs_file_dio_aio_write(
out:
xfs_rw_iunlock(ip, iolock);
- /* No fallback to buffered IO on errors for XFS. */
- ASSERT(ret < 0 || ret == count);
+ /*
+ * No fallback to buffered IO on errors for XFS. DAX can result in
+ * partial writes, but direct IO will either complete fully or fail.
+ */
+ ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
return ret;
}
@@ -843,7 +859,7 @@ xfs_file_write_iter(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+ if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
ret = xfs_file_dio_aio_write(iocb, from);
else
ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1064,17 +1080,6 @@ xfs_file_readdir(
return xfs_readdir(ip, ctx, bufsize);
}
-STATIC int
-xfs_file_mmap(
- struct file *filp,
- struct vm_area_struct *vma)
-{
- vma->vm_ops = &xfs_file_vm_ops;
-
- file_accessed(filp);
- return 0;
-}
-
/*
* This type is designed to indicate the type of offset we would like
* to search from page cache for xfs_seek_hole_data().
@@ -1455,48 +1460,83 @@ xfs_file_llseek(
* ordering of:
*
* mmap_sem (MM)
- * i_mmap_lock (XFS - truncate serialisation)
- * page_lock (MM)
- * i_lock (XFS - extent map serialisation)
+ * sb_start_pagefault(vfs, freeze)
+ * i_mmap_lock (XFS - truncate serialisation)
+ * page_lock (MM)
+ * i_lock (XFS - extent map serialisation)
+ */
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
*/
STATIC int
-xfs_filemap_fault(
+xfs_filemap_page_mkwrite(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
- struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
- int error;
+ struct inode *inode = file_inode(vma->vm_file);
+ int ret;
- trace_xfs_filemap_fault(ip);
+ trace_xfs_filemap_page_mkwrite(XFS_I(inode));
- xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- error = filemap_fault(vma, vmf);
- xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- return error;
+ if (IS_DAX(inode)) {
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
+ xfs_end_io_dax_write);
+ } else {
+ ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = block_page_mkwrite_return(ret);
+ }
+
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
}
-/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
- */
STATIC int
-xfs_filemap_page_mkwrite(
+xfs_filemap_fault(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
- struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
- int error;
+ struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file));
+ int ret;
+
+ trace_xfs_filemap_fault(ip);
- trace_xfs_filemap_page_mkwrite(ip);
+ /* DAX can shortcut the normal fault path on write faults! */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
+ return xfs_filemap_page_mkwrite(vma, vmf);
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = filemap_fault(vma, vmf);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
- return error;
+ return ret;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+ .fault = xfs_filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = xfs_filemap_page_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+ struct file *filp,
+ struct vm_area_struct *vma)
+{
+ file_accessed(filp);
+ vma->vm_ops = &xfs_file_vm_ops;
+ if (IS_DAX(file_inode(filp)))
+ vma->vm_flags |= VM_MIXEDMAP;
+ return 0;
}
const struct file_operations xfs_file_operations = {
@@ -1527,9 +1567,3 @@ const struct file_operations xfs_dir_file_operations = {
#endif
.fsync = xfs_dir_fsync,
};
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
- .fault = xfs_filemap_fault,
- .map_pages = filemap_map_pages,
- .page_mkwrite = xfs_filemap_page_mkwrite,
-};
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index da82f1cb4b9b..c4c130f9bfb6 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -196,7 +196,8 @@ xfs_filestream_pick_ag(
goto next_ag;
}
- longest = xfs_alloc_longest_free_extent(mp, pag);
+ longest = xfs_alloc_longest_free_extent(mp, pag,
+ xfs_alloc_min_freelist(mp, pag));
if (((minlen && longest >= minlen) ||
(!minlen && pag->pagf_freeblks >= minfree)) &&
(!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cb7e8a29dfb6..9b3438a7680f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -101,7 +101,9 @@ xfs_fs_geometry(
(xfs_sb_version_hasftype(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
(xfs_sb_version_hasfinobt(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
+ XFS_FSOP_GEOM_FLAGS_FINOBT : 0) |
+ (xfs_sb_version_hassparseinodes(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_SPINODES : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -201,7 +203,7 @@ xfs_growfs_data_private(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
XFS_GROWFS_SPACE_RES(mp), 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -489,7 +491,7 @@ xfs_growfs_data_private(
if (dpct)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
return error;
@@ -557,7 +559,7 @@ xfs_growfs_data_private(
return saved_error ? saved_error : error;
error0:
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 539a85fddbc2..3da9f4da4f3d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -905,7 +905,6 @@ xfs_dir_ialloc(
{
xfs_trans_t *tp;
- xfs_trans_t *ntp;
xfs_inode_t *ip;
xfs_buf_t *ialloc_context = NULL;
int code;
@@ -954,8 +953,6 @@ xfs_dir_ialloc(
* to succeed the second time.
*/
if (ialloc_context) {
- struct xfs_trans_res tres;
-
/*
* Normally, xfs_trans_commit releases all the locks.
* We call bhold to hang on to the ialloc_context across
@@ -964,12 +961,6 @@ xfs_dir_ialloc(
* allocation group.
*/
xfs_trans_bhold(tp, ialloc_context);
- /*
- * Save the log reservation so we can use
- * them in the next transaction.
- */
- tres.tr_logres = xfs_trans_get_log_res(tp);
- tres.tr_logcount = xfs_trans_get_log_count(tp);
/*
* We want the quota changes to be associated with the next
@@ -985,35 +976,9 @@ xfs_dir_ialloc(
tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
}
- ntp = xfs_trans_dup(tp);
- code = xfs_trans_commit(tp, 0);
- tp = ntp;
- if (committed != NULL) {
+ code = xfs_trans_roll(&tp, 0);
+ if (committed != NULL)
*committed = 1;
- }
- /*
- * If we get an error during the commit processing,
- * release the buffer that is still held and return
- * to the caller.
- */
- if (code) {
- xfs_buf_relse(ialloc_context);
- if (dqinfo) {
- tp->t_dqinfo = dqinfo;
- xfs_trans_free_dqinfo(tp);
- }
- *tpp = ntp;
- *ipp = NULL;
- return code;
- }
-
- /*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(tp->t_ticket);
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- code = xfs_trans_reserve(tp, &tres, 0, 0);
/*
* Re-attach the quota info that we detached from prev trx.
@@ -1025,7 +990,7 @@ xfs_dir_ialloc(
if (code) {
xfs_buf_relse(ialloc_context);
- *tpp = ntp;
+ *tpp = tp;
*ipp = NULL;
return code;
}
@@ -1127,7 +1092,6 @@ xfs_create(
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
- uint cancel_flags;
int committed;
prid_t prid;
struct xfs_dquot *udqp = NULL;
@@ -1164,8 +1128,6 @@ xfs_create(
tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
}
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
/*
* Initially assume that the file does not exist and
* reserve the resources for that case. If that is not
@@ -1183,10 +1145,9 @@ xfs_create(
resblks = 0;
error = xfs_trans_reserve(tp, tres, 0, 0);
}
- if (error) {
- cancel_flags = 0;
+ if (error)
goto out_trans_cancel;
- }
+
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
@@ -1217,7 +1178,7 @@ xfs_create(
if (error) {
if (error == -ENOSPC)
goto out_trans_cancel;
- goto out_trans_abort;
+ goto out_trans_cancel;
}
/*
@@ -1235,7 +1196,7 @@ xfs_create(
resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
if (error) {
ASSERT(error != -ENOSPC);
- goto out_trans_abort;
+ goto out_trans_cancel;
}
xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -1269,7 +1230,7 @@ xfs_create(
if (error)
goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto out_release_inode;
@@ -1282,10 +1243,8 @@ xfs_create(
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
- out_trans_abort:
- cancel_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
out_release_inode:
/*
* Wait until after the current transaction is aborted to finish the
@@ -1317,7 +1276,6 @@ xfs_create_tmpfile(
struct xfs_inode *ip = NULL;
struct xfs_trans *tp = NULL;
int error;
- uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
@@ -1350,10 +1308,8 @@ xfs_create_tmpfile(
resblks = 0;
error = xfs_trans_reserve(tp, tres, 0, 0);
}
- if (error) {
- cancel_flags = 0;
+ if (error)
goto out_trans_cancel;
- }
error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
pdqp, resblks, 1, 0);
@@ -1365,7 +1321,7 @@ xfs_create_tmpfile(
if (error) {
if (error == -ENOSPC)
goto out_trans_cancel;
- goto out_trans_abort;
+ goto out_trans_cancel;
}
if (mp->m_flags & XFS_MOUNT_WSYNC)
@@ -1381,9 +1337,9 @@ xfs_create_tmpfile(
ip->i_d.di_nlink--;
error = xfs_iunlink(tp, ip);
if (error)
- goto out_trans_abort;
+ goto out_trans_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto out_release_inode;
@@ -1394,10 +1350,8 @@ xfs_create_tmpfile(
*ipp = ip;
return 0;
- out_trans_abort:
- cancel_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
out_release_inode:
/*
* Wait until after the current transaction is aborted to finish the
@@ -1427,7 +1381,6 @@ xfs_link(
int error;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int cancel_flags;
int committed;
int resblks;
@@ -1447,17 +1400,14 @@ xfs_link(
goto std_return;
tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
if (error == -ENOSPC) {
resblks = 0;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
}
- if (error) {
- cancel_flags = 0;
+ if (error)
goto error_return;
- }
xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
@@ -1486,19 +1436,19 @@ xfs_link(
if (sip->i_d.di_nlink == 0) {
error = xfs_iunlink_remove(tp, sip);
if (error)
- goto abort_return;
+ goto error_return;
}
error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
&first_block, &free_list, resblks);
if (error)
- goto abort_return;
+ goto error_return;
xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
error = xfs_bumplink(tp, sip);
if (error)
- goto abort_return;
+ goto error_return;
/*
* If this is a synchronous mount, make sure that the
@@ -1512,15 +1462,13 @@ xfs_link(
error = xfs_bmap_finish (&tp, &free_list, &committed);
if (error) {
xfs_bmap_cancel(&free_list);
- goto abort_return;
+ goto error_return;
}
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ return xfs_trans_commit(tp);
- abort_return:
- cancel_flags |= XFS_TRANS_ABORT;
error_return:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
std_return:
return error;
}
@@ -1555,7 +1503,6 @@ xfs_itruncate_extents(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
- struct xfs_trans *ntp;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
xfs_fileoff_t first_unmap_block;
@@ -1613,29 +1560,7 @@ xfs_itruncate_extents(
if (error)
goto out_bmap_cancel;
- if (committed) {
- /*
- * Mark the inode dirty so it will be logged and
- * moved forward in the log as part of every commit.
- */
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- }
-
- ntp = xfs_trans_dup(tp);
- error = xfs_trans_commit(tp, 0);
- tp = ntp;
-
- xfs_trans_ijoin(tp, ip, 0);
-
- if (error)
- goto out;
-
- /*
- * Transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(tp->t_ticket);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_roll(&tp, ip);
if (error)
goto out;
}
@@ -1756,7 +1681,7 @@ xfs_inactive_truncate(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -1777,7 +1702,7 @@ xfs_inactive_truncate(
ASSERT(ip->i_d.di_nextents == 0);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto error_unlock;
@@ -1785,7 +1710,7 @@ xfs_inactive_truncate(
return 0;
error_trans_cancel:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
error_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -1835,7 +1760,7 @@ xfs_inactive_ifree(
} else {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
}
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_trans_cancel(tp);
return error;
}
@@ -1855,7 +1780,7 @@ xfs_inactive_ifree(
__func__, error);
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
}
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1874,7 +1799,7 @@ xfs_inactive_ifree(
if (error)
xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
__func__, error);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
__func__, error);
@@ -2235,28 +2160,42 @@ xfs_iunlink_remove(
*/
STATIC int
xfs_ifree_cluster(
- xfs_inode_t *free_ip,
- xfs_trans_t *tp,
- xfs_ino_t inum)
+ xfs_inode_t *free_ip,
+ xfs_trans_t *tp,
+ struct xfs_icluster *xic)
{
xfs_mount_t *mp = free_ip->i_mount;
int blks_per_cluster;
int inodes_per_cluster;
int nbufs;
int i, j;
+ int ioffset;
xfs_daddr_t blkno;
xfs_buf_t *bp;
xfs_inode_t *ip;
xfs_inode_log_item_t *iip;
xfs_log_item_t *lip;
struct xfs_perag *pag;
+ xfs_ino_t inum;
+ inum = xic->first_ino;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
blks_per_cluster = xfs_icluster_size_fsb(mp);
inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
nbufs = mp->m_ialloc_blks / blks_per_cluster;
for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
+ /*
+ * The allocation bitmap tells us which inodes of the chunk were
+ * physically allocated. Skip the cluster if an inode falls into
+ * a sparse region.
+ */
+ ioffset = inum - xic->first_ino;
+ if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
+ ASSERT(do_mod(ioffset, inodes_per_cluster) == 0);
+ continue;
+ }
+
blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
XFS_INO_TO_AGBNO(mp, inum));
@@ -2414,8 +2353,7 @@ xfs_ifree(
xfs_bmap_free_t *flist)
{
int error;
- int delete;
- xfs_ino_t first_ino;
+ struct xfs_icluster xic = { 0 };
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(ip->i_d.di_nlink == 0);
@@ -2431,7 +2369,7 @@ xfs_ifree(
if (error)
return error;
- error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
+ error = xfs_difree(tp, ip->i_ino, flist, &xic);
if (error)
return error;
@@ -2448,8 +2386,8 @@ xfs_ifree(
ip->i_d.di_gen++;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (delete)
- error = xfs_ifree_cluster(ip, tp, first_ino);
+ if (xic.deleted)
+ error = xfs_ifree_cluster(ip, tp, &xic);
return error;
}
@@ -2536,7 +2474,6 @@ xfs_remove(
int error = 0;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int cancel_flags;
int committed;
uint resblks;
@@ -2557,7 +2494,6 @@ xfs_remove(
tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
else
tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
/*
* We try to get the real space reservation first,
@@ -2576,7 +2512,6 @@ xfs_remove(
}
if (error) {
ASSERT(error != -ENOSPC);
- cancel_flags = 0;
goto out_trans_cancel;
}
@@ -2588,7 +2523,6 @@ xfs_remove(
/*
* If we're removing a directory perform some additional validation.
*/
- cancel_flags |= XFS_TRANS_ABORT;
if (is_dir) {
ASSERT(ip->i_d.di_nlink >= 2);
if (ip->i_d.di_nlink != 2) {
@@ -2644,7 +2578,7 @@ xfs_remove(
if (error)
goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto std_return;
@@ -2656,7 +2590,7 @@ xfs_remove(
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
std_return:
return error;
}
@@ -2730,11 +2664,11 @@ xfs_finish_rename(
error = xfs_bmap_finish(&tp, free_list, &committed);
if (error) {
xfs_bmap_cancel(free_list);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ return xfs_trans_commit(tp);
}
/*
@@ -2855,7 +2789,7 @@ xfs_cross_rename(
out_trans_abort:
xfs_bmap_cancel(free_list);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
@@ -2915,7 +2849,6 @@ xfs_rename(
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
- int cancel_flags = 0;
int spaceres;
int error;
@@ -2951,7 +2884,6 @@ xfs_rename(
}
if (error)
goto out_trans_cancel;
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
/*
* Attach the dquots to the inodes
@@ -3022,10 +2954,8 @@ xfs_rename(
error = xfs_dir_createname(tp, target_dp, target_name,
src_ip->i_ino, &first_block,
&free_list, spaceres);
- if (error == -ENOSPC)
- goto out_bmap_cancel;
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
xfs_trans_ichgtime(tp, target_dp,
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3033,7 +2963,7 @@ xfs_rename(
if (new_parent && src_is_directory) {
error = xfs_bumplink(tp, target_dp);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
}
} else { /* target_ip != NULL */
/*
@@ -3065,7 +2995,7 @@ xfs_rename(
src_ip->i_ino,
&first_block, &free_list, spaceres);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
xfs_trans_ichgtime(tp, target_dp,
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3076,7 +3006,7 @@ xfs_rename(
*/
error = xfs_droplink(tp, target_ip);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
if (src_is_directory) {
/*
@@ -3084,7 +3014,7 @@ xfs_rename(
*/
error = xfs_droplink(tp, target_ip);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
}
} /* target_ip != NULL */
@@ -3101,7 +3031,7 @@ xfs_rename(
&first_block, &free_list, spaceres);
ASSERT(error != -EEXIST);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
}
/*
@@ -3127,7 +3057,7 @@ xfs_rename(
*/
error = xfs_droplink(tp, src_dp);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
}
/*
@@ -3142,7 +3072,7 @@ xfs_rename(
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
&first_block, &free_list, spaceres);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
/*
* For whiteouts, we need to bump the link count on the whiteout inode.
@@ -3156,10 +3086,10 @@ xfs_rename(
ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
error = xfs_bumplink(tp, wip);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
error = xfs_iunlink_remove(tp, wip);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
/*
@@ -3180,12 +3110,10 @@ xfs_rename(
IRELE(wip);
return error;
-out_trans_abort:
- cancel_flags |= XFS_TRANS_ABORT;
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
if (wip)
IRELE(wip);
return error;
@@ -3464,7 +3392,7 @@ xfs_iflush_int(
ASSERT(ip->i_d.di_version > 1);
/* set *dip = inode's place in the buffer */
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 87f67c6b654c..ea7d85af5310 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -336,7 +336,7 @@ xfs_set_dmattrs(
tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -346,7 +346,7 @@ xfs_set_dmattrs(
ip->i_d.di_dmstate = state;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
return error;
}
@@ -1076,7 +1076,7 @@ xfs_ioctl_setattr_get_trans(
return tp;
out_cancel:
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return ERR_PTR(error);
}
@@ -1253,7 +1253,7 @@ xfs_ioctl_setattr(
else
ip->i_d.di_extsize = 0;
- code = xfs_trans_commit(tp, 0);
+ code = xfs_trans_commit(tp);
/*
* Release any dquot(s) the inode had kept before chown.
@@ -1265,7 +1265,7 @@ xfs_ioctl_setattr(
return code;
error_trans_cancel:
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
error_free_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(pdqp);
@@ -1338,11 +1338,11 @@ xfs_ioc_setxflags(
error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_drop_write;
}
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
out_drop_write:
mnt_drop_write_file(filp);
return error;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 38e633bad8c2..1f86033171c8 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -183,7 +183,7 @@ xfs_iomap_write_direct(
* Check for running out of space, note: need lock to return
*/
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -213,7 +213,7 @@ xfs_iomap_write_direct(
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto out_unlock;
@@ -236,7 +236,7 @@ out_bmap_cancel:
xfs_bmap_cancel(&free_list);
xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
out_trans_cancel:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
@@ -690,7 +690,7 @@ xfs_iomap_write_allocate(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
nres, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -760,7 +760,7 @@ xfs_iomap_write_allocate(
if (error)
goto trans_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto error0;
@@ -791,7 +791,7 @@ xfs_iomap_write_allocate(
trans_cancel:
xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
error0:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -853,7 +853,7 @@ xfs_iomap_write_unwritten(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
resblks, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -890,7 +890,7 @@ xfs_iomap_write_unwritten(
if (error)
goto error_on_bmapi_transaction;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
@@ -914,7 +914,7 @@ xfs_iomap_write_unwritten(
error_on_bmapi_transaction:
xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 7f51f39f8acc..766b23f86ce9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -699,7 +699,7 @@ xfs_setattr_nonsize(
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -730,7 +730,7 @@ xfs_setattr_nonsize(
return 0;
out_trans_cancel:
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_dqrele:
xfs_qm_dqrele(udqp);
@@ -752,7 +752,6 @@ xfs_setattr_size(
struct xfs_trans *tp;
int error;
uint lock_flags = 0;
- uint commit_flags = 0;
bool did_zeroing = false;
trace_xfs_setattr(ip);
@@ -848,7 +847,11 @@ xfs_setattr_size(
* to hope that the caller sees ENOMEM and retries the truncate
* operation.
*/
- error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+ if (IS_DAX(inode))
+ error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
+ else
+ error = block_truncate_page(inode->i_mapping, newsize,
+ xfs_get_blocks);
if (error)
return error;
truncate_setsize(inode, newsize);
@@ -858,7 +861,6 @@ xfs_setattr_size(
if (error)
goto out_trans_cancel;
- commit_flags = XFS_TRANS_RELEASE_LOG_RES;
lock_flags |= XFS_ILOCK_EXCL;
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
@@ -898,7 +900,7 @@ xfs_setattr_size(
if (newsize <= oldsize) {
error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
if (error)
- goto out_trans_abort;
+ goto out_trans_cancel;
/*
* Truncated "down", so we're removing references to old data
@@ -925,16 +927,14 @@ xfs_setattr_size(
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
out_unlock:
if (lock_flags)
xfs_iunlock(ip, lock_flags);
return error;
-out_trans_abort:
- commit_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
- xfs_trans_cancel(tp, commit_flags);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
@@ -981,7 +981,7 @@ xfs_vn_update_time(
tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -1003,7 +1003,7 @@ xfs_vn_update_time(
}
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1188,22 +1188,22 @@ xfs_diflags_to_iflags(
struct inode *inode,
struct xfs_inode *ip)
{
- if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+ uint16_t flags = ip->i_d.di_flags;
+
+ inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
+ S_NOATIME | S_DAX);
+
+ if (flags & XFS_DIFLAG_IMMUTABLE)
inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
- if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+ if (flags & XFS_DIFLAG_APPEND)
inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
- if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+ if (flags & XFS_DIFLAG_SYNC)
inode->i_flags |= S_SYNC;
- else
- inode->i_flags &= ~S_SYNC;
- if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+ if (flags & XFS_DIFLAG_NOATIME)
inode->i_flags |= S_NOATIME;
- else
- inode->i_flags &= ~S_NOATIME;
+ /* XXX: Also needs an on-disk per inode flag! */
+ if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+ inode->i_flags |= S_DAX;
}
/*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 80429891dc9b..f41b0c3fddab 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk(
}
irec->ir_free |= xfs_inobt_maskn(0, idx);
- *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
+ *icount = irec->ir_count - irec->ir_freecount;
}
return 0;
@@ -415,6 +415,8 @@ xfs_bulkstat(
goto del_cursor;
if (icount) {
irbp->ir_startino = r.ir_startino;
+ irbp->ir_holemask = r.ir_holemask;
+ irbp->ir_count = r.ir_count;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
irbp++;
@@ -447,13 +449,15 @@ xfs_bulkstat(
* If this chunk has any allocated inodes, save it.
* Also start read-ahead now for this chunk.
*/
- if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+ if (r.ir_freecount < r.ir_count) {
xfs_bulkstat_ichunk_ra(mp, agno, &r);
irbp->ir_startino = r.ir_startino;
+ irbp->ir_holemask = r.ir_holemask;
+ irbp->ir_count = r.ir_count;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
irbp++;
- icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
+ icount += r.ir_count - r.ir_freecount;
}
error = xfs_btree_increment(cur, 0, &stat);
if (error || stat == 0) {
@@ -599,8 +603,7 @@ xfs_inumbers(
agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
buffer[bufidx].xi_startino =
XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
- buffer[bufidx].xi_alloccount =
- XFS_INODES_PER_CHUNK - r.ir_freecount;
+ buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount;
buffer[bufidx].xi_allocmask = ~r.ir_free;
if (++bufidx == bcount) {
long written;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 7c7842c85a08..85f883dd6207 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,26 +32,12 @@ typedef unsigned int __uint32_t;
typedef signed long long int __int64_t;
typedef unsigned long long int __uint64_t;
-typedef __uint32_t inst_t; /* an instruction */
-
typedef __s64 xfs_off_t; /* <file offset> type */
typedef unsigned long long xfs_ino_t; /* <inode> type */
typedef __s64 xfs_daddr_t; /* <disk address> type */
-typedef char * xfs_caddr_t; /* <core address> type */
typedef __u32 xfs_dev_t;
typedef __u32 xfs_nlink_t;
-/* __psint_t is the same size as a pointer */
-#if (BITS_PER_LONG == 32)
-typedef __int32_t __psint_t;
-typedef __uint32_t __psunsigned_t;
-#elif (BITS_PER_LONG == 64)
-typedef __int64_t __psint_t;
-typedef __uint64_t __psunsigned_t;
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-
#include "xfs_types.h"
#include "kmem.h"
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bcc7cfabb787..08d4fe46f0fa 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -109,7 +109,7 @@ xlog_ungrant_log_space(
STATIC void
xlog_verify_dest_ptr(
struct xlog *log,
- char *ptr);
+ void *ptr);
STATIC void
xlog_verify_grant_tail(
struct xlog *log);
@@ -513,7 +513,7 @@ xfs_log_done(
struct xfs_mount *mp,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
- uint flags)
+ bool regrant)
{
struct xlog *log = mp->m_log;
xfs_lsn_t lsn = 0;
@@ -526,14 +526,11 @@ xfs_log_done(
(((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
(xlog_commit_record(log, ticket, iclog, &lsn)))) {
lsn = (xfs_lsn_t) -1;
- if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
- flags |= XFS_LOG_REL_PERM_RESERV;
- }
+ regrant = false;
}
- if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
- (flags & XFS_LOG_REL_PERM_RESERV)) {
+ if (!regrant) {
trace_xfs_log_done_nonperm(log, ticket);
/*
@@ -541,7 +538,6 @@ xfs_log_done(
* request has been made to release a permanent reservation.
*/
xlog_ungrant_log_space(log, ticket);
- xfs_log_ticket_put(ticket);
} else {
trace_xfs_log_done_perm(log, ticket);
@@ -553,6 +549,7 @@ xfs_log_done(
ticket->t_flags |= XLOG_TIC_INITED;
}
+ xfs_log_ticket_put(ticket);
return lsn;
}
@@ -1447,7 +1444,7 @@ xlog_alloc_log(
iclog->ic_bp = bp;
iclog->ic_data = bp->b_addr;
#ifdef DEBUG
- log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+ log->l_iclog_bak[i] = &iclog->ic_header;
#endif
head = &iclog->ic_header;
memset(head, 0, sizeof(xlog_rec_header_t));
@@ -1602,7 +1599,7 @@ xlog_pack_data(
int i, j, k;
int size = iclog->ic_offset + roundoff;
__be32 cycle_lsn;
- xfs_caddr_t dp;
+ char *dp;
cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
@@ -3664,7 +3661,7 @@ xlog_ticket_alloc(
void
xlog_verify_dest_ptr(
struct xlog *log,
- char *ptr)
+ void *ptr)
{
int i;
int good_ptr = 0;
@@ -3767,9 +3764,8 @@ xlog_verify_iclog(
xlog_op_header_t *ophead;
xlog_in_core_t *icptr;
xlog_in_core_2_t *xhdr;
- xfs_caddr_t ptr;
- xfs_caddr_t base_ptr;
- __psint_t field_offset;
+ void *base_ptr, *ptr, *p;
+ ptrdiff_t field_offset;
__uint8_t clientid;
int len, i, j, k, op_len;
int idx;
@@ -3788,9 +3784,9 @@ xlog_verify_iclog(
if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
- ptr = (xfs_caddr_t) &iclog->ic_header;
- for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
- ptr += BBSIZE) {
+ base_ptr = ptr = &iclog->ic_header;
+ p = &iclog->ic_header;
+ for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
xfs_emerg(log->l_mp, "%s: unexpected magic num",
__func__);
@@ -3798,20 +3794,19 @@ xlog_verify_iclog(
/* check fields */
len = be32_to_cpu(iclog->ic_header.h_num_logops);
- ptr = iclog->ic_datap;
- base_ptr = ptr;
- ophead = (xlog_op_header_t *)ptr;
+ base_ptr = ptr = iclog->ic_datap;
+ ophead = ptr;
xhdr = iclog->ic_data;
for (i = 0; i < len; i++) {
- ophead = (xlog_op_header_t *)ptr;
+ ophead = ptr;
/* clientid is only 1 byte */
- field_offset = (__psint_t)
- ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+ p = &ophead->oh_clientid;
+ field_offset = p - base_ptr;
if (!syncing || (field_offset & 0x1ff)) {
clientid = ophead->oh_clientid;
} else {
- idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+ idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3829,13 +3824,13 @@ xlog_verify_iclog(
(unsigned long)field_offset);
/* check length */
- field_offset = (__psint_t)
- ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+ p = &ophead->oh_len;
+ field_offset = p - base_ptr;
if (!syncing || (field_offset & 0x1ff)) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
- idx = BTOBBT((__psint_t)&ophead->oh_len -
- (__psint_t)iclog->ic_datap);
+ idx = BTOBBT((uintptr_t)&ophead->oh_len -
+ (uintptr_t)iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 84e0deb95abd..fa27aaec72cb 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -111,15 +111,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
#define XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
/*
- * Macros, structures, prototypes for interface to the log manager.
- */
-
-/*
- * Flags to xfs_log_done()
- */
-#define XFS_LOG_REL_PERM_RESERV 0x1
-
-/*
* Flags to xfs_log_force()
*
* XFS_LOG_SYNC: Synchronous force in-core log to disk
@@ -138,7 +129,7 @@ struct xfs_log_callback;
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
- uint flags);
+ bool regrant);
int _xfs_log_force(struct xfs_mount *mp,
uint flags,
int *log_forced);
@@ -183,7 +174,7 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_lsn_t *commit_lsn, int flags);
+ xfs_lsn_t *commit_lsn, bool regrant);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 45cc0ce18adf..abc2ccbff739 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -624,7 +624,7 @@ restart:
spin_unlock(&cil->xc_push_lock);
/* xfs_log_done always frees the ticket on error. */
- commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+ commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
if (commit_lsn == -1)
goto out_abort;
@@ -773,14 +773,10 @@ xfs_log_commit_cil(
struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_lsn_t *commit_lsn,
- int flags)
+ bool regrant)
{
struct xlog *log = mp->m_log;
struct xfs_cil *cil = log->l_cilp;
- int log_flags = 0;
-
- if (flags & XFS_TRANS_RELEASE_LOG_RES)
- log_flags = XFS_LOG_REL_PERM_RESERV;
/* lock out background commit */
down_read(&cil->xc_ctx_lock);
@@ -795,7 +791,7 @@ xfs_log_commit_cil(
if (commit_lsn)
*commit_lsn = tp->t_commit_lsn;
- xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ xfs_log_done(mp, tp->t_ticket, NULL, regrant);
xfs_trans_unreserve_and_mod_sb(tp);
/*
@@ -809,7 +805,7 @@ xfs_log_commit_cil(
* the log items. This affects (at least) processing of stale buffers,
* inodes and EFIs.
*/
- xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
+ xfs_trans_free_items(tp, tp->t_commit_lsn, false);
xlog_cil_push_background(log);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index db7cbdeb2b42..1c87c8abfbed 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -409,7 +409,7 @@ struct xlog {
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
- char *l_iclog_bak[XLOG_MAX_ICLOGS];
+ void *l_iclog_bak[XLOG_MAX_ICLOGS];
#endif
};
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4f5784f85a5b..01dd228ca05e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -147,7 +147,7 @@ xlog_put_bp(
* Return the address of the start of the given block number's data
* in a log buffer. The buffer covers a log sector-aligned region.
*/
-STATIC xfs_caddr_t
+STATIC char *
xlog_align(
struct xlog *log,
xfs_daddr_t blk_no,
@@ -203,7 +203,7 @@ xlog_bread(
xfs_daddr_t blk_no,
int nbblks,
struct xfs_buf *bp,
- xfs_caddr_t *offset)
+ char **offset)
{
int error;
@@ -225,9 +225,9 @@ xlog_bread_offset(
xfs_daddr_t blk_no, /* block to read from */
int nbblks, /* blocks to read */
struct xfs_buf *bp,
- xfs_caddr_t offset)
+ char *offset)
{
- xfs_caddr_t orig_offset = bp->b_addr;
+ char *orig_offset = bp->b_addr;
int orig_len = BBTOB(bp->b_length);
int error, error2;
@@ -396,7 +396,7 @@ xlog_find_cycle_start(
xfs_daddr_t *last_blk,
uint cycle)
{
- xfs_caddr_t offset;
+ char *offset;
xfs_daddr_t mid_blk;
xfs_daddr_t end_blk;
uint mid_cycle;
@@ -443,7 +443,7 @@ xlog_find_verify_cycle(
uint cycle;
xfs_buf_t *bp;
xfs_daddr_t bufblks;
- xfs_caddr_t buf = NULL;
+ char *buf = NULL;
int error = 0;
/*
@@ -509,7 +509,7 @@ xlog_find_verify_log_record(
{
xfs_daddr_t i;
xfs_buf_t *bp;
- xfs_caddr_t offset = NULL;
+ char *offset = NULL;
xlog_rec_header_t *head = NULL;
int error = 0;
int smallmem = 0;
@@ -616,7 +616,7 @@ xlog_find_head(
xfs_daddr_t *return_head_blk)
{
xfs_buf_t *bp;
- xfs_caddr_t offset;
+ char *offset;
xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
int num_scan_bblks;
uint first_half_cycle, last_half_cycle;
@@ -891,7 +891,7 @@ xlog_find_tail(
{
xlog_rec_header_t *rhead;
xlog_op_header_t *op_head;
- xfs_caddr_t offset = NULL;
+ char *offset = NULL;
xfs_buf_t *bp;
int error, i, found;
xfs_daddr_t umount_data_blk;
@@ -1099,7 +1099,7 @@ xlog_find_zeroed(
xfs_daddr_t *blk_no)
{
xfs_buf_t *bp;
- xfs_caddr_t offset;
+ char *offset;
uint first_cycle, last_cycle;
xfs_daddr_t new_blk, last_blk, start_blk;
xfs_daddr_t num_scan_bblks;
@@ -1199,7 +1199,7 @@ bp_err:
STATIC void
xlog_add_record(
struct xlog *log,
- xfs_caddr_t buf,
+ char *buf,
int cycle,
int block,
int tail_cycle,
@@ -1227,7 +1227,7 @@ xlog_write_log_records(
int tail_cycle,
int tail_block)
{
- xfs_caddr_t offset;
+ char *offset;
xfs_buf_t *bp;
int balign, ealign;
int sectbb = log->l_sectBBsize;
@@ -1789,8 +1789,7 @@ xlog_recover_do_inode_buffer(
return -EFSCORRUPTED;
}
- buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
- next_unlinked_offset);
+ buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
*buffer_nextp = *logged_nextp;
/*
@@ -1798,7 +1797,7 @@ xlog_recover_do_inode_buffer(
* have to leave the inode in a consistent state for whoever
* reads it next....
*/
- xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+ xfs_dinode_calc_crc(mp,
xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
}
@@ -2503,8 +2502,8 @@ xlog_recover_inode_pass2(
xfs_buf_t *bp;
xfs_dinode_t *dip;
int len;
- xfs_caddr_t src;
- xfs_caddr_t dest;
+ char *src;
+ char *dest;
int error;
int attr_index;
uint fields;
@@ -2546,7 +2545,7 @@ xlog_recover_inode_pass2(
goto out_release;
}
ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
+ dip = xfs_buf_offset(bp, in_f->ilf_boffset);
/*
* Make sure the place we're flushing out to really looks
@@ -2885,7 +2884,7 @@ xlog_recover_dquot_pass2(
return error;
ASSERT(bp);
- ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+ ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
/*
* If the dquot has an LSN in it, recover the dquot only if it's less
@@ -3068,12 +3067,22 @@ xlog_recover_do_icreate_pass2(
return -EINVAL;
}
- /* existing allocation is fixed value */
- ASSERT(count == mp->m_ialloc_inos);
- ASSERT(length == mp->m_ialloc_blks);
- if (count != mp->m_ialloc_inos ||
- length != mp->m_ialloc_blks) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+ /*
+ * The inode chunk is either full or sparse and we only support
+ * m_ialloc_min_blks sized sparse allocations at this time.
+ */
+ if (length != mp->m_ialloc_blks &&
+ length != mp->m_ialloc_min_blks) {
+ xfs_warn(log->l_mp,
+ "%s: unsupported chunk length", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ /* verify inode count is consistent with extent length */
+ if ((count >> mp->m_sb.sb_inopblog) != length) {
+ xfs_warn(log->l_mp,
+ "%s: inconsistent inode count and chunk length",
+ __FUNCTION__);
return -EINVAL;
}
@@ -3091,8 +3100,8 @@ xlog_recover_do_icreate_pass2(
XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
return 0;
- xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
- be32_to_cpu(icl->icl_gen));
+ xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length,
+ be32_to_cpu(icl->icl_gen));
return 0;
}
@@ -3364,17 +3373,17 @@ STATIC int
xlog_recover_add_to_cont_trans(
struct xlog *log,
struct xlog_recover *trans,
- xfs_caddr_t dp,
+ char *dp,
int len)
{
xlog_recover_item_t *item;
- xfs_caddr_t ptr, old_ptr;
+ char *ptr, *old_ptr;
int old_len;
if (list_empty(&trans->r_itemq)) {
/* finish copying rest of trans header */
xlog_recover_add_item(&trans->r_itemq);
- ptr = (xfs_caddr_t) &trans->r_theader +
+ ptr = (char *)&trans->r_theader +
sizeof(xfs_trans_header_t) - len;
memcpy(ptr, dp, len);
return 0;
@@ -3410,12 +3419,12 @@ STATIC int
xlog_recover_add_to_trans(
struct xlog *log,
struct xlog_recover *trans,
- xfs_caddr_t dp,
+ char *dp,
int len)
{
xfs_inode_log_format_t *in_f; /* any will do */
xlog_recover_item_t *item;
- xfs_caddr_t ptr;
+ char *ptr;
if (!len)
return 0;
@@ -3504,7 +3513,7 @@ STATIC int
xlog_recovery_process_trans(
struct xlog *log,
struct xlog_recover *trans,
- xfs_caddr_t dp,
+ char *dp,
unsigned int len,
unsigned int flags,
int pass)
@@ -3611,8 +3620,8 @@ xlog_recover_process_ophdr(
struct hlist_head rhash[],
struct xlog_rec_header *rhead,
struct xlog_op_header *ohead,
- xfs_caddr_t dp,
- xfs_caddr_t end,
+ char *dp,
+ char *end,
int pass)
{
struct xlog_recover *trans;
@@ -3661,11 +3670,11 @@ xlog_recover_process_data(
struct xlog *log,
struct hlist_head rhash[],
struct xlog_rec_header *rhead,
- xfs_caddr_t dp,
+ char *dp,
int pass)
{
struct xlog_op_header *ohead;
- xfs_caddr_t end;
+ char *end;
int num_logops;
int error;
@@ -3751,11 +3760,11 @@ xlog_recover_process_efi(
}
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
return error;
abort_error:
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
@@ -3857,13 +3866,13 @@ xlog_recover_clear_agi_bucket(
xfs_trans_log_buf(tp, agibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
goto out_error;
return;
out_abort:
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
out_error:
xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
return;
@@ -4010,7 +4019,7 @@ xlog_recover_process_iunlinks(
STATIC int
xlog_unpack_data_crc(
struct xlog_rec_header *rhead,
- xfs_caddr_t dp,
+ char *dp,
struct xlog *log)
{
__le32 crc;
@@ -4040,7 +4049,7 @@ xlog_unpack_data_crc(
STATIC int
xlog_unpack_data(
struct xlog_rec_header *rhead,
- xfs_caddr_t dp,
+ char *dp,
struct xlog *log)
{
int i, j, k;
@@ -4122,7 +4131,7 @@ xlog_do_recovery_pass(
{
xlog_rec_header_t *rhead;
xfs_daddr_t blk_no;
- xfs_caddr_t offset;
+ char *offset;
xfs_buf_t *hbp, *dbp;
int error = 0, h_size;
int bblks, split_bblks;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6f23fbdfb365..461e791efad7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -725,6 +725,22 @@ xfs_mountfs(
}
/*
+ * If enabled, sparse inode chunk alignment is expected to match the
+ * cluster size. Full inode chunk alignment must match the chunk size,
+ * but that is checked on sb read verification...
+ */
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
+ mp->m_sb.sb_spino_align !=
+ XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
+ xfs_warn(mp,
+ "Sparse inode block alignment (%u) must match cluster size (%llu).",
+ mp->m_sb.sb_spino_align,
+ XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
+ error = -EINVAL;
+ goto out_remove_uuid;
+ }
+
+ /*
* Set inode alignment fields
*/
xfs_set_inoalignment(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 8c995a2ccb6f..7999e91cd49a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -101,6 +101,8 @@ typedef struct xfs_mount {
__uint64_t m_flags; /* global mount flags */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
+ int m_ialloc_min_blks;/* min blocks in sparse inode
+ * allocation */
int m_inoalign_mask;/* mask sb_inoalignmt if used */
uint m_qflags; /* quota status flags */
struct xfs_trans_resv m_resv; /* precomputed res values */
@@ -179,6 +181,8 @@ typedef struct xfs_mount {
allocator */
#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
+#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */
+
/*
* Default minimum read and write sizes.
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 981a657eca39..ab4a6066f7ca 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -306,7 +306,7 @@ xfs_fs_commit_blocks(
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_drop_iolock;
}
@@ -321,7 +321,7 @@ xfs_fs_commit_blocks(
}
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
out_drop_iolock:
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5538468c7f63..eac9549efd52 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -756,7 +756,7 @@ xfs_qm_qino_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -764,8 +764,7 @@ xfs_qm_qino_alloc(
error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
&committed);
if (error) {
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
}
@@ -796,7 +795,7 @@ xfs_qm_qino_alloc(
spin_unlock(&mp->m_sb_lock);
xfs_log_sb(tp);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
xfs_alert(mp, "%s failed (error %d)!", __func__, error);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 9a25c9275fb3..3640c6e896af 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -239,7 +239,7 @@ xfs_qm_scall_trunc_qfile(
tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
goto out_put;
}
@@ -252,15 +252,14 @@ xfs_qm_scall_trunc_qfile(
error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
if (error) {
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
ASSERT(ip->i_d.di_nextents == 0);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
@@ -437,7 +436,7 @@ xfs_qm_scall_setqlim(
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_rele;
}
@@ -548,7 +547,7 @@ xfs_qm_scall_setqlim(
dqp->dq_flags |= XFS_DQ_DIRTY;
xfs_trans_log_dquot(tp, dqp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
out_rele:
xfs_qm_dqrele(dqp);
@@ -571,7 +570,7 @@ xfs_qm_log_quotaoff_end(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -585,8 +584,7 @@ xfs_qm_log_quotaoff_end(
* We don't care about quotoff's performance.
*/
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
- return error;
+ return xfs_trans_commit(tp);
}
@@ -605,7 +603,7 @@ xfs_qm_log_quotaoff(
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out;
}
@@ -624,7 +622,7 @@ xfs_qm_log_quotaoff(
* We don't care about quotoff's performance.
*/
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
goto out;
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 5376dd406ba2..ce6506adab7b 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -55,7 +55,6 @@ struct xfs_trans;
typedef struct xfs_dqtrx {
struct xfs_dquot *qt_dquot; /* the dquot this refers to */
ulong qt_blk_res; /* blks reserved on a dquot */
- ulong qt_blk_res_used; /* blks used from the reservation */
ulong qt_ino_res; /* inode reserved on a dquot */
ulong qt_ino_res_used; /* inodes used from the reservation */
long qt_bcount_delta; /* dquot blk count changes */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f2079b6911cc..f4e8c06eee26 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -780,7 +780,6 @@ xfs_growfs_rt_alloc(
* Allocate space to the file, as necessary.
*/
while (oblocks < nblocks) {
- int cancelflags = 0;
xfs_trans_t *tp;
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
@@ -792,7 +791,6 @@ xfs_growfs_rt_alloc(
resblks, 0);
if (error)
goto error_cancel;
- cancelflags = XFS_TRANS_RELEASE_LOG_RES;
/*
* Lock the inode.
*/
@@ -804,7 +802,6 @@ xfs_growfs_rt_alloc(
* Allocate blocks to the bitmap file.
*/
nmap = 1;
- cancelflags |= XFS_TRANS_ABORT;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
XFS_BMAPI_METADATA, &firstblock,
resblks, &map, &nmap, &flist);
@@ -818,14 +815,13 @@ xfs_growfs_rt_alloc(
error = xfs_bmap_finish(&tp, &flist, &committed);
if (error)
goto error_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto error;
/*
* Now we need to clear the allocated blocks.
* Do this one block per transaction, to keep it simple.
*/
- cancelflags = 0;
for (bno = map.br_startoff, fsbno = map.br_startblock;
bno < map.br_startoff + map.br_blockcount;
bno++, fsbno++) {
@@ -851,7 +847,7 @@ xfs_growfs_rt_alloc(
if (bp == NULL) {
error = -EIO;
error_cancel:
- xfs_trans_cancel(tp, cancelflags);
+ xfs_trans_cancel(tp);
goto error;
}
memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
@@ -859,7 +855,7 @@ error_cancel:
/*
* Commit the transaction.
*/
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
goto error;
}
@@ -973,7 +969,6 @@ xfs_growfs_rt(
bmbno < nrbmblocks;
bmbno++) {
xfs_trans_t *tp;
- int cancelflags = 0;
*nmp = *mp;
nsbp = &nmp->m_sb;
@@ -1015,7 +1010,6 @@ xfs_growfs_rt(
mp->m_rbmip->i_d.di_size =
nsbp->sb_rbmblocks * nsbp->sb_blocksize;
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
- cancelflags |= XFS_TRANS_ABORT;
/*
* Get the summary inode into the transaction.
*/
@@ -1062,7 +1056,7 @@ xfs_growfs_rt(
nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
if (error) {
error_cancel:
- xfs_trans_cancel(tp, cancelflags);
+ xfs_trans_cancel(tp);
break;
}
/*
@@ -1076,7 +1070,7 @@ error_cancel:
mp->m_rsumlevels = nrsumlevels;
mp->m_rsumsize = nrsumsize;
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
break;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 858e1e62bbaa..1fb16562c159 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
+#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */
+
/*
* Table driven mount option parser.
*
@@ -363,6 +365,10 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_DISCARD;
} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
mp->m_flags &= ~XFS_MOUNT_DISCARD;
+#ifdef CONFIG_FS_DAX
+ } else if (!strcmp(this_char, MNTOPT_DAX)) {
+ mp->m_flags |= XFS_MOUNT_DAX;
+#endif
} else {
xfs_warn(mp, "unknown mount option [%s].", this_char);
return -EINVAL;
@@ -452,8 +458,8 @@ done:
}
struct proc_xfs_info {
- int flag;
- char *str;
+ uint64_t flag;
+ char *str;
};
STATIC int
@@ -474,6 +480,7 @@ xfs_showargs(
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
{ XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
{ XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },
+ { XFS_MOUNT_DAX, "," MNTOPT_DAX },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
@@ -1507,6 +1514,20 @@ xfs_fs_fill_super(
if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
sb->s_flags |= MS_I_VERSION;
+ if (mp->m_flags & XFS_MOUNT_DAX) {
+ xfs_warn(mp,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ if (sb->s_blocksize != PAGE_SIZE) {
+ xfs_alert(mp,
+ "Filesystem block size invalid for DAX Turning DAX off.");
+ mp->m_flags &= ~XFS_MOUNT_DAX;
+ } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+ xfs_alert(mp,
+ "Block device does not support DAX Turning DAX off.");
+ mp->m_flags &= ~XFS_MOUNT_DAX;
+ }
+ }
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 3df411eadb86..4be27b0210af 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -104,7 +104,7 @@ xfs_readlink_bmap(
cur_chunk += sizeof(struct xfs_dsymlink_hdr);
}
- memcpy(link + offset, bp->b_addr, byte_cnt);
+ memcpy(link + offset, cur_chunk, byte_cnt);
pathlen -= byte_cnt;
offset += byte_cnt;
@@ -178,7 +178,6 @@ xfs_symlink(
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
- uint cancel_flags;
int committed;
xfs_fileoff_t first_fsb;
xfs_filblks_t fs_blocks;
@@ -224,7 +223,6 @@ xfs_symlink(
return error;
tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
/*
* The symlink will fit into the inode data fork?
* There can't be any attributes so we get the whole variable part.
@@ -239,10 +237,8 @@ xfs_symlink(
resblks = 0;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
}
- if (error) {
- cancel_flags = 0;
+ if (error)
goto out_trans_cancel;
- }
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
@@ -394,7 +390,7 @@ xfs_symlink(
if (error)
goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto out_release_inode;
@@ -407,9 +403,8 @@ xfs_symlink(
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
- cancel_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
out_release_inode:
/*
* Wait until after the current transaction is aborted to finish the
@@ -464,7 +459,7 @@ xfs_inactive_symlink_rmt(
tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -533,7 +528,7 @@ xfs_inactive_symlink_rmt(
/*
* Commit the transaction containing extent freeing and EFDs.
*/
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
goto error_unlock;
@@ -552,7 +547,7 @@ xfs_inactive_symlink_rmt(
error_bmap_cancel:
xfs_bmap_cancel(&free_list);
error_trans_cancel:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
error_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 615781bf4ee5..8d916d33d93d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -738,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
__entry->blocks, __entry->shift, __entry->writeio_blocks)
)
+TRACE_EVENT(xfs_irec_merge_pre,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
+ TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(uint16_t, holemask)
+ __field(xfs_agino_t, nagino)
+ __field(uint16_t, nholemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->holemask = holemask;
+ __entry->nagino = nagino;
+ __entry->nholemask = holemask;
+ ),
+ TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+ __entry->agino, __entry->holemask, __entry->nagino,
+ __entry->nholemask)
+)
+
+TRACE_EVENT(xfs_irec_merge_post,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ uint16_t holemask),
+ TP_ARGS(mp, agno, agino, holemask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(uint16_t, holemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->holemask = holemask;
+ ),
+ TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
+ MINOR(__entry->dev), __entry->agno, __entry->agino,
+ __entry->holemask)
+)
+
#define DEFINE_IREF_EVENT(name) \
DEFINE_EVENT(xfs_iref_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 220ef2c906b2..0582a27107d4 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -113,7 +113,7 @@ xfs_trans_free(
* blocks. Locks and log items, however, are no inherited. They must
* be added to the new transaction explicitly.
*/
-xfs_trans_t *
+STATIC xfs_trans_t *
xfs_trans_dup(
xfs_trans_t *tp)
{
@@ -251,14 +251,7 @@ xfs_trans_reserve(
*/
undo_log:
if (resp->tr_logres > 0) {
- int log_flags;
-
- if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
- log_flags = XFS_LOG_REL_PERM_RESERV;
- } else {
- log_flags = 0;
- }
- xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+ xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
tp->t_ticket = NULL;
tp->t_log_res = 0;
tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -744,7 +737,7 @@ void
xfs_trans_free_items(
struct xfs_trans *tp,
xfs_lsn_t commit_lsn,
- int flags)
+ bool abort)
{
struct xfs_log_item_desc *lidp, *next;
@@ -755,7 +748,7 @@ xfs_trans_free_items(
if (commit_lsn != NULLCOMMITLSN)
lip->li_ops->iop_committing(lip, commit_lsn);
- if (flags & XFS_TRANS_ABORT)
+ if (abort)
lip->li_flags |= XFS_LI_ABORTED;
lip->li_ops->iop_unlock(lip);
@@ -892,27 +885,17 @@ xfs_trans_committed_bulk(
* have already been unlocked as if the commit had succeeded.
* Do not reference the transaction structure after this call.
*/
-int
-xfs_trans_commit(
+static int
+__xfs_trans_commit(
struct xfs_trans *tp,
- uint flags)
+ bool regrant)
{
struct xfs_mount *mp = tp->t_mountp;
xfs_lsn_t commit_lsn = -1;
int error = 0;
- int log_flags = 0;
int sync = tp->t_flags & XFS_TRANS_SYNC;
/*
- * Determine whether this commit is releasing a permanent
- * log reservation or not.
- */
- if (flags & XFS_TRANS_RELEASE_LOG_RES) {
- ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- log_flags = XFS_LOG_REL_PERM_RESERV;
- }
-
- /*
* If there is nothing to be logged by the transaction,
* then unlock all of the items associated with the
* transaction and free the transaction structure.
@@ -936,7 +919,7 @@ xfs_trans_commit(
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
+ xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
xfs_trans_free(tp);
@@ -964,18 +947,25 @@ out_unreserve:
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
if (commit_lsn == -1 && !error)
error = -EIO;
}
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+ xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
xfs_trans_free(tp);
XFS_STATS_INC(xs_trans_empty);
return error;
}
+int
+xfs_trans_commit(
+ struct xfs_trans *tp)
+{
+ return __xfs_trans_commit(tp, false);
+}
+
/*
* Unlock all of the transaction's items and free the transaction.
* The transaction must not have modified any of its items, because
@@ -986,29 +976,22 @@ out_unreserve:
*/
void
xfs_trans_cancel(
- xfs_trans_t *tp,
- int flags)
+ struct xfs_trans *tp)
{
- int log_flags;
- xfs_mount_t *mp = tp->t_mountp;
+ struct xfs_mount *mp = tp->t_mountp;
+ bool dirty = (tp->t_flags & XFS_TRANS_DIRTY);
/*
- * See if the caller is being too lazy to figure out if
- * the transaction really needs an abort.
- */
- if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
- flags &= ~XFS_TRANS_ABORT;
- /*
* See if the caller is relying on us to shut down the
* filesystem. This happens in paths where we detect
* corruption and decide to give up.
*/
- if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (dirty && !XFS_FORCED_SHUTDOWN(mp)) {
XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
#ifdef DEBUG
- if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
struct xfs_log_item_desc *lidp;
list_for_each_entry(lidp, &tp->t_items, lid_trans)
@@ -1018,27 +1001,20 @@ xfs_trans_cancel(
xfs_trans_unreserve_and_mod_sb(tp);
xfs_trans_unreserve_and_mod_dquots(tp);
- if (tp->t_ticket) {
- if (flags & XFS_TRANS_RELEASE_LOG_RES) {
- ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- log_flags = XFS_LOG_REL_PERM_RESERV;
- } else {
- log_flags = 0;
- }
- xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
- }
+ if (tp->t_ticket)
+ xfs_log_done(mp, tp->t_ticket, NULL, false);
/* mark this thread as no longer being in a transaction */
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+ xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
xfs_trans_free(tp);
}
/*
* Roll from one trans in the sequence of PERMANENT transactions to
* the next: permanent transactions are only flushed out when
- * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * committed with xfs_trans_commit(), but we still want as soon
* as possible to let chunks of it go to the log. So we commit the
* chunk we've been working on and get a new transaction to continue.
*/
@@ -1055,7 +1031,8 @@ xfs_trans_roll(
* Ensure that the inode is always logged.
*/
trans = *tpp;
- xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+ if (dp)
+ xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
/*
* Copy the critical parameters from one trans to the next.
@@ -1071,20 +1048,13 @@ xfs_trans_roll(
* is in progress. The caller takes the responsibility to cancel
* the duplicate transaction that gets returned.
*/
- error = xfs_trans_commit(trans, 0);
+ error = __xfs_trans_commit(trans, true);
if (error)
return error;
trans = *tpp;
/*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(trans->t_ticket);
-
-
- /*
* Reserve space in the log for th next transaction.
* This also pushes items in the "AIL", the list of logged items,
* out to disk if they are taking up space at the tail of the log
@@ -1100,6 +1070,7 @@ xfs_trans_roll(
if (error)
return error;
- xfs_trans_ijoin(trans, dp, 0);
+ if (dp)
+ xfs_trans_ijoin(trans, dp, 0);
return 0;
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index b5bc1ab3c4da..3b21b4e5e467 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -133,8 +133,6 @@ typedef struct xfs_trans {
* XFS transaction mechanism exported interfaces that are
* actually macros.
*/
-#define xfs_trans_get_log_res(tp) ((tp)->t_log_res)
-#define xfs_trans_get_log_count(tp) ((tp)->t_log_count)
#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res)
#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
@@ -153,7 +151,6 @@ typedef struct xfs_trans {
*/
xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
-xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
uint, uint);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -228,9 +225,9 @@ void xfs_trans_log_efd_extent(xfs_trans_t *,
struct xfs_efd_log_item *,
xfs_fsblock_t,
xfs_extlen_t);
-int xfs_trans_commit(xfs_trans_t *, uint flags);
+int xfs_trans_commit(struct xfs_trans *);
int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
-void xfs_trans_cancel(xfs_trans_t *, int);
+void xfs_trans_cancel(xfs_trans_t *);
int xfs_trans_ail_init(struct xfs_mount *);
void xfs_trans_ail_destroy(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 573aefb5a573..1098cf490189 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -159,7 +159,7 @@ xfs_trans_ail_cursor_next(
{
struct xfs_log_item *lip = cur->item;
- if ((__psint_t)lip & 1)
+ if ((uintptr_t)lip & 1)
lip = xfs_ail_min(ailp);
if (lip)
cur->item = xfs_ail_next(ailp, lip);
@@ -196,7 +196,7 @@ xfs_trans_ail_cursor_clear(
list_for_each_entry(cur, &ailp->xa_cursors, list) {
if (cur->item == lip)
cur->item = (struct xfs_log_item *)
- ((__psint_t)cur->item | 1);
+ ((uintptr_t)cur->item | 1);
}
}
@@ -287,7 +287,7 @@ xfs_ail_splice(
* find the place in the AIL where the items belong.
*/
lip = cur ? cur->item : NULL;
- if (!lip || (__psint_t) lip & 1)
+ if (!lip || (uintptr_t)lip & 1)
lip = __xfs_trans_ail_cursor_last(ailp, lsn);
/*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 76a16df55ef7..ce78534a047e 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo(
xfs_trans_t *ntp)
{
xfs_dqtrx_t *oq, *nq;
- int i,j;
+ int i, j;
xfs_dqtrx_t *oqa, *nqa;
+ ulong blk_res_used;
if (!otp->t_dqinfo)
return;
@@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo(
* Because the quota blk reservation is carried forward,
* it is also necessary to carry forward the DQ_DIRTY flag.
*/
- if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+ if (otp->t_flags & XFS_TRANS_DQ_DIRTY)
ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
oqa = otp->t_dqinfo->dqs[j];
nqa = ntp->t_dqinfo->dqs[j];
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+ blk_res_used = 0;
+
if (oqa[i].qt_dquot == NULL)
break;
oq = &oqa[i];
nq = &nqa[i];
+ if (oq->qt_blk_res && oq->qt_bcount_delta > 0)
+ blk_res_used = oq->qt_bcount_delta;
+
nq->qt_dquot = oq->qt_dquot;
nq->qt_bcount_delta = nq->qt_icount_delta = 0;
nq->qt_rtbcount_delta = 0;
@@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo(
/*
* Transfer whatever is left of the reservations.
*/
- nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
- oq->qt_blk_res = oq->qt_blk_res_used;
+ nq->qt_blk_res = oq->qt_blk_res - blk_res_used;
+ oq->qt_blk_res = blk_res_used;
nq->qt_rtblk_res = oq->qt_rtblk_res -
oq->qt_rtblk_res_used;
@@ -239,10 +245,6 @@ xfs_trans_mod_dquot(
* disk blocks used.
*/
case XFS_TRANS_DQ_BCOUNT:
- if (qtrx->qt_blk_res && delta > 0) {
- qtrx->qt_blk_res_used += (ulong)delta;
- ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
- }
qtrx->qt_bcount_delta += delta;
break;
@@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas(
* reservation that a transaction structure knows of.
*/
if (qtrx->qt_blk_res != 0) {
- if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
- if (qtrx->qt_blk_res >
- qtrx->qt_blk_res_used)
+ ulong blk_res_used = 0;
+
+ if (qtrx->qt_bcount_delta > 0)
+ blk_res_used = qtrx->qt_bcount_delta;
+
+ if (qtrx->qt_blk_res != blk_res_used) {
+ if (qtrx->qt_blk_res > blk_res_used)
dqp->q_res_bcount -= (xfs_qcnt_t)
(qtrx->qt_blk_res -
- qtrx->qt_blk_res_used);
+ blk_res_used);
else
dqp->q_res_bcount -= (xfs_qcnt_t)
- (qtrx->qt_blk_res_used -
+ (blk_res_used -
qtrx->qt_blk_res);
}
} else {
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index bd1281862ad7..1b736294558a 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -30,7 +30,7 @@ void xfs_trans_init(struct xfs_mount *);
void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
- int flags);
+ bool abort);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c187817471fb..1618cdfb38c7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -261,8 +261,13 @@ extern void acpi_osi_setup(char *str);
extern bool acpi_osi_is_win8(void);
#ifdef CONFIG_ACPI_NUMA
+int acpi_map_pxm_to_online_node(int pxm);
int acpi_get_node(acpi_handle handle);
#else
+static inline int acpi_map_pxm_to_online_node(int pxm)
+{
+ return 0;
+}
static inline int acpi_get_node(acpi_handle handle)
{
return 0;
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 05be2352fef8..26fc8bc77f85 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -21,6 +21,7 @@
# define __rcu __attribute__((noderef, address_space(4)))
#else
# define __rcu
+# define __pmem __attribute__((noderef, address_space(5)))
#endif
extern void __chk_user_ptr(const volatile void __user *);
extern void __chk_io_ptr(const volatile void __iomem *);
@@ -42,6 +43,7 @@ extern void __chk_io_ptr(const volatile void __iomem *);
# define __cond_lock(x,c) (c)
# define __percpu
# define __rcu
+# define __pmem
#endif
/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
diff --git a/include/linux/dma/pxa-dma.h b/include/linux/dma/pxa-dma.h
new file mode 100644
index 000000000000..3edc99294bf6
--- /dev/null
+++ b/include/linux/dma/pxa-dma.h
@@ -0,0 +1,27 @@
+#ifndef _PXA_DMA_H_
+#define _PXA_DMA_H_
+
+enum pxad_chan_prio {
+ PXAD_PRIO_HIGHEST = 0,
+ PXAD_PRIO_NORMAL,
+ PXAD_PRIO_LOW,
+ PXAD_PRIO_LOWEST,
+};
+
+struct pxad_param {
+ unsigned int drcmr;
+ enum pxad_chan_prio prio;
+};
+
+struct dma_chan;
+
+#ifdef CONFIG_PXA_DMA
+bool pxad_filter_fn(struct dma_chan *chan, void *param);
+#else
+static inline bool pxad_filter_fn(struct dma_chan *chan, void *param)
+{
+ return false;
+}
+#endif
+
+#endif /* _PXA_DMA_H_ */
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index ad419757241f..e2f5eb419976 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -65,6 +65,7 @@ enum dma_transaction_type {
DMA_PQ,
DMA_XOR_VAL,
DMA_PQ_VAL,
+ DMA_MEMSET,
DMA_INTERRUPT,
DMA_SG,
DMA_PRIVATE,
@@ -122,10 +123,18 @@ enum dma_transfer_direction {
* chunk and before first src/dst address for next chunk.
* Ignored for dst(assumed 0), if dst_inc is true and dst_sgl is false.
* Ignored for src(assumed 0), if src_inc is true and src_sgl is false.
+ * @dst_icg: Number of bytes to jump after last dst address of this
+ * chunk and before the first dst address for next chunk.
+ * Ignored if dst_inc is true and dst_sgl is false.
+ * @src_icg: Number of bytes to jump after last src address of this
+ * chunk and before the first src address for next chunk.
+ * Ignored if src_inc is true and src_sgl is false.
*/
struct data_chunk {
size_t size;
size_t icg;
+ size_t dst_icg;
+ size_t src_icg;
};
/**
@@ -222,6 +231,16 @@ struct dma_chan_percpu {
};
/**
+ * struct dma_router - DMA router structure
+ * @dev: pointer to the DMA router device
+ * @route_free: function to be called when the route can be disconnected
+ */
+struct dma_router {
+ struct device *dev;
+ void (*route_free)(struct device *dev, void *route_data);
+};
+
+/**
* struct dma_chan - devices supply DMA channels, clients use them
* @device: ptr to the dma device who supplies this channel, always !%NULL
* @cookie: last cookie value returned to client
@@ -232,6 +251,8 @@ struct dma_chan_percpu {
* @local: per-cpu pointer to a struct dma_chan_percpu
* @client_count: how many clients are using this channel
* @table_count: number of appearances in the mem-to-mem allocation table
+ * @router: pointer to the DMA router structure
+ * @route_data: channel specific data for the router
* @private: private data for certain client-channel associations
*/
struct dma_chan {
@@ -247,6 +268,11 @@ struct dma_chan {
struct dma_chan_percpu __percpu *local;
int client_count;
int table_count;
+
+ /* DMA router */
+ struct dma_router *router;
+ void *route_data;
+
void *private;
};
@@ -570,6 +596,7 @@ struct dma_tx_state {
* @copy_align: alignment shift for memcpy operations
* @xor_align: alignment shift for xor operations
* @pq_align: alignment shift for pq operations
+ * @fill_align: alignment shift for memset operations
* @dev_id: unique device ID
* @dev: struct device reference for dma mapping api
* @src_addr_widths: bit mask of src addr widths the device supports
@@ -588,6 +615,7 @@ struct dma_tx_state {
* @device_prep_dma_xor_val: prepares a xor validation operation
* @device_prep_dma_pq: prepares a pq operation
* @device_prep_dma_pq_val: prepares a pqzero_sum operation
+ * @device_prep_dma_memset: prepares a memset operation
* @device_prep_dma_interrupt: prepares an end of chain interrupt operation
* @device_prep_slave_sg: prepares a slave dma operation
* @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio.
@@ -620,6 +648,7 @@ struct dma_device {
u8 copy_align;
u8 xor_align;
u8 pq_align;
+ u8 fill_align;
#define DMA_HAS_PQ_CONTINUE (1 << 15)
int dev_id;
@@ -650,6 +679,9 @@ struct dma_device {
struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
unsigned int src_cnt, const unsigned char *scf, size_t len,
enum sum_check_flags *pqres, unsigned long flags);
+ struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
+ struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
+ unsigned long flags);
struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)(
struct dma_chan *chan, unsigned long flags);
struct dma_async_tx_descriptor *(*device_prep_dma_sg)(
@@ -745,6 +777,17 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
return chan->device->device_prep_interleaved_dma(chan, xt, flags);
}
+static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memset(
+ struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
+ unsigned long flags)
+{
+ if (!chan || !chan->device)
+ return NULL;
+
+ return chan->device->device_prep_dma_memset(chan, dest, value,
+ len, flags);
+}
+
static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg(
struct dma_chan *chan,
struct scatterlist *dst_sg, unsigned int dst_nents,
@@ -820,6 +863,12 @@ static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1,
return dmaengine_check_align(dev->pq_align, off1, off2, len);
}
+static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1,
+ size_t off2, size_t len)
+{
+ return dmaengine_check_align(dev->fill_align, off1, off2, len);
+}
+
static inline void
dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue)
{
@@ -874,6 +923,33 @@ static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags)
BUG();
}
+static inline size_t dmaengine_get_icg(bool inc, bool sgl, size_t icg,
+ size_t dir_icg)
+{
+ if (inc) {
+ if (dir_icg)
+ return dir_icg;
+ else if (sgl)
+ return icg;
+ }
+
+ return 0;
+}
+
+static inline size_t dmaengine_get_dst_icg(struct dma_interleaved_template *xt,
+ struct data_chunk *chunk)
+{
+ return dmaengine_get_icg(xt->dst_inc, xt->dst_sgl,
+ chunk->icg, chunk->dst_icg);
+}
+
+static inline size_t dmaengine_get_src_icg(struct dma_interleaved_template *xt,
+ struct data_chunk *chunk)
+{
+ return dmaengine_get_icg(xt->src_inc, xt->src_sgl,
+ chunk->icg, chunk->src_icg);
+}
+
/* --- public DMA engine API --- */
#ifdef CONFIG_DMA_ENGINE
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 5f19efe4eb3f..85ef051ac6fb 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -85,7 +85,8 @@ typedef struct {
#define EFI_MEMORY_MAPPED_IO 11
#define EFI_MEMORY_MAPPED_IO_PORT_SPACE 12
#define EFI_PAL_CODE 13
-#define EFI_MAX_MEMORY_TYPE 14
+#define EFI_PERSISTENT_MEMORY 14
+#define EFI_MAX_MEMORY_TYPE 15
/* Attribute values: */
#define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e351da4a934f..3f1a84635da8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
ssize_t bytes, void *private);
+typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
#define MAY_EXEC 0x00000001
#define MAY_WRITE 0x00000002
@@ -2655,9 +2656,13 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
int dax_clear_blocks(struct inode *, sector_t block, long size);
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+ dax_iodone_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+ dax_iodone_t);
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
+#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
+#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
new file mode 100644
index 000000000000..75e3af01ee32
--- /dev/null
+++ b/include/linux/libnvdimm.h
@@ -0,0 +1,151 @@
+/*
+ * libnvdimm - Non-volatile-memory Devices Subsystem
+ *
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __LIBNVDIMM_H__
+#define __LIBNVDIMM_H__
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+enum {
+ /* when a dimm supports both PMEM and BLK access a label is required */
+ NDD_ALIASING = 1 << 0,
+ /* unarmed memory devices may not persist writes */
+ NDD_UNARMED = 1 << 1,
+
+ /* need to set a limit somewhere, but yes, this is likely overkill */
+ ND_IOCTL_MAX_BUFLEN = SZ_4M,
+ ND_CMD_MAX_ELEM = 4,
+ ND_CMD_MAX_ENVELOPE = 16,
+ ND_CMD_ARS_STATUS_MAX = SZ_4K,
+ ND_MAX_MAPPINGS = 32,
+
+ /* mark newly adjusted resources as requiring a label update */
+ DPA_RESOURCE_ADJUSTED = 1 << 0,
+};
+
+extern struct attribute_group nvdimm_bus_attribute_group;
+extern struct attribute_group nvdimm_attribute_group;
+extern struct attribute_group nd_device_attribute_group;
+extern struct attribute_group nd_numa_attribute_group;
+extern struct attribute_group nd_region_attribute_group;
+extern struct attribute_group nd_mapping_attribute_group;
+
+struct nvdimm;
+struct nvdimm_bus_descriptor;
+typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc,
+ struct nvdimm *nvdimm, unsigned int cmd, void *buf,
+ unsigned int buf_len);
+
+struct nd_namespace_label;
+struct nvdimm_drvdata;
+struct nd_mapping {
+ struct nvdimm *nvdimm;
+ struct nd_namespace_label **labels;
+ u64 start;
+ u64 size;
+ /*
+ * @ndd is for private use at region enable / disable time for
+ * get_ndd() + put_ndd(), all other nd_mapping to ndd
+ * conversions use to_ndd() which respects enabled state of the
+ * nvdimm.
+ */
+ struct nvdimm_drvdata *ndd;
+};
+
+struct nvdimm_bus_descriptor {
+ const struct attribute_group **attr_groups;
+ unsigned long dsm_mask;
+ char *provider_name;
+ ndctl_fn ndctl;
+};
+
+struct nd_cmd_desc {
+ int in_num;
+ int out_num;
+ u32 in_sizes[ND_CMD_MAX_ELEM];
+ int out_sizes[ND_CMD_MAX_ELEM];
+};
+
+struct nd_interleave_set {
+ u64 cookie;
+};
+
+struct nd_region_desc {
+ struct resource *res;
+ struct nd_mapping *nd_mapping;
+ u16 num_mappings;
+ const struct attribute_group **attr_groups;
+ struct nd_interleave_set *nd_set;
+ void *provider_data;
+ int num_lanes;
+ int numa_node;
+};
+
+struct nvdimm_bus;
+struct module;
+struct device;
+struct nd_blk_region;
+struct nd_blk_region_desc {
+ int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+ void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+ int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
+ void *iobuf, u64 len, int rw);
+ struct nd_region_desc ndr_desc;
+};
+
+static inline struct nd_blk_region_desc *to_blk_region_desc(
+ struct nd_region_desc *ndr_desc)
+{
+ return container_of(ndr_desc, struct nd_blk_region_desc, ndr_desc);
+
+}
+
+struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
+ struct nvdimm_bus_descriptor *nfit_desc, struct module *module);
+#define nvdimm_bus_register(parent, desc) \
+ __nvdimm_bus_register(parent, desc, THIS_MODULE)
+void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
+struct nvdimm_bus *to_nvdimm_bus(struct device *dev);
+struct nvdimm *to_nvdimm(struct device *dev);
+struct nd_region *to_nd_region(struct device *dev);
+struct nd_blk_region *to_nd_blk_region(struct device *dev);
+struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
+const char *nvdimm_name(struct nvdimm *nvdimm);
+void *nvdimm_provider_data(struct nvdimm *nvdimm);
+struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
+ const struct attribute_group **groups, unsigned long flags,
+ unsigned long *dsm_mask);
+const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
+const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
+u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
+ const struct nd_cmd_desc *desc, int idx, void *buf);
+u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
+ const struct nd_cmd_desc *desc, int idx, const u32 *in_field,
+ const u32 *out_field);
+int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count);
+struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc);
+struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc);
+struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc);
+void *nd_region_provider_data(struct nd_region *nd_region);
+void *nd_blk_region_provider_data(struct nd_blk_region *ndbr);
+void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data);
+struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr);
+unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
+void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
+u64 nd_fletcher64(void *addr, size_t len, bool le);
+#endif /* __LIBNVDIMM_H__ */
diff --git a/include/linux/nd.h b/include/linux/nd.h
new file mode 100644
index 000000000000..507e47c86737
--- /dev/null
+++ b/include/linux/nd.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __LINUX_ND_H__
+#define __LINUX_ND_H__
+#include <linux/fs.h>
+#include <linux/ndctl.h>
+#include <linux/device.h>
+
+struct nd_device_driver {
+ struct device_driver drv;
+ unsigned long type;
+ int (*probe)(struct device *dev);
+ int (*remove)(struct device *dev);
+};
+
+static inline struct nd_device_driver *to_nd_device_driver(
+ struct device_driver *drv)
+{
+ return container_of(drv, struct nd_device_driver, drv);
+};
+
+/**
+ * struct nd_namespace_common - core infrastructure of a namespace
+ * @force_raw: ignore other personalities for the namespace (e.g. btt)
+ * @dev: device model node
+ * @claim: when set a another personality has taken ownership of the namespace
+ * @rw_bytes: access the raw namespace capacity with byte-aligned transfers
+ */
+struct nd_namespace_common {
+ int force_raw;
+ struct device dev;
+ struct device *claim;
+ int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset,
+ void *buf, size_t size, int rw);
+};
+
+static inline struct nd_namespace_common *to_ndns(struct device *dev)
+{
+ return container_of(dev, struct nd_namespace_common, dev);
+}
+
+/**
+ * struct nd_namespace_io - infrastructure for loading an nd_pmem instance
+ * @dev: namespace device created by the nd region driver
+ * @res: struct resource conversion of a NFIT SPA table
+ */
+struct nd_namespace_io {
+ struct nd_namespace_common common;
+ struct resource res;
+};
+
+/**
+ * struct nd_namespace_pmem - namespace device for dimm-backed interleaved memory
+ * @nsio: device and system physical address range to drive
+ * @alt_name: namespace name supplied in the dimm label
+ * @uuid: namespace name supplied in the dimm label
+ */
+struct nd_namespace_pmem {
+ struct nd_namespace_io nsio;
+ char *alt_name;
+ u8 *uuid;
+};
+
+/**
+ * struct nd_namespace_blk - namespace for dimm-bounded persistent memory
+ * @alt_name: namespace name supplied in the dimm label
+ * @uuid: namespace name supplied in the dimm label
+ * @id: ida allocated id
+ * @lbasize: blk namespaces have a native sector size when btt not present
+ * @num_resources: number of dpa extents to claim
+ * @res: discontiguous dpa extents for given dimm
+ */
+struct nd_namespace_blk {
+ struct nd_namespace_common common;
+ char *alt_name;
+ u8 *uuid;
+ int id;
+ unsigned long lbasize;
+ int num_resources;
+ struct resource **res;
+};
+
+static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev)
+{
+ return container_of(dev, struct nd_namespace_io, common.dev);
+}
+
+static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev)
+{
+ struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+ return container_of(nsio, struct nd_namespace_pmem, nsio);
+}
+
+static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev)
+{
+ return container_of(dev, struct nd_namespace_blk, common.dev);
+}
+
+/**
+ * nvdimm_read_bytes() - synchronously read bytes from an nvdimm namespace
+ * @ndns: device to read
+ * @offset: namespace-relative starting offset
+ * @buf: buffer to fill
+ * @size: transfer length
+ *
+ * @buf is up-to-date upon return from this routine.
+ */
+static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns,
+ resource_size_t offset, void *buf, size_t size)
+{
+ return ndns->rw_bytes(ndns, offset, buf, size, READ);
+}
+
+/**
+ * nvdimm_write_bytes() - synchronously write bytes to an nvdimm namespace
+ * @ndns: device to read
+ * @offset: namespace-relative starting offset
+ * @buf: buffer to drain
+ * @size: transfer length
+ *
+ * NVDIMM Namepaces disks do not implement sectors internally. Depending on
+ * the @ndns, the contents of @buf may be in cpu cache, platform buffers,
+ * or on backing memory media upon return from this routine. Flushing
+ * to media is handled internal to the @ndns driver, if at all.
+ */
+static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns,
+ resource_size_t offset, void *buf, size_t size)
+{
+ return ndns->rw_bytes(ndns, offset, buf, size, WRITE);
+}
+
+#define MODULE_ALIAS_ND_DEVICE(type) \
+ MODULE_ALIAS("nd:t" __stringify(type) "*")
+#define ND_DEVICE_MODALIAS_FMT "nd:t%d"
+
+int __must_check __nd_driver_register(struct nd_device_driver *nd_drv,
+ struct module *module, const char *mod_name);
+#define nd_driver_register(driver) \
+ __nd_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
+#endif /* __LINUX_ND_H__ */
diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h
index 56bc026c143f..98ba7525929e 100644
--- a/include/linux/of_dma.h
+++ b/include/linux/of_dma.h
@@ -23,6 +23,9 @@ struct of_dma {
struct device_node *of_node;
struct dma_chan *(*of_dma_xlate)
(struct of_phandle_args *, struct of_dma *);
+ void *(*of_dma_route_allocate)
+ (struct of_phandle_args *, struct of_dma *);
+ struct dma_router *dma_router;
void *of_dma_data;
};
@@ -37,12 +40,20 @@ extern int of_dma_controller_register(struct device_node *np,
(struct of_phandle_args *, struct of_dma *),
void *data);
extern void of_dma_controller_free(struct device_node *np);
+
+extern int of_dma_router_register(struct device_node *np,
+ void *(*of_dma_route_allocate)
+ (struct of_phandle_args *, struct of_dma *),
+ struct dma_router *dma_router);
+#define of_dma_router_free of_dma_controller_free
+
extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
const char *name);
extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
struct of_dma *ofdma);
extern struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec,
struct of_dma *ofdma);
+
#else
static inline int of_dma_controller_register(struct device_node *np,
struct dma_chan *(*of_dma_xlate)
@@ -56,6 +67,16 @@ static inline void of_dma_controller_free(struct device_node *np)
{
}
+static inline int of_dma_router_register(struct device_node *np,
+ void *(*of_dma_route_allocate)
+ (struct of_phandle_args *, struct of_dma *),
+ struct dma_router *dma_router)
+{
+ return -ENODEV;
+}
+
+#define of_dma_router_free of_dma_controller_free
+
static inline struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
const char *name)
{
diff --git a/include/linux/platform_data/dma-rcar-audmapp.h b/include/linux/platform_data/dma-rcar-audmapp.h
deleted file mode 100644
index 471fffebbeb4..000000000000
--- a/include/linux/platform_data/dma-rcar-audmapp.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * This is for Renesas R-Car Audio-DMAC-peri-peri.
- *
- * Copyright (C) 2014 Renesas Electronics Corporation
- * Copyright (C) 2014 Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
- *
- * This file is based on the include/linux/sh_dma.h
- *
- * Header for the new SH dmaengine driver
- *
- * Copyright (C) 2010 Guennadi Liakhovetski <g.liakhovetski@gmx.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef SH_AUDMAPP_H
-#define SH_AUDMAPP_H
-
-#include <linux/dmaengine.h>
-
-struct audmapp_slave_config {
- int slave_id;
- dma_addr_t src;
- dma_addr_t dst;
- u32 chcr;
-};
-
-struct audmapp_pdata {
- struct audmapp_slave_config *slave;
- int slave_num;
-};
-
-#endif /* SH_AUDMAPP_H */
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
new file mode 100644
index 000000000000..d2114045a6c4
--- /dev/null
+++ b/include/linux/pmem.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __PMEM_H__
+#define __PMEM_H__
+
+#include <linux/io.h>
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+#include <asm/cacheflush.h>
+#else
+static inline void arch_wmb_pmem(void)
+{
+ BUG();
+}
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+ return false;
+}
+
+static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
+ unsigned long size)
+{
+ return NULL;
+}
+
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+ size_t n)
+{
+ BUG();
+}
+#endif
+
+/*
+ * Architectures that define ARCH_HAS_PMEM_API must provide
+ * implementations for arch_memremap_pmem(), arch_memcpy_to_pmem(),
+ * arch_wmb_pmem(), and __arch_has_wmb_pmem().
+ */
+
+static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
+{
+ memcpy(dst, (void __force const *) src, size);
+}
+
+static inline void memunmap_pmem(void __pmem *addr)
+{
+ iounmap((void __force __iomem *) addr);
+}
+
+/**
+ * arch_has_wmb_pmem - true if wmb_pmem() ensures durability
+ *
+ * For a given cpu implementation within an architecture it is possible
+ * that wmb_pmem() resolves to a nop. In the case this returns
+ * false, pmem api users are unable to ensure durability and may want to
+ * fall back to a different data consistency model, or otherwise notify
+ * the user.
+ */
+static inline bool arch_has_wmb_pmem(void)
+{
+ if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API))
+ return __arch_has_wmb_pmem();
+ return false;
+}
+
+static inline bool arch_has_pmem_api(void)
+{
+ return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem();
+}
+
+/*
+ * These defaults seek to offer decent performance and minimize the
+ * window between i/o completion and writes being durable on media.
+ * However, it is undefined / architecture specific whether
+ * default_memremap_pmem + default_memcpy_to_pmem is sufficient for
+ * making data durable relative to i/o completion.
+ */
+static void default_memcpy_to_pmem(void __pmem *dst, const void *src,
+ size_t size)
+{
+ memcpy((void __force *) dst, src, size);
+}
+
+static void __pmem *default_memremap_pmem(resource_size_t offset,
+ unsigned long size)
+{
+ return (void __pmem __force *)ioremap_wt(offset, size);
+}
+
+/**
+ * memremap_pmem - map physical persistent memory for pmem api
+ * @offset: physical address of persistent memory
+ * @size: size of the mapping
+ *
+ * Establish a mapping of the architecture specific memory type expected
+ * by memcpy_to_pmem() and wmb_pmem(). For example, it may be
+ * the case that an uncacheable or writethrough mapping is sufficient,
+ * or a writeback mapping provided memcpy_to_pmem() and
+ * wmb_pmem() arrange for the data to be written through the
+ * cache to persistent media.
+ */
+static inline void __pmem *memremap_pmem(resource_size_t offset,
+ unsigned long size)
+{
+ if (arch_has_pmem_api())
+ return arch_memremap_pmem(offset, size);
+ return default_memremap_pmem(offset, size);
+}
+
+/**
+ * memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Perform a memory copy that results in the destination of the copy
+ * being effectively evicted from, or never written to, the processor
+ * cache hierarchy after the copy completes. After memcpy_to_pmem()
+ * data may still reside in cpu or platform buffers, so this operation
+ * must be followed by a wmb_pmem().
+ */
+static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n)
+{
+ if (arch_has_pmem_api())
+ arch_memcpy_to_pmem(dst, src, n);
+ else
+ default_memcpy_to_pmem(dst, src, n);
+}
+
+/**
+ * wmb_pmem - synchronize writes to persistent memory
+ *
+ * After a series of memcpy_to_pmem() operations this drains data from
+ * cpu write buffers and any platform (memory controller) buffers to
+ * ensure that written data is durable on persistent memory media.
+ */
+static inline void wmb_pmem(void)
+{
+ if (arch_has_pmem_api())
+ arch_wmb_pmem();
+}
+#endif /* __PMEM_H__ */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9de2fdc8b5e4..a99f0e5243e1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -153,30 +153,8 @@ size_t ksize(const void *);
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
-/*
- * The KMALLOC_LOOP_LOW is the definition for the for loop index start number
- * to create the kmalloc_caches object in create_kmalloc_caches(). The first
- * and the second are 96 and 192. You can see that in the kmalloc_index(), if
- * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64,
- * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't
- * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW.
- */
-#if KMALLOC_MIN_SIZE <= 32
-#define KMALLOC_LOOP_LOW 1
-#elif KMALLOC_MIN_SIZE <= 64
-#define KMALLOC_LOOP_LOW 2
-#else
-#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW
-#endif
-
#else
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-/*
- * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used.
- * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be
- * initialized.
- */
-#define KMALLOC_LOOP_LOW 1
#endif
/*
diff --git a/include/linux/wait.h b/include/linux/wait.h
index d69ac4ecc88b..1e1bf9f963a9 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -358,6 +358,19 @@ do { \
__ret; \
})
+#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \
+ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0, \
+ cmd1; schedule(); cmd2)
+/*
+ * Just like wait_event_cmd(), except it sets exclusive flag
+ */
+#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \
+do { \
+ if (condition) \
+ break; \
+ __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2); \
+} while (0)
+
#define __wait_event_cmd(wq, condition, cmd1, cmd2) \
(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
cmd1; schedule(); cmd2)
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 7f79cf459591..0b73af9be12f 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1117,61 +1117,6 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
TP_ARGS(wq)
);
-#define show_oper_type(type) \
- __print_symbolic(type, \
- { BTRFS_QGROUP_OPER_ADD_EXCL, "OPER_ADD_EXCL" }, \
- { BTRFS_QGROUP_OPER_ADD_SHARED, "OPER_ADD_SHARED" }, \
- { BTRFS_QGROUP_OPER_SUB_EXCL, "OPER_SUB_EXCL" }, \
- { BTRFS_QGROUP_OPER_SUB_SHARED, "OPER_SUB_SHARED" })
-
-DECLARE_EVENT_CLASS(btrfs_qgroup_oper,
-
- TP_PROTO(struct btrfs_qgroup_operation *oper),
-
- TP_ARGS(oper),
-
- TP_STRUCT__entry(
- __field( u64, ref_root )
- __field( u64, bytenr )
- __field( u64, num_bytes )
- __field( u64, seq )
- __field( int, type )
- __field( u64, elem_seq )
- ),
-
- TP_fast_assign(
- __entry->ref_root = oper->ref_root;
- __entry->bytenr = oper->bytenr,
- __entry->num_bytes = oper->num_bytes;
- __entry->seq = oper->seq;
- __entry->type = oper->type;
- __entry->elem_seq = oper->elem.seq;
- ),
-
- TP_printk("ref_root = %llu, bytenr = %llu, num_bytes = %llu, "
- "seq = %llu, elem.seq = %llu, type = %s",
- (unsigned long long)__entry->ref_root,
- (unsigned long long)__entry->bytenr,
- (unsigned long long)__entry->num_bytes,
- (unsigned long long)__entry->seq,
- (unsigned long long)__entry->elem_seq,
- show_oper_type(__entry->type))
-);
-
-DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_account,
-
- TP_PROTO(struct btrfs_qgroup_operation *oper),
-
- TP_ARGS(oper)
-);
-
-DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_record_ref,
-
- TP_PROTO(struct btrfs_qgroup_operation *oper),
-
- TP_ARGS(oper)
-);
-
#endif /* _TRACE_BTRFS_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index c1c23f19d4a2..1ff9942718fe 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -272,6 +272,7 @@ header-y += ncp_fs.h
header-y += ncp.h
header-y += ncp_mount.h
header-y += ncp_no.h
+header-y += ndctl.h
header-y += neighbour.h
header-y += netconf.h
header-y += netdevice.h
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
new file mode 100644
index 000000000000..2b94ea2287bb
--- /dev/null
+++ b/include/uapi/linux/ndctl.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2014-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU Lesser General Public License,
+ * version 2.1, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
+ * more details.
+ */
+#ifndef __NDCTL_H__
+#define __NDCTL_H__
+
+#include <linux/types.h>
+
+struct nd_cmd_smart {
+ __u32 status;
+ __u8 data[128];
+} __packed;
+
+struct nd_cmd_smart_threshold {
+ __u32 status;
+ __u8 data[8];
+} __packed;
+
+struct nd_cmd_dimm_flags {
+ __u32 status;
+ __u32 flags;
+} __packed;
+
+struct nd_cmd_get_config_size {
+ __u32 status;
+ __u32 config_size;
+ __u32 max_xfer;
+} __packed;
+
+struct nd_cmd_get_config_data_hdr {
+ __u32 in_offset;
+ __u32 in_length;
+ __u32 status;
+ __u8 out_buf[0];
+} __packed;
+
+struct nd_cmd_set_config_hdr {
+ __u32 in_offset;
+ __u32 in_length;
+ __u8 in_buf[0];
+} __packed;
+
+struct nd_cmd_vendor_hdr {
+ __u32 opcode;
+ __u32 in_length;
+ __u8 in_buf[0];
+} __packed;
+
+struct nd_cmd_vendor_tail {
+ __u32 status;
+ __u32 out_length;
+ __u8 out_buf[0];
+} __packed;
+
+struct nd_cmd_ars_cap {
+ __u64 address;
+ __u64 length;
+ __u32 status;
+ __u32 max_ars_out;
+} __packed;
+
+struct nd_cmd_ars_start {
+ __u64 address;
+ __u64 length;
+ __u16 type;
+ __u8 reserved[6];
+ __u32 status;
+} __packed;
+
+struct nd_cmd_ars_status {
+ __u32 status;
+ __u32 out_length;
+ __u64 address;
+ __u64 length;
+ __u16 type;
+ __u32 num_records;
+ struct nd_ars_record {
+ __u32 handle;
+ __u32 flags;
+ __u64 err_address;
+ __u64 mask;
+ } __packed records[0];
+} __packed;
+
+enum {
+ ND_CMD_IMPLEMENTED = 0,
+
+ /* bus commands */
+ ND_CMD_ARS_CAP = 1,
+ ND_CMD_ARS_START = 2,
+ ND_CMD_ARS_STATUS = 3,
+
+ /* per-dimm commands */
+ ND_CMD_SMART = 1,
+ ND_CMD_SMART_THRESHOLD = 2,
+ ND_CMD_DIMM_FLAGS = 3,
+ ND_CMD_GET_CONFIG_SIZE = 4,
+ ND_CMD_GET_CONFIG_DATA = 5,
+ ND_CMD_SET_CONFIG_DATA = 6,
+ ND_CMD_VENDOR_EFFECT_LOG_SIZE = 7,
+ ND_CMD_VENDOR_EFFECT_LOG = 8,
+ ND_CMD_VENDOR = 9,
+};
+
+static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
+{
+ static const char * const names[] = {
+ [ND_CMD_ARS_CAP] = "ars_cap",
+ [ND_CMD_ARS_START] = "ars_start",
+ [ND_CMD_ARS_STATUS] = "ars_status",
+ };
+
+ if (cmd < ARRAY_SIZE(names) && names[cmd])
+ return names[cmd];
+ return "unknown";
+}
+
+static inline const char *nvdimm_cmd_name(unsigned cmd)
+{
+ static const char * const names[] = {
+ [ND_CMD_SMART] = "smart",
+ [ND_CMD_SMART_THRESHOLD] = "smart_thresh",
+ [ND_CMD_DIMM_FLAGS] = "flags",
+ [ND_CMD_GET_CONFIG_SIZE] = "get_size",
+ [ND_CMD_GET_CONFIG_DATA] = "get_data",
+ [ND_CMD_SET_CONFIG_DATA] = "set_data",
+ [ND_CMD_VENDOR_EFFECT_LOG_SIZE] = "effect_size",
+ [ND_CMD_VENDOR_EFFECT_LOG] = "effect_log",
+ [ND_CMD_VENDOR] = "vendor",
+ };
+
+ if (cmd < ARRAY_SIZE(names) && names[cmd])
+ return names[cmd];
+ return "unknown";
+}
+
+#define ND_IOCTL 'N'
+
+#define ND_IOCTL_SMART _IOWR(ND_IOCTL, ND_CMD_SMART,\
+ struct nd_cmd_smart)
+
+#define ND_IOCTL_SMART_THRESHOLD _IOWR(ND_IOCTL, ND_CMD_SMART_THRESHOLD,\
+ struct nd_cmd_smart_threshold)
+
+#define ND_IOCTL_DIMM_FLAGS _IOWR(ND_IOCTL, ND_CMD_DIMM_FLAGS,\
+ struct nd_cmd_dimm_flags)
+
+#define ND_IOCTL_GET_CONFIG_SIZE _IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_SIZE,\
+ struct nd_cmd_get_config_size)
+
+#define ND_IOCTL_GET_CONFIG_DATA _IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_DATA,\
+ struct nd_cmd_get_config_data_hdr)
+
+#define ND_IOCTL_SET_CONFIG_DATA _IOWR(ND_IOCTL, ND_CMD_SET_CONFIG_DATA,\
+ struct nd_cmd_set_config_hdr)
+
+#define ND_IOCTL_VENDOR _IOWR(ND_IOCTL, ND_CMD_VENDOR,\
+ struct nd_cmd_vendor_hdr)
+
+#define ND_IOCTL_ARS_CAP _IOWR(ND_IOCTL, ND_CMD_ARS_CAP,\
+ struct nd_cmd_ars_cap)
+
+#define ND_IOCTL_ARS_START _IOWR(ND_IOCTL, ND_CMD_ARS_START,\
+ struct nd_cmd_ars_start)
+
+#define ND_IOCTL_ARS_STATUS _IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\
+ struct nd_cmd_ars_status)
+
+#define ND_DEVICE_DIMM 1 /* nd_dimm: container for "config data" */
+#define ND_DEVICE_REGION_PMEM 2 /* nd_region: (parent of PMEM namespaces) */
+#define ND_DEVICE_REGION_BLK 3 /* nd_region: (parent of BLK namespaces) */
+#define ND_DEVICE_NAMESPACE_IO 4 /* legacy persistent memory */
+#define ND_DEVICE_NAMESPACE_PMEM 5 /* PMEM namespace (may alias with BLK) */
+#define ND_DEVICE_NAMESPACE_BLK 6 /* BLK namespace (may alias with PMEM) */
+
+enum nd_driver_flags {
+ ND_DRIVER_DIMM = 1 << ND_DEVICE_DIMM,
+ ND_DRIVER_REGION_PMEM = 1 << ND_DEVICE_REGION_PMEM,
+ ND_DRIVER_REGION_BLK = 1 << ND_DEVICE_REGION_BLK,
+ ND_DRIVER_NAMESPACE_IO = 1 << ND_DEVICE_NAMESPACE_IO,
+ ND_DRIVER_NAMESPACE_PMEM = 1 << ND_DEVICE_NAMESPACE_PMEM,
+ ND_DRIVER_NAMESPACE_BLK = 1 << ND_DEVICE_NAMESPACE_BLK,
+};
+
+enum {
+ ND_MIN_NAMESPACE_SIZE = 0x00400000,
+};
+#endif /* __NDCTL_H__ */
diff --git a/lib/Kconfig b/lib/Kconfig
index 34e332b8d326..3a2ef67db6c7 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -528,4 +528,7 @@ source "lib/fonts/Kconfig"
config ARCH_HAS_SG_CHAIN
def_bool n
+config ARCH_HAS_PMEM_API
+ bool
+
endmenu
diff --git a/lib/kobject.c b/lib/kobject.c
index 75ee63834fd1..2e3bd01964a9 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -545,6 +545,7 @@ out:
kfree(devpath);
return error;
}
+EXPORT_SYMBOL_GPL(kobject_move);
/**
* kobject_del - unlink kobject from hierarchy.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 9f8d71f78404..983b78694c46 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -855,6 +855,12 @@ void __init setup_kmalloc_cache_index_table(void)
}
}
+static void new_kmalloc_cache(int idx, unsigned long flags)
+{
+ kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
+ kmalloc_info[idx].size, flags);
+}
+
/*
* Create the kmalloc array. Some of the regular kmalloc arrays
* may already have been created because they were needed to
@@ -864,25 +870,19 @@ void __init create_kmalloc_caches(unsigned long flags)
{
int i;
- for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
- if (!kmalloc_caches[i]) {
- kmalloc_caches[i] = create_kmalloc_cache(
- kmalloc_info[i].name,
- kmalloc_info[i].size,
- flags);
- }
+ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+ if (!kmalloc_caches[i])
+ new_kmalloc_cache(i, flags);
/*
- * "i == 2" is the "kmalloc-192" case which is the last special
- * case for initialization and it's the point to jump to
- * allocate the minimize size of the object. In slab allocator,
- * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4
- * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is
- * defined, it may be larger than 2^5 and here is also the
- * trick to skip the empty gap.
+ * Caches that are not of the two-to-the-power-of size.
+ * These have to be created immediately after the
+ * earlier power of two caches
*/
- if (i == 2)
- i = (KMALLOC_SHIFT_LOW - 1);
+ if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
+ new_kmalloc_cache(1, flags);
+ if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
+ new_kmalloc_cache(2, flags);
}
/* Kmalloc array is now usable */
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
new file mode 100644
index 000000000000..8e9b64520ec1
--- /dev/null
+++ b/tools/testing/nvdimm/Kbuild
@@ -0,0 +1,40 @@
+ldflags-y += --wrap=ioremap_cache
+ldflags-y += --wrap=ioremap_nocache
+ldflags-y += --wrap=iounmap
+ldflags-y += --wrap=__request_region
+ldflags-y += --wrap=__release_region
+
+DRIVERS := ../../../drivers
+NVDIMM_SRC := $(DRIVERS)/nvdimm
+ACPI_SRC := $(DRIVERS)/acpi
+
+obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
+obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
+obj-$(CONFIG_ND_BTT) += nd_btt.o
+obj-$(CONFIG_ND_BLK) += nd_blk.o
+obj-$(CONFIG_ACPI_NFIT) += nfit.o
+
+nfit-y := $(ACPI_SRC)/nfit.o
+nfit-y += config_check.o
+
+nd_pmem-y := $(NVDIMM_SRC)/pmem.o
+nd_pmem-y += config_check.o
+
+nd_btt-y := $(NVDIMM_SRC)/btt.o
+nd_btt-y += config_check.o
+
+nd_blk-y := $(NVDIMM_SRC)/blk.o
+nd_blk-y += config_check.o
+
+libnvdimm-y := $(NVDIMM_SRC)/core.o
+libnvdimm-y += $(NVDIMM_SRC)/bus.o
+libnvdimm-y += $(NVDIMM_SRC)/dimm_devs.o
+libnvdimm-y += $(NVDIMM_SRC)/dimm.o
+libnvdimm-y += $(NVDIMM_SRC)/region_devs.o
+libnvdimm-y += $(NVDIMM_SRC)/region.o
+libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o
+libnvdimm-y += $(NVDIMM_SRC)/label.o
+libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
+libnvdimm-y += config_check.o
+
+obj-m += test/
diff --git a/tools/testing/nvdimm/Makefile b/tools/testing/nvdimm/Makefile
new file mode 100644
index 000000000000..3dfe024b4e7e
--- /dev/null
+++ b/tools/testing/nvdimm/Makefile
@@ -0,0 +1,7 @@
+KDIR ?= ../../../
+
+default:
+ $(MAKE) -C $(KDIR) M=$$PWD
+
+install: default
+ $(MAKE) -C $(KDIR) M=$$PWD modules_install
diff --git a/tools/testing/nvdimm/config_check.c b/tools/testing/nvdimm/config_check.c
new file mode 100644
index 000000000000..f2c7615554eb
--- /dev/null
+++ b/tools/testing/nvdimm/config_check.c
@@ -0,0 +1,15 @@
+#include <linux/kconfig.h>
+#include <linux/bug.h>
+
+void check(void)
+{
+ /*
+ * These kconfig symbols must be set to "m" for nfit_test to
+ * load and operate.
+ */
+ BUILD_BUG_ON(!IS_MODULE(CONFIG_LIBNVDIMM));
+ BUILD_BUG_ON(!IS_MODULE(CONFIG_BLK_DEV_PMEM));
+ BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BTT));
+ BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BLK));
+ BUILD_BUG_ON(!IS_MODULE(CONFIG_ACPI_NFIT));
+}
diff --git a/tools/testing/nvdimm/test/Kbuild b/tools/testing/nvdimm/test/Kbuild
new file mode 100644
index 000000000000..9241064970fe
--- /dev/null
+++ b/tools/testing/nvdimm/test/Kbuild
@@ -0,0 +1,8 @@
+ccflags-y := -I$(src)/../../../../drivers/nvdimm/
+ccflags-y += -I$(src)/../../../../drivers/acpi/
+
+obj-m += nfit_test.o
+obj-m += nfit_test_iomap.o
+
+nfit_test-y := nfit.o
+nfit_test_iomap-y := iomap.o
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
new file mode 100644
index 000000000000..c85a6f6ba559
--- /dev/null
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/rculist.h>
+#include <linux/export.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/io.h>
+#include "nfit_test.h"
+
+static LIST_HEAD(iomap_head);
+
+static struct iomap_ops {
+ nfit_test_lookup_fn nfit_test_lookup;
+ struct list_head list;
+} iomap_ops = {
+ .list = LIST_HEAD_INIT(iomap_ops.list),
+};
+
+void nfit_test_setup(nfit_test_lookup_fn lookup)
+{
+ iomap_ops.nfit_test_lookup = lookup;
+ list_add_rcu(&iomap_ops.list, &iomap_head);
+}
+EXPORT_SYMBOL(nfit_test_setup);
+
+void nfit_test_teardown(void)
+{
+ list_del_rcu(&iomap_ops.list);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(nfit_test_teardown);
+
+static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
+{
+ struct iomap_ops *ops;
+
+ ops = list_first_or_null_rcu(&iomap_head, typeof(*ops), list);
+ if (ops)
+ return ops->nfit_test_lookup(resource);
+ return NULL;
+}
+
+void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
+ void __iomem *(*fallback_fn)(resource_size_t, unsigned long))
+{
+ struct nfit_test_resource *nfit_res;
+
+ rcu_read_lock();
+ nfit_res = get_nfit_res(offset);
+ rcu_read_unlock();
+ if (nfit_res)
+ return (void __iomem *) nfit_res->buf + offset
+ - nfit_res->res->start;
+ return fallback_fn(offset, size);
+}
+
+void __iomem *__wrap_ioremap_cache(resource_size_t offset, unsigned long size)
+{
+ return __nfit_test_ioremap(offset, size, ioremap_cache);
+}
+EXPORT_SYMBOL(__wrap_ioremap_cache);
+
+void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size)
+{
+ return __nfit_test_ioremap(offset, size, ioremap_nocache);
+}
+EXPORT_SYMBOL(__wrap_ioremap_nocache);
+
+void __wrap_iounmap(volatile void __iomem *addr)
+{
+ struct nfit_test_resource *nfit_res;
+
+ rcu_read_lock();
+ nfit_res = get_nfit_res((unsigned long) addr);
+ rcu_read_unlock();
+ if (nfit_res)
+ return;
+ return iounmap(addr);
+}
+EXPORT_SYMBOL(__wrap_iounmap);
+
+struct resource *__wrap___request_region(struct resource *parent,
+ resource_size_t start, resource_size_t n, const char *name,
+ int flags)
+{
+ struct nfit_test_resource *nfit_res;
+
+ if (parent == &iomem_resource) {
+ rcu_read_lock();
+ nfit_res = get_nfit_res(start);
+ rcu_read_unlock();
+ if (nfit_res) {
+ struct resource *res = nfit_res->res + 1;
+
+ if (start + n > nfit_res->res->start
+ + resource_size(nfit_res->res)) {
+ pr_debug("%s: start: %llx n: %llx overflow: %pr\n",
+ __func__, start, n,
+ nfit_res->res);
+ return NULL;
+ }
+
+ res->start = start;
+ res->end = start + n - 1;
+ res->name = name;
+ res->flags = resource_type(parent);
+ res->flags |= IORESOURCE_BUSY | flags;
+ pr_debug("%s: %pr\n", __func__, res);
+ return res;
+ }
+ }
+ return __request_region(parent, start, n, name, flags);
+}
+EXPORT_SYMBOL(__wrap___request_region);
+
+void __wrap___release_region(struct resource *parent, resource_size_t start,
+ resource_size_t n)
+{
+ struct nfit_test_resource *nfit_res;
+
+ if (parent == &iomem_resource) {
+ rcu_read_lock();
+ nfit_res = get_nfit_res(start);
+ rcu_read_unlock();
+ if (nfit_res) {
+ struct resource *res = nfit_res->res + 1;
+
+ if (start != res->start || resource_size(res) != n)
+ pr_info("%s: start: %llx n: %llx mismatch: %pr\n",
+ __func__, start, n, res);
+ else
+ memset(res, 0, sizeof(*res));
+ return;
+ }
+ }
+ __release_region(parent, start, n);
+}
+EXPORT_SYMBOL(__wrap___release_region);
+
+MODULE_LICENSE("GPL v2");
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
new file mode 100644
index 000000000000..4b69b8368de0
--- /dev/null
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -0,0 +1,1116 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/platform_device.h>
+#include <linux/dma-mapping.h>
+#include <linux/libnvdimm.h>
+#include <linux/vmalloc.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/ndctl.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <nfit.h>
+#include <nd.h>
+#include "nfit_test.h"
+
+/*
+ * Generate an NFIT table to describe the following topology:
+ *
+ * BUS0: Interleaved PMEM regions, and aliasing with BLK regions
+ *
+ * (a) (b) DIMM BLK-REGION
+ * +----------+--------------+----------+---------+
+ * +------+ | blk2.0 | pm0.0 | blk2.1 | pm1.0 | 0 region2
+ * | imc0 +--+- - - - - region0 - - - -+----------+ +
+ * +--+---+ | blk3.0 | pm0.0 | blk3.1 | pm1.0 | 1 region3
+ * | +----------+--------------v----------v v
+ * +--+---+ | |
+ * | cpu0 | region1
+ * +--+---+ | |
+ * | +-------------------------^----------^ ^
+ * +--+---+ | blk4.0 | pm1.0 | 2 region4
+ * | imc1 +--+-------------------------+----------+ +
+ * +------+ | blk5.0 | pm1.0 | 3 region5
+ * +-------------------------+----------+-+-------+
+ *
+ * *) In this layout we have four dimms and two memory controllers in one
+ * socket. Each unique interface (BLK or PMEM) to DPA space
+ * is identified by a region device with a dynamically assigned id.
+ *
+ * *) The first portion of dimm0 and dimm1 are interleaved as REGION0.
+ * A single PMEM namespace "pm0.0" is created using half of the
+ * REGION0 SPA-range. REGION0 spans dimm0 and dimm1. PMEM namespace
+ * allocate from from the bottom of a region. The unallocated
+ * portion of REGION0 aliases with REGION2 and REGION3. That
+ * unallacted capacity is reclaimed as BLK namespaces ("blk2.0" and
+ * "blk3.0") starting at the base of each DIMM to offset (a) in those
+ * DIMMs. "pm0.0", "blk2.0" and "blk3.0" are free-form readable
+ * names that can be assigned to a namespace.
+ *
+ * *) In the last portion of dimm0 and dimm1 we have an interleaved
+ * SPA range, REGION1, that spans those two dimms as well as dimm2
+ * and dimm3. Some of REGION1 allocated to a PMEM namespace named
+ * "pm1.0" the rest is reclaimed in 4 BLK namespaces (for each
+ * dimm in the interleave set), "blk2.1", "blk3.1", "blk4.0", and
+ * "blk5.0".
+ *
+ * *) The portion of dimm2 and dimm3 that do not participate in the
+ * REGION1 interleaved SPA range (i.e. the DPA address below offset
+ * (b) are also included in the "blk4.0" and "blk5.0" namespaces.
+ * Note, that BLK namespaces need not be contiguous in DPA-space, and
+ * can consume aliased capacity from multiple interleave sets.
+ *
+ * BUS1: Legacy NVDIMM (single contiguous range)
+ *
+ * region2
+ * +---------------------+
+ * |---------------------|
+ * || pm2.0 ||
+ * |---------------------|
+ * +---------------------+
+ *
+ * *) A NFIT-table may describe a simple system-physical-address range
+ * with no BLK aliasing. This type of region may optionally
+ * reference an NVDIMM.
+ */
+enum {
+ NUM_PM = 2,
+ NUM_DCR = 4,
+ NUM_BDW = NUM_DCR,
+ NUM_SPA = NUM_PM + NUM_DCR + NUM_BDW,
+ NUM_MEM = NUM_DCR + NUM_BDW + 2 /* spa0 iset */ + 4 /* spa1 iset */,
+ DIMM_SIZE = SZ_32M,
+ LABEL_SIZE = SZ_128K,
+ SPA0_SIZE = DIMM_SIZE,
+ SPA1_SIZE = DIMM_SIZE*2,
+ SPA2_SIZE = DIMM_SIZE,
+ BDW_SIZE = 64 << 8,
+ DCR_SIZE = 12,
+ NUM_NFITS = 2, /* permit testing multiple NFITs per system */
+};
+
+struct nfit_test_dcr {
+ __le64 bdw_addr;
+ __le32 bdw_status;
+ __u8 aperature[BDW_SIZE];
+};
+
+#define NFIT_DIMM_HANDLE(node, socket, imc, chan, dimm) \
+ (((node & 0xfff) << 16) | ((socket & 0xf) << 12) \
+ | ((imc & 0xf) << 8) | ((chan & 0xf) << 4) | (dimm & 0xf))
+
+static u32 handle[NUM_DCR] = {
+ [0] = NFIT_DIMM_HANDLE(0, 0, 0, 0, 0),
+ [1] = NFIT_DIMM_HANDLE(0, 0, 0, 0, 1),
+ [2] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 0),
+ [3] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 1),
+};
+
+struct nfit_test {
+ struct acpi_nfit_desc acpi_desc;
+ struct platform_device pdev;
+ struct list_head resources;
+ void *nfit_buf;
+ dma_addr_t nfit_dma;
+ size_t nfit_size;
+ int num_dcr;
+ int num_pm;
+ void **dimm;
+ dma_addr_t *dimm_dma;
+ void **label;
+ dma_addr_t *label_dma;
+ void **spa_set;
+ dma_addr_t *spa_set_dma;
+ struct nfit_test_dcr **dcr;
+ dma_addr_t *dcr_dma;
+ int (*alloc)(struct nfit_test *t);
+ void (*setup)(struct nfit_test *t);
+};
+
+static struct nfit_test *to_nfit_test(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+
+ return container_of(pdev, struct nfit_test, pdev);
+}
+
+static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
+ struct nvdimm *nvdimm, unsigned int cmd, void *buf,
+ unsigned int buf_len)
+{
+ struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+ struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc);
+ struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+ int i, rc;
+
+ if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask))
+ return -ENXIO;
+
+ /* lookup label space for the given dimm */
+ for (i = 0; i < ARRAY_SIZE(handle); i++)
+ if (__to_nfit_memdev(nfit_mem)->device_handle == handle[i])
+ break;
+ if (i >= ARRAY_SIZE(handle))
+ return -ENXIO;
+
+ switch (cmd) {
+ case ND_CMD_GET_CONFIG_SIZE: {
+ struct nd_cmd_get_config_size *nd_cmd = buf;
+
+ if (buf_len < sizeof(*nd_cmd))
+ return -EINVAL;
+ nd_cmd->status = 0;
+ nd_cmd->config_size = LABEL_SIZE;
+ nd_cmd->max_xfer = SZ_4K;
+ rc = 0;
+ break;
+ }
+ case ND_CMD_GET_CONFIG_DATA: {
+ struct nd_cmd_get_config_data_hdr *nd_cmd = buf;
+ unsigned int len, offset = nd_cmd->in_offset;
+
+ if (buf_len < sizeof(*nd_cmd))
+ return -EINVAL;
+ if (offset >= LABEL_SIZE)
+ return -EINVAL;
+ if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len)
+ return -EINVAL;
+
+ nd_cmd->status = 0;
+ len = min(nd_cmd->in_length, LABEL_SIZE - offset);
+ memcpy(nd_cmd->out_buf, t->label[i] + offset, len);
+ rc = buf_len - sizeof(*nd_cmd) - len;
+ break;
+ }
+ case ND_CMD_SET_CONFIG_DATA: {
+ struct nd_cmd_set_config_hdr *nd_cmd = buf;
+ unsigned int len, offset = nd_cmd->in_offset;
+ u32 *status;
+
+ if (buf_len < sizeof(*nd_cmd))
+ return -EINVAL;
+ if (offset >= LABEL_SIZE)
+ return -EINVAL;
+ if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len)
+ return -EINVAL;
+
+ status = buf + nd_cmd->in_length + sizeof(*nd_cmd);
+ *status = 0;
+ len = min(nd_cmd->in_length, LABEL_SIZE - offset);
+ memcpy(t->label[i] + offset, nd_cmd->in_buf, len);
+ rc = buf_len - sizeof(*nd_cmd) - (len + 4);
+ break;
+ }
+ default:
+ return -ENOTTY;
+ }
+
+ return rc;
+}
+
+static DEFINE_SPINLOCK(nfit_test_lock);
+static struct nfit_test *instances[NUM_NFITS];
+
+static void release_nfit_res(void *data)
+{
+ struct nfit_test_resource *nfit_res = data;
+ struct resource *res = nfit_res->res;
+
+ spin_lock(&nfit_test_lock);
+ list_del(&nfit_res->list);
+ spin_unlock(&nfit_test_lock);
+
+ if (is_vmalloc_addr(nfit_res->buf))
+ vfree(nfit_res->buf);
+ else
+ dma_free_coherent(nfit_res->dev, resource_size(res),
+ nfit_res->buf, res->start);
+ kfree(res);
+ kfree(nfit_res);
+}
+
+static void *__test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma,
+ void *buf)
+{
+ struct device *dev = &t->pdev.dev;
+ struct resource *res = kzalloc(sizeof(*res) * 2, GFP_KERNEL);
+ struct nfit_test_resource *nfit_res = kzalloc(sizeof(*nfit_res),
+ GFP_KERNEL);
+ int rc;
+
+ if (!res || !buf || !nfit_res)
+ goto err;
+ rc = devm_add_action(dev, release_nfit_res, nfit_res);
+ if (rc)
+ goto err;
+ INIT_LIST_HEAD(&nfit_res->list);
+ memset(buf, 0, size);
+ nfit_res->dev = dev;
+ nfit_res->buf = buf;
+ nfit_res->res = res;
+ res->start = *dma;
+ res->end = *dma + size - 1;
+ res->name = "NFIT";
+ spin_lock(&nfit_test_lock);
+ list_add(&nfit_res->list, &t->resources);
+ spin_unlock(&nfit_test_lock);
+
+ return nfit_res->buf;
+ err:
+ if (buf && !is_vmalloc_addr(buf))
+ dma_free_coherent(dev, size, buf, *dma);
+ else if (buf)
+ vfree(buf);
+ kfree(res);
+ kfree(nfit_res);
+ return NULL;
+}
+
+static void *test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma)
+{
+ void *buf = vmalloc(size);
+
+ *dma = (unsigned long) buf;
+ return __test_alloc(t, size, dma, buf);
+}
+
+static void *test_alloc_coherent(struct nfit_test *t, size_t size,
+ dma_addr_t *dma)
+{
+ struct device *dev = &t->pdev.dev;
+ void *buf = dma_alloc_coherent(dev, size, dma, GFP_KERNEL);
+
+ return __test_alloc(t, size, dma, buf);
+}
+
+static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(instances); i++) {
+ struct nfit_test_resource *n, *nfit_res = NULL;
+ struct nfit_test *t = instances[i];
+
+ if (!t)
+ continue;
+ spin_lock(&nfit_test_lock);
+ list_for_each_entry(n, &t->resources, list) {
+ if (addr >= n->res->start && (addr < n->res->start
+ + resource_size(n->res))) {
+ nfit_res = n;
+ break;
+ } else if (addr >= (unsigned long) n->buf
+ && (addr < (unsigned long) n->buf
+ + resource_size(n->res))) {
+ nfit_res = n;
+ break;
+ }
+ }
+ spin_unlock(&nfit_test_lock);
+ if (nfit_res)
+ return nfit_res;
+ }
+
+ return NULL;
+}
+
+static int nfit_test0_alloc(struct nfit_test *t)
+{
+ size_t nfit_size = sizeof(struct acpi_table_nfit)
+ + sizeof(struct acpi_nfit_system_address) * NUM_SPA
+ + sizeof(struct acpi_nfit_memory_map) * NUM_MEM
+ + sizeof(struct acpi_nfit_control_region) * NUM_DCR
+ + sizeof(struct acpi_nfit_data_region) * NUM_BDW;
+ int i;
+
+ t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma);
+ if (!t->nfit_buf)
+ return -ENOMEM;
+ t->nfit_size = nfit_size;
+
+ t->spa_set[0] = test_alloc_coherent(t, SPA0_SIZE, &t->spa_set_dma[0]);
+ if (!t->spa_set[0])
+ return -ENOMEM;
+
+ t->spa_set[1] = test_alloc_coherent(t, SPA1_SIZE, &t->spa_set_dma[1]);
+ if (!t->spa_set[1])
+ return -ENOMEM;
+
+ for (i = 0; i < NUM_DCR; i++) {
+ t->dimm[i] = test_alloc(t, DIMM_SIZE, &t->dimm_dma[i]);
+ if (!t->dimm[i])
+ return -ENOMEM;
+
+ t->label[i] = test_alloc(t, LABEL_SIZE, &t->label_dma[i]);
+ if (!t->label[i])
+ return -ENOMEM;
+ sprintf(t->label[i], "label%d", i);
+ }
+
+ for (i = 0; i < NUM_DCR; i++) {
+ t->dcr[i] = test_alloc(t, LABEL_SIZE, &t->dcr_dma[i]);
+ if (!t->dcr[i])
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int nfit_test1_alloc(struct nfit_test *t)
+{
+ size_t nfit_size = sizeof(struct acpi_table_nfit)
+ + sizeof(struct acpi_nfit_system_address)
+ + sizeof(struct acpi_nfit_memory_map)
+ + sizeof(struct acpi_nfit_control_region);
+
+ t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma);
+ if (!t->nfit_buf)
+ return -ENOMEM;
+ t->nfit_size = nfit_size;
+
+ t->spa_set[0] = test_alloc_coherent(t, SPA2_SIZE, &t->spa_set_dma[0]);
+ if (!t->spa_set[0])
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void nfit_test_init_header(struct acpi_table_nfit *nfit, size_t size)
+{
+ memcpy(nfit->header.signature, ACPI_SIG_NFIT, 4);
+ nfit->header.length = size;
+ nfit->header.revision = 1;
+ memcpy(nfit->header.oem_id, "LIBND", 6);
+ memcpy(nfit->header.oem_table_id, "TEST", 5);
+ nfit->header.oem_revision = 1;
+ memcpy(nfit->header.asl_compiler_id, "TST", 4);
+ nfit->header.asl_compiler_revision = 1;
+}
+
+static void nfit_test0_setup(struct nfit_test *t)
+{
+ struct nvdimm_bus_descriptor *nd_desc;
+ struct acpi_nfit_desc *acpi_desc;
+ struct acpi_nfit_memory_map *memdev;
+ void *nfit_buf = t->nfit_buf;
+ size_t size = t->nfit_size;
+ struct acpi_nfit_system_address *spa;
+ struct acpi_nfit_control_region *dcr;
+ struct acpi_nfit_data_region *bdw;
+ unsigned int offset;
+
+ nfit_test_init_header(nfit_buf, size);
+
+ /*
+ * spa0 (interleave first half of dimm0 and dimm1, note storage
+ * does not actually alias the related block-data-window
+ * regions)
+ */
+ spa = nfit_buf + sizeof(struct acpi_table_nfit);
+ spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+ spa->header.length = sizeof(*spa);
+ memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16);
+ spa->range_index = 0+1;
+ spa->address = t->spa_set_dma[0];
+ spa->length = SPA0_SIZE;
+
+ /*
+ * spa1 (interleave last half of the 4 DIMMS, note storage
+ * does not actually alias the related block-data-window
+ * regions)
+ */
+ spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa);
+ spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+ spa->header.length = sizeof(*spa);
+ memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16);
+ spa->range_index = 1+1;
+ spa->address = t->spa_set_dma[1];
+ spa->length = SPA1_SIZE;
+
+ /* spa2 (dcr0) dimm0 */
+ spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 2;
+ spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+ spa->header.length = sizeof(*spa);
+ memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
+ spa->range_index = 2+1;
+ spa->address = t->dcr_dma[0];
+ spa->length = DCR_SIZE;
+
+ /* spa3 (dcr1) dimm1 */
+ spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 3;
+ spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+ spa->header.length = sizeof(*spa);
+ memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
+ spa->range_index = 3+1;
+ spa->address = t->dcr_dma[1];
+ spa->length = DCR_SIZE;
+
+ /* spa4 (dcr2) dimm2 */
+ spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 4;
+ spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+ spa->header.length = sizeof(*spa);
+ memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
+ spa->range_index = 4+1;
+ spa->address = t->dcr_dma[2];
+ spa->length = DCR_SIZE;
+
+ /* spa5 (dcr3) dimm3 */
+ spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 5;
+ spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+ spa->header.length = sizeof(*spa);
+ memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
+ spa->range_index = 5+1;
+ spa->address = t->dcr_dma[3];
+ spa->length = DCR_SIZE;
+
+ /* spa6 (bdw for dcr0) dimm0 */
+ spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 6;
+ spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+ spa->header.length = sizeof(*spa);
+ memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16);
+ spa->range_index = 6+1;
+ spa->address = t->dimm_dma[0];
+ spa->length = DIMM_SIZE;
+
+ /* spa7 (bdw for dcr1) dimm1 */
+ spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 7;
+ spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+ spa->header.length = sizeof(*spa);
+ memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16);
+ spa->range_index = 7+1;
+ spa->address = t->dimm_dma[1];
+ spa->length = DIMM_SIZE;
+
+ /* spa8 (bdw for dcr2) dimm2 */
+ spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 8;
+ spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+ spa->header.length = sizeof(*spa);
+ memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16);
+ spa->range_index = 8+1;
+ spa->address = t->dimm_dma[2];
+ spa->length = DIMM_SIZE;
+
+ /* spa9 (bdw for dcr3) dimm3 */
+ spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 9;
+ spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+ spa->header.length = sizeof(*spa);
+ memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16);
+ spa->range_index = 9+1;
+ spa->address = t->dimm_dma[3];
+ spa->length = DIMM_SIZE;
+
+ offset = sizeof(struct acpi_table_nfit) + sizeof(*spa) * 10;
+ /* mem-region0 (spa0, dimm0) */
+ memdev = nfit_buf + offset;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[0];
+ memdev->physical_id = 0;
+ memdev->region_id = 0;
+ memdev->range_index = 0+1;
+ memdev->region_index = 0+1;
+ memdev->region_size = SPA0_SIZE/2;
+ memdev->region_offset = t->spa_set_dma[0];
+ memdev->address = 0;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 2;
+
+ /* mem-region1 (spa0, dimm1) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map);
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[1];
+ memdev->physical_id = 1;
+ memdev->region_id = 0;
+ memdev->range_index = 0+1;
+ memdev->region_index = 1+1;
+ memdev->region_size = SPA0_SIZE/2;
+ memdev->region_offset = t->spa_set_dma[0] + SPA0_SIZE/2;
+ memdev->address = 0;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 2;
+
+ /* mem-region2 (spa1, dimm0) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 2;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[0];
+ memdev->physical_id = 0;
+ memdev->region_id = 1;
+ memdev->range_index = 1+1;
+ memdev->region_index = 0+1;
+ memdev->region_size = SPA1_SIZE/4;
+ memdev->region_offset = t->spa_set_dma[1];
+ memdev->address = SPA0_SIZE/2;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 4;
+
+ /* mem-region3 (spa1, dimm1) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 3;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[1];
+ memdev->physical_id = 1;
+ memdev->region_id = 1;
+ memdev->range_index = 1+1;
+ memdev->region_index = 1+1;
+ memdev->region_size = SPA1_SIZE/4;
+ memdev->region_offset = t->spa_set_dma[1] + SPA1_SIZE/4;
+ memdev->address = SPA0_SIZE/2;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 4;
+
+ /* mem-region4 (spa1, dimm2) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 4;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[2];
+ memdev->physical_id = 2;
+ memdev->region_id = 0;
+ memdev->range_index = 1+1;
+ memdev->region_index = 2+1;
+ memdev->region_size = SPA1_SIZE/4;
+ memdev->region_offset = t->spa_set_dma[1] + 2*SPA1_SIZE/4;
+ memdev->address = SPA0_SIZE/2;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 4;
+
+ /* mem-region5 (spa1, dimm3) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 5;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[3];
+ memdev->physical_id = 3;
+ memdev->region_id = 0;
+ memdev->range_index = 1+1;
+ memdev->region_index = 3+1;
+ memdev->region_size = SPA1_SIZE/4;
+ memdev->region_offset = t->spa_set_dma[1] + 3*SPA1_SIZE/4;
+ memdev->address = SPA0_SIZE/2;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 4;
+
+ /* mem-region6 (spa/dcr0, dimm0) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 6;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[0];
+ memdev->physical_id = 0;
+ memdev->region_id = 0;
+ memdev->range_index = 2+1;
+ memdev->region_index = 0+1;
+ memdev->region_size = 0;
+ memdev->region_offset = 0;
+ memdev->address = 0;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 1;
+
+ /* mem-region7 (spa/dcr1, dimm1) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 7;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[1];
+ memdev->physical_id = 1;
+ memdev->region_id = 0;
+ memdev->range_index = 3+1;
+ memdev->region_index = 1+1;
+ memdev->region_size = 0;
+ memdev->region_offset = 0;
+ memdev->address = 0;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 1;
+
+ /* mem-region8 (spa/dcr2, dimm2) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 8;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[2];
+ memdev->physical_id = 2;
+ memdev->region_id = 0;
+ memdev->range_index = 4+1;
+ memdev->region_index = 2+1;
+ memdev->region_size = 0;
+ memdev->region_offset = 0;
+ memdev->address = 0;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 1;
+
+ /* mem-region9 (spa/dcr3, dimm3) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 9;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[3];
+ memdev->physical_id = 3;
+ memdev->region_id = 0;
+ memdev->range_index = 5+1;
+ memdev->region_index = 3+1;
+ memdev->region_size = 0;
+ memdev->region_offset = 0;
+ memdev->address = 0;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 1;
+
+ /* mem-region10 (spa/bdw0, dimm0) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 10;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[0];
+ memdev->physical_id = 0;
+ memdev->region_id = 0;
+ memdev->range_index = 6+1;
+ memdev->region_index = 0+1;
+ memdev->region_size = 0;
+ memdev->region_offset = 0;
+ memdev->address = 0;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 1;
+
+ /* mem-region11 (spa/bdw1, dimm1) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 11;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[1];
+ memdev->physical_id = 1;
+ memdev->region_id = 0;
+ memdev->range_index = 7+1;
+ memdev->region_index = 1+1;
+ memdev->region_size = 0;
+ memdev->region_offset = 0;
+ memdev->address = 0;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 1;
+
+ /* mem-region12 (spa/bdw2, dimm2) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 12;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[2];
+ memdev->physical_id = 2;
+ memdev->region_id = 0;
+ memdev->range_index = 8+1;
+ memdev->region_index = 2+1;
+ memdev->region_size = 0;
+ memdev->region_offset = 0;
+ memdev->address = 0;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 1;
+
+ /* mem-region13 (spa/dcr3, dimm3) */
+ memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 13;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = handle[3];
+ memdev->physical_id = 3;
+ memdev->region_id = 0;
+ memdev->range_index = 9+1;
+ memdev->region_index = 3+1;
+ memdev->region_size = 0;
+ memdev->region_offset = 0;
+ memdev->address = 0;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 1;
+
+ offset = offset + sizeof(struct acpi_nfit_memory_map) * 14;
+ /* dcr-descriptor0 */
+ dcr = nfit_buf + offset;
+ dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
+ dcr->header.length = sizeof(struct acpi_nfit_control_region);
+ dcr->region_index = 0+1;
+ dcr->vendor_id = 0xabcd;
+ dcr->device_id = 0;
+ dcr->revision_id = 1;
+ dcr->serial_number = ~handle[0];
+ dcr->windows = 1;
+ dcr->window_size = DCR_SIZE;
+ dcr->command_offset = 0;
+ dcr->command_size = 8;
+ dcr->status_offset = 8;
+ dcr->status_size = 4;
+
+ /* dcr-descriptor1 */
+ dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region);
+ dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
+ dcr->header.length = sizeof(struct acpi_nfit_control_region);
+ dcr->region_index = 1+1;
+ dcr->vendor_id = 0xabcd;
+ dcr->device_id = 0;
+ dcr->revision_id = 1;
+ dcr->serial_number = ~handle[1];
+ dcr->windows = 1;
+ dcr->window_size = DCR_SIZE;
+ dcr->command_offset = 0;
+ dcr->command_size = 8;
+ dcr->status_offset = 8;
+ dcr->status_size = 4;
+
+ /* dcr-descriptor2 */
+ dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region) * 2;
+ dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
+ dcr->header.length = sizeof(struct acpi_nfit_control_region);
+ dcr->region_index = 2+1;
+ dcr->vendor_id = 0xabcd;
+ dcr->device_id = 0;
+ dcr->revision_id = 1;
+ dcr->serial_number = ~handle[2];
+ dcr->windows = 1;
+ dcr->window_size = DCR_SIZE;
+ dcr->command_offset = 0;
+ dcr->command_size = 8;
+ dcr->status_offset = 8;
+ dcr->status_size = 4;
+
+ /* dcr-descriptor3 */
+ dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region) * 3;
+ dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
+ dcr->header.length = sizeof(struct acpi_nfit_control_region);
+ dcr->region_index = 3+1;
+ dcr->vendor_id = 0xabcd;
+ dcr->device_id = 0;
+ dcr->revision_id = 1;
+ dcr->serial_number = ~handle[3];
+ dcr->windows = 1;
+ dcr->window_size = DCR_SIZE;
+ dcr->command_offset = 0;
+ dcr->command_size = 8;
+ dcr->status_offset = 8;
+ dcr->status_size = 4;
+
+ offset = offset + sizeof(struct acpi_nfit_control_region) * 4;
+ /* bdw0 (spa/dcr0, dimm0) */
+ bdw = nfit_buf + offset;
+ bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION;
+ bdw->header.length = sizeof(struct acpi_nfit_data_region);
+ bdw->region_index = 0+1;
+ bdw->windows = 1;
+ bdw->offset = 0;
+ bdw->size = BDW_SIZE;
+ bdw->capacity = DIMM_SIZE;
+ bdw->start_address = 0;
+
+ /* bdw1 (spa/dcr1, dimm1) */
+ bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region);
+ bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION;
+ bdw->header.length = sizeof(struct acpi_nfit_data_region);
+ bdw->region_index = 1+1;
+ bdw->windows = 1;
+ bdw->offset = 0;
+ bdw->size = BDW_SIZE;
+ bdw->capacity = DIMM_SIZE;
+ bdw->start_address = 0;
+
+ /* bdw2 (spa/dcr2, dimm2) */
+ bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region) * 2;
+ bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION;
+ bdw->header.length = sizeof(struct acpi_nfit_data_region);
+ bdw->region_index = 2+1;
+ bdw->windows = 1;
+ bdw->offset = 0;
+ bdw->size = BDW_SIZE;
+ bdw->capacity = DIMM_SIZE;
+ bdw->start_address = 0;
+
+ /* bdw3 (spa/dcr3, dimm3) */
+ bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region) * 3;
+ bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION;
+ bdw->header.length = sizeof(struct acpi_nfit_data_region);
+ bdw->region_index = 3+1;
+ bdw->windows = 1;
+ bdw->offset = 0;
+ bdw->size = BDW_SIZE;
+ bdw->capacity = DIMM_SIZE;
+ bdw->start_address = 0;
+
+ acpi_desc = &t->acpi_desc;
+ set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en);
+ set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
+ set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
+ nd_desc = &acpi_desc->nd_desc;
+ nd_desc->ndctl = nfit_test_ctl;
+}
+
+static void nfit_test1_setup(struct nfit_test *t)
+{
+ size_t size = t->nfit_size, offset;
+ void *nfit_buf = t->nfit_buf;
+ struct acpi_nfit_memory_map *memdev;
+ struct acpi_nfit_control_region *dcr;
+ struct acpi_nfit_system_address *spa;
+
+ nfit_test_init_header(nfit_buf, size);
+
+ offset = sizeof(struct acpi_table_nfit);
+ /* spa0 (flat range with no bdw aliasing) */
+ spa = nfit_buf + offset;
+ spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+ spa->header.length = sizeof(*spa);
+ memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16);
+ spa->range_index = 0+1;
+ spa->address = t->spa_set_dma[0];
+ spa->length = SPA2_SIZE;
+
+ offset += sizeof(*spa);
+ /* mem-region0 (spa0, dimm0) */
+ memdev = nfit_buf + offset;
+ memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
+ memdev->header.length = sizeof(*memdev);
+ memdev->device_handle = 0;
+ memdev->physical_id = 0;
+ memdev->region_id = 0;
+ memdev->range_index = 0+1;
+ memdev->region_index = 0+1;
+ memdev->region_size = SPA2_SIZE;
+ memdev->region_offset = 0;
+ memdev->address = 0;
+ memdev->interleave_index = 0;
+ memdev->interleave_ways = 1;
+ memdev->flags = ACPI_NFIT_MEM_SAVE_FAILED | ACPI_NFIT_MEM_RESTORE_FAILED
+ | ACPI_NFIT_MEM_FLUSH_FAILED | ACPI_NFIT_MEM_HEALTH_OBSERVED
+ | ACPI_NFIT_MEM_ARMED;
+
+ offset += sizeof(*memdev);
+ /* dcr-descriptor0 */
+ dcr = nfit_buf + offset;
+ dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
+ dcr->header.length = sizeof(struct acpi_nfit_control_region);
+ dcr->region_index = 0+1;
+ dcr->vendor_id = 0xabcd;
+ dcr->device_id = 0;
+ dcr->revision_id = 1;
+ dcr->serial_number = ~0;
+ dcr->code = 0x201;
+ dcr->windows = 0;
+ dcr->window_size = 0;
+ dcr->command_offset = 0;
+ dcr->command_size = 0;
+ dcr->status_offset = 0;
+ dcr->status_size = 0;
+}
+
+static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
+ void *iobuf, u64 len, int rw)
+{
+ struct nfit_blk *nfit_blk = ndbr->blk_provider_data;
+ struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW];
+ struct nd_region *nd_region = &ndbr->nd_region;
+ unsigned int lane;
+
+ lane = nd_region_acquire_lane(nd_region);
+ if (rw)
+ memcpy(mmio->base + dpa, iobuf, len);
+ else
+ memcpy(iobuf, mmio->base + dpa, len);
+ nd_region_release_lane(nd_region, lane);
+
+ return 0;
+}
+
+static int nfit_test_probe(struct platform_device *pdev)
+{
+ struct nvdimm_bus_descriptor *nd_desc;
+ struct acpi_nfit_desc *acpi_desc;
+ struct device *dev = &pdev->dev;
+ struct nfit_test *nfit_test;
+ int rc;
+
+ nfit_test = to_nfit_test(&pdev->dev);
+
+ /* common alloc */
+ if (nfit_test->num_dcr) {
+ int num = nfit_test->num_dcr;
+
+ nfit_test->dimm = devm_kcalloc(dev, num, sizeof(void *),
+ GFP_KERNEL);
+ nfit_test->dimm_dma = devm_kcalloc(dev, num, sizeof(dma_addr_t),
+ GFP_KERNEL);
+ nfit_test->label = devm_kcalloc(dev, num, sizeof(void *),
+ GFP_KERNEL);
+ nfit_test->label_dma = devm_kcalloc(dev, num,
+ sizeof(dma_addr_t), GFP_KERNEL);
+ nfit_test->dcr = devm_kcalloc(dev, num,
+ sizeof(struct nfit_test_dcr *), GFP_KERNEL);
+ nfit_test->dcr_dma = devm_kcalloc(dev, num,
+ sizeof(dma_addr_t), GFP_KERNEL);
+ if (nfit_test->dimm && nfit_test->dimm_dma && nfit_test->label
+ && nfit_test->label_dma && nfit_test->dcr
+ && nfit_test->dcr_dma)
+ /* pass */;
+ else
+ return -ENOMEM;
+ }
+
+ if (nfit_test->num_pm) {
+ int num = nfit_test->num_pm;
+
+ nfit_test->spa_set = devm_kcalloc(dev, num, sizeof(void *),
+ GFP_KERNEL);
+ nfit_test->spa_set_dma = devm_kcalloc(dev, num,
+ sizeof(dma_addr_t), GFP_KERNEL);
+ if (nfit_test->spa_set && nfit_test->spa_set_dma)
+ /* pass */;
+ else
+ return -ENOMEM;
+ }
+
+ /* per-nfit specific alloc */
+ if (nfit_test->alloc(nfit_test))
+ return -ENOMEM;
+
+ nfit_test->setup(nfit_test);
+ acpi_desc = &nfit_test->acpi_desc;
+ acpi_desc->dev = &pdev->dev;
+ acpi_desc->nfit = nfit_test->nfit_buf;
+ acpi_desc->blk_do_io = nfit_test_blk_do_io;
+ nd_desc = &acpi_desc->nd_desc;
+ nd_desc->attr_groups = acpi_nfit_attribute_groups;
+ acpi_desc->nvdimm_bus = nvdimm_bus_register(&pdev->dev, nd_desc);
+ if (!acpi_desc->nvdimm_bus)
+ return -ENXIO;
+
+ rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_size);
+ if (rc) {
+ nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+ return rc;
+ }
+
+ return 0;
+}
+
+static int nfit_test_remove(struct platform_device *pdev)
+{
+ struct nfit_test *nfit_test = to_nfit_test(&pdev->dev);
+ struct acpi_nfit_desc *acpi_desc = &nfit_test->acpi_desc;
+
+ nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+
+ return 0;
+}
+
+static void nfit_test_release(struct device *dev)
+{
+ struct nfit_test *nfit_test = to_nfit_test(dev);
+
+ kfree(nfit_test);
+}
+
+static const struct platform_device_id nfit_test_id[] = {
+ { KBUILD_MODNAME },
+ { },
+};
+
+static struct platform_driver nfit_test_driver = {
+ .probe = nfit_test_probe,
+ .remove = nfit_test_remove,
+ .driver = {
+ .name = KBUILD_MODNAME,
+ },
+ .id_table = nfit_test_id,
+};
+
+#ifdef CONFIG_CMA_SIZE_MBYTES
+#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
+#else
+#define CMA_SIZE_MBYTES 0
+#endif
+
+static __init int nfit_test_init(void)
+{
+ int rc, i;
+
+ nfit_test_setup(nfit_test_lookup);
+
+ for (i = 0; i < NUM_NFITS; i++) {
+ struct nfit_test *nfit_test;
+ struct platform_device *pdev;
+ static int once;
+
+ nfit_test = kzalloc(sizeof(*nfit_test), GFP_KERNEL);
+ if (!nfit_test) {
+ rc = -ENOMEM;
+ goto err_register;
+ }
+ INIT_LIST_HEAD(&nfit_test->resources);
+ switch (i) {
+ case 0:
+ nfit_test->num_pm = NUM_PM;
+ nfit_test->num_dcr = NUM_DCR;
+ nfit_test->alloc = nfit_test0_alloc;
+ nfit_test->setup = nfit_test0_setup;
+ break;
+ case 1:
+ nfit_test->num_pm = 1;
+ nfit_test->alloc = nfit_test1_alloc;
+ nfit_test->setup = nfit_test1_setup;
+ break;
+ default:
+ rc = -EINVAL;
+ goto err_register;
+ }
+ pdev = &nfit_test->pdev;
+ pdev->name = KBUILD_MODNAME;
+ pdev->id = i;
+ pdev->dev.release = nfit_test_release;
+ rc = platform_device_register(pdev);
+ if (rc) {
+ put_device(&pdev->dev);
+ goto err_register;
+ }
+
+ rc = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+ if (rc)
+ goto err_register;
+
+ instances[i] = nfit_test;
+
+ if (!once++) {
+ dma_addr_t dma;
+ void *buf;
+
+ buf = dma_alloc_coherent(&pdev->dev, SZ_128M, &dma,
+ GFP_KERNEL);
+ if (!buf) {
+ rc = -ENOMEM;
+ dev_warn(&pdev->dev, "need 128M of free cma\n");
+ goto err_register;
+ }
+ dma_free_coherent(&pdev->dev, SZ_128M, buf, dma);
+ }
+ }
+
+ rc = platform_driver_register(&nfit_test_driver);
+ if (rc)
+ goto err_register;
+ return 0;
+
+ err_register:
+ for (i = 0; i < NUM_NFITS; i++)
+ if (instances[i])
+ platform_device_unregister(&instances[i]->pdev);
+ nfit_test_teardown();
+ return rc;
+}
+
+static __exit void nfit_test_exit(void)
+{
+ int i;
+
+ platform_driver_unregister(&nfit_test_driver);
+ for (i = 0; i < NUM_NFITS; i++)
+ platform_device_unregister(&instances[i]->pdev);
+ nfit_test_teardown();
+}
+
+module_init(nfit_test_init);
+module_exit(nfit_test_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h
new file mode 100644
index 000000000000..96c5e16d7db9
--- /dev/null
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __NFIT_TEST_H__
+#define __NFIT_TEST_H__
+
+struct nfit_test_resource {
+ struct list_head list;
+ struct resource *res;
+ struct device *dev;
+ void *buf;
+};
+
+typedef struct nfit_test_resource *(*nfit_test_lookup_fn)(resource_size_t);
+void __iomem *__wrap_ioremap_nocache(resource_size_t offset,
+ unsigned long size);
+void __wrap_iounmap(volatile void __iomem *addr);
+void nfit_test_setup(nfit_test_lookup_fn lookup);
+void nfit_test_teardown(void);
+#endif
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 95abddcd7839..24ae9e829e9a 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -4,6 +4,7 @@ TARGETS += efivarfs
TARGETS += exec
TARGETS += firmware
TARGETS += ftrace
+TARGETS += futex
TARGETS += kcmp
TARGETS += memfd
TARGETS += memory-hotplug
@@ -12,13 +13,18 @@ TARGETS += mqueue
TARGETS += net
TARGETS += powerpc
TARGETS += ptrace
+TARGETS += seccomp
TARGETS += size
TARGETS += sysctl
+ifneq (1, $(quicktest))
TARGETS += timers
+endif
TARGETS += user
TARGETS += vm
TARGETS += x86
#Please keep the TARGETS list alphabetically sorted
+# Run "make quicktest=1 run_tests" or
+# "make quicktest=1 kselftest from top level Makefile
TARGETS_HOTPLUG = cpu-hotplug
TARGETS_HOTPLUG += memory-hotplug
@@ -27,7 +33,7 @@ TARGETS_HOTPLUG += memory-hotplug
# Makefile to avoid test build failures when test
# Makefile doesn't have explicit build rules.
ifeq (1,$(MAKELEVEL))
-undefine LDFLAGS
+override LDFLAGS =
override MAKEFLAGS =
endif
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
index 4edb7d0da29b..6b76bfdc847e 100644
--- a/tools/testing/selftests/exec/Makefile
+++ b/tools/testing/selftests/exec/Makefile
@@ -1,6 +1,6 @@
CFLAGS = -Wall
BINARIES = execveat
-DEPS = execveat.symlink execveat.denatured script subdir
+DEPS = execveat.symlink execveat.denatured script
all: $(BINARIES) $(DEPS)
subdir:
diff --git a/tools/testing/selftests/ftrace/Makefile b/tools/testing/selftests/ftrace/Makefile
index 346720639d1d..0acbeca47225 100644
--- a/tools/testing/selftests/ftrace/Makefile
+++ b/tools/testing/selftests/ftrace/Makefile
@@ -1,6 +1,7 @@
all:
TEST_PROGS := ftracetest
+TEST_DIRS := test.d/
include ../lib.mk
diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile
new file mode 100644
index 000000000000..6a1752956283
--- /dev/null
+++ b/tools/testing/selftests/futex/Makefile
@@ -0,0 +1,29 @@
+SUBDIRS := functional
+
+TEST_PROGS := run.sh
+
+.PHONY: all clean
+all:
+ for DIR in $(SUBDIRS); do $(MAKE) -C $$DIR $@ ; done
+
+include ../lib.mk
+
+override define RUN_TESTS
+ ./run.sh
+endef
+
+override define INSTALL_RULE
+ mkdir -p $(INSTALL_PATH)
+ install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)
+
+ @for SUBDIR in $(SUBDIRS); do \
+ $(MAKE) -C $$SUBDIR INSTALL_PATH=$(INSTALL_PATH)/$$SUBDIR install; \
+ done;
+endef
+
+override define EMIT_TESTS
+ echo "./run.sh"
+endef
+
+clean:
+ for DIR in $(SUBDIRS); do $(MAKE) -C $$DIR $@ ; done
diff --git a/tools/testing/selftests/futex/README b/tools/testing/selftests/futex/README
new file mode 100644
index 000000000000..3224a049b196
--- /dev/null
+++ b/tools/testing/selftests/futex/README
@@ -0,0 +1,62 @@
+Futex Test
+==========
+Futex Test is intended to thoroughly test the Linux kernel futex system call
+API.
+
+Functional tests shall test the documented behavior of the futex operation
+code under test. This includes checking for proper behavior under normal use,
+odd corner cases, regression tests, and abject abuse and misuse.
+
+Futextest will also provide example implementation of mutual exclusion
+primitives. These can be used as is in user applications or can serve as
+examples for system libraries. These will likely be added to either a new lib/
+directory or purely as header files under include/, I'm leaning toward the
+latter.
+
+Quick Start
+-----------
+# make
+# ./run.sh
+
+Design and Implementation Goals
+-------------------------------
+o Tests should be as self contained as is practical so as to facilitate sharing
+ the individual tests on mailing list discussions and bug reports.
+o The build system shall remain as simple as possible, avoiding any archive or
+ shared object building and linking.
+o Where possible, any helper functions or other package-wide code shall be
+ implemented in header files, avoiding the need to compile intermediate object
+ files.
+o External dependendencies shall remain as minimal as possible. Currently gcc
+ and glibc are the only dependencies.
+o Tests return 0 for success and < 0 for failure.
+
+Output Formatting
+-----------------
+Test output shall be easily parsable by both human and machine. Title and
+results are printed to stdout, while intermediate ERROR or FAIL messages are
+sent to stderr. Tests shall support the -c option to print PASS, FAIL, and
+ERROR strings in color for easy visual parsing. Output shall conform to the
+following format:
+
+test_name: Description of the test
+ Arguments: arg1=val1 #units specified for clarity where appropriate
+ ERROR: Description of unexpected error
+ FAIL: Reason for test failure
+ # FIXME: Perhaps an " INFO: informational message" option would be
+ # useful here. Using -v to toggle it them on and off, as with -c.
+ # there may be multiple ERROR or FAIL messages
+Result: (PASS|FAIL|ERROR)
+
+Naming
+------
+o FIXME: decide on a sane test naming scheme. Currently the tests are named
+ based on the primary futex operation they test. Eventually this will become a
+ problem as we intend to write multiple tests which collide in this namespace.
+ Perhaps something like "wait-wake-1" "wait-wake-2" is adequate, leaving the
+ detailed description in the test source and the output.
+
+Coding Style
+------------
+o The Futex Test project adheres to the coding standards set forth by Linux
+ kernel as defined in the Linux source Documentation/CodingStyle.
diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore
new file mode 100644
index 000000000000..a09f57061902
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/.gitignore
@@ -0,0 +1,7 @@
+futex_requeue_pi
+futex_requeue_pi_mismatched_ops
+futex_requeue_pi_signal_restart
+futex_wait_private_mapped_file
+futex_wait_timeout
+futex_wait_uninitialized_heap
+futex_wait_wouldblock
diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile
new file mode 100644
index 000000000000..9d6b75ef7b5d
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/Makefile
@@ -0,0 +1,25 @@
+INCLUDES := -I../include -I../../
+CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES)
+LDFLAGS := $(LDFLAGS) -pthread -lrt
+
+HEADERS := ../include/futextest.h
+TARGETS := \
+ futex_wait_timeout \
+ futex_wait_wouldblock \
+ futex_requeue_pi \
+ futex_requeue_pi_signal_restart \
+ futex_requeue_pi_mismatched_ops \
+ futex_wait_uninitialized_heap \
+ futex_wait_private_mapped_file
+
+TEST_PROGS := $(TARGETS) run.sh
+
+.PHONY: all clean
+all: $(TARGETS)
+
+$(TARGETS): $(HEADERS)
+
+include ../../lib.mk
+
+clean:
+ rm -f $(TARGETS)
diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi.c b/tools/testing/selftests/futex/functional/futex_requeue_pi.c
new file mode 100644
index 000000000000..3da06ad23996
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_requeue_pi.c
@@ -0,0 +1,409 @@
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2006-2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * DESCRIPTION
+ * This test excercises the futex syscall op codes needed for requeuing
+ * priority inheritance aware POSIX condition variables and mutexes.
+ *
+ * AUTHORS
+ * Sripathi Kodi <sripathik@in.ibm.com>
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2008-Jan-13: Initial version by Sripathi Kodi <sripathik@in.ibm.com>
+ * 2009-Nov-6: futex test adaptation by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#include <errno.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include "atomic.h"
+#include "futextest.h"
+#include "logging.h"
+
+#define MAX_WAKE_ITERS 1000
+#define THREAD_MAX 10
+#define SIGNAL_PERIOD_US 100
+
+atomic_t waiters_blocked = ATOMIC_INITIALIZER;
+atomic_t waiters_woken = ATOMIC_INITIALIZER;
+
+futex_t f1 = FUTEX_INITIALIZER;
+futex_t f2 = FUTEX_INITIALIZER;
+futex_t wake_complete = FUTEX_INITIALIZER;
+
+/* Test option defaults */
+static long timeout_ns;
+static int broadcast;
+static int owner;
+static int locked;
+
+struct thread_arg {
+ long id;
+ struct timespec *timeout;
+ int lock;
+ int ret;
+};
+#define THREAD_ARG_INITIALIZER { 0, NULL, 0, 0 }
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -b Broadcast wakeup (all waiters)\n");
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -l Lock the pi futex across requeue\n");
+ printf(" -o Use a third party pi futex owner during requeue (cancels -l)\n");
+ printf(" -t N Timeout in nanoseconds (default: 0)\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg,
+ int policy, int prio)
+{
+ int ret;
+ struct sched_param schedp;
+ pthread_attr_t attr;
+
+ pthread_attr_init(&attr);
+ memset(&schedp, 0, sizeof(schedp));
+
+ ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
+ if (ret) {
+ error("pthread_attr_setinheritsched\n", ret);
+ return -1;
+ }
+
+ ret = pthread_attr_setschedpolicy(&attr, policy);
+ if (ret) {
+ error("pthread_attr_setschedpolicy\n", ret);
+ return -1;
+ }
+
+ schedp.sched_priority = prio;
+ ret = pthread_attr_setschedparam(&attr, &schedp);
+ if (ret) {
+ error("pthread_attr_setschedparam\n", ret);
+ return -1;
+ }
+
+ ret = pthread_create(pth, &attr, func, arg);
+ if (ret) {
+ error("pthread_create\n", ret);
+ return -1;
+ }
+ return 0;
+}
+
+
+void *waiterfn(void *arg)
+{
+ struct thread_arg *args = (struct thread_arg *)arg;
+ futex_t old_val;
+
+ info("Waiter %ld: running\n", args->id);
+ /* Each thread sleeps for a different amount of time
+ * This is to avoid races, because we don't lock the
+ * external mutex here */
+ usleep(1000 * (long)args->id);
+
+ old_val = f1;
+ atomic_inc(&waiters_blocked);
+ info("Calling futex_wait_requeue_pi: %p (%u) -> %p\n",
+ &f1, f1, &f2);
+ args->ret = futex_wait_requeue_pi(&f1, old_val, &f2, args->timeout,
+ FUTEX_PRIVATE_FLAG);
+
+ info("waiter %ld woke with %d %s\n", args->id, args->ret,
+ args->ret < 0 ? strerror(errno) : "");
+ atomic_inc(&waiters_woken);
+ if (args->ret < 0) {
+ if (args->timeout && errno == ETIMEDOUT)
+ args->ret = 0;
+ else {
+ args->ret = RET_ERROR;
+ error("futex_wait_requeue_pi\n", errno);
+ }
+ futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG);
+ }
+ futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+
+ info("Waiter %ld: exiting with %d\n", args->id, args->ret);
+ pthread_exit((void *)&args->ret);
+}
+
+void *broadcast_wakerfn(void *arg)
+{
+ struct thread_arg *args = (struct thread_arg *)arg;
+ int nr_requeue = INT_MAX;
+ int task_count = 0;
+ futex_t old_val;
+ int nr_wake = 1;
+ int i = 0;
+
+ info("Waker: waiting for waiters to block\n");
+ while (waiters_blocked.val < THREAD_MAX)
+ usleep(1000);
+ usleep(1000);
+
+ info("Waker: Calling broadcast\n");
+ if (args->lock) {
+ info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", f2, &f2);
+ futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG);
+ }
+ continue_requeue:
+ old_val = f1;
+ args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, nr_wake, nr_requeue,
+ FUTEX_PRIVATE_FLAG);
+ if (args->ret < 0) {
+ args->ret = RET_ERROR;
+ error("FUTEX_CMP_REQUEUE_PI failed\n", errno);
+ } else if (++i < MAX_WAKE_ITERS) {
+ task_count += args->ret;
+ if (task_count < THREAD_MAX - waiters_woken.val)
+ goto continue_requeue;
+ } else {
+ error("max broadcast iterations (%d) reached with %d/%d tasks woken or requeued\n",
+ 0, MAX_WAKE_ITERS, task_count, THREAD_MAX);
+ args->ret = RET_ERROR;
+ }
+
+ futex_wake(&wake_complete, 1, FUTEX_PRIVATE_FLAG);
+
+ if (args->lock)
+ futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+
+ if (args->ret > 0)
+ args->ret = task_count;
+
+ info("Waker: exiting with %d\n", args->ret);
+ pthread_exit((void *)&args->ret);
+}
+
+void *signal_wakerfn(void *arg)
+{
+ struct thread_arg *args = (struct thread_arg *)arg;
+ unsigned int old_val;
+ int nr_requeue = 0;
+ int task_count = 0;
+ int nr_wake = 1;
+ int i = 0;
+
+ info("Waker: waiting for waiters to block\n");
+ while (waiters_blocked.val < THREAD_MAX)
+ usleep(1000);
+ usleep(1000);
+
+ while (task_count < THREAD_MAX && waiters_woken.val < THREAD_MAX) {
+ info("task_count: %d, waiters_woken: %d\n",
+ task_count, waiters_woken.val);
+ if (args->lock) {
+ info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n",
+ f2, &f2);
+ futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG);
+ }
+ info("Waker: Calling signal\n");
+ /* cond_signal */
+ old_val = f1;
+ args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2,
+ nr_wake, nr_requeue,
+ FUTEX_PRIVATE_FLAG);
+ if (args->ret < 0)
+ args->ret = -errno;
+ info("futex: %x\n", f2);
+ if (args->lock) {
+ info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n",
+ f2, &f2);
+ futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+ }
+ info("futex: %x\n", f2);
+ if (args->ret < 0) {
+ error("FUTEX_CMP_REQUEUE_PI failed\n", errno);
+ args->ret = RET_ERROR;
+ break;
+ }
+
+ task_count += args->ret;
+ usleep(SIGNAL_PERIOD_US);
+ i++;
+ /* we have to loop at least THREAD_MAX times */
+ if (i > MAX_WAKE_ITERS + THREAD_MAX) {
+ error("max signaling iterations (%d) reached, giving up on pending waiters.\n",
+ 0, MAX_WAKE_ITERS + THREAD_MAX);
+ args->ret = RET_ERROR;
+ break;
+ }
+ }
+
+ futex_wake(&wake_complete, 1, FUTEX_PRIVATE_FLAG);
+
+ if (args->ret >= 0)
+ args->ret = task_count;
+
+ info("Waker: exiting with %d\n", args->ret);
+ info("Waker: waiters_woken: %d\n", waiters_woken.val);
+ pthread_exit((void *)&args->ret);
+}
+
+void *third_party_blocker(void *arg)
+{
+ struct thread_arg *args = (struct thread_arg *)arg;
+ int ret2 = 0;
+
+ args->ret = futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG);
+ if (args->ret)
+ goto out;
+ args->ret = futex_wait(&wake_complete, wake_complete, NULL,
+ FUTEX_PRIVATE_FLAG);
+ ret2 = futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+
+ out:
+ if (args->ret || ret2) {
+ error("third_party_blocker() futex error", 0);
+ args->ret = RET_ERROR;
+ }
+
+ pthread_exit((void *)&args->ret);
+}
+
+int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns)
+{
+ void *(*wakerfn)(void *) = signal_wakerfn;
+ struct thread_arg blocker_arg = THREAD_ARG_INITIALIZER;
+ struct thread_arg waker_arg = THREAD_ARG_INITIALIZER;
+ pthread_t waiter[THREAD_MAX], waker, blocker;
+ struct timespec ts, *tsp = NULL;
+ struct thread_arg args[THREAD_MAX];
+ int *waiter_ret;
+ int i, ret = RET_PASS;
+
+ if (timeout_ns) {
+ time_t secs;
+
+ info("timeout_ns = %ld\n", timeout_ns);
+ ret = clock_gettime(CLOCK_MONOTONIC, &ts);
+ secs = (ts.tv_nsec + timeout_ns) / 1000000000;
+ ts.tv_nsec = ((int64_t)ts.tv_nsec + timeout_ns) % 1000000000;
+ ts.tv_sec += secs;
+ info("ts.tv_sec = %ld\n", ts.tv_sec);
+ info("ts.tv_nsec = %ld\n", ts.tv_nsec);
+ tsp = &ts;
+ }
+
+ if (broadcast)
+ wakerfn = broadcast_wakerfn;
+
+ if (third_party_owner) {
+ if (create_rt_thread(&blocker, third_party_blocker,
+ (void *)&blocker_arg, SCHED_FIFO, 1)) {
+ error("Creating third party blocker thread failed\n",
+ errno);
+ ret = RET_ERROR;
+ goto out;
+ }
+ }
+
+ atomic_set(&waiters_woken, 0);
+ for (i = 0; i < THREAD_MAX; i++) {
+ args[i].id = i;
+ args[i].timeout = tsp;
+ info("Starting thread %d\n", i);
+ if (create_rt_thread(&waiter[i], waiterfn, (void *)&args[i],
+ SCHED_FIFO, 1)) {
+ error("Creating waiting thread failed\n", errno);
+ ret = RET_ERROR;
+ goto out;
+ }
+ }
+ waker_arg.lock = lock;
+ if (create_rt_thread(&waker, wakerfn, (void *)&waker_arg,
+ SCHED_FIFO, 1)) {
+ error("Creating waker thread failed\n", errno);
+ ret = RET_ERROR;
+ goto out;
+ }
+
+ /* Wait for threads to finish */
+ /* Store the first error or failure encountered in waiter_ret */
+ waiter_ret = &args[0].ret;
+ for (i = 0; i < THREAD_MAX; i++)
+ pthread_join(waiter[i],
+ *waiter_ret ? NULL : (void **)&waiter_ret);
+
+ if (third_party_owner)
+ pthread_join(blocker, NULL);
+ pthread_join(waker, NULL);
+
+out:
+ if (!ret) {
+ if (*waiter_ret)
+ ret = *waiter_ret;
+ else if (waker_arg.ret < 0)
+ ret = waker_arg.ret;
+ else if (blocker_arg.ret)
+ ret = blocker_arg.ret;
+ }
+
+ return ret;
+}
+
+int main(int argc, char *argv[])
+{
+ int c, ret;
+
+ while ((c = getopt(argc, argv, "bchlot:v:")) != -1) {
+ switch (c) {
+ case 'b':
+ broadcast = 1;
+ break;
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'l':
+ locked = 1;
+ break;
+ case 'o':
+ owner = 1;
+ locked = 0;
+ break;
+ case 't':
+ timeout_ns = atoi(optarg);
+ break;
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ printf("%s: Test requeue functionality\n", basename(argv[0]));
+ printf("\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n",
+ broadcast, locked, owner, timeout_ns);
+
+ /*
+ * FIXME: unit_test is obsolete now that we parse options and the
+ * various style of runs are done by run.sh - simplify the code and move
+ * unit_test into main()
+ */
+ ret = unit_test(broadcast, locked, owner, timeout_ns);
+
+ print_result(ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c
new file mode 100644
index 000000000000..d5e4f2c4da2a
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c
@@ -0,0 +1,135 @@
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * DESCRIPTION
+ * 1. Block a thread using FUTEX_WAIT
+ * 2. Attempt to use FUTEX_CMP_REQUEUE_PI on the futex from 1.
+ * 3. The kernel must detect the mismatch and return -EINVAL.
+ *
+ * AUTHOR
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2009-Nov-9: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#include <errno.h>
+#include <getopt.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "futextest.h"
+#include "logging.h"
+
+futex_t f1 = FUTEX_INITIALIZER;
+futex_t f2 = FUTEX_INITIALIZER;
+int child_ret = 0;
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+void *blocking_child(void *arg)
+{
+ child_ret = futex_wait(&f1, f1, NULL, FUTEX_PRIVATE_FLAG);
+ if (child_ret < 0) {
+ child_ret = -errno;
+ error("futex_wait\n", errno);
+ }
+ return (void *)&child_ret;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret = RET_PASS;
+ pthread_t child;
+ int c;
+
+ while ((c = getopt(argc, argv, "chv:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ printf("%s: Detect mismatched requeue_pi operations\n",
+ basename(argv[0]));
+
+ if (pthread_create(&child, NULL, blocking_child, NULL)) {
+ error("pthread_create\n", errno);
+ ret = RET_ERROR;
+ goto out;
+ }
+ /* Allow the child to block in the kernel. */
+ sleep(1);
+
+ /*
+ * The kernel should detect the waiter did not setup the
+ * q->requeue_pi_key and return -EINVAL. If it does not,
+ * it likely gave the lock to the child, which is now hung
+ * in the kernel.
+ */
+ ret = futex_cmp_requeue_pi(&f1, f1, &f2, 1, 0, FUTEX_PRIVATE_FLAG);
+ if (ret < 0) {
+ if (errno == EINVAL) {
+ /*
+ * The kernel correctly detected the mismatched
+ * requeue_pi target and aborted. Wake the child with
+ * FUTEX_WAKE.
+ */
+ ret = futex_wake(&f1, 1, FUTEX_PRIVATE_FLAG);
+ if (ret == 1) {
+ ret = RET_PASS;
+ } else if (ret < 0) {
+ error("futex_wake\n", errno);
+ ret = RET_ERROR;
+ } else {
+ error("futex_wake did not wake the child\n", 0);
+ ret = RET_ERROR;
+ }
+ } else {
+ error("futex_cmp_requeue_pi\n", errno);
+ ret = RET_ERROR;
+ }
+ } else if (ret > 0) {
+ fail("futex_cmp_requeue_pi failed to detect the mismatch\n");
+ ret = RET_FAIL;
+ } else {
+ error("futex_cmp_requeue_pi found no waiters\n", 0);
+ ret = RET_ERROR;
+ }
+
+ pthread_join(child, NULL);
+
+ if (!ret)
+ ret = child_ret;
+
+ out:
+ /* If the kernel crashes, we shouldn't return at all. */
+ print_result(ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c
new file mode 100644
index 000000000000..7f0c756993af
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c
@@ -0,0 +1,223 @@
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2006-2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * DESCRIPTION
+ * This test exercises the futex_wait_requeue_pi() signal handling both
+ * before and after the requeue. The first should be restarted by the
+ * kernel. The latter should return EWOULDBLOCK to the waiter.
+ *
+ * AUTHORS
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2008-May-5: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "atomic.h"
+#include "futextest.h"
+#include "logging.h"
+
+#define DELAY_US 100
+
+futex_t f1 = FUTEX_INITIALIZER;
+futex_t f2 = FUTEX_INITIALIZER;
+atomic_t requeued = ATOMIC_INITIALIZER;
+
+int waiter_ret = 0;
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg,
+ int policy, int prio)
+{
+ struct sched_param schedp;
+ pthread_attr_t attr;
+ int ret;
+
+ pthread_attr_init(&attr);
+ memset(&schedp, 0, sizeof(schedp));
+
+ ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
+ if (ret) {
+ error("pthread_attr_setinheritsched\n", ret);
+ return -1;
+ }
+
+ ret = pthread_attr_setschedpolicy(&attr, policy);
+ if (ret) {
+ error("pthread_attr_setschedpolicy\n", ret);
+ return -1;
+ }
+
+ schedp.sched_priority = prio;
+ ret = pthread_attr_setschedparam(&attr, &schedp);
+ if (ret) {
+ error("pthread_attr_setschedparam\n", ret);
+ return -1;
+ }
+
+ ret = pthread_create(pth, &attr, func, arg);
+ if (ret) {
+ error("pthread_create\n", ret);
+ return -1;
+ }
+ return 0;
+}
+
+void handle_signal(int signo)
+{
+ info("signal received %s requeue\n",
+ requeued.val ? "after" : "prior to");
+}
+
+void *waiterfn(void *arg)
+{
+ unsigned int old_val;
+ int res;
+
+ waiter_ret = RET_PASS;
+
+ info("Waiter running\n");
+ info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2);
+ old_val = f1;
+ res = futex_wait_requeue_pi(&f1, old_val, &(f2), NULL,
+ FUTEX_PRIVATE_FLAG);
+ if (!requeued.val || errno != EWOULDBLOCK) {
+ fail("unexpected return from futex_wait_requeue_pi: %d (%s)\n",
+ res, strerror(errno));
+ info("w2:futex: %x\n", f2);
+ if (!res)
+ futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+ waiter_ret = RET_FAIL;
+ }
+
+ info("Waiter exiting with %d\n", waiter_ret);
+ pthread_exit(NULL);
+}
+
+
+int main(int argc, char *argv[])
+{
+ unsigned int old_val;
+ struct sigaction sa;
+ pthread_t waiter;
+ int c, res, ret = RET_PASS;
+
+ while ((c = getopt(argc, argv, "chv:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ printf("%s: Test signal handling during requeue_pi\n",
+ basename(argv[0]));
+ printf("\tArguments: <none>\n");
+
+ sa.sa_handler = handle_signal;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = 0;
+ if (sigaction(SIGUSR1, &sa, NULL)) {
+ error("sigaction\n", errno);
+ exit(1);
+ }
+
+ info("m1:f2: %x\n", f2);
+ info("Creating waiter\n");
+ res = create_rt_thread(&waiter, waiterfn, NULL, SCHED_FIFO, 1);
+ if (res) {
+ error("Creating waiting thread failed", res);
+ ret = RET_ERROR;
+ goto out;
+ }
+
+ info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2);
+ info("m2:f2: %x\n", f2);
+ futex_lock_pi(&f2, 0, 0, FUTEX_PRIVATE_FLAG);
+ info("m3:f2: %x\n", f2);
+
+ while (1) {
+ /*
+ * signal the waiter before requeue, waiter should automatically
+ * restart futex_wait_requeue_pi() in the kernel. Wait for the
+ * waiter to block on f1 again.
+ */
+ info("Issuing SIGUSR1 to waiter\n");
+ pthread_kill(waiter, SIGUSR1);
+ usleep(DELAY_US);
+
+ info("Requeueing waiter via FUTEX_CMP_REQUEUE_PI\n");
+ old_val = f1;
+ res = futex_cmp_requeue_pi(&f1, old_val, &(f2), 1, 0,
+ FUTEX_PRIVATE_FLAG);
+ /*
+ * If res is non-zero, we either requeued the waiter or hit an
+ * error, break out and handle it. If it is zero, then the
+ * signal may have hit before the the waiter was blocked on f1.
+ * Try again.
+ */
+ if (res > 0) {
+ atomic_set(&requeued, 1);
+ break;
+ } else if (res > 0) {
+ error("FUTEX_CMP_REQUEUE_PI failed\n", errno);
+ ret = RET_ERROR;
+ break;
+ }
+ }
+ info("m4:f2: %x\n", f2);
+
+ /*
+ * Signal the waiter after requeue, waiter should return from
+ * futex_wait_requeue_pi() with EWOULDBLOCK. Join the thread here so the
+ * futex_unlock_pi() can't happen before the signal wakeup is detected
+ * in the kernel.
+ */
+ info("Issuing SIGUSR1 to waiter\n");
+ pthread_kill(waiter, SIGUSR1);
+ info("Waiting for waiter to return\n");
+ pthread_join(waiter, NULL);
+
+ info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", f2, &f2);
+ futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+ info("m5:f2: %x\n", f2);
+
+ out:
+ if (ret == RET_PASS && waiter_ret)
+ ret = waiter_ret;
+
+ print_result(ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c
new file mode 100644
index 000000000000..5f687f247454
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c
@@ -0,0 +1,125 @@
+/******************************************************************************
+ *
+ * Copyright FUJITSU LIMITED 2010
+ * Copyright KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * DESCRIPTION
+ * Internally, Futex has two handling mode, anon and file. The private file
+ * mapping is special. At first it behave as file, but after write anything
+ * it behave as anon. This test is intent to test such case.
+ *
+ * AUTHOR
+ * KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * HISTORY
+ * 2010-Jan-6: Initial version by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <libgen.h>
+#include <signal.h>
+
+#include "logging.h"
+#include "futextest.h"
+
+#define PAGE_SZ 4096
+
+char pad[PAGE_SZ] = {1};
+futex_t val = 1;
+char pad2[PAGE_SZ] = {1};
+
+#define WAKE_WAIT_US 3000000
+struct timespec wait_timeout = { .tv_sec = 5, .tv_nsec = 0};
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+void *thr_futex_wait(void *arg)
+{
+ int ret;
+
+ info("futex wait\n");
+ ret = futex_wait(&val, 1, &wait_timeout, 0);
+ if (ret && errno != EWOULDBLOCK && errno != ETIMEDOUT) {
+ error("futex error.\n", errno);
+ print_result(RET_ERROR);
+ exit(RET_ERROR);
+ }
+
+ if (ret && errno == ETIMEDOUT)
+ fail("waiter timedout\n");
+
+ info("futex_wait: ret = %d, errno = %d\n", ret, errno);
+
+ return NULL;
+}
+
+int main(int argc, char **argv)
+{
+ pthread_t thr;
+ int ret = RET_PASS;
+ int res;
+ int c;
+
+ while ((c = getopt(argc, argv, "chv:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ printf("%s: Test the futex value of private file mappings in FUTEX_WAIT\n",
+ basename(argv[0]));
+
+ ret = pthread_create(&thr, NULL, thr_futex_wait, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "pthread_create error\n");
+ ret = RET_ERROR;
+ goto out;
+ }
+
+ info("wait a while\n");
+ usleep(WAKE_WAIT_US);
+ val = 2;
+ res = futex_wake(&val, 1, 0);
+ info("futex_wake %d\n", res);
+ if (res != 1) {
+ fail("FUTEX_WAKE didn't find the waiting thread.\n");
+ ret = RET_FAIL;
+ }
+
+ info("join\n");
+ pthread_join(thr, NULL);
+
+ out:
+ print_result(ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
new file mode 100644
index 000000000000..ab428ca894de
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
@@ -0,0 +1,86 @@
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * DESCRIPTION
+ * Block on a futex and wait for timeout.
+ *
+ * AUTHOR
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#include <errno.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "futextest.h"
+#include "logging.h"
+
+static long timeout_ns = 100000; /* 100us default timeout */
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -t N Timeout in nanoseconds (default: 100,000)\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+int main(int argc, char *argv[])
+{
+ futex_t f1 = FUTEX_INITIALIZER;
+ struct timespec to;
+ int res, ret = RET_PASS;
+ int c;
+
+ while ((c = getopt(argc, argv, "cht:v:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 't':
+ timeout_ns = atoi(optarg);
+ break;
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ printf("%s: Block on a futex and wait for timeout\n",
+ basename(argv[0]));
+ printf("\tArguments: timeout=%ldns\n", timeout_ns);
+
+ /* initialize timeout */
+ to.tv_sec = 0;
+ to.tv_nsec = timeout_ns;
+
+ info("Calling futex_wait on f1: %u @ %p\n", f1, &f1);
+ res = futex_wait(&f1, f1, &to, FUTEX_PRIVATE_FLAG);
+ if (!res || errno != ETIMEDOUT) {
+ fail("futex_wait returned %d\n", ret < 0 ? errno : ret);
+ ret = RET_FAIL;
+ }
+
+ print_result(ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c
new file mode 100644
index 000000000000..fe7aee96844b
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c
@@ -0,0 +1,124 @@
+/******************************************************************************
+ *
+ * Copyright FUJITSU LIMITED 2010
+ * Copyright KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * DESCRIPTION
+ * Wait on uninitialized heap. It shold be zero and FUTEX_WAIT should
+ * return immediately. This test is intent to test zero page handling in
+ * futex.
+ *
+ * AUTHOR
+ * KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * HISTORY
+ * 2010-Jan-6: Initial version by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ *****************************************************************************/
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/futex.h>
+#include <libgen.h>
+
+#include "logging.h"
+#include "futextest.h"
+
+#define WAIT_US 5000000
+
+static int child_blocked = 1;
+static int child_ret;
+void *buf;
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+void *wait_thread(void *arg)
+{
+ int res;
+
+ child_ret = RET_PASS;
+ res = futex_wait(buf, 1, NULL, 0);
+ child_blocked = 0;
+
+ if (res != 0 && errno != EWOULDBLOCK) {
+ error("futex failure\n", errno);
+ child_ret = RET_ERROR;
+ }
+ pthread_exit(NULL);
+}
+
+int main(int argc, char **argv)
+{
+ int c, ret = RET_PASS;
+ long page_size;
+ pthread_t thr;
+
+ while ((c = getopt(argc, argv, "chv:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+ if (buf == (void *)-1) {
+ error("mmap\n", errno);
+ exit(1);
+ }
+
+ printf("%s: Test the uninitialized futex value in FUTEX_WAIT\n",
+ basename(argv[0]));
+
+
+ ret = pthread_create(&thr, NULL, wait_thread, NULL);
+ if (ret) {
+ error("pthread_create\n", errno);
+ ret = RET_ERROR;
+ goto out;
+ }
+
+ info("waiting %dus for child to return\n", WAIT_US);
+ usleep(WAIT_US);
+
+ ret = child_ret;
+ if (child_blocked) {
+ fail("child blocked in kernel\n");
+ ret = RET_FAIL;
+ }
+
+ out:
+ print_result(ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
new file mode 100644
index 000000000000..b6b027448825
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
@@ -0,0 +1,79 @@
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * DESCRIPTION
+ * Test if FUTEX_WAIT op returns -EWOULDBLOCK if the futex value differs
+ * from the expected one.
+ *
+ * AUTHOR
+ * Gowrishankar <gowrishankar.m@in.ibm.com>
+ *
+ * HISTORY
+ * 2009-Nov-14: Initial version by Gowrishankar <gowrishankar.m@in.ibm.com>
+ *
+ *****************************************************************************/
+
+#include <errno.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "futextest.h"
+#include "logging.h"
+
+#define timeout_ns 100000
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+int main(int argc, char *argv[])
+{
+ struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns};
+ futex_t f1 = FUTEX_INITIALIZER;
+ int res, ret = RET_PASS;
+ int c;
+
+ while ((c = getopt(argc, argv, "cht:v:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ printf("%s: Test the unexpected futex value in FUTEX_WAIT\n",
+ basename(argv[0]));
+
+ info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1);
+ res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG);
+ if (!res || errno != EWOULDBLOCK) {
+ fail("futex_wait returned: %d %s\n",
+ res ? errno : res, res ? strerror(errno) : "");
+ ret = RET_FAIL;
+ }
+
+ print_result(ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh
new file mode 100755
index 000000000000..e87dbe2a0b0d
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/run.sh
@@ -0,0 +1,79 @@
+#!/bin/sh
+
+###############################################################################
+#
+# Copyright © International Business Machines Corp., 2009
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# DESCRIPTION
+# Run tests in the current directory.
+#
+# AUTHOR
+# Darren Hart <dvhart@linux.intel.com>
+#
+# HISTORY
+# 2009-Nov-9: Initial version by Darren Hart <dvhart@linux.intel.com>
+# 2010-Jan-6: Add futex_wait_uninitialized_heap and futex_wait_private_mapped_file
+# by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+#
+###############################################################################
+
+# Test for a color capable console
+if [ -z "$USE_COLOR" ]; then
+ tput setf 7
+ if [ $? -eq 0 ]; then
+ USE_COLOR=1
+ tput sgr0
+ fi
+fi
+if [ "$USE_COLOR" -eq 1 ]; then
+ COLOR="-c"
+fi
+
+
+echo
+# requeue pi testing
+# without timeouts
+./futex_requeue_pi $COLOR
+./futex_requeue_pi $COLOR -b
+./futex_requeue_pi $COLOR -b -l
+./futex_requeue_pi $COLOR -b -o
+./futex_requeue_pi $COLOR -l
+./futex_requeue_pi $COLOR -o
+# with timeouts
+./futex_requeue_pi $COLOR -b -l -t 5000
+./futex_requeue_pi $COLOR -l -t 5000
+./futex_requeue_pi $COLOR -b -l -t 500000
+./futex_requeue_pi $COLOR -l -t 500000
+./futex_requeue_pi $COLOR -b -t 5000
+./futex_requeue_pi $COLOR -t 5000
+./futex_requeue_pi $COLOR -b -t 500000
+./futex_requeue_pi $COLOR -t 500000
+./futex_requeue_pi $COLOR -b -o -t 5000
+./futex_requeue_pi $COLOR -l -t 5000
+./futex_requeue_pi $COLOR -b -o -t 500000
+./futex_requeue_pi $COLOR -l -t 500000
+# with long timeout
+./futex_requeue_pi $COLOR -b -l -t 2000000000
+./futex_requeue_pi $COLOR -l -t 2000000000
+
+
+echo
+./futex_requeue_pi_mismatched_ops $COLOR
+
+echo
+./futex_requeue_pi_signal_restart $COLOR
+
+echo
+./futex_wait_timeout $COLOR
+
+echo
+./futex_wait_wouldblock $COLOR
+
+echo
+./futex_wait_uninitialized_heap $COLOR
+./futex_wait_private_mapped_file $COLOR
diff --git a/tools/testing/selftests/futex/include/atomic.h b/tools/testing/selftests/futex/include/atomic.h
new file mode 100644
index 000000000000..f861da3e31ab
--- /dev/null
+++ b/tools/testing/selftests/futex/include/atomic.h
@@ -0,0 +1,83 @@
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * DESCRIPTION
+ * GCC atomic builtin wrappers
+ * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
+ *
+ * AUTHOR
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2009-Nov-17: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#ifndef _ATOMIC_H
+#define _ATOMIC_H
+
+typedef struct {
+ volatile int val;
+} atomic_t;
+
+#define ATOMIC_INITIALIZER { 0 }
+
+/**
+ * atomic_cmpxchg() - Atomic compare and exchange
+ * @uaddr: The address of the futex to be modified
+ * @oldval: The expected value of the futex
+ * @newval: The new value to try and assign the futex
+ *
+ * Return the old value of addr->val.
+ */
+static inline int
+atomic_cmpxchg(atomic_t *addr, int oldval, int newval)
+{
+ return __sync_val_compare_and_swap(&addr->val, oldval, newval);
+}
+
+/**
+ * atomic_inc() - Atomic incrememnt
+ * @addr: Address of the variable to increment
+ *
+ * Return the new value of addr->val.
+ */
+static inline int
+atomic_inc(atomic_t *addr)
+{
+ return __sync_add_and_fetch(&addr->val, 1);
+}
+
+/**
+ * atomic_dec() - Atomic decrement
+ * @addr: Address of the variable to decrement
+ *
+ * Return the new value of addr-val.
+ */
+static inline int
+atomic_dec(atomic_t *addr)
+{
+ return __sync_sub_and_fetch(&addr->val, 1);
+}
+
+/**
+ * atomic_set() - Atomic set
+ * @addr: Address of the variable to set
+ * @newval: New value for the atomic_t
+ *
+ * Return the new value of addr->val.
+ */
+static inline int
+atomic_set(atomic_t *addr, int newval)
+{
+ addr->val = newval;
+ return newval;
+}
+
+#endif
diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h
new file mode 100644
index 000000000000..b98c3aba7102
--- /dev/null
+++ b/tools/testing/selftests/futex/include/futextest.h
@@ -0,0 +1,266 @@
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * DESCRIPTION
+ * Glibc independent futex library for testing kernel functionality.
+ *
+ * AUTHOR
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#ifndef _FUTEXTEST_H
+#define _FUTEXTEST_H
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <linux/futex.h>
+
+typedef volatile u_int32_t futex_t;
+#define FUTEX_INITIALIZER 0
+
+/* Define the newer op codes if the system header file is not up to date. */
+#ifndef FUTEX_WAIT_BITSET
+#define FUTEX_WAIT_BITSET 9
+#endif
+#ifndef FUTEX_WAKE_BITSET
+#define FUTEX_WAKE_BITSET 10
+#endif
+#ifndef FUTEX_WAIT_REQUEUE_PI
+#define FUTEX_WAIT_REQUEUE_PI 11
+#endif
+#ifndef FUTEX_CMP_REQUEUE_PI
+#define FUTEX_CMP_REQUEUE_PI 12
+#endif
+#ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE
+#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \
+ FUTEX_PRIVATE_FLAG)
+#endif
+#ifndef FUTEX_REQUEUE_PI_PRIVATE
+#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
+ FUTEX_PRIVATE_FLAG)
+#endif
+
+/**
+ * futex() - SYS_futex syscall wrapper
+ * @uaddr: address of first futex
+ * @op: futex op code
+ * @val: typically expected value of uaddr, but varies by op
+ * @timeout: typically an absolute struct timespec (except where noted
+ * otherwise). Overloaded by some ops
+ * @uaddr2: address of second futex for some ops\
+ * @val3: varies by op
+ * @opflags: flags to be bitwise OR'd with op, such as FUTEX_PRIVATE_FLAG
+ *
+ * futex() is used by all the following futex op wrappers. It can also be
+ * used for misuse and abuse testing. Generally, the specific op wrappers
+ * should be used instead. It is a macro instead of an static inline function as
+ * some of the types over overloaded (timeout is used for nr_requeue for
+ * example).
+ *
+ * These argument descriptions are the defaults for all
+ * like-named arguments in the following wrappers except where noted below.
+ */
+#define futex(uaddr, op, val, timeout, uaddr2, val3, opflags) \
+ syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3)
+
+/**
+ * futex_wait() - block on uaddr with optional timeout
+ * @timeout: relative timeout
+ */
+static inline int
+futex_wait(futex_t *uaddr, futex_t val, struct timespec *timeout, int opflags)
+{
+ return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags);
+}
+
+/**
+ * futex_wake() - wake one or more tasks blocked on uaddr
+ * @nr_wake: wake up to this many tasks
+ */
+static inline int
+futex_wake(futex_t *uaddr, int nr_wake, int opflags)
+{
+ return futex(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags);
+}
+
+/**
+ * futex_wait_bitset() - block on uaddr with bitset
+ * @bitset: bitset to be used with futex_wake_bitset
+ */
+static inline int
+futex_wait_bitset(futex_t *uaddr, futex_t val, struct timespec *timeout,
+ u_int32_t bitset, int opflags)
+{
+ return futex(uaddr, FUTEX_WAIT_BITSET, val, timeout, NULL, bitset,
+ opflags);
+}
+
+/**
+ * futex_wake_bitset() - wake one or more tasks blocked on uaddr with bitset
+ * @bitset: bitset to compare with that used in futex_wait_bitset
+ */
+static inline int
+futex_wake_bitset(futex_t *uaddr, int nr_wake, u_int32_t bitset, int opflags)
+{
+ return futex(uaddr, FUTEX_WAKE_BITSET, nr_wake, NULL, NULL, bitset,
+ opflags);
+}
+
+/**
+ * futex_lock_pi() - block on uaddr as a PI mutex
+ * @detect: whether (1) or not (0) to perform deadlock detection
+ */
+static inline int
+futex_lock_pi(futex_t *uaddr, struct timespec *timeout, int detect,
+ int opflags)
+{
+ return futex(uaddr, FUTEX_LOCK_PI, detect, timeout, NULL, 0, opflags);
+}
+
+/**
+ * futex_unlock_pi() - release uaddr as a PI mutex, waking the top waiter
+ */
+static inline int
+futex_unlock_pi(futex_t *uaddr, int opflags)
+{
+ return futex(uaddr, FUTEX_UNLOCK_PI, 0, NULL, NULL, 0, opflags);
+}
+
+/**
+ * futex_wake_op() - FIXME: COME UP WITH A GOOD ONE LINE DESCRIPTION
+ */
+static inline int
+futex_wake_op(futex_t *uaddr, futex_t *uaddr2, int nr_wake, int nr_wake2,
+ int wake_op, int opflags)
+{
+ return futex(uaddr, FUTEX_WAKE_OP, nr_wake, nr_wake2, uaddr2, wake_op,
+ opflags);
+}
+
+/**
+ * futex_requeue() - requeue without expected value comparison, deprecated
+ * @nr_wake: wake up to this many tasks
+ * @nr_requeue: requeue up to this many tasks
+ *
+ * Due to its inherently racy implementation, futex_requeue() is deprecated in
+ * favor of futex_cmp_requeue().
+ */
+static inline int
+futex_requeue(futex_t *uaddr, futex_t *uaddr2, int nr_wake, int nr_requeue,
+ int opflags)
+{
+ return futex(uaddr, FUTEX_REQUEUE, nr_wake, nr_requeue, uaddr2, 0,
+ opflags);
+}
+
+/**
+ * futex_cmp_requeue() - requeue tasks from uaddr to uaddr2
+ * @nr_wake: wake up to this many tasks
+ * @nr_requeue: requeue up to this many tasks
+ */
+static inline int
+futex_cmp_requeue(futex_t *uaddr, futex_t val, futex_t *uaddr2, int nr_wake,
+ int nr_requeue, int opflags)
+{
+ return futex(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2,
+ val, opflags);
+}
+
+/**
+ * futex_wait_requeue_pi() - block on uaddr and prepare to requeue to uaddr2
+ * @uaddr: non-PI futex source
+ * @uaddr2: PI futex target
+ *
+ * This is the first half of the requeue_pi mechanism. It shall always be
+ * paired with futex_cmp_requeue_pi().
+ */
+static inline int
+futex_wait_requeue_pi(futex_t *uaddr, futex_t val, futex_t *uaddr2,
+ struct timespec *timeout, int opflags)
+{
+ return futex(uaddr, FUTEX_WAIT_REQUEUE_PI, val, timeout, uaddr2, 0,
+ opflags);
+}
+
+/**
+ * futex_cmp_requeue_pi() - requeue tasks from uaddr to uaddr2 (PI aware)
+ * @uaddr: non-PI futex source
+ * @uaddr2: PI futex target
+ * @nr_wake: wake up to this many tasks
+ * @nr_requeue: requeue up to this many tasks
+ */
+static inline int
+futex_cmp_requeue_pi(futex_t *uaddr, futex_t val, futex_t *uaddr2, int nr_wake,
+ int nr_requeue, int opflags)
+{
+ return futex(uaddr, FUTEX_CMP_REQUEUE_PI, nr_wake, nr_requeue, uaddr2,
+ val, opflags);
+}
+
+/**
+ * futex_cmpxchg() - atomic compare and exchange
+ * @uaddr: The address of the futex to be modified
+ * @oldval: The expected value of the futex
+ * @newval: The new value to try and assign the futex
+ *
+ * Implement cmpxchg using gcc atomic builtins.
+ * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
+ *
+ * Return the old futex value.
+ */
+static inline u_int32_t
+futex_cmpxchg(futex_t *uaddr, u_int32_t oldval, u_int32_t newval)
+{
+ return __sync_val_compare_and_swap(uaddr, oldval, newval);
+}
+
+/**
+ * futex_dec() - atomic decrement of the futex value
+ * @uaddr: The address of the futex to be modified
+ *
+ * Return the new futex value.
+ */
+static inline u_int32_t
+futex_dec(futex_t *uaddr)
+{
+ return __sync_sub_and_fetch(uaddr, 1);
+}
+
+/**
+ * futex_inc() - atomic increment of the futex value
+ * @uaddr: the address of the futex to be modified
+ *
+ * Return the new futex value.
+ */
+static inline u_int32_t
+futex_inc(futex_t *uaddr)
+{
+ return __sync_add_and_fetch(uaddr, 1);
+}
+
+/**
+ * futex_set() - atomic decrement of the futex value
+ * @uaddr: the address of the futex to be modified
+ * @newval: New value for the atomic_t
+ *
+ * Return the new futex value.
+ */
+static inline u_int32_t
+futex_set(futex_t *uaddr, u_int32_t newval)
+{
+ *uaddr = newval;
+ return newval;
+}
+
+#endif
diff --git a/tools/testing/selftests/futex/include/logging.h b/tools/testing/selftests/futex/include/logging.h
new file mode 100644
index 000000000000..014aa01197af
--- /dev/null
+++ b/tools/testing/selftests/futex/include/logging.h
@@ -0,0 +1,153 @@
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * DESCRIPTION
+ * Glibc independent futex library for testing kernel functionality.
+ *
+ * AUTHOR
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#ifndef _LOGGING_H
+#define _LOGGING_H
+
+#include <string.h>
+#include <unistd.h>
+#include <linux/futex.h>
+#include "kselftest.h"
+
+/*
+ * Define PASS, ERROR, and FAIL strings with and without color escape
+ * sequences, default to no color.
+ */
+#define ESC 0x1B, '['
+#define BRIGHT '1'
+#define GREEN '3', '2'
+#define YELLOW '3', '3'
+#define RED '3', '1'
+#define ESCEND 'm'
+#define BRIGHT_GREEN ESC, BRIGHT, ';', GREEN, ESCEND
+#define BRIGHT_YELLOW ESC, BRIGHT, ';', YELLOW, ESCEND
+#define BRIGHT_RED ESC, BRIGHT, ';', RED, ESCEND
+#define RESET_COLOR ESC, '0', 'm'
+static const char PASS_COLOR[] = {BRIGHT_GREEN, ' ', 'P', 'A', 'S', 'S',
+ RESET_COLOR, 0};
+static const char ERROR_COLOR[] = {BRIGHT_YELLOW, 'E', 'R', 'R', 'O', 'R',
+ RESET_COLOR, 0};
+static const char FAIL_COLOR[] = {BRIGHT_RED, ' ', 'F', 'A', 'I', 'L',
+ RESET_COLOR, 0};
+static const char INFO_NORMAL[] = " INFO";
+static const char PASS_NORMAL[] = " PASS";
+static const char ERROR_NORMAL[] = "ERROR";
+static const char FAIL_NORMAL[] = " FAIL";
+const char *INFO = INFO_NORMAL;
+const char *PASS = PASS_NORMAL;
+const char *ERROR = ERROR_NORMAL;
+const char *FAIL = FAIL_NORMAL;
+
+/* Verbosity setting for INFO messages */
+#define VQUIET 0
+#define VCRITICAL 1
+#define VINFO 2
+#define VMAX VINFO
+int _verbose = VCRITICAL;
+
+/* Functional test return codes */
+#define RET_PASS 0
+#define RET_ERROR -1
+#define RET_FAIL -2
+
+/**
+ * log_color() - Use colored output for PASS, ERROR, and FAIL strings
+ * @use_color: use color (1) or not (0)
+ */
+void log_color(int use_color)
+{
+ if (use_color) {
+ PASS = PASS_COLOR;
+ ERROR = ERROR_COLOR;
+ FAIL = FAIL_COLOR;
+ } else {
+ PASS = PASS_NORMAL;
+ ERROR = ERROR_NORMAL;
+ FAIL = FAIL_NORMAL;
+ }
+}
+
+/**
+ * log_verbosity() - Set verbosity of test output
+ * @verbose: Enable (1) verbose output or not (0)
+ *
+ * Currently setting verbose=1 will enable INFO messages and 0 will disable
+ * them. FAIL and ERROR messages are always displayed.
+ */
+void log_verbosity(int level)
+{
+ if (level > VMAX)
+ level = VMAX;
+ else if (level < 0)
+ level = 0;
+ _verbose = level;
+}
+
+/**
+ * print_result() - Print standard PASS | ERROR | FAIL results
+ * @ret: the return value to be considered: 0 | RET_ERROR | RET_FAIL
+ *
+ * print_result() is primarily intended for functional tests.
+ */
+void print_result(int ret)
+{
+ const char *result = "Unknown return code";
+
+ switch (ret) {
+ case RET_PASS:
+ ksft_inc_pass_cnt();
+ result = PASS;
+ break;
+ case RET_ERROR:
+ result = ERROR;
+ break;
+ case RET_FAIL:
+ ksft_inc_fail_cnt();
+ result = FAIL;
+ break;
+ }
+ printf("Result: %s\n", result);
+}
+
+/* log level macros */
+#define info(message, vargs...) \
+do { \
+ if (_verbose >= VINFO) \
+ fprintf(stderr, "\t%s: "message, INFO, ##vargs); \
+} while (0)
+
+#define error(message, err, args...) \
+do { \
+ if (_verbose >= VCRITICAL) {\
+ if (err) \
+ fprintf(stderr, "\t%s: %s: "message, \
+ ERROR, strerror(err), ##args); \
+ else \
+ fprintf(stderr, "\t%s: "message, ERROR, ##args); \
+ } \
+} while (0)
+
+#define fail(message, args...) \
+do { \
+ if (_verbose >= VCRITICAL) \
+ fprintf(stderr, "\t%s: "message, FAIL, ##args); \
+} while (0)
+
+#endif
diff --git a/tools/testing/selftests/futex/run.sh b/tools/testing/selftests/futex/run.sh
new file mode 100755
index 000000000000..4126312ad64e
--- /dev/null
+++ b/tools/testing/selftests/futex/run.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+###############################################################################
+#
+# Copyright © International Business Machines Corp., 2009
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# DESCRIPTION
+# Run all tests under the functional, performance, and stress directories.
+# Format and summarize the results.
+#
+# AUTHOR
+# Darren Hart <dvhart@linux.intel.com>
+#
+# HISTORY
+# 2009-Nov-9: Initial version by Darren Hart <dvhart@linux.intel.com>
+#
+###############################################################################
+
+# Test for a color capable shell and pass the result to the subdir scripts
+USE_COLOR=0
+tput setf 7
+if [ $? -eq 0 ]; then
+ USE_COLOR=1
+ tput sgr0
+fi
+export USE_COLOR
+
+(cd functional; ./run.sh)
diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index 572c8888167a..ef1c80d67ac7 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -13,6 +13,13 @@
#include <stdlib.h>
#include <unistd.h>
+/* define kselftest exit codes */
+#define KSFT_PASS 0
+#define KSFT_FAIL 1
+#define KSFT_XFAIL 2
+#define KSFT_XPASS 3
+#define KSFT_SKIP 4
+
/* counters */
struct ksft_count {
unsigned int ksft_pass;
@@ -40,23 +47,23 @@ static inline void ksft_print_cnts(void)
static inline int ksft_exit_pass(void)
{
- exit(0);
+ exit(KSFT_PASS);
}
static inline int ksft_exit_fail(void)
{
- exit(1);
+ exit(KSFT_FAIL);
}
static inline int ksft_exit_xfail(void)
{
- exit(2);
+ exit(KSFT_XFAIL);
}
static inline int ksft_exit_xpass(void)
{
- exit(3);
+ exit(KSFT_XPASS);
}
static inline int ksft_exit_skip(void)
{
- exit(4);
+ exit(KSFT_SKIP);
}
#endif /* __KSELFTEST_H */
diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
index 2194155ae62a..ee412bab7ed4 100644
--- a/tools/testing/selftests/lib.mk
+++ b/tools/testing/selftests/lib.mk
@@ -13,6 +13,9 @@ run_tests: all
define INSTALL_RULE
mkdir -p $(INSTALL_PATH)
+ @for TEST_DIR in $(TEST_DIRS); do\
+ cp -r $$TEST_DIR $(INSTALL_PATH); \
+ done;
install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)
endef
diff --git a/tools/testing/selftests/mount/Makefile b/tools/testing/selftests/mount/Makefile
index 95580a97326e..5e35c9c50b72 100644
--- a/tools/testing/selftests/mount/Makefile
+++ b/tools/testing/selftests/mount/Makefile
@@ -9,7 +9,12 @@ unprivileged-remount-test: unprivileged-remount-test.c
include ../lib.mk
TEST_PROGS := unprivileged-remount-test
-override RUN_TESTS := if [ -f /proc/self/uid_map ] ; then ./unprivileged-remount-test ; fi
+override RUN_TESTS := if [ -f /proc/self/uid_map ] ; \
+ then \
+ ./unprivileged-remount-test ; \
+ else \
+ echo "WARN: No /proc/self/uid_map exist, test skipped." ; \
+ fi
override EMIT_TESTS := echo "$(RUN_TESTS)"
clean:
diff --git a/tools/testing/selftests/seccomp/.gitignore b/tools/testing/selftests/seccomp/.gitignore
new file mode 100644
index 000000000000..346d83ca8069
--- /dev/null
+++ b/tools/testing/selftests/seccomp/.gitignore
@@ -0,0 +1 @@
+seccomp_bpf
diff --git a/tools/testing/selftests/seccomp/Makefile b/tools/testing/selftests/seccomp/Makefile
new file mode 100644
index 000000000000..8401e87e34e1
--- /dev/null
+++ b/tools/testing/selftests/seccomp/Makefile
@@ -0,0 +1,10 @@
+TEST_PROGS := seccomp_bpf
+CFLAGS += -Wl,-no-as-needed -Wall
+LDFLAGS += -lpthread
+
+all: $(TEST_PROGS)
+
+include ../lib.mk
+
+clean:
+ $(RM) $(TEST_PROGS)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
new file mode 100644
index 000000000000..c5abe7fd7590
--- /dev/null
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -0,0 +1,2109 @@
+/*
+ * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by the GPLv2 license.
+ *
+ * Test code for seccomp bpf.
+ */
+
+#include <asm/siginfo.h>
+#define __have_siginfo_t 1
+#define __have_sigval_t 1
+#define __have_sigevent_t 1
+
+#include <errno.h>
+#include <linux/filter.h>
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <linux/prctl.h>
+#include <linux/ptrace.h>
+#include <linux/seccomp.h>
+#include <poll.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/elf.h>
+#include <sys/uio.h>
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include "test_harness.h"
+
+#ifndef PR_SET_PTRACER
+# define PR_SET_PTRACER 0x59616d61
+#endif
+
+#ifndef PR_SET_NO_NEW_PRIVS
+#define PR_SET_NO_NEW_PRIVS 38
+#define PR_GET_NO_NEW_PRIVS 39
+#endif
+
+#ifndef PR_SECCOMP_EXT
+#define PR_SECCOMP_EXT 43
+#endif
+
+#ifndef SECCOMP_EXT_ACT
+#define SECCOMP_EXT_ACT 1
+#endif
+
+#ifndef SECCOMP_EXT_ACT_TSYNC
+#define SECCOMP_EXT_ACT_TSYNC 1
+#endif
+
+#ifndef SECCOMP_MODE_STRICT
+#define SECCOMP_MODE_STRICT 1
+#endif
+
+#ifndef SECCOMP_MODE_FILTER
+#define SECCOMP_MODE_FILTER 2
+#endif
+
+#ifndef SECCOMP_RET_KILL
+#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */
+#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */
+#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */
+#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */
+#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
+
+/* Masks for the return value sections. */
+#define SECCOMP_RET_ACTION 0x7fff0000U
+#define SECCOMP_RET_DATA 0x0000ffffU
+
+struct seccomp_data {
+ int nr;
+ __u32 arch;
+ __u64 instruction_pointer;
+ __u64 args[6];
+};
+#endif
+
+#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
+
+#define SIBLING_EXIT_UNKILLED 0xbadbeef
+#define SIBLING_EXIT_FAILURE 0xbadface
+#define SIBLING_EXIT_NEWPRIVS 0xbadfeed
+
+TEST(mode_strict_support)
+{
+ long ret;
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support CONFIG_SECCOMP");
+ }
+ syscall(__NR_exit, 1);
+}
+
+TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
+{
+ long ret;
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support CONFIG_SECCOMP");
+ }
+ syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
+ NULL, NULL, NULL);
+ EXPECT_FALSE(true) {
+ TH_LOG("Unreachable!");
+ }
+}
+
+/* Note! This doesn't test no new privs behavior */
+TEST(no_new_privs_support)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ EXPECT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+}
+
+/* Tests kernel support by checking for a copy_from_user() fault on * NULL. */
+TEST(mode_filter_support)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EFAULT, errno) {
+ TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
+ }
+}
+
+TEST(mode_filter_without_nnp)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
+ ASSERT_LE(0, ret) {
+ TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
+ }
+ errno = 0;
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ /* Succeeds with CAP_SYS_ADMIN, fails without */
+ /* TODO(wad) check caps not euid */
+ if (geteuid()) {
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EACCES, errno);
+ } else {
+ EXPECT_EQ(0, ret);
+ }
+}
+
+#define MAX_INSNS_PER_PATH 32768
+
+TEST(filter_size_limits)
+{
+ int i;
+ int count = BPF_MAXINSNS + 1;
+ struct sock_filter allow[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_filter *filter;
+ struct sock_fprog prog = { };
+ long ret;
+
+ filter = calloc(count, sizeof(*filter));
+ ASSERT_NE(NULL, filter);
+
+ for (i = 0; i < count; i++)
+ filter[i] = allow[0];
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ prog.filter = filter;
+ prog.len = count;
+
+ /* Too many filter instructions in a single filter. */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_NE(0, ret) {
+ TH_LOG("Installing %d insn filter was allowed", prog.len);
+ }
+
+ /* One less is okay, though. */
+ prog.len -= 1;
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
+ }
+}
+
+TEST(filter_chain_limits)
+{
+ int i;
+ int count = BPF_MAXINSNS;
+ struct sock_filter allow[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_filter *filter;
+ struct sock_fprog prog = { };
+ long ret;
+
+ filter = calloc(count, sizeof(*filter));
+ ASSERT_NE(NULL, filter);
+
+ for (i = 0; i < count; i++)
+ filter[i] = allow[0];
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ prog.filter = filter;
+ prog.len = 1;
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ prog.len = count;
+
+ /* Too many total filter instructions. */
+ for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ if (ret != 0)
+ break;
+ }
+ ASSERT_NE(0, ret) {
+ TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
+ i, count, i * (count + 4));
+ }
+}
+
+TEST(mode_filter_cannot_move_to_strict)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EINVAL, errno);
+}
+
+
+TEST(mode_filter_get_seccomp)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
+ EXPECT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
+ EXPECT_EQ(2, ret);
+}
+
+
+TEST(ALLOW_all)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+}
+
+TEST(empty_prog)
+{
+ struct sock_filter filter[] = {
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EINVAL, errno);
+}
+
+TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+ EXPECT_EQ(0, syscall(__NR_getpid)) {
+ TH_LOG("getpid() shouldn't ever return");
+ }
+}
+
+/* return code >= 0x80000000 is unused. */
+TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+ EXPECT_EQ(0, syscall(__NR_getpid)) {
+ TH_LOG("getpid() shouldn't ever return");
+ }
+}
+
+TEST_SIGNAL(KILL_all, SIGSYS)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+}
+
+TEST_SIGNAL(KILL_one, SIGSYS)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* getpid() should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ /* Only both with lower 32-bit for now. */
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ pid_t parent = getppid();
+ pid_t pid = getpid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(pid, syscall(__NR_getpid));
+ /* getpid() should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid, 0x0C0FFEE));
+}
+
+TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ /* Only both with lower 32-bit for now. */
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ pid_t parent = getppid();
+ pid_t pid = getpid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(pid, syscall(__NR_getpid));
+ /* getpid() should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid, 1, 2, 3, 4, 5, 0x0C0FFEE));
+}
+
+/* TODO(wad) add 64-bit versus 32-bit arg tests. */
+TEST(arg_out_of_range)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EINVAL, errno);
+}
+
+TEST(ERRNO_valid)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | E2BIG),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(-1, read(0, NULL, 0));
+ EXPECT_EQ(E2BIG, errno);
+}
+
+TEST(ERRNO_zero)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* "errno" of 0 is ok. */
+ EXPECT_EQ(0, read(0, NULL, 0));
+}
+
+TEST(ERRNO_capped)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 4096),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(-1, read(0, NULL, 0));
+ EXPECT_EQ(4095, errno);
+}
+
+FIXTURE_DATA(TRAP) {
+ struct sock_fprog prog;
+};
+
+FIXTURE_SETUP(TRAP)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+
+ memset(&self->prog, 0, sizeof(self->prog));
+ self->prog.filter = malloc(sizeof(filter));
+ ASSERT_NE(NULL, self->prog.filter);
+ memcpy(self->prog.filter, filter, sizeof(filter));
+ self->prog.len = (unsigned short)ARRAY_SIZE(filter);
+}
+
+FIXTURE_TEARDOWN(TRAP)
+{
+ if (self->prog.filter)
+ free(self->prog.filter);
+}
+
+TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
+ ASSERT_EQ(0, ret);
+ syscall(__NR_getpid);
+}
+
+/* Ensure that SIGSYS overrides SIG_IGN */
+TEST_F_SIGNAL(TRAP, ign, SIGSYS)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ signal(SIGSYS, SIG_IGN);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
+ ASSERT_EQ(0, ret);
+ syscall(__NR_getpid);
+}
+
+static struct siginfo TRAP_info;
+static volatile int TRAP_nr;
+static void TRAP_action(int nr, siginfo_t *info, void *void_context)
+{
+ memcpy(&TRAP_info, info, sizeof(TRAP_info));
+ TRAP_nr = nr;
+}
+
+TEST_F(TRAP, handler)
+{
+ int ret, test;
+ struct sigaction act;
+ sigset_t mask;
+
+ memset(&act, 0, sizeof(act));
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGSYS);
+
+ act.sa_sigaction = &TRAP_action;
+ act.sa_flags = SA_SIGINFO;
+ ret = sigaction(SIGSYS, &act, NULL);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("sigaction failed");
+ }
+ ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("sigprocmask failed");
+ }
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
+ ASSERT_EQ(0, ret);
+ TRAP_nr = 0;
+ memset(&TRAP_info, 0, sizeof(TRAP_info));
+ /* Expect the registers to be rolled back. (nr = error) may vary
+ * based on arch. */
+ ret = syscall(__NR_getpid);
+ /* Silence gcc warning about volatile. */
+ test = TRAP_nr;
+ EXPECT_EQ(SIGSYS, test);
+ struct local_sigsys {
+ void *_call_addr; /* calling user insn */
+ int _syscall; /* triggering system call number */
+ unsigned int _arch; /* AUDIT_ARCH_* of syscall */
+ } *sigsys = (struct local_sigsys *)
+#ifdef si_syscall
+ &(TRAP_info.si_call_addr);
+#else
+ &TRAP_info.si_pid;
+#endif
+ EXPECT_EQ(__NR_getpid, sigsys->_syscall);
+ /* Make sure arch is non-zero. */
+ EXPECT_NE(0, sigsys->_arch);
+ EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
+}
+
+FIXTURE_DATA(precedence) {
+ struct sock_fprog allow;
+ struct sock_fprog trace;
+ struct sock_fprog error;
+ struct sock_fprog trap;
+ struct sock_fprog kill;
+};
+
+FIXTURE_SETUP(precedence)
+{
+ struct sock_filter allow_insns[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_filter trace_insns[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
+ };
+ struct sock_filter error_insns[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
+ };
+ struct sock_filter trap_insns[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
+ };
+ struct sock_filter kill_insns[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ };
+
+ memset(self, 0, sizeof(*self));
+#define FILTER_ALLOC(_x) \
+ self->_x.filter = malloc(sizeof(_x##_insns)); \
+ ASSERT_NE(NULL, self->_x.filter); \
+ memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
+ self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
+ FILTER_ALLOC(allow);
+ FILTER_ALLOC(trace);
+ FILTER_ALLOC(error);
+ FILTER_ALLOC(trap);
+ FILTER_ALLOC(kill);
+}
+
+FIXTURE_TEARDOWN(precedence)
+{
+#define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
+ FILTER_FREE(allow);
+ FILTER_FREE(trace);
+ FILTER_FREE(error);
+ FILTER_FREE(trap);
+ FILTER_FREE(kill);
+}
+
+TEST_F(precedence, allow_ok)
+{
+ pid_t parent, res = 0;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ res = syscall(__NR_getppid);
+ EXPECT_EQ(parent, res);
+}
+
+TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
+{
+ pid_t parent, res = 0;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ res = syscall(__NR_getppid);
+ EXPECT_EQ(parent, res);
+ /* getpid() should never return. */
+ res = syscall(__NR_getpid);
+ EXPECT_EQ(0, res);
+}
+
+TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* getpid() should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* getpid() should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* getpid() should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, errno_is_third)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, errno_is_third_in_any_order)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, trace_is_fourth)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* No ptracer */
+ EXPECT_EQ(-1, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, trace_is_fourth_in_any_order)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* No ptracer */
+ EXPECT_EQ(-1, syscall(__NR_getpid));
+}
+
+#ifndef PTRACE_O_TRACESECCOMP
+#define PTRACE_O_TRACESECCOMP 0x00000080
+#endif
+
+/* Catch the Ubuntu 12.04 value error. */
+#if PTRACE_EVENT_SECCOMP != 7
+#undef PTRACE_EVENT_SECCOMP
+#endif
+
+#ifndef PTRACE_EVENT_SECCOMP
+#define PTRACE_EVENT_SECCOMP 7
+#endif
+
+#define IS_SECCOMP_EVENT(status) ((status >> 16) == PTRACE_EVENT_SECCOMP)
+bool tracer_running;
+void tracer_stop(int sig)
+{
+ tracer_running = false;
+}
+
+typedef void tracer_func_t(struct __test_metadata *_metadata,
+ pid_t tracee, int status, void *args);
+
+void tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
+ tracer_func_t tracer_func, void *args)
+{
+ int ret = -1;
+ struct sigaction action = {
+ .sa_handler = tracer_stop,
+ };
+
+ /* Allow external shutdown. */
+ tracer_running = true;
+ ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
+
+ errno = 0;
+ while (ret == -1 && errno != EINVAL)
+ ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
+ ASSERT_EQ(0, ret) {
+ kill(tracee, SIGKILL);
+ }
+ /* Wait for attach stop */
+ wait(NULL);
+
+ ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, PTRACE_O_TRACESECCOMP);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
+ kill(tracee, SIGKILL);
+ }
+ ptrace(PTRACE_CONT, tracee, NULL, 0);
+
+ /* Unblock the tracee */
+ ASSERT_EQ(1, write(fd, "A", 1));
+ ASSERT_EQ(0, close(fd));
+
+ /* Run until we're shut down. Must assert to stop execution. */
+ while (tracer_running) {
+ int status;
+
+ if (wait(&status) != tracee)
+ continue;
+ if (WIFSIGNALED(status) || WIFEXITED(status))
+ /* Child is dead. Time to go. */
+ return;
+
+ /* Make sure this is a seccomp event. */
+ ASSERT_EQ(true, IS_SECCOMP_EVENT(status));
+
+ tracer_func(_metadata, tracee, status, args);
+
+ ret = ptrace(PTRACE_CONT, tracee, NULL, NULL);
+ ASSERT_EQ(0, ret);
+ }
+ /* Directly report the status of our test harness results. */
+ syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+/* Common tracer setup/teardown functions. */
+void cont_handler(int num)
+{ }
+pid_t setup_trace_fixture(struct __test_metadata *_metadata,
+ tracer_func_t func, void *args)
+{
+ char sync;
+ int pipefd[2];
+ pid_t tracer_pid;
+ pid_t tracee = getpid();
+
+ /* Setup a pipe for clean synchronization. */
+ ASSERT_EQ(0, pipe(pipefd));
+
+ /* Fork a child which we'll promote to tracer */
+ tracer_pid = fork();
+ ASSERT_LE(0, tracer_pid);
+ signal(SIGALRM, cont_handler);
+ if (tracer_pid == 0) {
+ close(pipefd[0]);
+ tracer(_metadata, pipefd[1], tracee, func, args);
+ syscall(__NR_exit, 0);
+ }
+ close(pipefd[1]);
+ prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
+ read(pipefd[0], &sync, 1);
+ close(pipefd[0]);
+
+ return tracer_pid;
+}
+void teardown_trace_fixture(struct __test_metadata *_metadata,
+ pid_t tracer)
+{
+ if (tracer) {
+ int status;
+ /*
+ * Extract the exit code from the other process and
+ * adopt it for ourselves in case its asserts failed.
+ */
+ ASSERT_EQ(0, kill(tracer, SIGUSR1));
+ ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
+ if (WEXITSTATUS(status))
+ _metadata->passed = 0;
+ }
+}
+
+/* "poke" tracer arguments and function. */
+struct tracer_args_poke_t {
+ unsigned long poke_addr;
+};
+
+void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
+ void *args)
+{
+ int ret;
+ unsigned long msg;
+ struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
+
+ ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
+ EXPECT_EQ(0, ret);
+ /* If this fails, don't try to recover. */
+ ASSERT_EQ(0x1001, msg) {
+ kill(tracee, SIGKILL);
+ }
+ /*
+ * Poke in the message.
+ * Registers are not touched to try to keep this relatively arch
+ * agnostic.
+ */
+ ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
+ EXPECT_EQ(0, ret);
+}
+
+FIXTURE_DATA(TRACE_poke) {
+ struct sock_fprog prog;
+ pid_t tracer;
+ long poked;
+ struct tracer_args_poke_t tracer_args;
+};
+
+FIXTURE_SETUP(TRACE_poke)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+
+ self->poked = 0;
+ memset(&self->prog, 0, sizeof(self->prog));
+ self->prog.filter = malloc(sizeof(filter));
+ ASSERT_NE(NULL, self->prog.filter);
+ memcpy(self->prog.filter, filter, sizeof(filter));
+ self->prog.len = (unsigned short)ARRAY_SIZE(filter);
+
+ /* Set up tracer args. */
+ self->tracer_args.poke_addr = (unsigned long)&self->poked;
+
+ /* Launch tracer. */
+ self->tracer = setup_trace_fixture(_metadata, tracer_poke,
+ &self->tracer_args);
+}
+
+FIXTURE_TEARDOWN(TRACE_poke)
+{
+ teardown_trace_fixture(_metadata, self->tracer);
+ if (self->prog.filter)
+ free(self->prog.filter);
+}
+
+TEST_F(TRACE_poke, read_has_side_effects)
+{
+ ssize_t ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(0, self->poked);
+ ret = read(-1, NULL, 0);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(0x1001, self->poked);
+}
+
+TEST_F(TRACE_poke, getpid_runs_normally)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(0, self->poked);
+ EXPECT_NE(0, syscall(__NR_getpid));
+ EXPECT_EQ(0, self->poked);
+}
+
+#if defined(__x86_64__)
+# define ARCH_REGS struct user_regs_struct
+# define SYSCALL_NUM orig_rax
+# define SYSCALL_RET rax
+#elif defined(__i386__)
+# define ARCH_REGS struct user_regs_struct
+# define SYSCALL_NUM orig_eax
+# define SYSCALL_RET eax
+#elif defined(__arm__)
+# define ARCH_REGS struct pt_regs
+# define SYSCALL_NUM ARM_r7
+# define SYSCALL_RET ARM_r0
+#elif defined(__aarch64__)
+# define ARCH_REGS struct user_pt_regs
+# define SYSCALL_NUM regs[8]
+# define SYSCALL_RET regs[0]
+#else
+# error "Do not know how to find your architecture's registers and syscalls"
+#endif
+
+/* Architecture-specific syscall fetching routine. */
+int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
+{
+ struct iovec iov;
+ ARCH_REGS regs;
+
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+ EXPECT_EQ(0, ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov)) {
+ TH_LOG("PTRACE_GETREGSET failed");
+ return -1;
+ }
+
+ return regs.SYSCALL_NUM;
+}
+
+/* Architecture-specific syscall changing routine. */
+void change_syscall(struct __test_metadata *_metadata,
+ pid_t tracee, int syscall)
+{
+ struct iovec iov;
+ int ret;
+ ARCH_REGS regs;
+
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+ ret = ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov);
+ EXPECT_EQ(0, ret);
+
+#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)
+ {
+ regs.SYSCALL_NUM = syscall;
+ }
+
+#elif defined(__arm__)
+# ifndef PTRACE_SET_SYSCALL
+# define PTRACE_SET_SYSCALL 23
+# endif
+ {
+ ret = ptrace(PTRACE_SET_SYSCALL, tracee, NULL, syscall);
+ EXPECT_EQ(0, ret);
+ }
+
+#else
+ ASSERT_EQ(1, 0) {
+ TH_LOG("How is the syscall changed on this architecture?");
+ }
+#endif
+
+ /* If syscall is skipped, change return value. */
+ if (syscall == -1)
+ regs.SYSCALL_RET = 1;
+
+ ret = ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &iov);
+ EXPECT_EQ(0, ret);
+}
+
+void tracer_syscall(struct __test_metadata *_metadata, pid_t tracee,
+ int status, void *args)
+{
+ int ret;
+ unsigned long msg;
+
+ /* Make sure we got the right message. */
+ ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
+ EXPECT_EQ(0, ret);
+
+ switch (msg) {
+ case 0x1002:
+ /* change getpid to getppid. */
+ change_syscall(_metadata, tracee, __NR_getppid);
+ break;
+ case 0x1003:
+ /* skip gettid. */
+ change_syscall(_metadata, tracee, -1);
+ break;
+ case 0x1004:
+ /* do nothing (allow getppid) */
+ break;
+ default:
+ EXPECT_EQ(0, msg) {
+ TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
+ kill(tracee, SIGKILL);
+ }
+ }
+
+}
+
+FIXTURE_DATA(TRACE_syscall) {
+ struct sock_fprog prog;
+ pid_t tracer, mytid, mypid, parent;
+};
+
+FIXTURE_SETUP(TRACE_syscall)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+
+ memset(&self->prog, 0, sizeof(self->prog));
+ self->prog.filter = malloc(sizeof(filter));
+ ASSERT_NE(NULL, self->prog.filter);
+ memcpy(self->prog.filter, filter, sizeof(filter));
+ self->prog.len = (unsigned short)ARRAY_SIZE(filter);
+
+ /* Prepare some testable syscall results. */
+ self->mytid = syscall(__NR_gettid);
+ ASSERT_GT(self->mytid, 0);
+ ASSERT_NE(self->mytid, 1) {
+ TH_LOG("Running this test as init is not supported. :)");
+ }
+
+ self->mypid = getpid();
+ ASSERT_GT(self->mypid, 0);
+ ASSERT_EQ(self->mytid, self->mypid);
+
+ self->parent = getppid();
+ ASSERT_GT(self->parent, 0);
+ ASSERT_NE(self->parent, self->mypid);
+
+ /* Launch tracer. */
+ self->tracer = setup_trace_fixture(_metadata, tracer_syscall, NULL);
+}
+
+FIXTURE_TEARDOWN(TRACE_syscall)
+{
+ teardown_trace_fixture(_metadata, self->tracer);
+ if (self->prog.filter)
+ free(self->prog.filter);
+}
+
+TEST_F(TRACE_syscall, syscall_allowed)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ /* getppid works as expected (no changes). */
+ EXPECT_EQ(self->parent, syscall(__NR_getppid));
+ EXPECT_NE(self->mypid, syscall(__NR_getppid));
+}
+
+TEST_F(TRACE_syscall, syscall_redirected)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ /* getpid has been redirected to getppid as expected. */
+ EXPECT_EQ(self->parent, syscall(__NR_getpid));
+ EXPECT_NE(self->mypid, syscall(__NR_getpid));
+}
+
+TEST_F(TRACE_syscall, syscall_dropped)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ /* gettid has been skipped and an altered return value stored. */
+ EXPECT_EQ(1, syscall(__NR_gettid));
+ EXPECT_NE(self->mytid, syscall(__NR_gettid));
+}
+
+#ifndef __NR_seccomp
+# if defined(__i386__)
+# define __NR_seccomp 354
+# elif defined(__x86_64__)
+# define __NR_seccomp 317
+# elif defined(__arm__)
+# define __NR_seccomp 383
+# elif defined(__aarch64__)
+# define __NR_seccomp 277
+# else
+# warning "seccomp syscall number unknown for this architecture"
+# define __NR_seccomp 0xffff
+# endif
+#endif
+
+#ifndef SECCOMP_SET_MODE_STRICT
+#define SECCOMP_SET_MODE_STRICT 0
+#endif
+
+#ifndef SECCOMP_SET_MODE_FILTER
+#define SECCOMP_SET_MODE_FILTER 1
+#endif
+
+#ifndef SECCOMP_FLAG_FILTER_TSYNC
+#define SECCOMP_FLAG_FILTER_TSYNC 1
+#endif
+
+#ifndef seccomp
+int seccomp(unsigned int op, unsigned int flags, struct sock_fprog *filter)
+{
+ errno = 0;
+ return syscall(__NR_seccomp, op, flags, filter);
+}
+#endif
+
+TEST(seccomp_syscall)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ /* Reject insane operation. */
+ ret = seccomp(-1, 0, &prog);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Did not reject crazy op value!");
+ }
+
+ /* Reject strict with flags or pointer. */
+ ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Did not reject mode strict with flags!");
+ }
+ ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Did not reject mode strict with uargs!");
+ }
+
+ /* Reject insane args for filter. */
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Did not reject crazy filter flags!");
+ }
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
+ EXPECT_EQ(EFAULT, errno) {
+ TH_LOG("Did not reject NULL filter!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
+ EXPECT_EQ(0, errno) {
+ TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
+ strerror(errno));
+ }
+}
+
+TEST(seccomp_syscall_mode_lock)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
+ EXPECT_EQ(0, ret) {
+ TH_LOG("Could not install filter!");
+ }
+
+ /* Make sure neither entry point will switch to strict. */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Switched to mode strict!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Switched to mode strict!");
+ }
+}
+
+TEST(TSYNC_first)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC,
+ &prog);
+ EXPECT_EQ(0, ret) {
+ TH_LOG("Could not install initial filter with TSYNC!");
+ }
+}
+
+#define TSYNC_SIBLINGS 2
+struct tsync_sibling {
+ pthread_t tid;
+ pid_t system_tid;
+ sem_t *started;
+ pthread_cond_t *cond;
+ pthread_mutex_t *mutex;
+ int diverge;
+ int num_waits;
+ struct sock_fprog *prog;
+ struct __test_metadata *metadata;
+};
+
+FIXTURE_DATA(TSYNC) {
+ struct sock_fprog root_prog, apply_prog;
+ struct tsync_sibling sibling[TSYNC_SIBLINGS];
+ sem_t started;
+ pthread_cond_t cond;
+ pthread_mutex_t mutex;
+ int sibling_count;
+};
+
+FIXTURE_SETUP(TSYNC)
+{
+ struct sock_filter root_filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_filter apply_filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+
+ memset(&self->root_prog, 0, sizeof(self->root_prog));
+ memset(&self->apply_prog, 0, sizeof(self->apply_prog));
+ memset(&self->sibling, 0, sizeof(self->sibling));
+ self->root_prog.filter = malloc(sizeof(root_filter));
+ ASSERT_NE(NULL, self->root_prog.filter);
+ memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
+ self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
+
+ self->apply_prog.filter = malloc(sizeof(apply_filter));
+ ASSERT_NE(NULL, self->apply_prog.filter);
+ memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
+ self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
+
+ self->sibling_count = 0;
+ pthread_mutex_init(&self->mutex, NULL);
+ pthread_cond_init(&self->cond, NULL);
+ sem_init(&self->started, 0, 0);
+ self->sibling[0].tid = 0;
+ self->sibling[0].cond = &self->cond;
+ self->sibling[0].started = &self->started;
+ self->sibling[0].mutex = &self->mutex;
+ self->sibling[0].diverge = 0;
+ self->sibling[0].num_waits = 1;
+ self->sibling[0].prog = &self->root_prog;
+ self->sibling[0].metadata = _metadata;
+ self->sibling[1].tid = 0;
+ self->sibling[1].cond = &self->cond;
+ self->sibling[1].started = &self->started;
+ self->sibling[1].mutex = &self->mutex;
+ self->sibling[1].diverge = 0;
+ self->sibling[1].prog = &self->root_prog;
+ self->sibling[1].num_waits = 1;
+ self->sibling[1].metadata = _metadata;
+}
+
+FIXTURE_TEARDOWN(TSYNC)
+{
+ int sib = 0;
+
+ if (self->root_prog.filter)
+ free(self->root_prog.filter);
+ if (self->apply_prog.filter)
+ free(self->apply_prog.filter);
+
+ for ( ; sib < self->sibling_count; ++sib) {
+ struct tsync_sibling *s = &self->sibling[sib];
+ void *status;
+
+ if (!s->tid)
+ continue;
+ if (pthread_kill(s->tid, 0)) {
+ pthread_cancel(s->tid);
+ pthread_join(s->tid, &status);
+ }
+ }
+ pthread_mutex_destroy(&self->mutex);
+ pthread_cond_destroy(&self->cond);
+ sem_destroy(&self->started);
+}
+
+void *tsync_sibling(void *data)
+{
+ long ret = 0;
+ struct tsync_sibling *me = data;
+
+ me->system_tid = syscall(__NR_gettid);
+
+ pthread_mutex_lock(me->mutex);
+ if (me->diverge) {
+ /* Just re-apply the root prog to fork the tree */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
+ me->prog, 0, 0);
+ }
+ sem_post(me->started);
+ /* Return outside of started so parent notices failures. */
+ if (ret) {
+ pthread_mutex_unlock(me->mutex);
+ return (void *)SIBLING_EXIT_FAILURE;
+ }
+ do {
+ pthread_cond_wait(me->cond, me->mutex);
+ me->num_waits = me->num_waits - 1;
+ } while (me->num_waits);
+ pthread_mutex_unlock(me->mutex);
+
+ ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
+ if (!ret)
+ return (void *)SIBLING_EXIT_NEWPRIVS;
+ read(0, NULL, 0);
+ return (void *)SIBLING_EXIT_UNKILLED;
+}
+
+void tsync_start_sibling(struct tsync_sibling *sibling)
+{
+ pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
+}
+
+TEST_F(TSYNC, siblings_fail_prctl)
+{
+ long ret;
+ void *status;
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ /* Check prctl failure detection by requesting sib 0 diverge. */
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("setting filter failed");
+ }
+
+ self->sibling[0].diverge = 1;
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ /* Signal the threads to clean up*/
+ pthread_mutex_lock(&self->mutex);
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+
+ /* Ensure diverging sibling failed to call prctl. */
+ pthread_join(self->sibling[0].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
+ pthread_join(self->sibling[1].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
+}
+
+TEST_F(TSYNC, two_siblings_with_ancestor)
+{
+ long ret;
+ void *status;
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
+ }
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC,
+ &self->apply_prog);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Could install filter on all threads!");
+ }
+ /* Tell the siblings to test the policy */
+ pthread_mutex_lock(&self->mutex);
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+ /* Ensure they are both killed and don't exit cleanly. */
+ pthread_join(self->sibling[0].tid, &status);
+ EXPECT_EQ(0x0, (long)status);
+ pthread_join(self->sibling[1].tid, &status);
+ EXPECT_EQ(0x0, (long)status);
+}
+
+TEST_F(TSYNC, two_sibling_want_nnp)
+{
+ void *status;
+
+ /* start siblings before any prctl() operations */
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ /* Tell the siblings to test no policy */
+ pthread_mutex_lock(&self->mutex);
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+
+ /* Ensure they are both upset about lacking nnp. */
+ pthread_join(self->sibling[0].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
+ pthread_join(self->sibling[1].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
+}
+
+TEST_F(TSYNC, two_siblings_with_no_filter)
+{
+ long ret;
+ void *status;
+
+ /* start siblings before any prctl() operations */
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC,
+ &self->apply_prog);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Could install filter on all threads!");
+ }
+
+ /* Tell the siblings to test the policy */
+ pthread_mutex_lock(&self->mutex);
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+
+ /* Ensure they are both killed and don't exit cleanly. */
+ pthread_join(self->sibling[0].tid, &status);
+ EXPECT_EQ(0x0, (long)status);
+ pthread_join(self->sibling[1].tid, &status);
+ EXPECT_EQ(0x0, (long)status);
+}
+
+TEST_F(TSYNC, two_siblings_with_one_divergence)
+{
+ long ret;
+ void *status;
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
+ }
+ self->sibling[0].diverge = 1;
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC,
+ &self->apply_prog);
+ ASSERT_EQ(self->sibling[0].system_tid, ret) {
+ TH_LOG("Did not fail on diverged sibling.");
+ }
+
+ /* Wake the threads */
+ pthread_mutex_lock(&self->mutex);
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+
+ /* Ensure they are both unkilled. */
+ pthread_join(self->sibling[0].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
+ pthread_join(self->sibling[1].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
+}
+
+TEST_F(TSYNC, two_siblings_not_under_filter)
+{
+ long ret, sib;
+ void *status;
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ /*
+ * Sibling 0 will have its own seccomp policy
+ * and Sibling 1 will not be under seccomp at
+ * all. Sibling 1 will enter seccomp and 0
+ * will cause failure.
+ */
+ self->sibling[0].diverge = 1;
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC,
+ &self->apply_prog);
+ ASSERT_EQ(ret, self->sibling[0].system_tid) {
+ TH_LOG("Did not fail on diverged sibling.");
+ }
+ sib = 1;
+ if (ret == self->sibling[0].system_tid)
+ sib = 0;
+
+ pthread_mutex_lock(&self->mutex);
+
+ /* Increment the other siblings num_waits so we can clean up
+ * the one we just saw.
+ */
+ self->sibling[!sib].num_waits += 1;
+
+ /* Signal the thread to clean up*/
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+ pthread_join(self->sibling[sib].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
+ /* Poll for actual task death. pthread_join doesn't guarantee it. */
+ while (!kill(self->sibling[sib].system_tid, 0))
+ sleep(0.1);
+ /* Switch to the remaining sibling */
+ sib = !sib;
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC,
+ &self->apply_prog);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Expected the remaining sibling to sync");
+ };
+
+ pthread_mutex_lock(&self->mutex);
+
+ /* If remaining sibling didn't have a chance to wake up during
+ * the first broadcast, manually reduce the num_waits now.
+ */
+ if (self->sibling[sib].num_waits > 1)
+ self->sibling[sib].num_waits = 1;
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+ pthread_join(self->sibling[sib].tid, &status);
+ EXPECT_EQ(0, (long)status);
+ /* Poll for actual task death. pthread_join doesn't guarantee it. */
+ while (!kill(self->sibling[sib].system_tid, 0))
+ sleep(0.1);
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC,
+ &self->apply_prog);
+ ASSERT_EQ(0, ret); /* just us chickens */
+}
+
+/* Make sure restarted syscalls are seen directly as "restart_syscall". */
+TEST(syscall_restart)
+{
+ long ret;
+ unsigned long msg;
+ pid_t child_pid;
+ int pipefd[2];
+ int status;
+ siginfo_t info = { };
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+
+#ifdef __NR_sigreturn
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 6, 0),
+#endif
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 5, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 4, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 3, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_poll, 4, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
+
+ /* Allow __NR_write for easy logging. */
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100), /* poll */
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200), /* restart */
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+
+ ASSERT_EQ(0, pipe(pipefd));
+
+ child_pid = fork();
+ ASSERT_LE(0, child_pid);
+ if (child_pid == 0) {
+ /* Child uses EXPECT not ASSERT to deliver status correctly. */
+ char buf = ' ';
+ struct pollfd fds = {
+ .fd = pipefd[0],
+ .events = POLLIN,
+ };
+
+ /* Attach parent as tracer and stop. */
+ EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
+ EXPECT_EQ(0, raise(SIGSTOP));
+
+ EXPECT_EQ(0, close(pipefd[1]));
+
+ EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ EXPECT_EQ(0, ret) {
+ TH_LOG("Failed to install filter!");
+ }
+
+ EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
+ TH_LOG("Failed to read() sync from parent");
+ }
+ EXPECT_EQ('.', buf) {
+ TH_LOG("Failed to get sync data from read()");
+ }
+
+ /* Start poll to be interrupted. */
+ errno = 0;
+ EXPECT_EQ(1, poll(&fds, 1, -1)) {
+ TH_LOG("Call to poll() failed (errno %d)", errno);
+ }
+
+ /* Read final sync from parent. */
+ EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
+ TH_LOG("Failed final read() from parent");
+ }
+ EXPECT_EQ('!', buf) {
+ TH_LOG("Failed to get final data from read()");
+ }
+
+ /* Directly report the status of our test harness results. */
+ syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS
+ : EXIT_FAILURE);
+ }
+ EXPECT_EQ(0, close(pipefd[0]));
+
+ /* Attach to child, setup options, and release. */
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ ASSERT_EQ(true, WIFSTOPPED(status));
+ ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
+ PTRACE_O_TRACESECCOMP));
+ ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
+ ASSERT_EQ(1, write(pipefd[1], ".", 1));
+
+ /* Wait for poll() to start. */
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ ASSERT_EQ(true, WIFSTOPPED(status));
+ ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
+ ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
+ ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
+ ASSERT_EQ(0x100, msg);
+ EXPECT_EQ(__NR_poll, get_syscall(_metadata, child_pid));
+
+ /* Might as well check siginfo for sanity while we're here. */
+ ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
+ ASSERT_EQ(SIGTRAP, info.si_signo);
+ ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
+ EXPECT_EQ(0, info.si_errno);
+ EXPECT_EQ(getuid(), info.si_uid);
+ /* Verify signal delivery came from child (seccomp-triggered). */
+ EXPECT_EQ(child_pid, info.si_pid);
+
+ /* Interrupt poll with SIGSTOP (which we'll need to handle). */
+ ASSERT_EQ(0, kill(child_pid, SIGSTOP));
+ ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ ASSERT_EQ(true, WIFSTOPPED(status));
+ ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
+ /* Verify signal delivery came from parent now. */
+ ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
+ EXPECT_EQ(getpid(), info.si_pid);
+
+ /* Restart poll with SIGCONT, which triggers restart_syscall. */
+ ASSERT_EQ(0, kill(child_pid, SIGCONT));
+ ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ ASSERT_EQ(true, WIFSTOPPED(status));
+ ASSERT_EQ(SIGCONT, WSTOPSIG(status));
+ ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
+
+ /* Wait for restart_syscall() to start. */
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ ASSERT_EQ(true, WIFSTOPPED(status));
+ ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
+ ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
+ ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
+ ASSERT_EQ(0x200, msg);
+ ret = get_syscall(_metadata, child_pid);
+#if defined(__arm__)
+ /* FIXME: ARM does not expose true syscall in registers. */
+ EXPECT_EQ(__NR_poll, ret);
+#else
+ EXPECT_EQ(__NR_restart_syscall, ret);
+#endif
+
+ /* Write again to end poll. */
+ ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
+ ASSERT_EQ(1, write(pipefd[1], "!", 1));
+ EXPECT_EQ(0, close(pipefd[1]));
+
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ if (WIFSIGNALED(status) || WEXITSTATUS(status))
+ _metadata->passed = 0;
+}
+
+/*
+ * TODO:
+ * - add microbenchmarks
+ * - expand NNP testing
+ * - better arch-specific TRACE and TRAP handlers.
+ * - endianness checking when appropriate
+ * - 64-bit arg prodding
+ * - arch value testing (x86 modes especially)
+ * - ...
+ */
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/seccomp/test_harness.h b/tools/testing/selftests/seccomp/test_harness.h
new file mode 100644
index 000000000000..977a6afc4489
--- /dev/null
+++ b/tools/testing/selftests/seccomp/test_harness.h
@@ -0,0 +1,537 @@
+/*
+ * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by the GPLv2 license.
+ *
+ * test_harness.h: simple C unit test helper.
+ *
+ * Usage:
+ * #include "test_harness.h"
+ * TEST(standalone_test) {
+ * do_some_stuff;
+ * EXPECT_GT(10, stuff) {
+ * stuff_state_t state;
+ * enumerate_stuff_state(&state);
+ * TH_LOG("expectation failed with state: %s", state.msg);
+ * }
+ * more_stuff;
+ * ASSERT_NE(some_stuff, NULL) TH_LOG("how did it happen?!");
+ * last_stuff;
+ * EXPECT_EQ(0, last_stuff);
+ * }
+ *
+ * FIXTURE(my_fixture) {
+ * mytype_t *data;
+ * int awesomeness_level;
+ * };
+ * FIXTURE_SETUP(my_fixture) {
+ * self->data = mytype_new();
+ * ASSERT_NE(NULL, self->data);
+ * }
+ * FIXTURE_TEARDOWN(my_fixture) {
+ * mytype_free(self->data);
+ * }
+ * TEST_F(my_fixture, data_is_good) {
+ * EXPECT_EQ(1, is_my_data_good(self->data));
+ * }
+ *
+ * TEST_HARNESS_MAIN
+ *
+ * API inspired by code.google.com/p/googletest
+ */
+#ifndef TEST_HARNESS_H_
+#define TEST_HARNESS_H_
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+/* All exported functionality should be declared through this macro. */
+#define TEST_API(x) _##x
+
+/*
+ * Exported APIs
+ */
+
+/* TEST(name) { implementation }
+ * Defines a test by name.
+ * Names must be unique and tests must not be run in parallel. The
+ * implementation containing block is a function and scoping should be treated
+ * as such. Returning early may be performed with a bare "return;" statement.
+ *
+ * EXPECT_* and ASSERT_* are valid in a TEST() { } context.
+ */
+#define TEST TEST_API(TEST)
+
+/* TEST_SIGNAL(name, signal) { implementation }
+ * Defines a test by name and the expected term signal.
+ * Names must be unique and tests must not be run in parallel. The
+ * implementation containing block is a function and scoping should be treated
+ * as such. Returning early may be performed with a bare "return;" statement.
+ *
+ * EXPECT_* and ASSERT_* are valid in a TEST() { } context.
+ */
+#define TEST_SIGNAL TEST_API(TEST_SIGNAL)
+
+/* FIXTURE(datatype name) {
+ * type property1;
+ * ...
+ * };
+ * Defines the data provided to TEST_F()-defined tests as |self|. It should be
+ * populated and cleaned up using FIXTURE_SETUP and FIXTURE_TEARDOWN.
+ */
+#define FIXTURE TEST_API(FIXTURE)
+
+/* FIXTURE_DATA(datatype name)
+ * This call may be used when the type of the fixture data
+ * is needed. In general, this should not be needed unless
+ * the |self| is being passed to a helper directly.
+ */
+#define FIXTURE_DATA TEST_API(FIXTURE_DATA)
+
+/* FIXTURE_SETUP(fixture name) { implementation }
+ * Populates the required "setup" function for a fixture. An instance of the
+ * datatype defined with _FIXTURE_DATA will be exposed as |self| for the
+ * implementation.
+ *
+ * ASSERT_* are valid for use in this context and will prempt the execution
+ * of any dependent fixture tests.
+ *
+ * A bare "return;" statement may be used to return early.
+ */
+#define FIXTURE_SETUP TEST_API(FIXTURE_SETUP)
+
+/* FIXTURE_TEARDOWN(fixture name) { implementation }
+ * Populates the required "teardown" function for a fixture. An instance of the
+ * datatype defined with _FIXTURE_DATA will be exposed as |self| for the
+ * implementation to clean up.
+ *
+ * A bare "return;" statement may be used to return early.
+ */
+#define FIXTURE_TEARDOWN TEST_API(FIXTURE_TEARDOWN)
+
+/* TEST_F(fixture, name) { implementation }
+ * Defines a test that depends on a fixture (e.g., is part of a test case).
+ * Very similar to TEST() except that |self| is the setup instance of fixture's
+ * datatype exposed for use by the implementation.
+ */
+#define TEST_F TEST_API(TEST_F)
+
+#define TEST_F_SIGNAL TEST_API(TEST_F_SIGNAL)
+
+/* Use once to append a main() to the test file. E.g.,
+ * TEST_HARNESS_MAIN
+ */
+#define TEST_HARNESS_MAIN TEST_API(TEST_HARNESS_MAIN)
+
+/*
+ * Operators for use in TEST and TEST_F.
+ * ASSERT_* calls will stop test execution immediately.
+ * EXPECT_* calls will emit a failure warning, note it, and continue.
+ */
+
+/* ASSERT_EQ(expected, measured): expected == measured */
+#define ASSERT_EQ TEST_API(ASSERT_EQ)
+/* ASSERT_NE(expected, measured): expected != measured */
+#define ASSERT_NE TEST_API(ASSERT_NE)
+/* ASSERT_LT(expected, measured): expected < measured */
+#define ASSERT_LT TEST_API(ASSERT_LT)
+/* ASSERT_LE(expected, measured): expected <= measured */
+#define ASSERT_LE TEST_API(ASSERT_LE)
+/* ASSERT_GT(expected, measured): expected > measured */
+#define ASSERT_GT TEST_API(ASSERT_GT)
+/* ASSERT_GE(expected, measured): expected >= measured */
+#define ASSERT_GE TEST_API(ASSERT_GE)
+/* ASSERT_NULL(measured): NULL == measured */
+#define ASSERT_NULL TEST_API(ASSERT_NULL)
+/* ASSERT_TRUE(measured): measured != 0 */
+#define ASSERT_TRUE TEST_API(ASSERT_TRUE)
+/* ASSERT_FALSE(measured): measured == 0 */
+#define ASSERT_FALSE TEST_API(ASSERT_FALSE)
+/* ASSERT_STREQ(expected, measured): !strcmp(expected, measured) */
+#define ASSERT_STREQ TEST_API(ASSERT_STREQ)
+/* ASSERT_STRNE(expected, measured): strcmp(expected, measured) */
+#define ASSERT_STRNE TEST_API(ASSERT_STRNE)
+/* EXPECT_EQ(expected, measured): expected == measured */
+#define EXPECT_EQ TEST_API(EXPECT_EQ)
+/* EXPECT_NE(expected, measured): expected != measured */
+#define EXPECT_NE TEST_API(EXPECT_NE)
+/* EXPECT_LT(expected, measured): expected < measured */
+#define EXPECT_LT TEST_API(EXPECT_LT)
+/* EXPECT_LE(expected, measured): expected <= measured */
+#define EXPECT_LE TEST_API(EXPECT_LE)
+/* EXPECT_GT(expected, measured): expected > measured */
+#define EXPECT_GT TEST_API(EXPECT_GT)
+/* EXPECT_GE(expected, measured): expected >= measured */
+#define EXPECT_GE TEST_API(EXPECT_GE)
+/* EXPECT_NULL(measured): NULL == measured */
+#define EXPECT_NULL TEST_API(EXPECT_NULL)
+/* EXPECT_TRUE(measured): 0 != measured */
+#define EXPECT_TRUE TEST_API(EXPECT_TRUE)
+/* EXPECT_FALSE(measured): 0 == measured */
+#define EXPECT_FALSE TEST_API(EXPECT_FALSE)
+/* EXPECT_STREQ(expected, measured): !strcmp(expected, measured) */
+#define EXPECT_STREQ TEST_API(EXPECT_STREQ)
+/* EXPECT_STRNE(expected, measured): strcmp(expected, measured) */
+#define EXPECT_STRNE TEST_API(EXPECT_STRNE)
+
+/* TH_LOG(format, ...)
+ * Optional debug logging function available for use in tests.
+ * Logging may be enabled or disabled by defining TH_LOG_ENABLED.
+ * E.g., #define TH_LOG_ENABLED 1
+ * If no definition is provided, logging is enabled by default.
+ */
+#define TH_LOG TEST_API(TH_LOG)
+
+/*
+ * Internal implementation.
+ *
+ */
+
+/* Utilities exposed to the test definitions */
+#ifndef TH_LOG_STREAM
+# define TH_LOG_STREAM stderr
+#endif
+
+#ifndef TH_LOG_ENABLED
+# define TH_LOG_ENABLED 1
+#endif
+
+#define _TH_LOG(fmt, ...) do { \
+ if (TH_LOG_ENABLED) \
+ __TH_LOG(fmt, ##__VA_ARGS__); \
+} while (0)
+
+/* Unconditional logger for internal use. */
+#define __TH_LOG(fmt, ...) \
+ fprintf(TH_LOG_STREAM, "%s:%d:%s:" fmt "\n", \
+ __FILE__, __LINE__, _metadata->name, ##__VA_ARGS__)
+
+/* Defines the test function and creates the registration stub. */
+#define _TEST(test_name) __TEST_IMPL(test_name, -1)
+
+#define _TEST_SIGNAL(test_name, signal) __TEST_IMPL(test_name, signal)
+
+#define __TEST_IMPL(test_name, _signal) \
+ static void test_name(struct __test_metadata *_metadata); \
+ static struct __test_metadata _##test_name##_object = \
+ { name: "global." #test_name, \
+ fn: &test_name, termsig: _signal }; \
+ static void __attribute__((constructor)) _register_##test_name(void) \
+ { \
+ __register_test(&_##test_name##_object); \
+ } \
+ static void test_name( \
+ struct __test_metadata __attribute__((unused)) *_metadata)
+
+/* Wraps the struct name so we have one less argument to pass around. */
+#define _FIXTURE_DATA(fixture_name) struct _test_data_##fixture_name
+
+/* Called once per fixture to setup the data and register. */
+#define _FIXTURE(fixture_name) \
+ static void __attribute__((constructor)) \
+ _register_##fixture_name##_data(void) \
+ { \
+ __fixture_count++; \
+ } \
+ _FIXTURE_DATA(fixture_name)
+
+/* Prepares the setup function for the fixture. |_metadata| is included
+ * so that ASSERT_* work as a convenience.
+ */
+#define _FIXTURE_SETUP(fixture_name) \
+ void fixture_name##_setup( \
+ struct __test_metadata __attribute__((unused)) *_metadata, \
+ _FIXTURE_DATA(fixture_name) __attribute__((unused)) *self)
+#define _FIXTURE_TEARDOWN(fixture_name) \
+ void fixture_name##_teardown( \
+ struct __test_metadata __attribute__((unused)) *_metadata, \
+ _FIXTURE_DATA(fixture_name) __attribute__((unused)) *self)
+
+/* Emits test registration and helpers for fixture-based test
+ * cases.
+ * TODO(wad) register fixtures on dedicated test lists.
+ */
+#define _TEST_F(fixture_name, test_name) \
+ __TEST_F_IMPL(fixture_name, test_name, -1)
+
+#define _TEST_F_SIGNAL(fixture_name, test_name, signal) \
+ __TEST_F_IMPL(fixture_name, test_name, signal)
+
+#define __TEST_F_IMPL(fixture_name, test_name, signal) \
+ static void fixture_name##_##test_name( \
+ struct __test_metadata *_metadata, \
+ _FIXTURE_DATA(fixture_name) *self); \
+ static inline void wrapper_##fixture_name##_##test_name( \
+ struct __test_metadata *_metadata) \
+ { \
+ /* fixture data is alloced, setup, and torn down per call. */ \
+ _FIXTURE_DATA(fixture_name) self; \
+ memset(&self, 0, sizeof(_FIXTURE_DATA(fixture_name))); \
+ fixture_name##_setup(_metadata, &self); \
+ /* Let setup failure terminate early. */ \
+ if (!_metadata->passed) \
+ return; \
+ fixture_name##_##test_name(_metadata, &self); \
+ fixture_name##_teardown(_metadata, &self); \
+ } \
+ static struct __test_metadata \
+ _##fixture_name##_##test_name##_object = { \
+ name: #fixture_name "." #test_name, \
+ fn: &wrapper_##fixture_name##_##test_name, \
+ termsig: signal, \
+ }; \
+ static void __attribute__((constructor)) \
+ _register_##fixture_name##_##test_name(void) \
+ { \
+ __register_test(&_##fixture_name##_##test_name##_object); \
+ } \
+ static void fixture_name##_##test_name( \
+ struct __test_metadata __attribute__((unused)) *_metadata, \
+ _FIXTURE_DATA(fixture_name) __attribute__((unused)) *self)
+
+/* Exports a simple wrapper to run the test harness. */
+#define _TEST_HARNESS_MAIN \
+ static void __attribute__((constructor)) \
+ __constructor_order_last(void) \
+ { \
+ if (!__constructor_order) \
+ __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; \
+ } \
+ int main(int argc, char **argv) { \
+ return test_harness_run(argc, argv); \
+ }
+
+#define _ASSERT_EQ(_expected, _seen) \
+ __EXPECT(_expected, _seen, ==, 1)
+#define _ASSERT_NE(_expected, _seen) \
+ __EXPECT(_expected, _seen, !=, 1)
+#define _ASSERT_LT(_expected, _seen) \
+ __EXPECT(_expected, _seen, <, 1)
+#define _ASSERT_LE(_expected, _seen) \
+ __EXPECT(_expected, _seen, <=, 1)
+#define _ASSERT_GT(_expected, _seen) \
+ __EXPECT(_expected, _seen, >, 1)
+#define _ASSERT_GE(_expected, _seen) \
+ __EXPECT(_expected, _seen, >=, 1)
+#define _ASSERT_NULL(_seen) \
+ __EXPECT(NULL, _seen, ==, 1)
+
+#define _ASSERT_TRUE(_seen) \
+ _ASSERT_NE(0, _seen)
+#define _ASSERT_FALSE(_seen) \
+ _ASSERT_EQ(0, _seen)
+#define _ASSERT_STREQ(_expected, _seen) \
+ __EXPECT_STR(_expected, _seen, ==, 1)
+#define _ASSERT_STRNE(_expected, _seen) \
+ __EXPECT_STR(_expected, _seen, !=, 1)
+
+#define _EXPECT_EQ(_expected, _seen) \
+ __EXPECT(_expected, _seen, ==, 0)
+#define _EXPECT_NE(_expected, _seen) \
+ __EXPECT(_expected, _seen, !=, 0)
+#define _EXPECT_LT(_expected, _seen) \
+ __EXPECT(_expected, _seen, <, 0)
+#define _EXPECT_LE(_expected, _seen) \
+ __EXPECT(_expected, _seen, <=, 0)
+#define _EXPECT_GT(_expected, _seen) \
+ __EXPECT(_expected, _seen, >, 0)
+#define _EXPECT_GE(_expected, _seen) \
+ __EXPECT(_expected, _seen, >=, 0)
+
+#define _EXPECT_NULL(_seen) \
+ __EXPECT(NULL, _seen, ==, 0)
+#define _EXPECT_TRUE(_seen) \
+ _EXPECT_NE(0, _seen)
+#define _EXPECT_FALSE(_seen) \
+ _EXPECT_EQ(0, _seen)
+
+#define _EXPECT_STREQ(_expected, _seen) \
+ __EXPECT_STR(_expected, _seen, ==, 0)
+#define _EXPECT_STRNE(_expected, _seen) \
+ __EXPECT_STR(_expected, _seen, !=, 0)
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
+
+/* Support an optional handler after and ASSERT_* or EXPECT_*. The approach is
+ * not thread-safe, but it should be fine in most sane test scenarios.
+ *
+ * Using __bail(), which optionally abort()s, is the easiest way to early
+ * return while still providing an optional block to the API consumer.
+ */
+#define OPTIONAL_HANDLER(_assert) \
+ for (; _metadata->trigger; _metadata->trigger = __bail(_assert))
+
+#define __EXPECT(_expected, _seen, _t, _assert) do { \
+ /* Avoid multiple evaluation of the cases */ \
+ __typeof__(_expected) __exp = (_expected); \
+ __typeof__(_seen) __seen = (_seen); \
+ if (!(__exp _t __seen)) { \
+ unsigned long long __exp_print = 0; \
+ unsigned long long __seen_print = 0; \
+ /* Avoid casting complaints the scariest way we can. */ \
+ memcpy(&__exp_print, &__exp, sizeof(__exp)); \
+ memcpy(&__seen_print, &__seen, sizeof(__seen)); \
+ __TH_LOG("Expected %s (%llu) %s %s (%llu)", \
+ #_expected, __exp_print, #_t, \
+ #_seen, __seen_print); \
+ _metadata->passed = 0; \
+ /* Ensure the optional handler is triggered */ \
+ _metadata->trigger = 1; \
+ } \
+} while (0); OPTIONAL_HANDLER(_assert)
+
+#define __EXPECT_STR(_expected, _seen, _t, _assert) do { \
+ const char *__exp = (_expected); \
+ const char *__seen = (_seen); \
+ if (!(strcmp(__exp, __seen) _t 0)) { \
+ __TH_LOG("Expected '%s' %s '%s'.", __exp, #_t, __seen); \
+ _metadata->passed = 0; \
+ _metadata->trigger = 1; \
+ } \
+} while (0); OPTIONAL_HANDLER(_assert)
+
+/* Contains all the information for test execution and status checking. */
+struct __test_metadata {
+ const char *name;
+ void (*fn)(struct __test_metadata *);
+ int termsig;
+ int passed;
+ int trigger; /* extra handler after the evaluation */
+ struct __test_metadata *prev, *next;
+};
+
+/* Storage for the (global) tests to be run. */
+static struct __test_metadata *__test_list;
+static unsigned int __test_count;
+static unsigned int __fixture_count;
+static int __constructor_order;
+
+#define _CONSTRUCTOR_ORDER_FORWARD 1
+#define _CONSTRUCTOR_ORDER_BACKWARD -1
+
+/*
+ * Since constructors are called in reverse order, reverse the test
+ * list so tests are run in source declaration order.
+ * https://gcc.gnu.org/onlinedocs/gccint/Initialization.html
+ * However, it seems not all toolchains do this correctly, so use
+ * __constructor_order to detect which direction is called first
+ * and adjust list building logic to get things running in the right
+ * direction.
+ */
+static inline void __register_test(struct __test_metadata *t)
+{
+ __test_count++;
+ /* Circular linked list where only prev is circular. */
+ if (__test_list == NULL) {
+ __test_list = t;
+ t->next = NULL;
+ t->prev = t;
+ return;
+ }
+ if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) {
+ t->next = NULL;
+ t->prev = __test_list->prev;
+ t->prev->next = t;
+ __test_list->prev = t;
+ } else {
+ t->next = __test_list;
+ t->next->prev = t;
+ t->prev = t;
+ __test_list = t;
+ }
+}
+
+static inline int __bail(int for_realz)
+{
+ if (for_realz)
+ abort();
+ return 0;
+}
+
+void __run_test(struct __test_metadata *t)
+{
+ pid_t child_pid;
+ int status;
+
+ t->passed = 1;
+ t->trigger = 0;
+ printf("[ RUN ] %s\n", t->name);
+ child_pid = fork();
+ if (child_pid < 0) {
+ printf("ERROR SPAWNING TEST CHILD\n");
+ t->passed = 0;
+ } else if (child_pid == 0) {
+ t->fn(t);
+ _exit(t->passed);
+ } else {
+ /* TODO(wad) add timeout support. */
+ waitpid(child_pid, &status, 0);
+ if (WIFEXITED(status)) {
+ t->passed = t->termsig == -1 ? WEXITSTATUS(status) : 0;
+ if (t->termsig != -1) {
+ fprintf(TH_LOG_STREAM,
+ "%s: Test exited normally "
+ "instead of by signal (code: %d)\n",
+ t->name,
+ WEXITSTATUS(status));
+ }
+ } else if (WIFSIGNALED(status)) {
+ t->passed = 0;
+ if (WTERMSIG(status) == SIGABRT) {
+ fprintf(TH_LOG_STREAM,
+ "%s: Test terminated by assertion\n",
+ t->name);
+ } else if (WTERMSIG(status) == t->termsig) {
+ t->passed = 1;
+ } else {
+ fprintf(TH_LOG_STREAM,
+ "%s: Test terminated unexpectedly "
+ "by signal %d\n",
+ t->name,
+ WTERMSIG(status));
+ }
+ } else {
+ fprintf(TH_LOG_STREAM,
+ "%s: Test ended in some other way [%u]\n",
+ t->name,
+ status);
+ }
+ }
+ printf("[ %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name);
+}
+
+static int test_harness_run(int __attribute__((unused)) argc,
+ char __attribute__((unused)) **argv)
+{
+ struct __test_metadata *t;
+ int ret = 0;
+ unsigned int count = 0;
+ unsigned int pass_count = 0;
+
+ /* TODO(wad) add optional arguments similar to gtest. */
+ printf("[==========] Running %u tests from %u test cases.\n",
+ __test_count, __fixture_count + 1);
+ for (t = __test_list; t; t = t->next) {
+ count++;
+ __run_test(t);
+ if (t->passed)
+ pass_count++;
+ else
+ ret = 1;
+ }
+ printf("[==========] %u / %u tests passed.\n", pass_count, count);
+ printf("[ %s ]\n", (ret ? "FAILED" : "PASSED"));
+ return ret;
+}
+
+static void __attribute__((constructor)) __constructor_order_first(void)
+{
+ if (!__constructor_order)
+ __constructor_order = _CONSTRUCTOR_ORDER_FORWARD;
+}
+
+#endif /* TEST_HARNESS_H_ */
diff --git a/tools/testing/selftests/timers/.gitignore b/tools/testing/selftests/timers/.gitignore
new file mode 100644
index 000000000000..ced998151bc4
--- /dev/null
+++ b/tools/testing/selftests/timers/.gitignore
@@ -0,0 +1,18 @@
+alarmtimer-suspend
+change_skew
+clocksource-switch
+inconsistency-check
+leap-a-day
+leapcrash
+mqueue-lat
+nanosleep
+nsleep-lat
+posix_timers
+raw_skew
+rtctest
+set-2038
+set-tai
+set-timer-lat
+skew_consistency
+threadtest
+valid-adjtimex
diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c
index aaffbde1d5ee..72cacf5383dd 100644
--- a/tools/testing/selftests/timers/alarmtimer-suspend.c
+++ b/tools/testing/selftests/timers/alarmtimer-suspend.c
@@ -57,7 +57,7 @@ static inline int ksft_exit_fail(void)
#define NSEC_PER_SEC 1000000000ULL
-#define UNREASONABLE_LAT (NSEC_PER_SEC * 4) /* hopefully we resume in 4secs */
+#define UNREASONABLE_LAT (NSEC_PER_SEC * 5) /* hopefully we resume in 5 secs */
#define SUSPEND_SECS 15
int alarmcount;
@@ -152,7 +152,11 @@ int main(void)
alarm_clock_id++) {
alarmcount = 0;
- timer_create(alarm_clock_id, &se, &tm1);
+ if (timer_create(alarm_clock_id, &se, &tm1) == -1) {
+ printf("timer_create failled, %s unspported?\n",
+ clockstring(alarm_clock_id));
+ break;
+ }
clock_gettime(alarm_clock_id, &start_time);
printf("Start time (%s): %ld:%ld\n", clockstring(alarm_clock_id),
@@ -172,7 +176,7 @@ int main(void)
while (alarmcount < 10) {
int ret;
- sleep(1);
+ sleep(3);
ret = system("echo mem > /sys/power/state");
if (ret)
break;
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index a5ce9534eb15..231b9a031f6a 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -1,7 +1,12 @@
# Makefile for vm selftests
CFLAGS = -Wall
-BINARIES = hugepage-mmap hugepage-shm map_hugetlb thuge-gen hugetlbfstest
+BINARIES = compaction_test
+BINARIES += hugepage-mmap
+BINARIES += hugepage-shm
+BINARIES += hugetlbfstest
+BINARIES += map_hugetlb
+BINARIES += thuge-gen
BINARIES += transhuge-stress
all: $(BINARIES)
diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c
new file mode 100644
index 000000000000..932ff577ffc0
--- /dev/null
+++ b/tools/testing/selftests/vm/compaction_test.c
@@ -0,0 +1,225 @@
+/*
+ *
+ * A test for the patch "Allow compaction of unevictable pages".
+ * With this patch we should be able to allocate at least 1/4
+ * of RAM in huge pages. Without the patch much less is
+ * allocated.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#define MAP_SIZE 1048576
+
+struct map_list {
+ void *map;
+ struct map_list *next;
+};
+
+int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize)
+{
+ char buffer[256] = {0};
+ char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'";
+ FILE *cmdfile = popen(cmd, "r");
+
+ if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+ perror("Failed to read meminfo\n");
+ return -1;
+ }
+
+ pclose(cmdfile);
+
+ *memfree = atoll(buffer);
+ cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'";
+ cmdfile = popen(cmd, "r");
+
+ if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+ perror("Failed to read meminfo\n");
+ return -1;
+ }
+
+ pclose(cmdfile);
+ *hugepagesize = atoll(buffer);
+
+ return 0;
+}
+
+int prereq(void)
+{
+ char allowed;
+ int fd;
+
+ fd = open("/proc/sys/vm/compact_unevictable_allowed",
+ O_RDONLY | O_NONBLOCK);
+ if (fd < 0) {
+ perror("Failed to open\n"
+ "/proc/sys/vm/compact_unevictable_allowed\n");
+ return -1;
+ }
+
+ if (read(fd, &allowed, sizeof(char)) != sizeof(char)) {
+ perror("Failed to read from\n"
+ "/proc/sys/vm/compact_unevictable_allowed\n");
+ close(fd);
+ return -1;
+ }
+
+ close(fd);
+ if (allowed == '1')
+ return 0;
+
+ return -1;
+}
+
+int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
+{
+ int fd;
+ int compaction_index = 0;
+ char initial_nr_hugepages[10] = {0};
+ char nr_hugepages[10] = {0};
+
+ /* We want to test with 80% of available memory. Else, OOM killer comes
+ in to play */
+ mem_free = mem_free * 0.8;
+
+ fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
+ if (fd < 0) {
+ perror("Failed to open /proc/sys/vm/nr_hugepages");
+ return -1;
+ }
+
+ if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) {
+ perror("Failed to read from /proc/sys/vm/nr_hugepages");
+ goto close_fd;
+ }
+
+ /* Start with the initial condition of 0 huge pages*/
+ if (write(fd, "0", sizeof(char)) != sizeof(char)) {
+ perror("Failed to write to /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ lseek(fd, 0, SEEK_SET);
+
+ /* Request a large number of huge pages. The Kernel will allocate
+ as much as it can */
+ if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
+ perror("Failed to write to /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ lseek(fd, 0, SEEK_SET);
+
+ if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
+ perror("Failed to read from /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ /* We should have been able to request at least 1/3 rd of the memory in
+ huge pages */
+ compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size);
+
+ if (compaction_index > 3) {
+ printf("No of huge pages allocated = %d\n",
+ (atoi(nr_hugepages)));
+ fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n"
+ "as huge pages\n", compaction_index);
+ goto close_fd;
+ }
+
+ printf("No of huge pages allocated = %d\n",
+ (atoi(nr_hugepages)));
+
+ if (write(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages))
+ != strlen(initial_nr_hugepages)) {
+ perror("Failed to write to /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ close(fd);
+ return 0;
+
+ close_fd:
+ close(fd);
+ printf("Not OK. Compaction test failed.");
+ return -1;
+}
+
+
+int main(int argc, char **argv)
+{
+ struct rlimit lim;
+ struct map_list *list, *entry;
+ size_t page_size, i;
+ void *map = NULL;
+ unsigned long mem_free = 0;
+ unsigned long hugepage_size = 0;
+ unsigned long mem_fragmentable = 0;
+
+ if (prereq() != 0) {
+ printf("Either the sysctl compact_unevictable_allowed is not\n"
+ "set to 1 or couldn't read the proc file.\n"
+ "Skipping the test\n");
+ return 0;
+ }
+
+ lim.rlim_cur = RLIM_INFINITY;
+ lim.rlim_max = RLIM_INFINITY;
+ if (setrlimit(RLIMIT_MEMLOCK, &lim)) {
+ perror("Failed to set rlimit:\n");
+ return -1;
+ }
+
+ page_size = getpagesize();
+
+ list = NULL;
+
+ if (read_memory_info(&mem_free, &hugepage_size) != 0) {
+ printf("ERROR: Cannot read meminfo\n");
+ return -1;
+ }
+
+ mem_fragmentable = mem_free * 0.8 / 1024;
+
+ while (mem_fragmentable > 0) {
+ map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
+ if (map == MAP_FAILED)
+ break;
+
+ entry = malloc(sizeof(struct map_list));
+ if (!entry) {
+ munmap(map, MAP_SIZE);
+ break;
+ }
+ entry->map = map;
+ entry->next = list;
+ list = entry;
+
+ /* Write something (in this case the address of the map) to
+ * ensure that KSM can't merge the mapped pages
+ */
+ for (i = 0; i < MAP_SIZE; i += page_size)
+ *(unsigned long *)(map + i) = (unsigned long)map + i;
+
+ mem_fragmentable--;
+ }
+
+ for (entry = list; entry != NULL; entry = entry->next) {
+ munmap(entry->map, MAP_SIZE);
+ if (!entry->next)
+ break;
+ entry = entry->next;
+ }
+
+ if (check_compaction(mem_free, hugepage_size) == 0)
+ return 0;
+
+ return -1;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index c87b6812300d..49ece11ff7fd 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -90,4 +90,16 @@ fi
umount $mnt
rm -rf $mnt
echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
+
+echo "-----------------------"
+echo "running compaction_test"
+echo "-----------------------"
+./compaction_test
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
exit $exitcode
diff --git a/tools/testing/selftests/x86/trivial_64bit_program.c b/tools/testing/selftests/x86/trivial_64bit_program.c
index b994946c40fb..05c6a41b3671 100644
--- a/tools/testing/selftests/x86/trivial_64bit_program.c
+++ b/tools/testing/selftests/x86/trivial_64bit_program.c
@@ -1,5 +1,5 @@
/*
- * Trivial program to check that we have a valid 32-bit build environment.
+ * Trivial program to check that we have a valid 64-bit build environment.
* Copyright (c) 2015 Andy Lutomirski
* GPL v2
*/