618 files changed, 32292 insertions, 14682 deletions
diff --git a/Documentation/ABI/testing/configfs-rdma_cm b/Documentation/ABI/testing/configfs-rdma_cm
new file mode 100644
index 000000000000..5c389aaf5291
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-rdma_cm
@@ -0,0 +1,22 @@
+What: 		/config/rdma_cm
+Date: 		November 29, 2015
+KernelVersion:  4.4.0
+Description: 	Interface is used to configure RDMA-cable HCAs in respect to
+		RDMA-CM attributes.
+
+		Attributes are visible only when configfs is mounted. To mount
+		configfs in /config directory use:
+		# mount -t configfs none /config/
+
+		In order to set parameters related to a specific HCA, a directory
+		for this HCA has to be created:
+		mkdir -p /config/rdma_cm/<hca>
+
+
+What: 		/config/rdma_cm/<hca>/ports/<port-num>/default_roce_mode
+Date: 		November 29, 2015
+KernelVersion:  4.4.0
+Description: 	RDMA-CM based connections from HCA <hca> at port <port-num>
+		will be initiated with this RoCE type as default.
+		The possible RoCE types are either "IB/RoCE v1" or "RoCE v2".
+		This parameter has RW access.
diff --git a/Documentation/ABI/testing/sysfs-class-infiniband b/Documentation/ABI/testing/sysfs-class-infiniband
new file mode 100644
index 000000000000..a86abe66a316
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-infiniband
@@ -0,0 +1,16 @@
+What:		/sys/class/infiniband/<hca>/ports/<port-number>/gid_attrs/ndevs/<gid-index>
+Date:		November 29, 2015
+KernelVersion:	4.4.0
+Contact:	linux-rdma@vger.kernel.org
+Description: 	The net-device's name associated with the GID resides
+		at index <gid-index>.
+
+What:		/sys/class/infiniband/<hca>/ports/<port-number>/gid_attrs/types/<gid-index>
+Date:		November 29, 2015
+KernelVersion:	4.4.0
+Contact:	linux-rdma@vger.kernel.org
+Description: 	The RoCE type of the associated GID resides at index <gid-index>.
+		This could either be "IB/RoCE v1" for IB and RoCE v1 based GODs
+		or "RoCE v2" for RoCE v2 based GIDs.
+
+
diff --git a/Documentation/devicetree/bindings/input/gpio-keys.txt b/Documentation/devicetree/bindings/input/gpio-keys.txt
index cf1333d1dd52..21641236c095 100644
--- a/Documentation/devicetree/bindings/input/gpio-keys.txt
+++ b/Documentation/devicetree/bindings/input/gpio-keys.txt
@@ -6,6 +6,7 @@ Required properties:
 Optional properties:
 	- autorepeat: Boolean, Enable auto repeat feature of Linux input
 	  subsystem.
+	- label: String, name of the input device.
 
 Each button (key) is represented as a sub-node of "gpio-keys":
 Subnode properties:
diff --git a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt
index 0dfa60d88dd3..08efe6bc2193 100644
--- a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt
@@ -2,8 +2,10 @@
 
 Required properties:
 - compatible : should be "rockchip,<name>-tsadc"
+   "rockchip,rk3228-tsadc": found on RK3228 SoCs
    "rockchip,rk3288-tsadc": found on RK3288 SoCs
    "rockchip,rk3368-tsadc": found on RK3368 SoCs
+   "rockchip,rk3399-tsadc": found on RK3399 SoCs
 - reg : physical base address of the controller and length of memory mapped
 	region.
 - interrupts : The interrupt number to the cpu. The interrupt specifier format
diff --git a/Documentation/infiniband/core_locking.txt b/Documentation/infiniband/core_locking.txt
index e1678542279a..4b1f36b6ada0 100644
--- a/Documentation/infiniband/core_locking.txt
+++ b/Documentation/infiniband/core_locking.txt
@@ -15,7 +15,6 @@ Sleeping and interrupt context
     modify_ah
     query_ah
     destroy_ah
-    bind_mw
     post_send
     post_recv
     poll_cq
@@ -31,7 +30,6 @@ Sleeping and interrupt context
     ib_modify_ah
     ib_query_ah
     ib_destroy_ah
-    ib_bind_mw
     ib_post_send
     ib_post_recv
     ib_req_notify_cq
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index f4cbfe0ba108..edec3a3e648d 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -90,7 +90,7 @@ BLOCK_SOFTIRQ:  Do all of the following:
 	from being initiated from tasks that might run on the CPU to
 	be de-jittered.  (It is OK to force this CPU offline and then
 	bring it back online before you start your application.)
-BLOCK_IOPOLL_SOFTIRQ:  Do all of the following:
+IRQ_POLL_SOFTIRQ:  Do all of the following:
 1.	Force block-device interrupts onto some other CPU.
 2.	Initiate any block I/O and block-I/O polling on other CPUs.
 3.	Once your application has started, prevent CPU-hotplug operations
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 88152f214f48..302b5ed616a6 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/fs:
 - nr_open
 - overflowuid
 - overflowgid
+- pipe-user-pages-hard
+- pipe-user-pages-soft
 - protected_hardlinks
 - protected_symlinks
 - suid_dumpable
@@ -159,6 +161,27 @@ The default is 65534.
 
 ==============================================================
 
+pipe-user-pages-hard:
+
+Maximum total number of pages a non-privileged user may allocate for pipes.
+Once this limit is reached, no new pipes may be allocated until usage goes
+below the limit again. When set to 0, no limit is applied, which is the default
+setting.
+
+==============================================================
+
+pipe-user-pages-soft:
+
+Maximum total number of pages a non-privileged user may allocate for pipes
+before the pipe size gets limited to a single page. Once this limit is reached,
+new pipes will be limited to a single page in size for this user in order to
+limit total memory usage, and trying to increase them using fcntl() will be
+denied until usage goes below the limit again. The default value allows to
+allocate up to 1024 pipes at their default size. When set to 0, no limit is
+applied.
+
+==============================================================
+
 protected_hardlinks:
 
 A long-standing class of security issues is the hardlink-based
diff --git a/MAINTAINERS b/MAINTAINERS
index 4ad9d7c46cac..78f71f471bd6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3665,13 +3665,12 @@ F:	drivers/scsi/dpt*
 F:	drivers/scsi/dpt/
 
 DRBD DRIVER
-P:	Philipp Reisner
-P:	Lars Ellenberg
-M:	drbd-dev@lists.linbit.com
-L:	drbd-user@lists.linbit.com
+M:	Philipp Reisner <philipp.reisner@linbit.com>
+M:	Lars Ellenberg <lars.ellenberg@linbit.com>
+L:	drbd-dev@lists.linbit.com
 W:	http://www.drbd.org
-T:	git git://git.drbd.org/linux-2.6-drbd.git drbd
-T:	git git://git.drbd.org/drbd-8.3.git
+T:	git git://git.linbit.com/linux-drbd.git
+T:	git git://git.linbit.com/drbd-8.4.git
 S:	Supported
 F:	drivers/block/drbd/
 F:	lib/lru_cache.c
@@ -7150,27 +7149,45 @@ W:	https://linuxtv.org
 S:	Odd Fixes
 F:	drivers/media/radio/radio-miropcm20*
 
-Mellanox MLX5 core VPI driver
-M:	Eli Cohen <eli@mellanox.com>
+MELLANOX MLX4 core VPI driver
+M:	Yishai Hadas <yishaih@mellanox.com>
 L:	netdev@vger.kernel.org
 L:	linux-rdma@vger.kernel.org
 W:	http://www.mellanox.com
 Q:	http://patchwork.ozlabs.org/project/netdev/list/
+S:	Supported
+F:	drivers/net/ethernet/mellanox/mlx4/
+F:	include/linux/mlx4/
+
+MELLANOX MLX4 IB driver
+M:	Yishai Hadas <yishaih@mellanox.com>
+L:	linux-rdma@vger.kernel.org
+W:	http://www.mellanox.com
 Q:	http://patchwork.kernel.org/project/linux-rdma/list/
-T:	git git://openfabrics.org/~eli/connect-ib.git
+S:	Supported
+F:	drivers/infiniband/hw/mlx4/
+F:	include/linux/mlx4/
+
+MELLANOX MLX5 core VPI driver
+M:	Matan Barak <matanb@mellanox.com>
+M:	Leon Romanovsky <leonro@mellanox.com>
+L:	netdev@vger.kernel.org
+L:	linux-rdma@vger.kernel.org
+W:	http://www.mellanox.com
+Q:	http://patchwork.ozlabs.org/project/netdev/list/
 S:	Supported
 F:	drivers/net/ethernet/mellanox/mlx5/core/
 F:	include/linux/mlx5/
 
-Mellanox MLX5 IB driver
-M:	Eli Cohen <eli@mellanox.com>
+MELLANOX MLX5 IB driver
+M:	Matan Barak <matanb@mellanox.com>
+M:	Leon Romanovsky <leonro@mellanox.com>
 L:	linux-rdma@vger.kernel.org
 W:	http://www.mellanox.com
 Q:	http://patchwork.kernel.org/project/linux-rdma/list/
-T:	git git://openfabrics.org/~eli/connect-ib.git
 S:	Supported
-F:	include/linux/mlx5/
 F:	drivers/infiniband/hw/mlx5/
+F:	include/linux/mlx5/
 
 MELEXIS MLX90614 DRIVER
 M:	Crt Mori <cmo@melexis.com>
@@ -7701,6 +7718,12 @@ W:	https://github.com/jonmason/ntb/wiki
 T:	git git://github.com/jonmason/ntb.git
 F:	drivers/ntb/hw/intel/
 
+NTB AMD DRIVER
+M:	Xiangliang Yu <Xiangliang.Yu@amd.com>
+L:	linux-ntb@googlegroups.com
+S:	Supported
+F:	drivers/ntb/hw/amd/
+
 NTFS FILESYSTEM
 M:	Anton Altaparmakov <anton@tuxera.com>
 L:	linux-ntfs-dev@lists.sourceforge.net
@@ -10452,9 +10475,11 @@ S:	Maintained
 F:	drivers/net/ethernet/dlink/sundance.c
 
 SUPERH
+M:	Yoshinori Sato <ysato@users.sourceforge.jp>
+M:	Rich Felker <dalias@libc.org>
 L:	linux-sh@vger.kernel.org
 Q:	http://patchwork.kernel.org/project/linux-sh/list/
-S:	Orphan
+S:	Maintained
 F:	Documentation/sh/
 F:	arch/sh/
 F:	drivers/sh/
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index 5c0e5cc8ed10..c6b6175d0203 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -153,10 +153,9 @@ choice
 		  mobile SoCs in the Kona family of chips (e.g. bcm28155,
 		  bcm11351, etc...)
 
-	config DEBUG_BCM63XX
+	config DEBUG_BCM63XX_UART
 		bool "Kernel low-level debugging on BCM63XX UART"
 		depends on ARCH_BCM_63XX
-		select DEBUG_UART_BCM63XX
 
 	config DEBUG_BERLIN_UART
 		bool "Marvell Berlin SoC Debug UART"
@@ -1414,7 +1413,7 @@ config DEBUG_LL_INCLUDE
 	default "debug/vf.S" if DEBUG_VF_UART
 	default "debug/vt8500.S" if DEBUG_VT8500_UART0
 	default "debug/zynq.S" if DEBUG_ZYNQ_UART0 || DEBUG_ZYNQ_UART1
-	default "debug/bcm63xx.S" if DEBUG_UART_BCM63XX
+	default "debug/bcm63xx.S" if DEBUG_BCM63XX_UART
 	default "debug/digicolor.S" if DEBUG_DIGICOLOR_UA0
 	default "mach/debug-macro.S"
 
@@ -1428,10 +1427,6 @@ config DEBUG_UART_8250
 		ARCH_IOP13XX || ARCH_IOP32X || ARCH_IOP33X || ARCH_IXP4XX || \
 		ARCH_RPC
 
-# Compatibility options for BCM63xx
-config DEBUG_UART_BCM63XX
-	def_bool ARCH_BCM_63XX
-
 config DEBUG_UART_PHYS
 	hex "Physical base address of debug UART"
 	default 0x00100a00 if DEBUG_NETX_UART
@@ -1529,7 +1524,7 @@ config DEBUG_UART_PHYS
 	default 0xfffb0000 if DEBUG_OMAP1UART1 || DEBUG_OMAP7XXUART1
 	default 0xfffb0800 if DEBUG_OMAP1UART2 || DEBUG_OMAP7XXUART2
 	default 0xfffb9800 if DEBUG_OMAP1UART3 || DEBUG_OMAP7XXUART3
-	default 0xfffe8600 if DEBUG_UART_BCM63XX
+	default 0xfffe8600 if DEBUG_BCM63XX_UART
 	default 0xfffff700 if ARCH_IOP33X
 	depends on ARCH_EP93XX || \
 	        DEBUG_LL_UART_8250 || DEBUG_LL_UART_PL01X || \
@@ -1542,7 +1537,7 @@ config DEBUG_UART_PHYS
 		DEBUG_RMOBILE_SCIFA0 || DEBUG_RMOBILE_SCIFA1 || \
 		DEBUG_RMOBILE_SCIFA4 || DEBUG_S3C24XX_UART || \
 		DEBUG_S3C64XX_UART || \
-		DEBUG_UART_BCM63XX || DEBUG_ASM9260_UART || \
+		DEBUG_BCM63XX_UART || DEBUG_ASM9260_UART || \
 		DEBUG_SIRFSOC_UART || DEBUG_DIGICOLOR_UA0 || \
 		DEBUG_AT91_UART
 
@@ -1588,7 +1583,7 @@ config DEBUG_UART_VIRT
 	default 0xfb10c000 if DEBUG_REALVIEW_PB1176_PORT
 	default 0xfc40ab00 if DEBUG_BRCMSTB_UART
 	default 0xfc705000 if DEBUG_ZTE_ZX
-	default 0xfcfe8600 if DEBUG_UART_BCM63XX
+	default 0xfcfe8600 if DEBUG_BCM63XX_UART
 	default 0xfd000000 if DEBUG_SPEAR3XX || DEBUG_SPEAR13XX
 	default 0xfd012000 if DEBUG_MVEBU_UART0_ALTERNATE && ARCH_MV78XX0
 	default 0xfd883000 if DEBUG_ALPINE_UART0
@@ -1638,7 +1633,7 @@ config DEBUG_UART_VIRT
 		DEBUG_NETX_UART || \
 		DEBUG_QCOM_UARTDM || DEBUG_S3C24XX_UART || \
 		DEBUG_S3C64XX_UART || \
-		DEBUG_UART_BCM63XX || DEBUG_ASM9260_UART || \
+		DEBUG_BCM63XX_UART || DEBUG_ASM9260_UART || \
 		DEBUG_SIRFSOC_UART || DEBUG_DIGICOLOR_UA0
 
 config DEBUG_UART_8250_SHIFT
diff --git a/arch/arm/boot/dts/r8a7740-armadillo800eva.dts b/arch/arm/boot/dts/r8a7740-armadillo800eva.dts
index 78a21f2828df..c548cabb102f 100644
--- a/arch/arm/boot/dts/r8a7740-armadillo800eva.dts
+++ b/arch/arm/boot/dts/r8a7740-armadillo800eva.dts
@@ -180,7 +180,7 @@
 };
 
 &extal1_clk {
-	clock-frequency = <25000000>;
+	clock-frequency = <24000000>;
 };
 &extal2_clk {
 	clock-frequency = <48000000>;
diff --git a/arch/arm/mach-realview/Makefile b/arch/arm/mach-realview/Makefile
index 8be6632407d8..dae8d86ef4cc 100644
--- a/arch/arm/mach-realview/Makefile
+++ b/arch/arm/mach-realview/Makefile
@@ -4,10 +4,9 @@
 ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include \
 	-I$(srctree)/arch/arm/plat-versatile/include
 
-
+obj-y					:= core.o
 obj-$(CONFIG_REALVIEW_DT)		+= realview-dt.o
 obj-$(CONFIG_SMP)			+= platsmp-dt.o
-obj-y					:= core.o
 
 ifdef CONFIG_ATAGS
 obj-$(CONFIG_MACH_REALVIEW_EB)		+= realview_eb.o
diff --git a/arch/arm/mach-tegra/Kconfig b/arch/arm/mach-tegra/Kconfig
index a90f3556017f..0fa8b84ed657 100644
--- a/arch/arm/mach-tegra/Kconfig
+++ b/arch/arm/mach-tegra/Kconfig
@@ -13,57 +13,5 @@ menuconfig ARCH_TEGRA
 	select ARCH_HAS_RESET_CONTROLLER
 	select RESET_CONTROLLER
 	select SOC_BUS
-	select USB_ULPI if USB_PHY
-	select USB_ULPI_VIEWPORT if USB_PHY
 	help
 	  This enables support for NVIDIA Tegra based systems.
-
-if ARCH_TEGRA
-
-config ARCH_TEGRA_2x_SOC
-	bool "Enable support for Tegra20 family"
-	select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
-	select ARM_ERRATA_720789
-	select ARM_ERRATA_754327 if SMP
-	select ARM_ERRATA_764369 if SMP
-	select PINCTRL_TEGRA20
-	select PL310_ERRATA_727915 if CACHE_L2X0
-	select PL310_ERRATA_769419 if CACHE_L2X0
-	select TEGRA_TIMER
-	help
-	  Support for NVIDIA Tegra AP20 and T20 processors, based on the
-	  ARM CortexA9MP CPU and the ARM PL310 L2 cache controller
-
-config ARCH_TEGRA_3x_SOC
-	bool "Enable support for Tegra30 family"
-	select ARM_ERRATA_754322
-	select ARM_ERRATA_764369 if SMP
-	select PINCTRL_TEGRA30
-	select PL310_ERRATA_769419 if CACHE_L2X0
-	select TEGRA_TIMER
-	help
-	  Support for NVIDIA Tegra T30 processor family, based on the
-	  ARM CortexA9MP CPU and the ARM PL310 L2 cache controller
-
-config ARCH_TEGRA_114_SOC
-	bool "Enable support for Tegra114 family"
-	select ARM_ERRATA_798181 if SMP
-	select ARM_L1_CACHE_SHIFT_6
-	select HAVE_ARM_ARCH_TIMER
-	select PINCTRL_TEGRA114
-	select TEGRA_TIMER
-	help
-	  Support for NVIDIA Tegra T114 processor family, based on the
-	  ARM CortexA15MP CPU
-
-config ARCH_TEGRA_124_SOC
-	bool "Enable support for Tegra124 family"
-	select ARM_L1_CACHE_SHIFT_6
-	select HAVE_ARM_ARCH_TIMER
-	select PINCTRL_TEGRA124
-	select TEGRA_TIMER
-	help
-	  Support for NVIDIA Tegra T124 processor family, based on the
-	  ARM CortexA15MP CPU
-
-endif
diff --git a/arch/arm/mach-tegra/sleep-tegra20.S b/arch/arm/mach-tegra/sleep-tegra20.S
index e6b684e14322..f5d19667484e 100644
--- a/arch/arm/mach-tegra/sleep-tegra20.S
+++ b/arch/arm/mach-tegra/sleep-tegra20.S
@@ -231,8 +231,11 @@ ENDPROC(tegra20_cpu_is_resettable_soon)
  * tegra20_tear_down_core in IRAM
  */
 ENTRY(tegra20_sleep_core_finish)
+	mov     r4, r0
 	/* Flush, disable the L1 data cache and exit SMP */
+	mov     r0, #TEGRA_FLUSH_CACHE_ALL
 	bl	tegra_disable_clean_inv_dcache
+	mov     r0, r4
 
 	mov32	r3, tegra_shut_off_mmu
 	add	r3, r3, r0
diff --git a/arch/arm/mach-tegra/sleep-tegra30.S b/arch/arm/mach-tegra/sleep-tegra30.S
index 9a2f0b051e10..16e5ff03383c 100644
--- a/arch/arm/mach-tegra/sleep-tegra30.S
+++ b/arch/arm/mach-tegra/sleep-tegra30.S
@@ -242,8 +242,11 @@ ENDPROC(tegra30_cpu_shutdown)
  * tegra30_tear_down_core in IRAM
  */
 ENTRY(tegra30_sleep_core_finish)
+	mov	r4, r0
 	/* Flush, disable the L1 data cache and exit SMP */
+	mov	r0, #TEGRA_FLUSH_CACHE_ALL
 	bl	tegra_disable_clean_inv_dcache
+	mov	r0, r4
 
 	/*
 	 * Preload all the address literals that are needed for the
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 534a60ae282e..0eca3812527e 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1200,10 +1200,7 @@ error:
 	while (i--)
 		if (pages[i])
 			__free_pages(pages[i], 0);
-	if (array_size <= PAGE_SIZE)
-		kfree(pages);
-	else
-		vfree(pages);
+	kvfree(pages);
 	return NULL;
 }
 
@@ -1211,7 +1208,6 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
 			       size_t size, struct dma_attrs *attrs)
 {
 	int count = size >> PAGE_SHIFT;
-	int array_size = count * sizeof(struct page *);
 	int i;
 
 	if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) {
@@ -1222,10 +1218,7 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
 				__free_pages(pages[i], 0);
 	}
 
-	if (array_size <= PAGE_SIZE)
-		kfree(pages);
-	else
-		vfree(pages);
+	kvfree(pages);
 	return 0;
 }
 
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 2c400412b3bc..21074f674bde 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -105,18 +105,6 @@ config ARCH_TEGRA
 	help
 	  This enables support for the NVIDIA Tegra SoC family.
 
-config ARCH_TEGRA_132_SOC
-	bool "NVIDIA Tegra132 SoC"
-	depends on ARCH_TEGRA
-	select PINCTRL_TEGRA124
-	select USB_ULPI if USB_PHY
-	select USB_ULPI_VIEWPORT if USB_PHY
-	help
-	  Enable support for NVIDIA Tegra132 SoC, based on the Denver
-	  ARMv8 CPU.  The Tegra132 SoC is similar to the Tegra124 SoC,
-	  but contains an NVIDIA Denver CPU complex in place of
-	  Tegra124's "4+1" Cortex-A15 CPU complex.
-
 config ARCH_SPRD
 	bool "Spreadtrum SoC platform"
 	help
diff --git a/arch/arm64/boot/dts/Makefile b/arch/arm64/boot/dts/Makefile
index 76e7510835b2..f832b8a7453a 100644
--- a/arch/arm64/boot/dts/Makefile
+++ b/arch/arm64/boot/dts/Makefile
@@ -9,6 +9,7 @@ dts-dirs += freescale
 dts-dirs += hisilicon
 dts-dirs += marvell
 dts-dirs += mediatek
+dts-dirs += nvidia
 dts-dirs += qcom
 dts-dirs += renesas
 dts-dirs += rockchip
diff --git a/arch/arm64/boot/dts/nvidia/Makefile b/arch/arm64/boot/dts/nvidia/Makefile
new file mode 100644
index 000000000000..a7e865da1005
--- /dev/null
+++ b/arch/arm64/boot/dts/nvidia/Makefile
@@ -0,0 +1,7 @@
+dtb-$(CONFIG_ARCH_TEGRA_132_SOC) += tegra132-norrin.dtb
+dtb-$(CONFIG_ARCH_TEGRA_210_SOC) += tegra210-p2371-0000.dtb
+dtb-$(CONFIG_ARCH_TEGRA_210_SOC) += tegra210-p2371-2180.dtb
+dtb-$(CONFIG_ARCH_TEGRA_210_SOC) += tegra210-p2571.dtb
+
+always		:= $(dtb-y)
+clean-files	:= *.dtb
diff --git a/arch/arm64/boot/dts/nvidia/tegra132-norrin.dts b/arch/arm64/boot/dts/nvidia/tegra132-norrin.dts
new file mode 100644
index 000000000000..7dfe1c085966
--- /dev/null
+++ b/arch/arm64/boot/dts/nvidia/tegra132-norrin.dts
@@ -0,0 +1,1130 @@
+/dts-v1/;
+
+#include <dt-bindings/input/input.h>
+#include "tegra132.dtsi"
+
+/ {
+	model = "NVIDIA Tegra132 Norrin";
+	compatible = "nvidia,norrin", "nvidia,tegra132", "nvidia,tegra124";
+
+	aliases {
+		rtc0 = "/i2c@0,7000d000/as3722@40";
+		rtc1 = "/rtc@0,7000e000";
+	};
+
+	memory {
+		device_type = "memory";
+		reg = <0x0 0x80000000 0x0 0x80000000>;
+	};
+
+	host1x@0,50000000 {
+		hdmi@0,54280000 {
+			status = "disabled";
+
+			vdd-supply = <&vdd_3v3_hdmi>;
+			pll-supply = <&vdd_hdmi_pll>;
+			hdmi-supply = <&vdd_5v0_hdmi>;
+
+			nvidia,ddc-i2c-bus = <&hdmi_ddc>;
+			nvidia,hpd-gpio =
+				<&gpio TEGRA_GPIO(N, 7) GPIO_ACTIVE_HIGH>;
+		};
+
+		sor@0,54540000 {
+			status = "okay";
+
+			nvidia,dpaux = <&dpaux>;
+			nvidia,panel = <&panel>;
+		};
+
+		dpaux: dpaux@0,545c0000 {
+			vdd-supply = <&vdd_3v3_panel>;
+			status = "okay";
+		};
+	};
+
+	gpu@0,57000000 {
+		status = "okay";
+
+		vdd-supply = <&vdd_gpu>;
+	};
+
+	pinmux@0,70000868 {
+		pinctrl-names = "default";
+		pinctrl-0 = <&pinmux_default>;
+
+		pinmux_default: pinmux@0 {
+			dap_mclk1_pw4 {
+				nvidia,pins = "dap_mclk1_pw4";
+				nvidia,function = "extperiph1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_din_pa4 {
+				nvidia,pins = "dap2_din_pa4";
+				nvidia,function = "i2s1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			dap2_dout_pa5 {
+				nvidia,pins = "dap2_dout_pa5",
+					      "dap2_fs_pa2",
+					      "dap2_sclk_pa3";
+				nvidia,function = "i2s1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			dap3_dout_pp2 {
+				nvidia,pins = "dap3_dout_pp2";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			dvfs_pwm_px0 {
+				nvidia,pins = "dvfs_pwm_px0",
+					      "dvfs_clk_px2";
+				nvidia,function = "cldvfs";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			ulpi_clk_py0 {
+				nvidia,pins = "ulpi_clk_py0",
+					      "ulpi_nxt_py2",
+					      "ulpi_stp_py3";
+				nvidia,function = "spi1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			ulpi_dir_py1 {
+				nvidia,pins = "ulpi_dir_py1";
+				nvidia,function = "spi1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			cam_i2c_scl_pbb1 {
+				nvidia,pins = "cam_i2c_scl_pbb1",
+					      "cam_i2c_sda_pbb2";
+				nvidia,function = "i2c3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,lock = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_ENABLE>;
+			};
+			gen2_i2c_scl_pt5 {
+				nvidia,pins = "gen2_i2c_scl_pt5",
+					      "gen2_i2c_sda_pt6";
+				nvidia,function = "i2c2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,lock = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_ENABLE>;
+			};
+			pj7 {
+				nvidia,pins = "pj7";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			spdif_in_pk6 {
+				nvidia,pins = "spdif_in_pk6";
+				nvidia,function = "spdif";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			pk7 {
+				nvidia,pins = "pk7";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			pg4 {
+				nvidia,pins = "pg4",
+					      "pg5",
+					      "pg6",
+					      "pi3";
+				nvidia,function = "spi4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			pg7 {
+				nvidia,pins = "pg7";
+				nvidia,function = "spi4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			ph1 {
+				nvidia,pins = "ph1";
+				nvidia,function = "pwm1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			pk0 {
+				nvidia,pins = "pk0",
+					      "kb_row15_ps7",
+					      "clk_32k_out_pa0";
+				nvidia,function = "soc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			sdmmc1_clk_pz0 {
+				nvidia,pins = "sdmmc1_clk_pz0";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			sdmmc1_cmd_pz1 {
+				nvidia,pins = "sdmmc1_cmd_pz1",
+					      "sdmmc1_dat0_py7",
+					      "sdmmc1_dat1_py6",
+					      "sdmmc1_dat2_py5",
+					      "sdmmc1_dat3_py4";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			sdmmc3_clk_pa6 {
+				nvidia,pins = "sdmmc3_clk_pa6";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			sdmmc3_cmd_pa7 {
+				nvidia,pins = "sdmmc3_cmd_pa7",
+					      "sdmmc3_dat0_pb7",
+					      "sdmmc3_dat1_pb6",
+					      "sdmmc3_dat2_pb5",
+					      "sdmmc3_dat3_pb4",
+					      "kb_col4_pq4",
+					      "sdmmc3_clk_lb_out_pee4",
+					      "sdmmc3_clk_lb_in_pee5",
+					      "sdmmc3_cd_n_pv2";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			sdmmc4_clk_pcc4 {
+				nvidia,pins = "sdmmc4_clk_pcc4";
+				nvidia,function = "sdmmc4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			sdmmc4_cmd_pt7 {
+				nvidia,pins = "sdmmc4_cmd_pt7",
+					      "sdmmc4_dat0_paa0",
+					      "sdmmc4_dat1_paa1",
+					      "sdmmc4_dat2_paa2",
+					      "sdmmc4_dat3_paa3",
+					      "sdmmc4_dat4_paa4",
+					      "sdmmc4_dat5_paa5",
+					      "sdmmc4_dat6_paa6",
+					      "sdmmc4_dat7_paa7";
+				nvidia,function = "sdmmc4";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			mic_det_l {
+				nvidia,pins = "kb_row7_pr7";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			kb_row10_ps2 {
+				nvidia,pins = "kb_row10_ps2";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			kb_row9_ps1 {
+				nvidia,pins = "kb_row9_ps1";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			pwr_i2c_scl_pz6 {
+				nvidia,pins = "pwr_i2c_scl_pz6",
+					      "pwr_i2c_sda_pz7";
+				nvidia,function = "i2cpwr";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,lock = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_ENABLE>;
+			};
+			jtag_rtck {
+				nvidia,pins = "jtag_rtck";
+				nvidia,function = "rtck";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			clk_32k_in {
+				nvidia,pins = "clk_32k_in";
+				nvidia,function = "clk";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			core_pwr_req {
+				nvidia,pins = "core_pwr_req";
+				nvidia,function = "pwron";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			cpu_pwr_req {
+				nvidia,pins = "cpu_pwr_req";
+				nvidia,function = "cpu";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			kb_col0_ap {
+				nvidia,pins = "kb_col0_pq0";
+				nvidia,function = "rsvd4";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			en_vdd_sd {
+				nvidia,pins = "kb_row0_pr0";
+				nvidia,function = "rsvd4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			lid_open {
+				nvidia,pins = "kb_row4_pr4";
+				nvidia,function = "rsvd3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			pwr_int_n {
+				nvidia,pins = "pwr_int_n";
+				nvidia,function = "pmi";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			reset_out_n {
+				nvidia,pins = "reset_out_n";
+				nvidia,function = "reset_out_n";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			clk3_out_pee0 {
+				nvidia,pins = "clk3_out_pee0";
+				nvidia,function = "extperiph3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			gen1_i2c_scl_pc4 {
+				nvidia,pins = "gen1_i2c_scl_pc4",
+					      "gen1_i2c_sda_pc5";
+				nvidia,function = "i2c1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,lock = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_ENABLE>;
+			};
+			hdmi_cec_pee3 {
+				nvidia,pins = "hdmi_cec_pee3";
+				nvidia,function = "cec";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,lock = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			hdmi_int_pn7 {
+				nvidia,pins = "hdmi_int_pn7";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			ddc_scl_pv4 {
+				nvidia,pins = "ddc_scl_pv4",
+					      "ddc_sda_pv5";
+				nvidia,function = "i2c4";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,lock = <TEGRA_PIN_DISABLE>;
+				nvidia,rcv-sel = <TEGRA_PIN_ENABLE>;
+			};
+			usb_vbus_en0_pn4 {
+				nvidia,pins = "usb_vbus_en0_pn4",
+					      "usb_vbus_en1_pn5",
+					      "usb_vbus_en2_pff1";
+				nvidia,function = "usb";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,lock = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			drive_sdio1 {
+				nvidia,pins = "drive_sdio1";
+				nvidia,high-speed-mode = <TEGRA_PIN_ENABLE>;
+				nvidia,schmitt = <TEGRA_PIN_DISABLE>;
+				nvidia,pull-down-strength = <36>;
+				nvidia,pull-up-strength = <20>;
+				nvidia,slew-rate-rising = <TEGRA_PIN_SLEW_RATE_SLOW>;
+				nvidia,slew-rate-falling = <TEGRA_PIN_SLEW_RATE_SLOW>;
+			};
+			drive_sdio3 {
+				nvidia,pins = "drive_sdio3";
+				nvidia,high-speed-mode = <TEGRA_PIN_ENABLE>;
+				nvidia,schmitt = <TEGRA_PIN_DISABLE>;
+				nvidia,pull-down-strength = <22>;
+				nvidia,pull-up-strength = <36>;
+				nvidia,slew-rate-rising = <TEGRA_PIN_SLEW_RATE_FASTEST>;
+				nvidia,slew-rate-falling = <TEGRA_PIN_SLEW_RATE_FASTEST>;
+			};
+			drive_gma {
+				nvidia,pins = "drive_gma";
+				nvidia,high-speed-mode = <TEGRA_PIN_ENABLE>;
+				nvidia,schmitt = <TEGRA_PIN_DISABLE>;
+				nvidia,pull-down-strength = <2>;
+				nvidia,pull-up-strength = <1>;
+				nvidia,slew-rate-rising = <TEGRA_PIN_SLEW_RATE_FASTEST>;
+				nvidia,slew-rate-falling = <TEGRA_PIN_SLEW_RATE_FASTEST>;
+				nvidia,drive-type = <1>;
+			};
+			ac_ok {
+				nvidia,pins = "pj0";
+				nvidia,function = "gmi";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			codec_irq_l {
+				nvidia,pins = "ph4";
+				nvidia,function = "gmi";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			lcd_bl_en {
+				nvidia,pins = "ph2";
+				nvidia,function = "gmi";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			touch_irq_l {
+				nvidia,pins = "gpio_w3_aud_pw3";
+				nvidia,function = "spi6";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			tpm_davint_l {
+				nvidia,pins = "ph6";
+				nvidia,function = "gmi";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			ts_irq_l {
+				nvidia,pins = "pk2";
+				nvidia,function = "gmi";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			ts_reset_l {
+				nvidia,pins = "pk4";
+				nvidia,function = "gmi";
+				nvidia,pull = <1>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			ts_shdn_l {
+				nvidia,pins = "pk1";
+				nvidia,function = "gmi";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			ph7 {
+				nvidia,pins = "ph7";
+				nvidia,function = "gmi";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			sensor_irq_l {
+				nvidia,pins = "pi6";
+				nvidia,function = "gmi";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			wifi_en {
+				nvidia,pins = "gpio_x7_aud_px7";
+				nvidia,function = "rsvd4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+			chromeos_write_protect {
+				nvidia,pins = "kb_row1_pr1";
+				nvidia,function = "rsvd4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			hp_det_l {
+				nvidia,pins = "pi7";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+			};
+			soc_warm_reset_l {
+				nvidia,pins = "pi5";
+				nvidia,function = "gmi";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+			};
+		};
+	};
+
+	serial@0,70006000 {
+		status = "okay";
+	};
+
+	pwm: pwm@0,7000a000 {
+		status = "okay";
+	};
+
+	/* HDMI DDC */
+	hdmi_ddc: i2c@0,7000c700 {
+		status = "okay";
+		clock-frequency = <100000>;
+	};
+
+	i2c@0,7000d000 {
+		status = "okay";
+		clock-frequency = <400000>;
+
+		as3722: pmic@40 {
+			compatible = "ams,as3722";
+			reg = <0x40>;
+			interrupts = <GIC_SPI 86 IRQ_TYPE_LEVEL_HIGH>;
+
+			ams,system-power-controller;
+
+			#interrupt-cells = <2>;
+			interrupt-controller;
+
+			#gpio-cells = <2>;
+			gpio-controller;
+
+			pinctrl-names = "default";
+			pinctrl-0 = <&as3722_default>;
+
+			as3722_default: pinmux@0 {
+				gpio0 {
+					pins = "gpio0";
+					function = "gpio";
+					bias-pull-down;
+				};
+
+				gpio1 {
+					pins = "gpio1";
+					function = "gpio";
+					bias-pull-up;
+				};
+
+				gpio2_4_7 {
+					pins = "gpio2", "gpio4", "gpio7";
+					function = "gpio";
+					bias-pull-up;
+				};
+
+				gpio3 {
+					pins = "gpio3";
+					function = "gpio";
+					bias-high-impedance;
+				};
+
+				gpio5 {
+					pins = "gpio5";
+					function = "clk32k-out";
+					bias-pull-down;
+				};
+
+				gpio6 {
+					pins = "gpio6";
+					function = "clk32k-out";
+					bias-pull-down;
+				};
+			};
+
+			regulators {
+				vsup-sd2-supply = <&vdd_5v0_sys>;
+				vsup-sd3-supply = <&vdd_5v0_sys>;
+				vsup-sd4-supply = <&vdd_5v0_sys>;
+				vsup-sd5-supply = <&vdd_5v0_sys>;
+				vin-ldo0-supply = <&vdd_1v35_lp0>;
+				vin-ldo1-6-supply = <&vdd_3v3_sys>;
+				vin-ldo2-5-7-supply = <&vddio_1v8>;
+				vin-ldo3-4-supply = <&vdd_3v3_sys>;
+				vin-ldo9-10-supply = <&vdd_5v0_sys>;
+				vin-ldo11-supply = <&vdd_3v3_run>;
+
+				sd0 {
+					regulator-name = "+VDD_CPU_AP";
+					regulator-min-microvolt = <700000>;
+					regulator-max-microvolt = <1350000>;
+					regulator-max-microamp = <3500000>;
+					regulator-always-on;
+					regulator-boot-on;
+					ams,ext-control = <2>;
+				};
+
+				sd1 {
+					regulator-name = "+VDD_CORE";
+					regulator-min-microvolt = <700000>;
+					regulator-max-microvolt = <1350000>;
+					regulator-max-microamp = <4000000>;
+					regulator-always-on;
+					regulator-boot-on;
+					ams,ext-control = <1>;
+				};
+
+				vdd_1v35_lp0: sd2 {
+					regulator-name = "+1.35V_LP0(sd2)";
+					regulator-min-microvolt = <1350000>;
+					regulator-max-microvolt = <1350000>;
+					regulator-always-on;
+					regulator-boot-on;
+				};
+
+				sd3 {
+					regulator-name = "+1.35V_LP0(sd3)";
+					regulator-min-microvolt = <1350000>;
+					regulator-max-microvolt = <1350000>;
+					regulator-always-on;
+					regulator-boot-on;
+				};
+
+				vdd_1v05_run: sd4 {
+					regulator-name = "+1.05V_RUN";
+					regulator-min-microvolt = <1050000>;
+					regulator-max-microvolt = <1050000>;
+				};
+
+				vddio_1v8: sd5 {
+					regulator-name = "+1.8V_VDDIO";
+					regulator-min-microvolt = <1800000>;
+					regulator-max-microvolt = <1800000>;
+					regulator-always-on;
+					regulator-boot-on;
+				};
+
+				vdd_gpu: sd6 {
+					regulator-name = "+VDD_GPU_AP";
+					regulator-min-microvolt = <800000>;
+					regulator-max-microvolt = <1200000>;
+					regulator-min-microamp = <3500000>;
+					regulator-max-microamp = <3500000>;
+					regulator-always-on;
+					regulator-boot-on;
+				};
+
+				ldo0 {
+					regulator-name = "+1.05_RUN_AVDD";
+					regulator-min-microvolt = <1050000>;
+					regulator-max-microvolt = <1050000>;
+					regulator-always-on;
+					regulator-boot-on;
+					ams,ext-control = <1>;
+				};
+
+				ldo1 {
+					regulator-name = "+1.8V_RUN_CAM";
+					regulator-min-microvolt = <1800000>;
+					regulator-max-microvolt = <1800000>;
+				};
+
+				ldo2 {
+					regulator-name = "+1.2V_GEN_AVDD";
+					regulator-min-microvolt = <1200000>;
+					regulator-max-microvolt = <1200000>;
+					regulator-always-on;
+					regulator-boot-on;
+				};
+
+				ldo3 {
+					regulator-name = "+1.00V_LP0_VDD_RTC";
+					regulator-min-microvolt = <1000000>;
+					regulator-max-microvolt = <1000000>;
+					regulator-always-on;
+					regulator-boot-on;
+					ams,enable-tracking;
+				};
+
+				vdd_run_cam: ldo4 {
+					regulator-name = "+2.8V_RUN_CAM";
+					regulator-min-microvolt = <2800000>;
+					regulator-max-microvolt = <2800000>;
+				};
+
+				ldo5 {
+					regulator-name = "+1.2V_RUN_CAM_FRONT";
+					regulator-min-microvolt = <1200000>;
+					regulator-max-microvolt = <1200000>;
+				};
+
+				vddio_sdmmc3: ldo6 {
+					regulator-name = "+VDDIO_SDMMC3";
+					regulator-min-microvolt = <1800000>;
+					regulator-max-microvolt = <3300000>;
+				};
+
+				ldo7 {
+					regulator-name = "+1.05V_RUN_CAM_REAR";
+					regulator-min-microvolt = <1050000>;
+					regulator-max-microvolt = <1050000>;
+				};
+
+				ldo9 {
+					regulator-name = "+2.8V_RUN_TOUCH";
+					regulator-min-microvolt = <2800000>;
+					regulator-max-microvolt = <2800000>;
+				};
+
+				ldo10 {
+					regulator-name = "+2.8V_RUN_CAM_AF";
+					regulator-min-microvolt = <2800000>;
+					regulator-max-microvolt = <2800000>;
+				};
+
+				ldo11 {
+					regulator-name = "+1.8V_RUN_VPP_FUSE";
+					regulator-min-microvolt = <1800000>;
+					regulator-max-microvolt = <1800000>;
+				};
+			};
+		};
+	};
+
+	spi@0,7000d400 {
+		status = "okay";
+
+		ec: cros-ec@0 {
+			compatible = "google,cros-ec-spi";
+			spi-max-frequency = <3000000>;
+			interrupt-parent = <&gpio>;
+			interrupts = <TEGRA_GPIO(C, 7) IRQ_TYPE_LEVEL_LOW>;
+			reg = <0>;
+
+			google,cros-ec-spi-msg-delay = <2000>;
+
+			i2c_20: i2c-tunnel {
+				compatible = "google,cros-ec-i2c-tunnel";
+				#address-cells = <1>;
+				#size-cells = <0>;
+
+				google,remote-bus = <0>;
+
+				charger: bq24735 {
+					compatible = "ti,bq24735";
+					reg = <0x9>;
+					interrupt-parent = <&gpio>;
+					interrupts = <TEGRA_GPIO(J, 0)
+							GPIO_ACTIVE_HIGH>;
+					ti,ac-detect-gpios = <&gpio
+							TEGRA_GPIO(J, 0)
+							GPIO_ACTIVE_HIGH>;
+				};
+
+				battery: smart-battery {
+					compatible = "sbs,sbs-battery";
+					reg = <0xb>;
+					battery-name = "battery";
+					sbs,i2c-retry-count = <2>;
+					sbs,poll-retry-count = <10>;
+				/*	power-supplies = <&charger>; */
+				};
+			};
+
+			keyboard-controller {
+				compatible = "google,cros-ec-keyb";
+				keypad,num-rows = <8>;
+				keypad,num-columns = <13>;
+				google,needs-ghost-filter;
+				linux,keymap =
+					<MATRIX_KEY(0x00, 0x01, KEY_LEFTMETA)
+					 MATRIX_KEY(0x00, 0x02, KEY_F1)
+					 MATRIX_KEY(0x00, 0x03, KEY_B)
+					 MATRIX_KEY(0x00, 0x04, KEY_F10)
+					 MATRIX_KEY(0x00, 0x06, KEY_N)
+					 MATRIX_KEY(0x00, 0x08, KEY_EQUAL)
+					 MATRIX_KEY(0x00, 0x0a, KEY_RIGHTALT)
+
+					 MATRIX_KEY(0x01, 0x01, KEY_ESC)
+					 MATRIX_KEY(0x01, 0x02, KEY_F4)
+					 MATRIX_KEY(0x01, 0x03, KEY_G)
+					 MATRIX_KEY(0x01, 0x04, KEY_F7)
+					 MATRIX_KEY(0x01, 0x06, KEY_H)
+					 MATRIX_KEY(0x01, 0x08, KEY_APOSTROPHE)
+					 MATRIX_KEY(0x01, 0x09, KEY_F9)
+					 MATRIX_KEY(0x01, 0x0b, KEY_BACKSPACE)
+
+					 MATRIX_KEY(0x02, 0x00, KEY_LEFTCTRL)
+					 MATRIX_KEY(0x02, 0x01, KEY_TAB)
+					 MATRIX_KEY(0x02, 0x02, KEY_F3)
+					 MATRIX_KEY(0x02, 0x03, KEY_T)
+					 MATRIX_KEY(0x02, 0x04, KEY_F6)
+					 MATRIX_KEY(0x02, 0x05, KEY_RIGHTBRACE)
+					 MATRIX_KEY(0x02, 0x06, KEY_Y)
+					 MATRIX_KEY(0x02, 0x07, KEY_102ND)
+					 MATRIX_KEY(0x02, 0x08, KEY_LEFTBRACE)
+					 MATRIX_KEY(0x02, 0x09, KEY_F8)
+
+					 MATRIX_KEY(0x03, 0x01, KEY_GRAVE)
+					 MATRIX_KEY(0x03, 0x02, KEY_F2)
+					 MATRIX_KEY(0x03, 0x03, KEY_5)
+					 MATRIX_KEY(0x03, 0x04, KEY_F5)
+					 MATRIX_KEY(0x03, 0x06, KEY_6)
+					 MATRIX_KEY(0x03, 0x08, KEY_MINUS)
+					 MATRIX_KEY(0x03, 0x0b, KEY_BACKSLASH)
+
+					 MATRIX_KEY(0x04, 0x00, KEY_RIGHTCTRL)
+					 MATRIX_KEY(0x04, 0x01, KEY_A)
+					 MATRIX_KEY(0x04, 0x02, KEY_D)
+					 MATRIX_KEY(0x04, 0x03, KEY_F)
+					 MATRIX_KEY(0x04, 0x04, KEY_S)
+					 MATRIX_KEY(0x04, 0x05, KEY_K)
+					 MATRIX_KEY(0x04, 0x06, KEY_J)
+					 MATRIX_KEY(0x04, 0x08, KEY_SEMICOLON)
+					 MATRIX_KEY(0x04, 0x09, KEY_L)
+					 MATRIX_KEY(0x04, 0x0a, KEY_BACKSLASH)
+					 MATRIX_KEY(0x04, 0x0b, KEY_ENTER)
+
+					 MATRIX_KEY(0x05, 0x01, KEY_Z)
+					 MATRIX_KEY(0x05, 0x02, KEY_C)
+					 MATRIX_KEY(0x05, 0x03, KEY_V)
+					 MATRIX_KEY(0x05, 0x04, KEY_X)
+					 MATRIX_KEY(0x05, 0x05, KEY_COMMA)
+					 MATRIX_KEY(0x05, 0x06, KEY_M)
+					 MATRIX_KEY(0x05, 0x07, KEY_LEFTSHIFT)
+					 MATRIX_KEY(0x05, 0x08, KEY_SLASH)
+					 MATRIX_KEY(0x05, 0x09, KEY_DOT)
+					 MATRIX_KEY(0x05, 0x0b, KEY_SPACE)
+
+					 MATRIX_KEY(0x06, 0x01, KEY_1)
+					 MATRIX_KEY(0x06, 0x02, KEY_3)
+					 MATRIX_KEY(0x06, 0x03, KEY_4)
+					 MATRIX_KEY(0x06, 0x04, KEY_2)
+					 MATRIX_KEY(0x06, 0x05, KEY_8)
+					 MATRIX_KEY(0x06, 0x06, KEY_7)
+					 MATRIX_KEY(0x06, 0x08, KEY_0)
+					 MATRIX_KEY(0x06, 0x09, KEY_9)
+					 MATRIX_KEY(0x06, 0x0a, KEY_LEFTALT)
+					 MATRIX_KEY(0x06, 0x0b, KEY_DOWN)
+					 MATRIX_KEY(0x06, 0x0c, KEY_RIGHT)
+
+					 MATRIX_KEY(0x07, 0x01, KEY_Q)
+					 MATRIX_KEY(0x07, 0x02, KEY_E)
+					 MATRIX_KEY(0x07, 0x03, KEY_R)
+					 MATRIX_KEY(0x07, 0x04, KEY_W)
+					 MATRIX_KEY(0x07, 0x05, KEY_I)
+					 MATRIX_KEY(0x07, 0x06, KEY_U)
+					 MATRIX_KEY(0x07, 0x07, KEY_RIGHTSHIFT)
+					 MATRIX_KEY(0x07, 0x08, KEY_P)
+					 MATRIX_KEY(0x07, 0x09, KEY_O)
+					 MATRIX_KEY(0x07, 0x0b, KEY_UP)
+					 MATRIX_KEY(0x07, 0x0c, KEY_LEFT)>;
+			};
+		};
+	};
+
+	pmc@0,7000e400 {
+		nvidia,invert-interrupt;
+		nvidia,suspend-mode = <0>;
+		#wake-cells = <3>;
+		nvidia,cpu-pwr-good-time = <500>;
+		nvidia,cpu-pwr-off-time = <300>;
+		nvidia,core-pwr-good-time = <641 3845>;
+		nvidia,core-pwr-off-time = <61036>;
+		nvidia,core-power-req-active-high;
+		nvidia,sys-clock-req-active-high;
+		nvidia,reset-gpio = <&gpio TEGRA_GPIO(I, 5) GPIO_ACTIVE_LOW>;
+	};
+
+	/* WIFI/BT module */
+	sdhci@0,700b0000 {
+		status = "disabled";
+	};
+
+	/* external SD/MMC */
+	sdhci@0,700b0400 {
+		cd-gpios = <&gpio TEGRA_GPIO(V, 2) GPIO_ACTIVE_LOW>;
+		power-gpios = <&gpio TEGRA_GPIO(R, 0) GPIO_ACTIVE_HIGH>;
+		wp-gpios = <&gpio TEGRA_GPIO(Q, 4) GPIO_ACTIVE_HIGH>;
+		status = "okay";
+		bus-width = <4>;
+		vqmmc-supply = <&vddio_sdmmc3>;
+	};
+
+	/* EMMC 4.51 */
+	sdhci@0,700b0600 {
+		status = "okay";
+		bus-width = <8>;
+		non-removable;
+	};
+
+	usb@0,7d000000 {
+		status = "okay";
+	};
+
+	usb-phy@0,7d000000 {
+		status = "okay";
+		vbus-supply = <&vdd_usb1_vbus>;
+	};
+
+	usb@0,7d004000 {
+		status = "okay";
+	};
+
+	usb-phy@0,7d004000 {
+		status = "okay";
+		vbus-supply = <&vdd_run_cam>;
+	};
+
+	usb@0,7d008000 {
+		status = "okay";
+	};
+
+	usb-phy@0,7d008000 {
+		status = "okay";
+		vbus-supply = <&vdd_usb3_vbus>;
+	};
+
+	backlight: backlight {
+		compatible = "pwm-backlight";
+
+		enable-gpios = <&gpio TEGRA_GPIO(H, 2) GPIO_ACTIVE_HIGH>;
+		power-supply = <&vdd_led>;
+		pwms = <&pwm 1 1000000>;
+
+		brightness-levels = <0 4 8 16 32 64 128 255>;
+		default-brightness-level = <6>;
+
+		backlight-boot-off;
+	};
+
+	clocks {
+		compatible = "simple-bus";
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		clk32k_in: clock@0 {
+			compatible = "fixed-clock";
+			reg=<0>;
+			#clock-cells = <0>;
+			clock-frequency = <32768>;
+		};
+	};
+
+	gpio-keys {
+		compatible = "gpio-keys";
+
+		lid {
+			label = "Lid";
+			gpios = <&gpio TEGRA_GPIO(R, 4) GPIO_ACTIVE_LOW>;
+			linux,input-type = <5>;
+			linux,code = <0>;
+			debounce-interval = <1>;
+			gpio-key,wakeup;
+		};
+
+		power {
+			label = "Power";
+			gpios = <&gpio TEGRA_GPIO(Q, 0) GPIO_ACTIVE_LOW>;
+			linux,code = <KEY_POWER>;
+			debounce-interval = <10>;
+			gpio-key,wakeup;
+		};
+	};
+
+	panel: panel {
+		compatible = "innolux,n116bge", "simple-panel";
+		backlight = <&backlight>;
+		ddc-i2c-bus = <&dpaux>;
+	};
+
+	regulators {
+		compatible = "simple-bus";
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		vdd_mux: regulator@0 {
+			compatible = "regulator-fixed";
+			reg = <0>;
+			regulator-name = "+VDD_MUX";
+			regulator-min-microvolt = <19000000>;
+			regulator-max-microvolt = <19000000>;
+			regulator-always-on;
+			regulator-boot-on;
+		};
+
+		vdd_5v0_sys: regulator@1 {
+			compatible = "regulator-fixed";
+			reg = <1>;
+			regulator-name = "+5V_SYS";
+			regulator-min-microvolt = <5000000>;
+			regulator-max-microvolt = <5000000>;
+			regulator-always-on;
+			regulator-boot-on;
+			vin-supply = <&vdd_mux>;
+		};
+
+		vdd_3v3_sys: regulator@2 {
+			compatible = "regulator-fixed";
+			reg = <2>;
+			regulator-name = "+3.3V_SYS";
+			regulator-min-microvolt = <3300000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-always-on;
+			regulator-boot-on;
+			vin-supply = <&vdd_mux>;
+		};
+
+		vdd_3v3_run: regulator@3 {
+			compatible = "regulator-fixed";
+			reg = <3>;
+			regulator-name = "+3.3V_RUN";
+			regulator-min-microvolt = <3300000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-always-on;
+			regulator-boot-on;
+			gpio = <&as3722 1 GPIO_ACTIVE_HIGH>;
+			enable-active-high;
+			vin-supply = <&vdd_3v3_sys>;
+		};
+
+		vdd_3v3_hdmi: regulator@4 {
+			compatible = "regulator-fixed";
+			reg = <4>;
+			regulator-name = "+3.3V_AVDD_HDMI_AP_GATED";
+			regulator-min-microvolt = <3300000>;
+			regulator-max-microvolt = <3300000>;
+			vin-supply = <&vdd_3v3_run>;
+		};
+
+		vdd_led: regulator@5 {
+			compatible = "regulator-fixed";
+			reg = <5>;
+			regulator-name = "+VDD_LED";
+			regulator-min-microvolt = <3300000>;
+			regulator-max-microvolt = <3300000>;
+			gpio = <&gpio TEGRA_GPIO(P, 2) GPIO_ACTIVE_HIGH>;
+			enable-active-high;
+			vin-supply = <&vdd_mux>;
+		};
+
+		vdd_usb1_vbus: regulator@6 {
+			compatible = "regulator-fixed";
+			reg = <6>;
+			regulator-name = "+5V_USB_HS";
+			regulator-min-microvolt = <5000000>;
+			regulator-max-microvolt = <5000000>;
+			gpio = <&gpio TEGRA_GPIO(N, 4) GPIO_ACTIVE_HIGH>;
+			enable-active-high;
+			gpio-open-drain;
+			vin-supply = <&vdd_5v0_sys>;
+		};
+
+		vdd_usb3_vbus: regulator@7 {
+			compatible = "regulator-fixed";
+			reg = <7>;
+			regulator-name = "+5V_USB_SS";
+			regulator-min-microvolt = <5000000>;
+			regulator-max-microvolt = <5000000>;
+			gpio = <&gpio TEGRA_GPIO(N, 5) GPIO_ACTIVE_HIGH>;
+			enable-active-high;
+			gpio-open-drain;
+			vin-supply = <&vdd_5v0_sys>;
+		};
+
+		vdd_3v3_panel: regulator@8 {
+			compatible = "regulator-fixed";
+			reg = <8>;
+			regulator-name = "+3.3V_PANEL";
+			regulator-min-microvolt = <3300000>;
+			regulator-max-microvolt = <3300000>;
+			gpio = <&as3722 4 GPIO_ACTIVE_HIGH>;
+			enable-active-high;
+			vin-supply = <&vdd_3v3_sys>;
+		};
+
+		vdd_hdmi_pll: regulator@9 {
+			compatible = "regulator-fixed";
+			reg = <9>;
+			regulator-name = "+1.05V_RUN_AVDD_HDMI_PLL_AP_GATE";
+			regulator-min-microvolt = <1050000>;
+			regulator-max-microvolt = <1050000>;
+			gpio = <&gpio TEGRA_GPIO(H, 7) GPIO_ACTIVE_LOW>;
+			vin-supply = <&vdd_1v05_run>;
+		};
+
+		vdd_5v0_hdmi: regulator@10 {
+			compatible = "regulator-fixed";
+			reg = <10>;
+			regulator-name = "+5V_HDMI_CON";
+			regulator-min-microvolt = <5000000>;
+			regulator-max-microvolt = <5000000>;
+			gpio = <&gpio TEGRA_GPIO(K, 6) GPIO_ACTIVE_HIGH>;
+			enable-active-high;
+			vin-supply = <&vdd_5v0_sys>;
+		};
+
+		vdd_5v0_ts: regulator@11 {
+			compatible = "regulator-fixed";
+			reg = <11>;
+			regulator-name = "+5V_VDD_TS";
+			regulator-min-microvolt = <5000000>;
+			regulator-max-microvolt = <5000000>;
+			regulator-always-on;
+			regulator-boot-on;
+			gpio = <&gpio TEGRA_GPIO(K, 1) GPIO_ACTIVE_HIGH>;
+			enable-active-high;
+		};
+	};
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra132.dtsi b/arch/arm64/boot/dts/nvidia/tegra132.dtsi
new file mode 100644
index 000000000000..e8bb46027bed
--- /dev/null
+++ b/arch/arm64/boot/dts/nvidia/tegra132.dtsi
@@ -0,0 +1,990 @@
+#include <dt-bindings/clock/tegra124-car.h>
+#include <dt-bindings/gpio/tegra-gpio.h>
+#include <dt-bindings/memory/tegra124-mc.h>
+#include <dt-bindings/pinctrl/pinctrl-tegra.h>
+#include <dt-bindings/pinctrl/pinctrl-tegra-xusb.h>
+#include <dt-bindings/interrupt-controller/arm-gic.h>
+
+/ {
+	compatible = "nvidia,tegra132", "nvidia,tegra124";
+	interrupt-parent = <&lic>;
+	#address-cells = <2>;
+	#size-cells = <2>;
+
+	pcie-controller@0,01003000 {
+		compatible = "nvidia,tegra124-pcie";
+		device_type = "pci";
+		reg = <0x0 0x01003000 0x0 0x00000800   /* PADS registers */
+		       0x0 0x01003800 0x0 0x00000800   /* AFI registers */
+		       0x0 0x02000000 0x0 0x10000000>; /* configuration space */
+		reg-names = "pads", "afi", "cs";
+		interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>, /* controller interrupt */
+			     <GIC_SPI 99 IRQ_TYPE_LEVEL_HIGH>; /* MSI interrupt */
+		interrupt-names = "intr", "msi";
+
+		#interrupt-cells = <1>;
+		interrupt-map-mask = <0 0 0 0>;
+		interrupt-map = <0 0 0 0 &gic GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>;
+
+		bus-range = <0x00 0xff>;
+		#address-cells = <3>;
+		#size-cells = <2>;
+
+		ranges = <0x82000000 0 0x01000000 0x0 0x01000000 0 0x00001000   /* port 0 configuration space */
+			  0x82000000 0 0x01001000 0x0 0x01001000 0 0x00001000   /* port 1 configuration space */
+			  0x81000000 0 0x0        0x0 0x12000000 0 0x00010000   /* downstream I/O (64 KiB) */
+			  0x82000000 0 0x13000000 0x0 0x13000000 0 0x0d000000   /* non-prefetchable memory (208 MiB) */
+			  0xc2000000 0 0x20000000 0x0 0x20000000 0 0x20000000>; /* prefetchable memory (512 MiB) */
+
+		clocks = <&tegra_car TEGRA124_CLK_PCIE>,
+			 <&tegra_car TEGRA124_CLK_AFI>,
+			 <&tegra_car TEGRA124_CLK_PLL_E>,
+			 <&tegra_car TEGRA124_CLK_CML0>;
+		clock-names = "pex", "afi", "pll_e", "cml";
+		resets = <&tegra_car 70>,
+			 <&tegra_car 72>,
+			 <&tegra_car 74>;
+		reset-names = "pex", "afi", "pcie_x";
+		status = "disabled";
+
+		phys = <&padctl TEGRA_XUSB_PADCTL_PCIE>;
+		phy-names = "pcie";
+
+		pci@1,0 {
+			device_type = "pci";
+			assigned-addresses = <0x82000800 0 0x01000000 0 0x1000>;
+			reg = <0x000800 0 0 0 0>;
+			status = "disabled";
+
+			#address-cells = <3>;
+			#size-cells = <2>;
+			ranges;
+
+			nvidia,num-lanes = <2>;
+		};
+
+		pci@2,0 {
+			device_type = "pci";
+			assigned-addresses = <0x82001000 0 0x01001000 0 0x1000>;
+			reg = <0x001000 0 0 0 0>;
+			status = "disabled";
+
+			#address-cells = <3>;
+			#size-cells = <2>;
+			ranges;
+
+			nvidia,num-lanes = <1>;
+		};
+	};
+
+	host1x@0,50000000 {
+		compatible = "nvidia,tegra124-host1x", "simple-bus";
+		reg = <0x0 0x50000000 0x0 0x00034000>;
+		interrupts = <GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>, /* syncpt */
+			     <GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>; /* general */
+		clocks = <&tegra_car TEGRA124_CLK_HOST1X>;
+		clock-names = "host1x";
+		resets = <&tegra_car 28>;
+		reset-names = "host1x";
+
+		#address-cells = <2>;
+		#size-cells = <2>;
+
+		ranges = <0 0x54000000 0 0x54000000 0 0x01000000>;
+
+		dc@0,54200000 {
+			compatible = "nvidia,tegra124-dc";
+			reg = <0x0 0x54200000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 73 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&tegra_car TEGRA124_CLK_DISP1>,
+				 <&tegra_car TEGRA124_CLK_PLL_P>;
+			clock-names = "dc", "parent";
+			resets = <&tegra_car 27>;
+			reset-names = "dc";
+
+			iommus = <&mc TEGRA_SWGROUP_DC>;
+
+			nvidia,head = <0>;
+		};
+
+		dc@0,54240000 {
+			compatible = "nvidia,tegra124-dc";
+			reg = <0x0 0x54240000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&tegra_car TEGRA124_CLK_DISP2>,
+				 <&tegra_car TEGRA124_CLK_PLL_P>;
+			clock-names = "dc", "parent";
+			resets = <&tegra_car 26>;
+			reset-names = "dc";
+
+			iommus = <&mc TEGRA_SWGROUP_DCB>;
+
+			nvidia,head = <1>;
+		};
+
+		hdmi@0,54280000 {
+			compatible = "nvidia,tegra124-hdmi";
+			reg = <0x0 0x54280000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&tegra_car TEGRA124_CLK_HDMI>,
+				 <&tegra_car TEGRA124_CLK_PLL_D2_OUT0>;
+			clock-names = "hdmi", "parent";
+			resets = <&tegra_car 51>;
+			reset-names = "hdmi";
+			status = "disabled";
+		};
+
+		sor@0,54540000 {
+			compatible = "nvidia,tegra124-sor";
+			reg = <0x0 0x54540000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&tegra_car TEGRA124_CLK_SOR0>,
+				 <&tegra_car TEGRA124_CLK_PLL_D_OUT0>,
+				 <&tegra_car TEGRA124_CLK_PLL_DP>,
+				 <&tegra_car TEGRA124_CLK_CLK_M>;
+			clock-names = "sor", "parent", "dp", "safe";
+			resets = <&tegra_car 182>;
+			reset-names = "sor";
+			status = "disabled";
+		};
+
+		dpaux: dpaux@0,545c0000 {
+			compatible = "nvidia,tegra124-dpaux";
+			reg = <0x0 0x545c0000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 159 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&tegra_car TEGRA124_CLK_DPAUX>,
+				 <&tegra_car TEGRA124_CLK_PLL_DP>;
+			clock-names = "dpaux", "parent";
+			resets = <&tegra_car 181>;
+			reset-names = "dpaux";
+			status = "disabled";
+		};
+	};
+
+	gic: interrupt-controller@0,50041000 {
+		compatible = "arm,cortex-a15-gic";
+		#interrupt-cells = <3>;
+		interrupt-controller;
+		reg = <0x0 0x50041000 0x0 0x1000>,
+		      <0x0 0x50042000 0x0 0x2000>,
+		      <0x0 0x50044000 0x0 0x2000>,
+		      <0x0 0x50046000 0x0 0x2000>;
+		interrupts = <GIC_PPI 9
+			(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>;
+		interrupt-parent = <&gic>;
+	};
+
+	gpu@0,57000000 {
+		compatible = "nvidia,gk20a";
+		reg = <0x0 0x57000000 0x0 0x01000000>,
+		      <0x0 0x58000000 0x0 0x01000000>;
+		interrupts = <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 158 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-names = "stall", "nonstall";
+		clocks = <&tegra_car TEGRA124_CLK_GPU>,
+			 <&tegra_car TEGRA124_CLK_PLL_P_OUT5>;
+		clock-names = "gpu", "pwr";
+		resets = <&tegra_car 184>;
+		reset-names = "gpu";
+		status = "disabled";
+	};
+
+	lic: interrupt-controller@60004000 {
+		compatible = "nvidia,tegra124-ictlr", "nvidia,tegra30-ictlr";
+		reg = <0x0 0x60004000 0x0 0x100>,
+		      <0x0 0x60004100 0x0 0x100>,
+		      <0x0 0x60004200 0x0 0x100>,
+		      <0x0 0x60004300 0x0 0x100>,
+		      <0x0 0x60004400 0x0 0x100>;
+		interrupt-controller;
+		#interrupt-cells = <3>;
+		interrupt-parent = <&gic>;
+	};
+
+	timer@0,60005000 {
+		compatible = "nvidia,tegra124-timer", "nvidia,tegra20-timer";
+		reg = <0x0 0x60005000 0x0 0x400>;
+		interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_TIMER>;
+		clock-names = "timer";
+	};
+
+	tegra_car: clock@0,60006000 {
+		compatible = "nvidia,tegra132-car";
+		reg = <0x0 0x60006000 0x0 0x1000>;
+		#clock-cells = <1>;
+		#reset-cells = <1>;
+		nvidia,external-memory-controller = <&emc>;
+	};
+
+	flow-controller@0,60007000 {
+		compatible = "nvidia,tegra124-flowctrl";
+		reg = <0x0 0x60007000 0x0 0x1000>;
+	};
+
+	actmon@0,6000c800 {
+		compatible = "nvidia,tegra124-actmon";
+		reg = <0x0 0x6000c800 0x0 0x400>;
+		interrupts = <GIC_SPI 45 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_ACTMON>,
+			 <&tegra_car TEGRA124_CLK_EMC>;
+		clock-names = "actmon", "emc";
+		resets = <&tegra_car 119>;
+		reset-names = "actmon";
+	};
+
+	gpio: gpio@0,6000d000 {
+		compatible = "nvidia,tegra124-gpio", "nvidia,tegra30-gpio";
+		reg = <0x0 0x6000d000 0x0 0x1000>;
+		interrupts = <GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 87 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 125 IRQ_TYPE_LEVEL_HIGH>;
+		#gpio-cells = <2>;
+		gpio-controller;
+		#interrupt-cells = <2>;
+		interrupt-controller;
+	};
+
+	apbdma: dma@0,60020000 {
+		compatible = "nvidia,tegra124-apbdma", "nvidia,tegra148-apbdma";
+		reg = <0x0 0x60020000 0x0 0x1400>;
+		interrupts = <GIC_SPI 104 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 114 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 118 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 119 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 128 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 129 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 130 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 132 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 133 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 134 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 135 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 136 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 137 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 138 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 139 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 142 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_APBDMA>;
+		clock-names = "dma";
+		resets = <&tegra_car 34>;
+		reset-names = "dma";
+		#dma-cells = <1>;
+	};
+
+	apbmisc@0,70000800 {
+		compatible = "nvidia,tegra124-apbmisc", "nvidia,tegra20-apbmisc";
+		reg = <0x0 0x70000800 0x0 0x64>,   /* Chip revision */
+		      <0x0 0x7000e864 0x0 0x04>;   /* Strapping options */
+	};
+
+	pinmux: pinmux@0,70000868 {
+		compatible = "nvidia,tegra124-pinmux";
+		reg = <0x0 0x70000868 0x0 0x164>, /* Pad control registers */
+		      <0x0 0x70003000 0x0 0x434>, /* Mux registers */
+		      <0x0 0x70000820 0x0 0x008>; /* MIPI pad control */
+	};
+
+	/*
+	 * There are two serial driver i.e. 8250 based simple serial
+	 * driver and APB DMA based serial driver for higher baudrate
+	 * and performace. To enable the 8250 based driver, the compatible
+	 * is "nvidia,tegra124-uart", "nvidia,tegra20-uart" and to enable
+	 * the APB DMA based serial driver, the comptible is
+	 * "nvidia,tegra124-hsuart", "nvidia,tegra30-hsuart".
+	 */
+	uarta: serial@0,70006000 {
+		compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart";
+		reg = <0x0 0x70006000 0x0 0x40>;
+		reg-shift = <2>;
+		interrupts = <GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_UARTA>;
+		clock-names = "serial";
+		resets = <&tegra_car 6>;
+		reset-names = "serial";
+		dmas = <&apbdma 8>, <&apbdma 8>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	uartb: serial@0,70006040 {
+		compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart";
+		reg = <0x0 0x70006040 0x0 0x40>;
+		reg-shift = <2>;
+		interrupts = <GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_UARTB>;
+		clock-names = "serial";
+		resets = <&tegra_car 7>;
+		reset-names = "serial";
+		dmas = <&apbdma 9>, <&apbdma 9>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	uartc: serial@0,70006200 {
+		compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart";
+		reg = <0x0 0x70006200 0x0 0x40>;
+		reg-shift = <2>;
+		interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_UARTC>;
+		clock-names = "serial";
+		resets = <&tegra_car 55>;
+		reset-names = "serial";
+		dmas = <&apbdma 10>, <&apbdma 10>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	uartd: serial@0,70006300 {
+		compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart";
+		reg = <0x0 0x70006300 0x0 0x40>;
+		reg-shift = <2>;
+		interrupts = <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_UARTD>;
+		clock-names = "serial";
+		resets = <&tegra_car 65>;
+		reset-names = "serial";
+		dmas = <&apbdma 19>, <&apbdma 19>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	pwm: pwm@0,7000a000 {
+		compatible = "nvidia,tegra124-pwm", "nvidia,tegra20-pwm";
+		reg = <0x0 0x7000a000 0x0 0x100>;
+		#pwm-cells = <2>;
+		clocks = <&tegra_car TEGRA124_CLK_PWM>;
+		clock-names = "pwm";
+		resets = <&tegra_car 17>;
+		reset-names = "pwm";
+		status = "disabled";
+	};
+
+	i2c@0,7000c000 {
+		compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000c000 0x0 0x100>;
+		interrupts = <GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_I2C1>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 12>;
+		reset-names = "i2c";
+		dmas = <&apbdma 21>, <&apbdma 21>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	i2c@0,7000c400 {
+		compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000c400 0x0 0x100>;
+		interrupts = <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_I2C2>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 54>;
+		reset-names = "i2c";
+		dmas = <&apbdma 22>, <&apbdma 22>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	i2c@0,7000c500 {
+		compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000c500 0x0 0x100>;
+		interrupts = <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_I2C3>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 67>;
+		reset-names = "i2c";
+		dmas = <&apbdma 23>, <&apbdma 23>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	i2c@0,7000c700 {
+		compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000c700 0x0 0x100>;
+		interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_I2C4>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 103>;
+		reset-names = "i2c";
+		dmas = <&apbdma 26>, <&apbdma 26>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	i2c@0,7000d000 {
+		compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000d000 0x0 0x100>;
+		interrupts = <GIC_SPI 53 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_I2C5>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 47>;
+		reset-names = "i2c";
+		dmas = <&apbdma 24>, <&apbdma 24>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	i2c@0,7000d100 {
+		compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000d100 0x0 0x100>;
+		interrupts = <GIC_SPI 63 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_I2C6>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 166>;
+		reset-names = "i2c";
+		dmas = <&apbdma 30>, <&apbdma 30>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	spi@0,7000d400 {
+		compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+		reg = <0x0 0x7000d400 0x0 0x200>;
+		interrupts = <GIC_SPI 59 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_SBC1>;
+		clock-names = "spi";
+		resets = <&tegra_car 41>;
+		reset-names = "spi";
+		dmas = <&apbdma 15>, <&apbdma 15>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	spi@0,7000d600 {
+		compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+		reg = <0x0 0x7000d600 0x0 0x200>;
+		interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_SBC2>;
+		clock-names = "spi";
+		resets = <&tegra_car 44>;
+		reset-names = "spi";
+		dmas = <&apbdma 16>, <&apbdma 16>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	spi@0,7000d800 {
+		compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+		reg = <0x0 0x7000d800 0x0 0x200>;
+		interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_SBC3>;
+		clock-names = "spi";
+		resets = <&tegra_car 46>;
+		reset-names = "spi";
+		dmas = <&apbdma 17>, <&apbdma 17>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	spi@0,7000da00 {
+		compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+		reg = <0x0 0x7000da00 0x0 0x200>;
+		interrupts = <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_SBC4>;
+		clock-names = "spi";
+		resets = <&tegra_car 68>;
+		reset-names = "spi";
+		dmas = <&apbdma 18>, <&apbdma 18>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	spi@0,7000dc00 {
+		compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+		reg = <0x0 0x7000dc00 0x0 0x200>;
+		interrupts = <GIC_SPI 94 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_SBC5>;
+		clock-names = "spi";
+		resets = <&tegra_car 104>;
+		reset-names = "spi";
+		dmas = <&apbdma 27>, <&apbdma 27>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	spi@0,7000de00 {
+		compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+		reg = <0x0 0x7000de00 0x0 0x200>;
+		interrupts = <GIC_SPI 79 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA124_CLK_SBC6>;
+		clock-names = "spi";
+		resets = <&tegra_car 105>;
+		reset-names = "spi";
+		dmas = <&apbdma 28>, <&apbdma 28>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	rtc@0,7000e000 {
+		compatible = "nvidia,tegra124-rtc", "nvidia,tegra20-rtc";
+		reg = <0x0 0x7000e000 0x0 0x100>;
+		interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_RTC>;
+		clock-names = "rtc";
+	};
+
+	pmc@0,7000e400 {
+		compatible = "nvidia,tegra124-pmc";
+		reg = <0x0 0x7000e400 0x0 0x400>;
+		clocks = <&tegra_car TEGRA124_CLK_PCLK>, <&clk32k_in>;
+		clock-names = "pclk", "clk32k_in";
+	};
+
+	fuse@0,7000f800 {
+		compatible = "nvidia,tegra124-efuse";
+		reg = <0x0 0x7000f800 0x0 0x400>;
+		clocks = <&tegra_car TEGRA124_CLK_FUSE>;
+		clock-names = "fuse";
+		resets = <&tegra_car 39>;
+		reset-names = "fuse";
+	};
+
+	mc: memory-controller@0,70019000 {
+		compatible = "nvidia,tegra132-mc";
+		reg = <0x0 0x70019000 0x0 0x1000>;
+		clocks = <&tegra_car TEGRA124_CLK_MC>;
+		clock-names = "mc";
+
+		interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>;
+
+		#iommu-cells = <1>;
+	};
+
+	emc: emc@0,7001b000 {
+		compatible = "nvidia,tegra132-emc", "nvidia,tegra124-emc";
+		reg = <0x0 0x7001b000 0x0 0x1000>;
+
+		nvidia,memory-controller = <&mc>;
+	};
+
+	sata@0,70020000 {
+		compatible = "nvidia,tegra124-ahci";
+		reg = <0x0 0x70027000 0x0 0x2000>, /* AHCI */
+		      <0x0 0x70020000 0x0 0x7000>; /* SATA */
+		interrupts = <GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_SATA>,
+			 <&tegra_car TEGRA124_CLK_SATA_OOB>,
+			 <&tegra_car TEGRA124_CLK_CML1>,
+			 <&tegra_car TEGRA124_CLK_PLL_E>;
+		clock-names = "sata", "sata-oob", "cml1", "pll_e";
+		resets = <&tegra_car 124>,
+			 <&tegra_car 123>,
+			 <&tegra_car 129>;
+		reset-names = "sata", "sata-oob", "sata-cold";
+		phys = <&padctl TEGRA_XUSB_PADCTL_SATA>;
+		phy-names = "sata-phy";
+		status = "disabled";
+	};
+
+	hda@0,70030000 {
+		compatible = "nvidia,tegra132-hda", "nvidia,tegra124-hda",
+			     "nvidia,tegra30-hda";
+		reg = <0x0 0x70030000 0x0 0x10000>;
+		interrupts = <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_HDA>,
+		         <&tegra_car TEGRA124_CLK_HDA2HDMI>,
+			 <&tegra_car TEGRA124_CLK_HDA2CODEC_2X>;
+		clock-names = "hda", "hda2hdmi", "hda2codec_2x";
+		resets = <&tegra_car 125>, /* hda */
+			 <&tegra_car 128>, /* hda2hdmi */
+			 <&tegra_car 111>; /* hda2codec_2x */
+		reset-names = "hda", "hda2hdmi", "hda2codec_2x";
+		status = "disabled";
+	};
+
+	padctl: padctl@0,7009f000 {
+		compatible = "nvidia,tegra132-xusb-padctl",
+			     "nvidia,tegra124-xusb-padctl";
+		reg = <0x0 0x7009f000 0x0 0x1000>;
+		resets = <&tegra_car 142>;
+		reset-names = "padctl";
+
+		#phy-cells = <1>;
+
+		phys {
+			pcie-0 {
+				status = "disabled";
+			};
+
+			sata-0 {
+				status = "disabled";
+			};
+
+			usb3-0 {
+				status = "disabled";
+			};
+
+			usb3-1 {
+				status = "disabled";
+			};
+
+			utmi-0 {
+				status = "disabled";
+			};
+
+			utmi-1 {
+				status = "disabled";
+			};
+
+			utmi-2 {
+				status = "disabled";
+			};
+		};
+	};
+
+	sdhci@0,700b0000 {
+		compatible = "nvidia,tegra124-sdhci";
+		reg = <0x0 0x700b0000 0x0 0x200>;
+		interrupts = <GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_SDMMC1>;
+		clock-names = "sdhci";
+		resets = <&tegra_car 14>;
+		reset-names = "sdhci";
+		status = "disabled";
+	};
+
+	sdhci@0,700b0200 {
+		compatible = "nvidia,tegra124-sdhci";
+		reg = <0x0 0x700b0200 0x0 0x200>;
+		interrupts = <GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_SDMMC2>;
+		clock-names = "sdhci";
+		resets = <&tegra_car 9>;
+		reset-names = "sdhci";
+		status = "disabled";
+	};
+
+	sdhci@0,700b0400 {
+		compatible = "nvidia,tegra124-sdhci";
+		reg = <0x0 0x700b0400 0x0 0x200>;
+		interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_SDMMC3>;
+		clock-names = "sdhci";
+		resets = <&tegra_car 69>;
+		reset-names = "sdhci";
+		status = "disabled";
+	};
+
+	sdhci@0,700b0600 {
+		compatible = "nvidia,tegra124-sdhci";
+		reg = <0x0 0x700b0600 0x0 0x200>;
+		interrupts = <GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_SDMMC4>;
+		clock-names = "sdhci";
+		resets = <&tegra_car 15>;
+		reset-names = "sdhci";
+		status = "disabled";
+	};
+
+	soctherm: thermal-sensor@0,700e2000 {
+		compatible = "nvidia,tegra124-soctherm";
+		reg = <0x0 0x700e2000 0x0 0x1000>;
+		interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_TSENSOR>,
+			<&tegra_car TEGRA124_CLK_SOC_THERM>;
+		clock-names = "tsensor", "soctherm";
+		resets = <&tegra_car 78>;
+		reset-names = "soctherm";
+		#thermal-sensor-cells = <1>;
+	};
+
+	ahub@0,70300000 {
+		compatible = "nvidia,tegra124-ahub";
+		reg = <0x0 0x70300000 0x0 0x200>,
+		      <0x0 0x70300800 0x0 0x800>,
+		      <0x0 0x70300200 0x0 0x600>;
+		interrupts = <GIC_SPI 103 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA124_CLK_D_AUDIO>,
+			 <&tegra_car TEGRA124_CLK_APBIF>;
+		clock-names = "d_audio", "apbif";
+		resets = <&tegra_car 106>, /* d_audio */
+			 <&tegra_car 107>, /* apbif */
+			 <&tegra_car 30>,  /* i2s0 */
+			 <&tegra_car 11>,  /* i2s1 */
+			 <&tegra_car 18>,  /* i2s2 */
+			 <&tegra_car 101>, /* i2s3 */
+			 <&tegra_car 102>, /* i2s4 */
+			 <&tegra_car 108>, /* dam0 */
+			 <&tegra_car 109>, /* dam1 */
+			 <&tegra_car 110>, /* dam2 */
+			 <&tegra_car 10>,  /* spdif */
+			 <&tegra_car 153>, /* amx */
+			 <&tegra_car 185>, /* amx1 */
+			 <&tegra_car 154>, /* adx */
+			 <&tegra_car 180>, /* adx1 */
+			 <&tegra_car 186>, /* afc0 */
+			 <&tegra_car 187>, /* afc1 */
+			 <&tegra_car 188>, /* afc2 */
+			 <&tegra_car 189>, /* afc3 */
+			 <&tegra_car 190>, /* afc4 */
+			 <&tegra_car 191>; /* afc5 */
+		reset-names = "d_audio", "apbif", "i2s0", "i2s1", "i2s2",
+			      "i2s3", "i2s4", "dam0", "dam1", "dam2",
+			      "spdif", "amx", "amx1", "adx", "adx1",
+			      "afc0", "afc1", "afc2", "afc3", "afc4", "afc5";
+		dmas = <&apbdma 1>, <&apbdma 1>,
+		       <&apbdma 2>, <&apbdma 2>,
+		       <&apbdma 3>, <&apbdma 3>,
+		       <&apbdma 4>, <&apbdma 4>,
+		       <&apbdma 6>, <&apbdma 6>,
+		       <&apbdma 7>, <&apbdma 7>,
+		       <&apbdma 12>, <&apbdma 12>,
+		       <&apbdma 13>, <&apbdma 13>,
+		       <&apbdma 14>, <&apbdma 14>,
+		       <&apbdma 29>, <&apbdma 29>;
+		dma-names = "rx0", "tx0", "rx1", "tx1", "rx2", "tx2",
+			    "rx3", "tx3", "rx4", "tx4", "rx5", "tx5",
+			    "rx6", "tx6", "rx7", "tx7", "rx8", "tx8",
+			    "rx9", "tx9";
+		ranges;
+		#address-cells = <2>;
+		#size-cells = <2>;
+
+		tegra_i2s0: i2s@0,70301000 {
+			compatible = "nvidia,tegra124-i2s";
+			reg = <0x0 0x70301000 0x0 0x100>;
+			nvidia,ahub-cif-ids = <4 4>;
+			clocks = <&tegra_car TEGRA124_CLK_I2S0>;
+			clock-names = "i2s";
+			resets = <&tegra_car 30>;
+			reset-names = "i2s";
+			status = "disabled";
+		};
+
+		tegra_i2s1: i2s@0,70301100 {
+			compatible = "nvidia,tegra124-i2s";
+			reg = <0x0 0x70301100 0x0 0x100>;
+			nvidia,ahub-cif-ids = <5 5>;
+			clocks = <&tegra_car TEGRA124_CLK_I2S1>;
+			clock-names = "i2s";
+			resets = <&tegra_car 11>;
+			reset-names = "i2s";
+			status = "disabled";
+		};
+
+		tegra_i2s2: i2s@0,70301200 {
+			compatible = "nvidia,tegra124-i2s";
+			reg = <0x0 0x70301200 0x0 0x100>;
+			nvidia,ahub-cif-ids = <6 6>;
+			clocks = <&tegra_car TEGRA124_CLK_I2S2>;
+			clock-names = "i2s";
+			resets = <&tegra_car 18>;
+			reset-names = "i2s";
+			status = "disabled";
+		};
+
+		tegra_i2s3: i2s@0,70301300 {
+			compatible = "nvidia,tegra124-i2s";
+			reg = <0x0 0x70301300 0x0 0x100>;
+			nvidia,ahub-cif-ids = <7 7>;
+			clocks = <&tegra_car TEGRA124_CLK_I2S3>;
+			clock-names = "i2s";
+			resets = <&tegra_car 101>;
+			reset-names = "i2s";
+			status = "disabled";
+		};
+
+		tegra_i2s4: i2s@0,70301400 {
+			compatible = "nvidia,tegra124-i2s";
+			reg = <0x0 0x70301400 0x0 0x100>;
+			nvidia,ahub-cif-ids = <8 8>;
+			clocks = <&tegra_car TEGRA124_CLK_I2S4>;
+			clock-names = "i2s";
+			resets = <&tegra_car 102>;
+			reset-names = "i2s";
+			status = "disabled";
+		};
+	};
+
+	usb@0,7d000000 {
+		compatible = "nvidia,tegra124-ehci", "nvidia,tegra30-ehci", "usb-ehci";
+		reg = <0x0 0x7d000000 0x0 0x4000>;
+		interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
+		phy_type = "utmi";
+		clocks = <&tegra_car TEGRA124_CLK_USBD>;
+		clock-names = "usb";
+		resets = <&tegra_car 22>;
+		reset-names = "usb";
+		nvidia,phy = <&phy1>;
+		status = "disabled";
+	};
+
+	phy1: usb-phy@0,7d000000 {
+		compatible = "nvidia,tegra124-usb-phy", "nvidia,tegra30-usb-phy";
+		reg = <0x0 0x7d000000 0x0 0x4000>,
+		      <0x0 0x7d000000 0x0 0x4000>;
+		phy_type = "utmi";
+		clocks = <&tegra_car TEGRA124_CLK_USBD>,
+			 <&tegra_car TEGRA124_CLK_PLL_U>,
+			 <&tegra_car TEGRA124_CLK_USBD>;
+		clock-names = "reg", "pll_u", "utmi-pads";
+		resets = <&tegra_car 22>, <&tegra_car 22>;
+		reset-names = "usb", "utmi-pads";
+		nvidia,hssync-start-delay = <0>;
+		nvidia,idle-wait-delay = <17>;
+		nvidia,elastic-limit = <16>;
+		nvidia,term-range-adj = <6>;
+		nvidia,xcvr-setup = <9>;
+		nvidia,xcvr-lsfslew = <0>;
+		nvidia,xcvr-lsrslew = <3>;
+		nvidia,hssquelch-level = <2>;
+		nvidia,hsdiscon-level = <5>;
+		nvidia,xcvr-hsslew = <12>;
+		nvidia,has-utmi-pad-registers;
+		status = "disabled";
+	};
+
+	usb@0,7d004000 {
+		compatible = "nvidia,tegra124-ehci", "nvidia,tegra30-ehci", "usb-ehci";
+		reg = <0x0 0x7d004000 0x0 0x4000>;
+		interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>;
+		phy_type = "utmi";
+		clocks = <&tegra_car TEGRA124_CLK_USB2>;
+		clock-names = "usb";
+		resets = <&tegra_car 58>;
+		reset-names = "usb";
+		nvidia,phy = <&phy2>;
+		status = "disabled";
+	};
+
+	phy2: usb-phy@0,7d004000 {
+		compatible = "nvidia,tegra124-usb-phy", "nvidia,tegra30-usb-phy";
+		reg = <0x0 0x7d004000 0x0 0x4000>,
+		      <0x0 0x7d000000 0x0 0x4000>;
+		phy_type = "utmi";
+		clocks = <&tegra_car TEGRA124_CLK_USB2>,
+			 <&tegra_car TEGRA124_CLK_PLL_U>,
+			 <&tegra_car TEGRA124_CLK_USBD>;
+		clock-names = "reg", "pll_u", "utmi-pads";
+		resets = <&tegra_car 58>, <&tegra_car 22>;
+		reset-names = "usb", "utmi-pads";
+		nvidia,hssync-start-delay = <0>;
+		nvidia,idle-wait-delay = <17>;
+		nvidia,elastic-limit = <16>;
+		nvidia,term-range-adj = <6>;
+		nvidia,xcvr-setup = <9>;
+		nvidia,xcvr-lsfslew = <0>;
+		nvidia,xcvr-lsrslew = <3>;
+		nvidia,hssquelch-level = <2>;
+		nvidia,hsdiscon-level = <5>;
+		nvidia,xcvr-hsslew = <12>;
+		status = "disabled";
+	};
+
+	usb@0,7d008000 {
+		compatible = "nvidia,tegra124-ehci", "nvidia,tegra30-ehci", "usb-ehci";
+		reg = <0x0 0x7d008000 0x0 0x4000>;
+		interrupts = <GIC_SPI 97 IRQ_TYPE_LEVEL_HIGH>;
+		phy_type = "utmi";
+		clocks = <&tegra_car TEGRA124_CLK_USB3>;
+		clock-names = "usb";
+		resets = <&tegra_car 59>;
+		reset-names = "usb";
+		nvidia,phy = <&phy3>;
+		status = "disabled";
+	};
+
+	phy3: usb-phy@0,7d008000 {
+		compatible = "nvidia,tegra124-usb-phy", "nvidia,tegra30-usb-phy";
+		reg = <0x0 0x7d008000 0x0 0x4000>,
+		      <0x0 0x7d000000 0x0 0x4000>;
+		phy_type = "utmi";
+		clocks = <&tegra_car TEGRA124_CLK_USB3>,
+			 <&tegra_car TEGRA124_CLK_PLL_U>,
+			 <&tegra_car TEGRA124_CLK_USBD>;
+		clock-names = "reg", "pll_u", "utmi-pads";
+		resets = <&tegra_car 59>, <&tegra_car 22>;
+		reset-names = "usb", "utmi-pads";
+		nvidia,hssync-start-delay = <0>;
+		nvidia,idle-wait-delay = <17>;
+		nvidia,elastic-limit = <16>;
+		nvidia,term-range-adj = <6>;
+		nvidia,xcvr-setup = <9>;
+		nvidia,xcvr-lsfslew = <0>;
+		nvidia,xcvr-lsrslew = <3>;
+		nvidia,hssquelch-level = <2>;
+		nvidia,hsdiscon-level = <5>;
+		nvidia,xcvr-hsslew = <12>;
+		status = "disabled";
+	};
+
+	cpus {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		cpu@0 {
+			device_type = "cpu";
+			compatible = "nvidia,denver", "arm,armv8";
+			reg = <0>;
+		};
+
+		cpu@1 {
+			device_type = "cpu";
+			compatible = "nvidia,denver", "arm,armv8";
+			reg = <1>;
+		};
+	};
+
+	timer {
+		compatible = "arm,armv7-timer";
+		interrupts = <GIC_PPI 13
+				(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+			     <GIC_PPI 14
+				(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+			     <GIC_PPI 11
+				(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+			     <GIC_PPI 10
+				(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>;
+		interrupt-parent = <&gic>;
+	};
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi
new file mode 100644
index 000000000000..2b7f88950d1e
--- /dev/null
+++ b/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi
@@ -0,0 +1,45 @@
+#include "tegra210.dtsi"
+
+/ {
+	model = "NVIDIA Jetson TX1";
+	compatible = "nvidia,p2180", "nvidia,tegra210";
+
+	aliases {
+		rtc1 = "/rtc@0,7000e000";
+		serial0 = &uarta;
+	};
+
+	memory {
+		device_type = "memory";
+		reg = <0x0 0x80000000 0x1 0x0>;
+	};
+
+	/* debug port */
+	serial@0,70006000 {
+		status = "okay";
+	};
+
+	pmc@0,7000e400 {
+		nvidia,invert-interrupt;
+	};
+
+	/* eMMC */
+	sdhci@0,700b0600 {
+		status = "okay";
+		bus-width = <8>;
+		non-removable;
+	};
+
+	clocks {
+		compatible = "simple-bus";
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		clk32k_in: clock@0 {
+			compatible = "fixed-clock";
+			reg = <0>;
+			#clock-cells = <0>;
+			clock-frequency = <32768>;
+		};
+	};
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2371-0000.dts b/arch/arm64/boot/dts/nvidia/tegra210-p2371-0000.dts
new file mode 100644
index 000000000000..1ddd8512e100
--- /dev/null
+++ b/arch/arm64/boot/dts/nvidia/tegra210-p2371-0000.dts
@@ -0,0 +1,9 @@
+/dts-v1/;
+
+#include "tegra210-p2530.dtsi"
+#include "tegra210-p2595.dtsi"
+
+/ {
+	model = "NVIDIA Tegra210 P2371 (P2530/P2595) reference design";
+	compatible = "nvidia,p2371-0000", "nvidia,tegra210";
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts b/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts
new file mode 100644
index 000000000000..683b339a980c
--- /dev/null
+++ b/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts
@@ -0,0 +1,9 @@
+/dts-v1/;
+
+#include "tegra210-p2180.dtsi"
+#include "tegra210-p2597.dtsi"
+
+/ {
+	model = "NVIDIA Jetson TX1 Developer Kit";
+	compatible = "nvidia,p2371-2180", "nvidia,tegra210";
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2530.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2530.dtsi
new file mode 100644
index 000000000000..ece0dec61fae
--- /dev/null
+++ b/arch/arm64/boot/dts/nvidia/tegra210-p2530.dtsi
@@ -0,0 +1,50 @@
+#include "tegra210.dtsi"
+
+/ {
+	model = "NVIDIA Tegra210 P2530 main board";
+	compatible = "nvidia,p2530", "nvidia,tegra210";
+
+	aliases {
+		rtc1 = "/rtc@0,7000e000";
+		serial0 = &uarta;
+	};
+
+	memory {
+		device_type = "memory";
+		reg = <0x0 0x80000000 0x0 0xc0000000>;
+	};
+
+	/* debug port */
+	serial@0,70006000 {
+		status = "okay";
+	};
+
+	i2c@0,7000d000 {
+		status = "okay";
+		clock-frequency = <400000>;
+	};
+
+	pmc@0,7000e400 {
+		nvidia,invert-interrupt;
+	};
+
+	/* eMMC */
+	sdhci@0,700b0600 {
+		status = "okay";
+		bus-width = <8>;
+		non-removable;
+	};
+
+	clocks {
+		compatible = "simple-bus";
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		clk32k_in: clock@0 {
+			compatible = "fixed-clock";
+			reg = <0>;
+			#clock-cells = <0>;
+			clock-frequency = <32768>;
+		};
+	};
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2571.dts b/arch/arm64/boot/dts/nvidia/tegra210-p2571.dts
new file mode 100644
index 000000000000..58d27ddd57ff
--- /dev/null
+++ b/arch/arm64/boot/dts/nvidia/tegra210-p2571.dts
@@ -0,0 +1,1302 @@
+/dts-v1/;
+
+#include <dt-bindings/input/input.h>
+#include "tegra210-p2530.dtsi"
+
+/ {
+	model = "NVIDIA Tegra210 P2571 reference design";
+	compatible = "nvidia,p2571", "nvidia,tegra210";
+
+	pinmux: pinmux@0,700008d4 {
+		pinctrl-names = "boot";
+		pinctrl-0 = <&state_boot>;
+
+		state_boot: pinmux {
+			pex_l0_rst_n_pa0 {
+				nvidia,pins = "pex_l0_rst_n_pa0";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			pex_l0_clkreq_n_pa1 {
+				nvidia,pins = "pex_l0_clkreq_n_pa1";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			pex_wake_n_pa2 {
+				nvidia,pins = "pex_wake_n_pa2";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			pex_l1_rst_n_pa3 {
+				nvidia,pins = "pex_l1_rst_n_pa3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			pex_l1_clkreq_n_pa4 {
+				nvidia,pins = "pex_l1_clkreq_n_pa4";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			sata_led_active_pa5 {
+				nvidia,pins = "sata_led_active_pa5";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pa6 {
+				nvidia,pins = "pa6";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_fs_pb0 {
+				nvidia,pins = "dap1_fs_pb0";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_din_pb1 {
+				nvidia,pins = "dap1_din_pb1";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_dout_pb2 {
+				nvidia,pins = "dap1_dout_pb2";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_sclk_pb3 {
+				nvidia,pins = "dap1_sclk_pb3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_mosi_pb4 {
+				nvidia,pins = "spi2_mosi_pb4";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_miso_pb5 {
+				nvidia,pins = "spi2_miso_pb5";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_sck_pb6 {
+				nvidia,pins = "spi2_sck_pb6";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_cs0_pb7 {
+				nvidia,pins = "spi2_cs0_pb7";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_mosi_pc0 {
+				nvidia,pins = "spi1_mosi_pc0";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_miso_pc1 {
+				nvidia,pins = "spi1_miso_pc1";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_sck_pc2 {
+				nvidia,pins = "spi1_sck_pc2";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_cs0_pc3 {
+				nvidia,pins = "spi1_cs0_pc3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_cs1_pc4 {
+				nvidia,pins = "spi1_cs1_pc4";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_sck_pc5 {
+				nvidia,pins = "spi4_sck_pc5";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_cs0_pc6 {
+				nvidia,pins = "spi4_cs0_pc6";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_mosi_pc7 {
+				nvidia,pins = "spi4_mosi_pc7";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_miso_pd0 {
+				nvidia,pins = "spi4_miso_pd0";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_tx_pd1 {
+				nvidia,pins = "uart3_tx_pd1";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_rx_pd2 {
+				nvidia,pins = "uart3_rx_pd2";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_rts_pd3 {
+				nvidia,pins = "uart3_rts_pd3";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_cts_pd4 {
+				nvidia,pins = "uart3_cts_pd4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic1_clk_pe0 {
+				nvidia,pins = "dmic1_clk_pe0";
+				nvidia,function = "i2s3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic1_dat_pe1 {
+				nvidia,pins = "dmic1_dat_pe1";
+				nvidia,function = "i2s3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic2_clk_pe2 {
+				nvidia,pins = "dmic2_clk_pe2";
+				nvidia,function = "i2s3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic2_dat_pe3 {
+				nvidia,pins = "dmic2_dat_pe3";
+				nvidia,function = "i2s3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic3_clk_pe4 {
+				nvidia,pins = "dmic3_clk_pe4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic3_dat_pe5 {
+				nvidia,pins = "dmic3_dat_pe5";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pe6 {
+				nvidia,pins = "pe6";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pe7 {
+				nvidia,pins = "pe7";
+				nvidia,function = "pwm3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gen3_i2c_scl_pf0 {
+				nvidia,pins = "gen3_i2c_scl_pf0";
+				nvidia,function = "i2c3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			gen3_i2c_sda_pf1 {
+				nvidia,pins = "gen3_i2c_sda_pf1";
+				nvidia,function = "i2c3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_tx_pg0 {
+				nvidia,pins = "uart2_tx_pg0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_rx_pg1 {
+				nvidia,pins = "uart2_rx_pg1";
+				nvidia,function = "uartb";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_rts_pg2 {
+				nvidia,pins = "uart2_rts_pg2";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_cts_pg3 {
+				nvidia,pins = "uart2_cts_pg3";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			wifi_en_ph0 {
+				nvidia,pins = "wifi_en_ph0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			wifi_rst_ph1 {
+				nvidia,pins = "wifi_rst_ph1";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			wifi_wake_ap_ph2 {
+				nvidia,pins = "wifi_wake_ap_ph2";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ap_wake_bt_ph3 {
+				nvidia,pins = "ap_wake_bt_ph3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			bt_rst_ph4 {
+				nvidia,pins = "bt_rst_ph4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			bt_wake_ap_ph5 {
+				nvidia,pins = "bt_wake_ap_ph5";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ph6 {
+				nvidia,pins = "ph6";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ap_wake_nfc_ph7 {
+				nvidia,pins = "ap_wake_nfc_ph7";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			nfc_en_pi0 {
+				nvidia,pins = "nfc_en_pi0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			nfc_int_pi1 {
+				nvidia,pins = "nfc_int_pi1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gps_en_pi2 {
+				nvidia,pins = "gps_en_pi2";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gps_rst_pi3 {
+				nvidia,pins = "gps_rst_pi3";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_tx_pi4 {
+				nvidia,pins = "uart4_tx_pi4";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_rx_pi5 {
+				nvidia,pins = "uart4_rx_pi5";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_rts_pi6 {
+				nvidia,pins = "uart4_rts_pi6";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_cts_pi7 {
+				nvidia,pins = "uart4_cts_pi7";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gen1_i2c_sda_pj0 {
+				nvidia,pins = "gen1_i2c_sda_pj0";
+				nvidia,function = "i2c1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			gen1_i2c_scl_pj1 {
+				nvidia,pins = "gen1_i2c_scl_pj1";
+				nvidia,function = "i2c1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			gen2_i2c_scl_pj2 {
+				nvidia,pins = "gen2_i2c_scl_pj2";
+				nvidia,function = "i2c2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			gen2_i2c_sda_pj3 {
+				nvidia,pins = "gen2_i2c_sda_pj3";
+				nvidia,function = "i2c2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			dap4_fs_pj4 {
+				nvidia,pins = "dap4_fs_pj4";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap4_din_pj5 {
+				nvidia,pins = "dap4_din_pj5";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap4_dout_pj6 {
+				nvidia,pins = "dap4_dout_pj6";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap4_sclk_pj7 {
+				nvidia,pins = "dap4_sclk_pj7";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk0 {
+				nvidia,pins = "pk0";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk1 {
+				nvidia,pins = "pk1";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk2 {
+				nvidia,pins = "pk2";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk3 {
+				nvidia,pins = "pk3";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk4 {
+				nvidia,pins = "pk4";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk5 {
+				nvidia,pins = "pk5";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk6 {
+				nvidia,pins = "pk6";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk7 {
+				nvidia,pins = "pk7";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pl0 {
+				nvidia,pins = "pl0";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pl1 {
+				nvidia,pins = "pl1";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_clk_pm0 {
+				nvidia,pins = "sdmmc1_clk_pm0";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_cmd_pm1 {
+				nvidia,pins = "sdmmc1_cmd_pm1";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat3_pm2 {
+				nvidia,pins = "sdmmc1_dat3_pm2";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat2_pm3 {
+				nvidia,pins = "sdmmc1_dat2_pm3";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat1_pm4 {
+				nvidia,pins = "sdmmc1_dat1_pm4";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat0_pm5 {
+				nvidia,pins = "sdmmc1_dat0_pm5";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_clk_pp0 {
+				nvidia,pins = "sdmmc3_clk_pp0";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_cmd_pp1 {
+				nvidia,pins = "sdmmc3_cmd_pp1";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat3_pp2 {
+				nvidia,pins = "sdmmc3_dat3_pp2";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat2_pp3 {
+				nvidia,pins = "sdmmc3_dat2_pp3";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat1_pp4 {
+				nvidia,pins = "sdmmc3_dat1_pp4";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat0_pp5 {
+				nvidia,pins = "sdmmc3_dat0_pp5";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam1_mclk_ps0 {
+				nvidia,pins = "cam1_mclk_ps0";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam2_mclk_ps1 {
+				nvidia,pins = "cam2_mclk_ps1";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam_i2c_scl_ps2 {
+				nvidia,pins = "cam_i2c_scl_ps2";
+				nvidia,function = "i2cvi";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			cam_i2c_sda_ps3 {
+				nvidia,pins = "cam_i2c_sda_ps3";
+				nvidia,function = "i2cvi";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			cam_rst_ps4 {
+				nvidia,pins = "cam_rst_ps4";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam_af_en_ps5 {
+				nvidia,pins = "cam_af_en_ps5";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam_flash_en_ps6 {
+				nvidia,pins = "cam_flash_en_ps6";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam1_pwdn_ps7 {
+				nvidia,pins = "cam1_pwdn_ps7";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam2_pwdn_pt0 {
+				nvidia,pins = "cam2_pwdn_pt0";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam1_strobe_pt1 {
+				nvidia,pins = "cam1_strobe_pt1";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_tx_pu0 {
+				nvidia,pins = "uart1_tx_pu0";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_rx_pu1 {
+				nvidia,pins = "uart1_rx_pu1";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_rts_pu2 {
+				nvidia,pins = "uart1_rts_pu2";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_cts_pu3 {
+				nvidia,pins = "uart1_cts_pu3";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_bl_pwm_pv0 {
+				nvidia,pins = "lcd_bl_pwm_pv0";
+				nvidia,function = "pwm0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_bl_en_pv1 {
+				nvidia,pins = "lcd_bl_en_pv1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_rst_pv2 {
+				nvidia,pins = "lcd_rst_pv2";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_gpio1_pv3 {
+				nvidia,pins = "lcd_gpio1_pv3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_gpio2_pv4 {
+				nvidia,pins = "lcd_gpio2_pv4";
+				nvidia,function = "pwm1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ap_ready_pv5 {
+				nvidia,pins = "ap_ready_pv5";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			touch_rst_pv6 {
+				nvidia,pins = "touch_rst_pv6";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			touch_clk_pv7 {
+				nvidia,pins = "touch_clk_pv7";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			modem_wake_ap_px0 {
+				nvidia,pins = "modem_wake_ap_px0";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			touch_int_px1 {
+				nvidia,pins = "touch_int_px1";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			motion_int_px2 {
+				nvidia,pins = "motion_int_px2";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			als_prox_int_px3 {
+				nvidia,pins = "als_prox_int_px3";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			temp_alert_px4 {
+				nvidia,pins = "temp_alert_px4";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_power_on_px5 {
+				nvidia,pins = "button_power_on_px5";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_vol_up_px6 {
+				nvidia,pins = "button_vol_up_px6";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_vol_down_px7 {
+				nvidia,pins = "button_vol_down_px7";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_slide_sw_py0 {
+				nvidia,pins = "button_slide_sw_py0";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_home_py1 {
+				nvidia,pins = "button_home_py1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_te_py2 {
+				nvidia,pins = "lcd_te_py2";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pwr_i2c_scl_py3 {
+				nvidia,pins = "pwr_i2c_scl_py3";
+				nvidia,function = "i2cpmu";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			pwr_i2c_sda_py4 {
+				nvidia,pins = "pwr_i2c_sda_py4";
+				nvidia,function = "i2cpmu";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			clk_32k_out_py5 {
+				nvidia,pins = "clk_32k_out_py5";
+				nvidia,function = "soc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz0 {
+				nvidia,pins = "pz0";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz1 {
+				nvidia,pins = "pz1";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz2 {
+				nvidia,pins = "pz2";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz3 {
+				nvidia,pins = "pz3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz4 {
+				nvidia,pins = "pz4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz5 {
+				nvidia,pins = "pz5";
+				nvidia,function = "soc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_fs_paa0 {
+				nvidia,pins = "dap2_fs_paa0";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_sclk_paa1 {
+				nvidia,pins = "dap2_sclk_paa1";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_din_paa2 {
+				nvidia,pins = "dap2_din_paa2";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_dout_paa3 {
+				nvidia,pins = "dap2_dout_paa3";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			aud_mclk_pbb0 {
+				nvidia,pins = "aud_mclk_pbb0";
+				nvidia,function = "aud";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dvfs_pwm_pbb1 {
+				nvidia,pins = "dvfs_pwm_pbb1";
+				nvidia,function = "cldvfs";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dvfs_clk_pbb2 {
+				nvidia,pins = "dvfs_clk_pbb2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gpio_x1_aud_pbb3 {
+				nvidia,pins = "gpio_x1_aud_pbb3";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gpio_x3_aud_pbb4 {
+				nvidia,pins = "gpio_x3_aud_pbb4";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			hdmi_cec_pcc0 {
+				nvidia,pins = "hdmi_cec_pcc0";
+				nvidia,function = "cec";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			hdmi_int_dp_hpd_pcc1 {
+				nvidia,pins = "hdmi_int_dp_hpd_pcc1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			spdif_out_pcc2 {
+				nvidia,pins = "spdif_out_pcc2";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spdif_in_pcc3 {
+				nvidia,pins = "spdif_in_pcc3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			usb_vbus_en0_pcc4 {
+				nvidia,pins = "usb_vbus_en0_pcc4";
+				nvidia,function = "usb";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			usb_vbus_en1_pcc5 {
+				nvidia,pins = "usb_vbus_en1_pcc5";
+				nvidia,function = "usb";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			dp_hpd0_pcc6 {
+				nvidia,pins = "dp_hpd0_pcc6";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pcc7 {
+				nvidia,pins = "pcc7";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_cs1_pdd0 {
+				nvidia,pins = "spi2_cs1_pdd0";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_sck_pee0 {
+				nvidia,pins = "qspi_sck_pee0";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_cs_n_pee1 {
+				nvidia,pins = "qspi_cs_n_pee1";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io0_pee2 {
+				nvidia,pins = "qspi_io0_pee2";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io1_pee3 {
+				nvidia,pins = "qspi_io1_pee3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io2_pee4 {
+				nvidia,pins = "qspi_io2_pee4";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io3_pee5 {
+				nvidia,pins = "qspi_io3_pee5";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			core_pwr_req {
+				nvidia,pins = "core_pwr_req";
+				nvidia,function = "core";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cpu_pwr_req {
+				nvidia,pins = "cpu_pwr_req";
+				nvidia,function = "cpu";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pwr_int_n {
+				nvidia,pins = "pwr_int_n";
+				nvidia,function = "pmi";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			clk_32k_in {
+				nvidia,pins = "clk_32k_in";
+				nvidia,function = "clk";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			jtag_rtck {
+				nvidia,pins = "jtag_rtck";
+				nvidia,function = "jtag";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			clk_req {
+				nvidia,pins = "clk_req";
+				nvidia,function = "sys";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			shutdown {
+				nvidia,pins = "shutdown";
+				nvidia,function = "shutdown";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+		};
+	};
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2595.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2595.dtsi
new file mode 100644
index 000000000000..f3f91392214e
--- /dev/null
+++ b/arch/arm64/boot/dts/nvidia/tegra210-p2595.dtsi
@@ -0,0 +1,1272 @@
+/ {
+	model = "NVIDIA Tegra210 P2595 I/O board";
+	compatible = "nvidia,p2595", "nvidia,tegra210";
+
+	pinmux: pinmux@0,700008d4 {
+		pinctrl-names = "boot";
+		pinctrl-0 = <&state_boot>;
+
+		state_boot: pinmux {
+			pex_l0_rst_n_pa0 {
+				nvidia,pins = "pex_l0_rst_n_pa0";
+				nvidia,function = "pe0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			pex_l0_clkreq_n_pa1 {
+				nvidia,pins = "pex_l0_clkreq_n_pa1";
+				nvidia,function = "pe0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			pex_wake_n_pa2 {
+				nvidia,pins = "pex_wake_n_pa2";
+				nvidia,function = "pe";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			pex_l1_rst_n_pa3 {
+				nvidia,pins = "pex_l1_rst_n_pa3";
+				nvidia,function = "pe1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			pex_l1_clkreq_n_pa4 {
+				nvidia,pins = "pex_l1_clkreq_n_pa4";
+				nvidia,function = "pe1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			sata_led_active_pa5 {
+				nvidia,pins = "sata_led_active_pa5";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pa6 {
+				nvidia,pins = "pa6";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_fs_pb0 {
+				nvidia,pins = "dap1_fs_pb0";
+				nvidia,function = "i2s1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_din_pb1 {
+				nvidia,pins = "dap1_din_pb1";
+				nvidia,function = "i2s1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_dout_pb2 {
+				nvidia,pins = "dap1_dout_pb2";
+				nvidia,function = "i2s1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_sclk_pb3 {
+				nvidia,pins = "dap1_sclk_pb3";
+				nvidia,function = "i2s1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_mosi_pb4 {
+				nvidia,pins = "spi2_mosi_pb4";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_miso_pb5 {
+				nvidia,pins = "spi2_miso_pb5";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_sck_pb6 {
+				nvidia,pins = "spi2_sck_pb6";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_cs0_pb7 {
+				nvidia,pins = "spi2_cs0_pb7";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_mosi_pc0 {
+				nvidia,pins = "spi1_mosi_pc0";
+				nvidia,function = "spi1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_miso_pc1 {
+				nvidia,pins = "spi1_miso_pc1";
+				nvidia,function = "spi1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_sck_pc2 {
+				nvidia,pins = "spi1_sck_pc2";
+				nvidia,function = "spi1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_cs0_pc3 {
+				nvidia,pins = "spi1_cs0_pc3";
+				nvidia,function = "spi1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_cs1_pc4 {
+				nvidia,pins = "spi1_cs1_pc4";
+				nvidia,function = "spi1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_sck_pc5 {
+				nvidia,pins = "spi4_sck_pc5";
+				nvidia,function = "spi4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_cs0_pc6 {
+				nvidia,pins = "spi4_cs0_pc6";
+				nvidia,function = "spi4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_mosi_pc7 {
+				nvidia,pins = "spi4_mosi_pc7";
+				nvidia,function = "spi4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_miso_pd0 {
+				nvidia,pins = "spi4_miso_pd0";
+				nvidia,function = "spi4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_tx_pd1 {
+				nvidia,pins = "uart3_tx_pd1";
+				nvidia,function = "uartc";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_rx_pd2 {
+				nvidia,pins = "uart3_rx_pd2";
+				nvidia,function = "uartc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_rts_pd3 {
+				nvidia,pins = "uart3_rts_pd3";
+				nvidia,function = "uartc";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_cts_pd4 {
+				nvidia,pins = "uart3_cts_pd4";
+				nvidia,function = "uartc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic1_clk_pe0 {
+				nvidia,pins = "dmic1_clk_pe0";
+				nvidia,function = "dmic1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic1_dat_pe1 {
+				nvidia,pins = "dmic1_dat_pe1";
+				nvidia,function = "dmic1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic2_clk_pe2 {
+				nvidia,pins = "dmic2_clk_pe2";
+				nvidia,function = "dmic2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic2_dat_pe3 {
+				nvidia,pins = "dmic2_dat_pe3";
+				nvidia,function = "dmic2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic3_clk_pe4 {
+				nvidia,pins = "dmic3_clk_pe4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic3_dat_pe5 {
+				nvidia,pins = "dmic3_dat_pe5";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pe6 {
+				nvidia,pins = "pe6";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pe7 {
+				nvidia,pins = "pe7";
+				nvidia,function = "pwm3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gen3_i2c_scl_pf0 {
+				nvidia,pins = "gen3_i2c_scl_pf0";
+				nvidia,function = "i2c3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			gen3_i2c_sda_pf1 {
+				nvidia,pins = "gen3_i2c_sda_pf1";
+				nvidia,function = "i2c3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_tx_pg0 {
+				nvidia,pins = "uart2_tx_pg0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_rx_pg1 {
+				nvidia,pins = "uart2_rx_pg1";
+				nvidia,function = "uartb";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_rts_pg2 {
+				nvidia,pins = "uart2_rts_pg2";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_cts_pg3 {
+				nvidia,pins = "uart2_cts_pg3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			wifi_en_ph0 {
+				nvidia,pins = "wifi_en_ph0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			wifi_rst_ph1 {
+				nvidia,pins = "wifi_rst_ph1";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			wifi_wake_ap_ph2 {
+				nvidia,pins = "wifi_wake_ap_ph2";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ap_wake_bt_ph3 {
+				nvidia,pins = "ap_wake_bt_ph3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			bt_rst_ph4 {
+				nvidia,pins = "bt_rst_ph4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			bt_wake_ap_ph5 {
+				nvidia,pins = "bt_wake_ap_ph5";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ph6 {
+				nvidia,pins = "ph6";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ap_wake_nfc_ph7 {
+				nvidia,pins = "ap_wake_nfc_ph7";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			nfc_en_pi0 {
+				nvidia,pins = "nfc_en_pi0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			nfc_int_pi1 {
+				nvidia,pins = "nfc_int_pi1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gps_en_pi2 {
+				nvidia,pins = "gps_en_pi2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gps_rst_pi3 {
+				nvidia,pins = "gps_rst_pi3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_tx_pi4 {
+				nvidia,pins = "uart4_tx_pi4";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_rx_pi5 {
+				nvidia,pins = "uart4_rx_pi5";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_rts_pi6 {
+				nvidia,pins = "uart4_rts_pi6";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_cts_pi7 {
+				nvidia,pins = "uart4_cts_pi7";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gen1_i2c_sda_pj0 {
+				nvidia,pins = "gen1_i2c_sda_pj0";
+				nvidia,function = "i2c1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			gen1_i2c_scl_pj1 {
+				nvidia,pins = "gen1_i2c_scl_pj1";
+				nvidia,function = "i2c1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			gen2_i2c_scl_pj2 {
+				nvidia,pins = "gen2_i2c_scl_pj2";
+				nvidia,function = "i2c2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			gen2_i2c_sda_pj3 {
+				nvidia,pins = "gen2_i2c_sda_pj3";
+				nvidia,function = "i2c2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			dap4_fs_pj4 {
+				nvidia,pins = "dap4_fs_pj4";
+				nvidia,function = "i2s4b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap4_din_pj5 {
+				nvidia,pins = "dap4_din_pj5";
+				nvidia,function = "i2s4b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap4_dout_pj6 {
+				nvidia,pins = "dap4_dout_pj6";
+				nvidia,function = "i2s4b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap4_sclk_pj7 {
+				nvidia,pins = "dap4_sclk_pj7";
+				nvidia,function = "i2s4b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk0 {
+				nvidia,pins = "pk0";
+				nvidia,function = "i2s5b";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk1 {
+				nvidia,pins = "pk1";
+				nvidia,function = "i2s5b";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk2 {
+				nvidia,pins = "pk2";
+				nvidia,function = "i2s5b";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk3 {
+				nvidia,pins = "pk3";
+				nvidia,function = "i2s5b";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk4 {
+				nvidia,pins = "pk4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk5 {
+				nvidia,pins = "pk5";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk6 {
+				nvidia,pins = "pk6";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk7 {
+				nvidia,pins = "pk7";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pl0 {
+				nvidia,pins = "pl0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pl1 {
+				nvidia,pins = "pl1";
+				nvidia,function = "soc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_clk_pm0 {
+				nvidia,pins = "sdmmc1_clk_pm0";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_cmd_pm1 {
+				nvidia,pins = "sdmmc1_cmd_pm1";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat3_pm2 {
+				nvidia,pins = "sdmmc1_dat3_pm2";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat2_pm3 {
+				nvidia,pins = "sdmmc1_dat2_pm3";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat1_pm4 {
+				nvidia,pins = "sdmmc1_dat1_pm4";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat0_pm5 {
+				nvidia,pins = "sdmmc1_dat0_pm5";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_clk_pp0 {
+				nvidia,pins = "sdmmc3_clk_pp0";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_cmd_pp1 {
+				nvidia,pins = "sdmmc3_cmd_pp1";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat3_pp2 {
+				nvidia,pins = "sdmmc3_dat3_pp2";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat2_pp3 {
+				nvidia,pins = "sdmmc3_dat2_pp3";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat1_pp4 {
+				nvidia,pins = "sdmmc3_dat1_pp4";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat0_pp5 {
+				nvidia,pins = "sdmmc3_dat0_pp5";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam1_mclk_ps0 {
+				nvidia,pins = "cam1_mclk_ps0";
+				nvidia,function = "extperiph3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam2_mclk_ps1 {
+				nvidia,pins = "cam2_mclk_ps1";
+				nvidia,function = "extperiph3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam_i2c_scl_ps2 {
+				nvidia,pins = "cam_i2c_scl_ps2";
+				nvidia,function = "i2cvi";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			cam_i2c_sda_ps3 {
+				nvidia,pins = "cam_i2c_sda_ps3";
+				nvidia,function = "i2cvi";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			cam_rst_ps4 {
+				nvidia,pins = "cam_rst_ps4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam_af_en_ps5 {
+				nvidia,pins = "cam_af_en_ps5";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam_flash_en_ps6 {
+				nvidia,pins = "cam_flash_en_ps6";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam1_pwdn_ps7 {
+				nvidia,pins = "cam1_pwdn_ps7";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam2_pwdn_pt0 {
+				nvidia,pins = "cam2_pwdn_pt0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam1_strobe_pt1 {
+				nvidia,pins = "cam1_strobe_pt1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_tx_pu0 {
+				nvidia,pins = "uart1_tx_pu0";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_rx_pu1 {
+				nvidia,pins = "uart1_rx_pu1";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_rts_pu2 {
+				nvidia,pins = "uart1_rts_pu2";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_cts_pu3 {
+				nvidia,pins = "uart1_cts_pu3";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_bl_pwm_pv0 {
+				nvidia,pins = "lcd_bl_pwm_pv0";
+				nvidia,function = "pwm0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_bl_en_pv1 {
+				nvidia,pins = "lcd_bl_en_pv1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_rst_pv2 {
+				nvidia,pins = "lcd_rst_pv2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_gpio1_pv3 {
+				nvidia,pins = "lcd_gpio1_pv3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_gpio2_pv4 {
+				nvidia,pins = "lcd_gpio2_pv4";
+				nvidia,function = "pwm1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ap_ready_pv5 {
+				nvidia,pins = "ap_ready_pv5";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			touch_rst_pv6 {
+				nvidia,pins = "touch_rst_pv6";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			touch_clk_pv7 {
+				nvidia,pins = "touch_clk_pv7";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			modem_wake_ap_px0 {
+				nvidia,pins = "modem_wake_ap_px0";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			touch_int_px1 {
+				nvidia,pins = "touch_int_px1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			motion_int_px2 {
+				nvidia,pins = "motion_int_px2";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			als_prox_int_px3 {
+				nvidia,pins = "als_prox_int_px3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			temp_alert_px4 {
+				nvidia,pins = "temp_alert_px4";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_power_on_px5 {
+				nvidia,pins = "button_power_on_px5";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_vol_up_px6 {
+				nvidia,pins = "button_vol_up_px6";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_vol_down_px7 {
+				nvidia,pins = "button_vol_down_px7";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_slide_sw_py0 {
+				nvidia,pins = "button_slide_sw_py0";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_home_py1 {
+				nvidia,pins = "button_home_py1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_te_py2 {
+				nvidia,pins = "lcd_te_py2";
+				nvidia,function = "displaya";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pwr_i2c_scl_py3 {
+				nvidia,pins = "pwr_i2c_scl_py3";
+				nvidia,function = "i2cpmu";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			pwr_i2c_sda_py4 {
+				nvidia,pins = "pwr_i2c_sda_py4";
+				nvidia,function = "i2cpmu";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			clk_32k_out_py5 {
+				nvidia,pins = "clk_32k_out_py5";
+				nvidia,function = "soc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz0 {
+				nvidia,pins = "pz0";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz1 {
+				nvidia,pins = "pz1";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz2 {
+				nvidia,pins = "pz2";
+				nvidia,function = "rsvd2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz3 {
+				nvidia,pins = "pz3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz4 {
+				nvidia,pins = "pz4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz5 {
+				nvidia,pins = "pz5";
+				nvidia,function = "soc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_fs_paa0 {
+				nvidia,pins = "dap2_fs_paa0";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_sclk_paa1 {
+				nvidia,pins = "dap2_sclk_paa1";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_din_paa2 {
+				nvidia,pins = "dap2_din_paa2";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_dout_paa3 {
+				nvidia,pins = "dap2_dout_paa3";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			aud_mclk_pbb0 {
+				nvidia,pins = "aud_mclk_pbb0";
+				nvidia,function = "aud";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dvfs_pwm_pbb1 {
+				nvidia,pins = "dvfs_pwm_pbb1";
+				nvidia,function = "cldvfs";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dvfs_clk_pbb2 {
+				nvidia,pins = "dvfs_clk_pbb2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gpio_x1_aud_pbb3 {
+				nvidia,pins = "gpio_x1_aud_pbb3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gpio_x3_aud_pbb4 {
+				nvidia,pins = "gpio_x3_aud_pbb4";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			hdmi_cec_pcc0 {
+				nvidia,pins = "hdmi_cec_pcc0";
+				nvidia,function = "cec";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			hdmi_int_dp_hpd_pcc1 {
+				nvidia,pins = "hdmi_int_dp_hpd_pcc1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			spdif_out_pcc2 {
+				nvidia,pins = "spdif_out_pcc2";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spdif_in_pcc3 {
+				nvidia,pins = "spdif_in_pcc3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			usb_vbus_en0_pcc4 {
+				nvidia,pins = "usb_vbus_en0_pcc4";
+				nvidia,function = "usb";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			usb_vbus_en1_pcc5 {
+				nvidia,pins = "usb_vbus_en1_pcc5";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			dp_hpd0_pcc6 {
+				nvidia,pins = "dp_hpd0_pcc6";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pcc7 {
+				nvidia,pins = "pcc7";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_cs1_pdd0 {
+				nvidia,pins = "spi2_cs1_pdd0";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_sck_pee0 {
+				nvidia,pins = "qspi_sck_pee0";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_cs_n_pee1 {
+				nvidia,pins = "qspi_cs_n_pee1";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io0_pee2 {
+				nvidia,pins = "qspi_io0_pee2";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io1_pee3 {
+				nvidia,pins = "qspi_io1_pee3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io2_pee4 {
+				nvidia,pins = "qspi_io2_pee4";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io3_pee5 {
+				nvidia,pins = "qspi_io3_pee5";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			core_pwr_req {
+				nvidia,pins = "core_pwr_req";
+				nvidia,function = "core";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cpu_pwr_req {
+				nvidia,pins = "cpu_pwr_req";
+				nvidia,function = "cpu";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pwr_int_n {
+				nvidia,pins = "pwr_int_n";
+				nvidia,function = "pmi";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			clk_32k_in {
+				nvidia,pins = "clk_32k_in";
+				nvidia,function = "clk";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			jtag_rtck {
+				nvidia,pins = "jtag_rtck";
+				nvidia,function = "jtag";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			clk_req {
+				nvidia,pins = "clk_req";
+				nvidia,function = "sys";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			shutdown {
+				nvidia,pins = "shutdown";
+				nvidia,function = "shutdown";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+		};
+	};
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi
new file mode 100644
index 000000000000..be3eccbe8013
--- /dev/null
+++ b/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi
@@ -0,0 +1,1270 @@
+/ {
+	model = "NVIDIA Tegra210 P2597 I/O board";
+	compatible = "nvidia,p2597", "nvidia,tegra210";
+
+	pinmux: pinmux@0,700008d4 {
+		pinctrl-names = "boot";
+		pinctrl-0 = <&state_boot>;
+
+		state_boot: pinmux {
+			pex_l0_rst_n_pa0 {
+				nvidia,pins = "pex_l0_rst_n_pa0";
+				nvidia,function = "pe0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			pex_l0_clkreq_n_pa1 {
+				nvidia,pins = "pex_l0_clkreq_n_pa1";
+				nvidia,function = "pe0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			pex_wake_n_pa2 {
+				nvidia,pins = "pex_wake_n_pa2";
+				nvidia,function = "pe";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			pex_l1_rst_n_pa3 {
+				nvidia,pins = "pex_l1_rst_n_pa3";
+				nvidia,function = "pe1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			pex_l1_clkreq_n_pa4 {
+				nvidia,pins = "pex_l1_clkreq_n_pa4";
+				nvidia,function = "pe1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			sata_led_active_pa5 {
+				nvidia,pins = "sata_led_active_pa5";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pa6 {
+				nvidia,pins = "pa6";
+				nvidia,function = "sata";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_fs_pb0 {
+				nvidia,pins = "dap1_fs_pb0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_din_pb1 {
+				nvidia,pins = "dap1_din_pb1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_dout_pb2 {
+				nvidia,pins = "dap1_dout_pb2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap1_sclk_pb3 {
+				nvidia,pins = "dap1_sclk_pb3";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_mosi_pb4 {
+				nvidia,pins = "spi2_mosi_pb4";
+				nvidia,function = "spi2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_miso_pb5 {
+				nvidia,pins = "spi2_miso_pb5";
+				nvidia,function = "spi2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_sck_pb6 {
+				nvidia,pins = "spi2_sck_pb6";
+				nvidia,function = "spi2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_cs0_pb7 {
+				nvidia,pins = "spi2_cs0_pb7";
+				nvidia,function = "spi2";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_mosi_pc0 {
+				nvidia,pins = "spi1_mosi_pc0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_miso_pc1 {
+				nvidia,pins = "spi1_miso_pc1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_sck_pc2 {
+				nvidia,pins = "spi1_sck_pc2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_cs0_pc3 {
+				nvidia,pins = "spi1_cs0_pc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi1_cs1_pc4 {
+				nvidia,pins = "spi1_cs1_pc4";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_sck_pc5 {
+				nvidia,pins = "spi4_sck_pc5";
+				nvidia,function = "spi4";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_cs0_pc6 {
+				nvidia,pins = "spi4_cs0_pc6";
+				nvidia,function = "spi4";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_mosi_pc7 {
+				nvidia,pins = "spi4_mosi_pc7";
+				nvidia,function = "spi4";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spi4_miso_pd0 {
+				nvidia,pins = "spi4_miso_pd0";
+				nvidia,function = "spi4";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_tx_pd1 {
+				nvidia,pins = "uart3_tx_pd1";
+				nvidia,function = "uartc";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_rx_pd2 {
+				nvidia,pins = "uart3_rx_pd2";
+				nvidia,function = "uartc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_rts_pd3 {
+				nvidia,pins = "uart3_rts_pd3";
+				nvidia,function = "uartc";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart3_cts_pd4 {
+				nvidia,pins = "uart3_cts_pd4";
+				nvidia,function = "uartc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic1_clk_pe0 {
+				nvidia,pins = "dmic1_clk_pe0";
+				nvidia,function = "i2s3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic1_dat_pe1 {
+				nvidia,pins = "dmic1_dat_pe1";
+				nvidia,function = "i2s3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic2_clk_pe2 {
+				nvidia,pins = "dmic2_clk_pe2";
+				nvidia,function = "i2s3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic2_dat_pe3 {
+				nvidia,pins = "dmic2_dat_pe3";
+				nvidia,function = "i2s3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic3_clk_pe4 {
+				nvidia,pins = "dmic3_clk_pe4";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dmic3_dat_pe5 {
+				nvidia,pins = "dmic3_dat_pe5";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pe6 {
+				nvidia,pins = "pe6";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pe7 {
+				nvidia,pins = "pe7";
+				nvidia,function = "pwm3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gen3_i2c_scl_pf0 {
+				nvidia,pins = "gen3_i2c_scl_pf0";
+				nvidia,function = "i2c3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			gen3_i2c_sda_pf1 {
+				nvidia,pins = "gen3_i2c_sda_pf1";
+				nvidia,function = "i2c3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_tx_pg0 {
+				nvidia,pins = "uart2_tx_pg0";
+				nvidia,function = "uartb";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_rx_pg1 {
+				nvidia,pins = "uart2_rx_pg1";
+				nvidia,function = "uartb";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_rts_pg2 {
+				nvidia,pins = "uart2_rts_pg2";
+				nvidia,function = "uartb";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart2_cts_pg3 {
+				nvidia,pins = "uart2_cts_pg3";
+				nvidia,function = "uartb";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			wifi_en_ph0 {
+				nvidia,pins = "wifi_en_ph0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			wifi_rst_ph1 {
+				nvidia,pins = "wifi_rst_ph1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			wifi_wake_ap_ph2 {
+				nvidia,pins = "wifi_wake_ap_ph2";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ap_wake_bt_ph3 {
+				nvidia,pins = "ap_wake_bt_ph3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			bt_rst_ph4 {
+				nvidia,pins = "bt_rst_ph4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			bt_wake_ap_ph5 {
+				nvidia,pins = "bt_wake_ap_ph5";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ph6 {
+				nvidia,pins = "ph6";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ap_wake_nfc_ph7 {
+				nvidia,pins = "ap_wake_nfc_ph7";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			nfc_en_pi0 {
+				nvidia,pins = "nfc_en_pi0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			nfc_int_pi1 {
+				nvidia,pins = "nfc_int_pi1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gps_en_pi2 {
+				nvidia,pins = "gps_en_pi2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gps_rst_pi3 {
+				nvidia,pins = "gps_rst_pi3";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_tx_pi4 {
+				nvidia,pins = "uart4_tx_pi4";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_rx_pi5 {
+				nvidia,pins = "uart4_rx_pi5";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_rts_pi6 {
+				nvidia,pins = "uart4_rts_pi6";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart4_cts_pi7 {
+				nvidia,pins = "uart4_cts_pi7";
+				nvidia,function = "uartd";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gen1_i2c_sda_pj0 {
+				nvidia,pins = "gen1_i2c_sda_pj0";
+				nvidia,function = "i2c1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			gen1_i2c_scl_pj1 {
+				nvidia,pins = "gen1_i2c_scl_pj1";
+				nvidia,function = "i2c1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			gen2_i2c_scl_pj2 {
+				nvidia,pins = "gen2_i2c_scl_pj2";
+				nvidia,function = "i2c2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			gen2_i2c_sda_pj3 {
+				nvidia,pins = "gen2_i2c_sda_pj3";
+				nvidia,function = "i2c2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			dap4_fs_pj4 {
+				nvidia,pins = "dap4_fs_pj4";
+				nvidia,function = "i2s4b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap4_din_pj5 {
+				nvidia,pins = "dap4_din_pj5";
+				nvidia,function = "i2s4b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap4_dout_pj6 {
+				nvidia,pins = "dap4_dout_pj6";
+				nvidia,function = "i2s4b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap4_sclk_pj7 {
+				nvidia,pins = "dap4_sclk_pj7";
+				nvidia,function = "i2s4b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk0 {
+				nvidia,pins = "pk0";
+				nvidia,function = "i2s5b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk1 {
+				nvidia,pins = "pk1";
+				nvidia,function = "i2s5b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk2 {
+				nvidia,pins = "pk2";
+				nvidia,function = "i2s5b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk3 {
+				nvidia,pins = "pk3";
+				nvidia,function = "i2s5b";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk4 {
+				nvidia,pins = "pk4";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk5 {
+				nvidia,pins = "pk5";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk6 {
+				nvidia,pins = "pk6";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pk7 {
+				nvidia,pins = "pk7";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pl0 {
+				nvidia,pins = "pl0";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pl1 {
+				nvidia,pins = "pl1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_clk_pm0 {
+				nvidia,pins = "sdmmc1_clk_pm0";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_cmd_pm1 {
+				nvidia,pins = "sdmmc1_cmd_pm1";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat3_pm2 {
+				nvidia,pins = "sdmmc1_dat3_pm2";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat2_pm3 {
+				nvidia,pins = "sdmmc1_dat2_pm3";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat1_pm4 {
+				nvidia,pins = "sdmmc1_dat1_pm4";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc1_dat0_pm5 {
+				nvidia,pins = "sdmmc1_dat0_pm5";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_clk_pp0 {
+				nvidia,pins = "sdmmc3_clk_pp0";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_cmd_pp1 {
+				nvidia,pins = "sdmmc3_cmd_pp1";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat3_pp2 {
+				nvidia,pins = "sdmmc3_dat3_pp2";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat2_pp3 {
+				nvidia,pins = "sdmmc3_dat2_pp3";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat1_pp4 {
+				nvidia,pins = "sdmmc3_dat1_pp4";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			sdmmc3_dat0_pp5 {
+				nvidia,pins = "sdmmc3_dat0_pp5";
+				nvidia,function = "sdmmc3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam1_mclk_ps0 {
+				nvidia,pins = "cam1_mclk_ps0";
+				nvidia,function = "extperiph3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam2_mclk_ps1 {
+				nvidia,pins = "cam2_mclk_ps1";
+				nvidia,function = "extperiph3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam_i2c_scl_ps2 {
+				nvidia,pins = "cam_i2c_scl_ps2";
+				nvidia,function = "i2cvi";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			cam_i2c_sda_ps3 {
+				nvidia,pins = "cam_i2c_sda_ps3";
+				nvidia,function = "i2cvi";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			cam_rst_ps4 {
+				nvidia,pins = "cam_rst_ps4";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam_af_en_ps5 {
+				nvidia,pins = "cam_af_en_ps5";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam_flash_en_ps6 {
+				nvidia,pins = "cam_flash_en_ps6";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam1_pwdn_ps7 {
+				nvidia,pins = "cam1_pwdn_ps7";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam2_pwdn_pt0 {
+				nvidia,pins = "cam2_pwdn_pt0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cam1_strobe_pt1 {
+				nvidia,pins = "cam1_strobe_pt1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_tx_pu0 {
+				nvidia,pins = "uart1_tx_pu0";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_rx_pu1 {
+				nvidia,pins = "uart1_rx_pu1";
+				nvidia,function = "uarta";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_rts_pu2 {
+				nvidia,pins = "uart1_rts_pu2";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			uart1_cts_pu3 {
+				nvidia,pins = "uart1_cts_pu3";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_bl_pwm_pv0 {
+				nvidia,pins = "lcd_bl_pwm_pv0";
+				nvidia,function = "pwm0";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_bl_en_pv1 {
+				nvidia,pins = "lcd_bl_en_pv1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_rst_pv2 {
+				nvidia,pins = "lcd_rst_pv2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_gpio1_pv3 {
+				nvidia,pins = "lcd_gpio1_pv3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_gpio2_pv4 {
+				nvidia,pins = "lcd_gpio2_pv4";
+				nvidia,function = "pwm1";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			ap_ready_pv5 {
+				nvidia,pins = "ap_ready_pv5";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			touch_rst_pv6 {
+				nvidia,pins = "touch_rst_pv6";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			touch_clk_pv7 {
+				nvidia,pins = "touch_clk_pv7";
+				nvidia,function = "touch";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			modem_wake_ap_px0 {
+				nvidia,pins = "modem_wake_ap_px0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			touch_int_px1 {
+				nvidia,pins = "touch_int_px1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			motion_int_px2 {
+				nvidia,pins = "motion_int_px2";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			als_prox_int_px3 {
+				nvidia,pins = "als_prox_int_px3";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			temp_alert_px4 {
+				nvidia,pins = "temp_alert_px4";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_power_on_px5 {
+				nvidia,pins = "button_power_on_px5";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_vol_up_px6 {
+				nvidia,pins = "button_vol_up_px6";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_vol_down_px7 {
+				nvidia,pins = "button_vol_down_px7";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_slide_sw_py0 {
+				nvidia,pins = "button_slide_sw_py0";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			button_home_py1 {
+				nvidia,pins = "button_home_py1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			lcd_te_py2 {
+				nvidia,pins = "lcd_te_py2";
+				nvidia,function = "displaya";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pwr_i2c_scl_py3 {
+				nvidia,pins = "pwr_i2c_scl_py3";
+				nvidia,function = "i2cpmu";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			pwr_i2c_sda_py4 {
+				nvidia,pins = "pwr_i2c_sda_py4";
+				nvidia,function = "i2cpmu";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			clk_32k_out_py5 {
+				nvidia,pins = "clk_32k_out_py5";
+				nvidia,function = "soc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz0 {
+				nvidia,pins = "pz0";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz1 {
+				nvidia,pins = "pz1";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz2 {
+				nvidia,pins = "pz2";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz3 {
+				nvidia,pins = "pz3";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz4 {
+				nvidia,pins = "pz4";
+				nvidia,function = "sdmmc1";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pz5 {
+				nvidia,pins = "pz5";
+				nvidia,function = "soc";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_fs_paa0 {
+				nvidia,pins = "dap2_fs_paa0";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_sclk_paa1 {
+				nvidia,pins = "dap2_sclk_paa1";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_din_paa2 {
+				nvidia,pins = "dap2_din_paa2";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dap2_dout_paa3 {
+				nvidia,pins = "dap2_dout_paa3";
+				nvidia,function = "i2s2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			aud_mclk_pbb0 {
+				nvidia,pins = "aud_mclk_pbb0";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dvfs_pwm_pbb1 {
+				nvidia,pins = "dvfs_pwm_pbb1";
+				nvidia,function = "cldvfs";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			dvfs_clk_pbb2 {
+				nvidia,pins = "dvfs_clk_pbb2";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gpio_x1_aud_pbb3 {
+				nvidia,pins = "gpio_x1_aud_pbb3";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			gpio_x3_aud_pbb4 {
+				nvidia,pins = "gpio_x3_aud_pbb4";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			hdmi_cec_pcc0 {
+				nvidia,pins = "hdmi_cec_pcc0";
+				nvidia,function = "cec";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			hdmi_int_dp_hpd_pcc1 {
+				nvidia,pins = "hdmi_int_dp_hpd_pcc1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			spdif_out_pcc2 {
+				nvidia,pins = "spdif_out_pcc2";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			spdif_in_pcc3 {
+				nvidia,pins = "spdif_in_pcc3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			usb_vbus_en0_pcc4 {
+				nvidia,pins = "usb_vbus_en0_pcc4";
+				nvidia,function = "usb";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			usb_vbus_en1_pcc5 {
+				nvidia,pins = "usb_vbus_en1_pcc5";
+				nvidia,function = "usb";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+			};
+			dp_hpd0_pcc6 {
+				nvidia,pins = "dp_hpd0_pcc6";
+				nvidia,function = "dp";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pcc7 {
+				nvidia,pins = "pcc7";
+				nvidia,function = "rsvd0";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+				nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+			};
+			spi2_cs1_pdd0 {
+				nvidia,pins = "spi2_cs1_pdd0";
+				nvidia,function = "spi2";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_sck_pee0 {
+				nvidia,pins = "qspi_sck_pee0";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_cs_n_pee1 {
+				nvidia,pins = "qspi_cs_n_pee1";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io0_pee2 {
+				nvidia,pins = "qspi_io0_pee2";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io1_pee3 {
+				nvidia,pins = "qspi_io1_pee3";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io2_pee4 {
+				nvidia,pins = "qspi_io2_pee4";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			qspi_io3_pee5 {
+				nvidia,pins = "qspi_io3_pee5";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			core_pwr_req {
+				nvidia,pins = "core_pwr_req";
+				nvidia,function = "core";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			cpu_pwr_req {
+				nvidia,pins = "cpu_pwr_req";
+				nvidia,function = "cpu";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			pwr_int_n {
+				nvidia,pins = "pwr_int_n";
+				nvidia,function = "pmi";
+				nvidia,pull = <TEGRA_PIN_PULL_UP>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			clk_32k_in {
+				nvidia,pins = "clk_32k_in";
+				nvidia,function = "clk";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			jtag_rtck {
+				nvidia,pins = "jtag_rtck";
+				nvidia,function = "jtag";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			clk_req {
+				nvidia,pins = "clk_req";
+				nvidia,function = "rsvd1";
+				nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+				nvidia,tristate = <TEGRA_PIN_ENABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+			shutdown {
+				nvidia,pins = "shutdown";
+				nvidia,function = "shutdown";
+				nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				nvidia,tristate = <TEGRA_PIN_DISABLE>;
+				nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+				nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+			};
+		};
+	};
+
+	/* MMC/SD */
+	sdhci@0,700b0000 {
+		status = "okay";
+		bus-width = <4>;
+		no-1-8-v;
+
+		cd-gpios = <&gpio TEGRA_GPIO(Z, 1) GPIO_ACTIVE_LOW>;
+	};
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210.dtsi b/arch/arm64/boot/dts/nvidia/tegra210.dtsi
new file mode 100644
index 000000000000..bc23f4dea002
--- /dev/null
+++ b/arch/arm64/boot/dts/nvidia/tegra210.dtsi
@@ -0,0 +1,805 @@
+#include <dt-bindings/clock/tegra210-car.h>
+#include <dt-bindings/gpio/tegra-gpio.h>
+#include <dt-bindings/memory/tegra210-mc.h>
+#include <dt-bindings/pinctrl/pinctrl-tegra.h>
+#include <dt-bindings/interrupt-controller/arm-gic.h>
+
+/ {
+	compatible = "nvidia,tegra210";
+	interrupt-parent = <&lic>;
+	#address-cells = <2>;
+	#size-cells = <2>;
+
+	host1x@0,50000000 {
+		compatible = "nvidia,tegra210-host1x", "simple-bus";
+		reg = <0x0 0x50000000 0x0 0x00034000>;
+		interrupts = <GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>, /* syncpt */
+			     <GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>; /* general */
+		clocks = <&tegra_car TEGRA210_CLK_HOST1X>;
+		clock-names = "host1x";
+		resets = <&tegra_car 28>;
+		reset-names = "host1x";
+
+		#address-cells = <2>;
+		#size-cells = <2>;
+
+		ranges = <0x0 0x54000000 0x0 0x54000000 0x0 0x01000000>;
+
+		dpaux1: dpaux@0,54040000 {
+			compatible = "nvidia,tegra210-dpaux";
+			reg = <0x0 0x54040000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&tegra_car TEGRA210_CLK_DPAUX1>,
+				 <&tegra_car TEGRA210_CLK_PLL_DP>;
+			clock-names = "dpaux", "parent";
+			resets = <&tegra_car 207>;
+			reset-names = "dpaux";
+			status = "disabled";
+		};
+
+		vi@0,54080000 {
+			compatible = "nvidia,tegra210-vi";
+			reg = <0x0 0x54080000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
+			status = "disabled";
+		};
+
+		tsec@0,54100000 {
+			compatible = "nvidia,tegra210-tsec";
+			reg = <0x0 0x54100000 0x0 0x00040000>;
+		};
+
+		dc@0,54200000 {
+			compatible = "nvidia,tegra210-dc";
+			reg = <0x0 0x54200000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 73 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&tegra_car TEGRA210_CLK_DISP1>,
+				 <&tegra_car TEGRA210_CLK_PLL_P>;
+			clock-names = "dc", "parent";
+			resets = <&tegra_car 27>;
+			reset-names = "dc";
+
+			iommus = <&mc TEGRA_SWGROUP_DC>;
+
+			nvidia,head = <0>;
+		};
+
+		dc@0,54240000 {
+			compatible = "nvidia,tegra210-dc";
+			reg = <0x0 0x54240000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&tegra_car TEGRA210_CLK_DISP2>,
+				 <&tegra_car TEGRA210_CLK_PLL_P>;
+			clock-names = "dc", "parent";
+			resets = <&tegra_car 26>;
+			reset-names = "dc";
+
+			iommus = <&mc TEGRA_SWGROUP_DCB>;
+
+			nvidia,head = <1>;
+		};
+
+		dsi@0,54300000 {
+			compatible = "nvidia,tegra210-dsi";
+			reg = <0x0 0x54300000 0x0 0x00040000>;
+			clocks = <&tegra_car TEGRA210_CLK_DSIA>,
+				 <&tegra_car TEGRA210_CLK_DSIALP>,
+				 <&tegra_car TEGRA210_CLK_PLL_D_OUT0>;
+			clock-names = "dsi", "lp", "parent";
+			resets = <&tegra_car 48>;
+			reset-names = "dsi";
+			nvidia,mipi-calibrate = <&mipi 0x0c0>; /* DSIA & DSIB pads */
+
+			status = "disabled";
+
+			#address-cells = <1>;
+			#size-cells = <0>;
+		};
+
+		vic@0,54340000 {
+			compatible = "nvidia,tegra210-vic";
+			reg = <0x0 0x54340000 0x0 0x00040000>;
+			status = "disabled";
+		};
+
+		nvjpg@0,54380000 {
+			compatible = "nvidia,tegra210-nvjpg";
+			reg = <0x0 0x54380000 0x0 0x00040000>;
+			status = "disabled";
+		};
+
+		dsi@0,54400000 {
+			compatible = "nvidia,tegra210-dsi";
+			reg = <0x0 0x54400000 0x0 0x00040000>;
+			clocks = <&tegra_car TEGRA210_CLK_DSIB>,
+				 <&tegra_car TEGRA210_CLK_DSIBLP>,
+				 <&tegra_car TEGRA210_CLK_PLL_D_OUT0>;
+			clock-names = "dsi", "lp", "parent";
+			resets = <&tegra_car 82>;
+			reset-names = "dsi";
+			nvidia,mipi-calibrate = <&mipi 0x300>; /* DSIC & DSID pads */
+
+			status = "disabled";
+
+			#address-cells = <1>;
+			#size-cells = <0>;
+		};
+
+		nvdec@0,54480000 {
+			compatible = "nvidia,tegra210-nvdec";
+			reg = <0x0 0x54480000 0x0 0x00040000>;
+			status = "disabled";
+		};
+
+		nvenc@0,544c0000 {
+			compatible = "nvidia,tegra210-nvenc";
+			reg = <0x0 0x544c0000 0x0 0x00040000>;
+			status = "disabled";
+		};
+
+		tsec@0,54500000 {
+			compatible = "nvidia,tegra210-tsec";
+			reg = <0x0 0x54500000 0x0 0x00040000>;
+			status = "disabled";
+		};
+
+		sor@0,54540000 {
+			compatible = "nvidia,tegra210-sor";
+			reg = <0x0 0x54540000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&tegra_car TEGRA210_CLK_SOR0>,
+				 <&tegra_car TEGRA210_CLK_PLL_D_OUT0>,
+				 <&tegra_car TEGRA210_CLK_PLL_DP>,
+				 <&tegra_car TEGRA210_CLK_SOR_SAFE>;
+			clock-names = "sor", "parent", "dp", "safe";
+			resets = <&tegra_car 182>;
+			reset-names = "sor";
+			status = "disabled";
+		};
+
+		sor@0,54580000 {
+			compatible = "nvidia,tegra210-sor1";
+			reg = <0x0 0x54580000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&tegra_car TEGRA210_CLK_SOR1>,
+				 <&tegra_car TEGRA210_CLK_PLL_D2_OUT0>,
+				 <&tegra_car TEGRA210_CLK_PLL_DP>,
+				 <&tegra_car TEGRA210_CLK_SOR_SAFE>;
+			clock-names = "sor", "parent", "dp", "safe";
+			resets = <&tegra_car 183>;
+			reset-names = "sor";
+			status = "disabled";
+		};
+
+		dpaux: dpaux@0,545c0000 {
+			compatible = "nvidia,tegra124-dpaux";
+			reg = <0x0 0x545c0000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 159 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&tegra_car TEGRA210_CLK_DPAUX>,
+				 <&tegra_car TEGRA210_CLK_PLL_DP>;
+			clock-names = "dpaux", "parent";
+			resets = <&tegra_car 181>;
+			reset-names = "dpaux";
+			status = "disabled";
+		};
+
+		isp@0,54600000 {
+			compatible = "nvidia,tegra210-isp";
+			reg = <0x0 0x54600000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 71 IRQ_TYPE_LEVEL_HIGH>;
+			status = "disabled";
+		};
+
+		isp@0,54680000 {
+			compatible = "nvidia,tegra210-isp";
+			reg = <0x0 0x54680000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 70 IRQ_TYPE_LEVEL_HIGH>;
+			status = "disabled";
+		};
+
+		i2c@0,546c0000 {
+			compatible = "nvidia,tegra210-i2c-vi";
+			reg = <0x0 0x546c0000 0x0 0x00040000>;
+			interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
+			status = "disabled";
+		};
+	};
+
+	gic: interrupt-controller@0,50041000 {
+		compatible = "arm,gic-400";
+		#interrupt-cells = <3>;
+		interrupt-controller;
+		reg = <0x0 0x50041000 0x0 0x1000>,
+		      <0x0 0x50042000 0x0 0x2000>,
+		      <0x0 0x50044000 0x0 0x2000>,
+		      <0x0 0x50046000 0x0 0x2000>;
+		interrupts = <GIC_PPI 9
+			(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>;
+		interrupt-parent = <&gic>;
+	};
+
+	gpu@0,57000000 {
+		compatible = "nvidia,gm20b";
+		reg = <0x0 0x57000000 0x0 0x01000000>,
+		      <0x0 0x58000000 0x0 0x01000000>;
+		interrupts = <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 158 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-names = "stall", "nonstall";
+		clocks = <&tegra_car TEGRA210_CLK_GPU>,
+			 <&tegra_car TEGRA210_CLK_PLL_P_OUT5>;
+		clock-names = "gpu", "pwr";
+		resets = <&tegra_car 184>;
+		reset-names = "gpu";
+		status = "disabled";
+	};
+
+	lic: interrupt-controller@0,60004000 {
+		compatible = "nvidia,tegra210-ictlr";
+		reg = <0x0 0x60004000 0x0 0x40>, /* primary controller */
+		      <0x0 0x60004100 0x0 0x40>, /* secondary controller */
+		      <0x0 0x60004200 0x0 0x40>, /* tertiary controller */
+		      <0x0 0x60004300 0x0 0x40>, /* quaternary controller */
+		      <0x0 0x60004400 0x0 0x40>, /* quinary controller */
+		      <0x0 0x60004500 0x0 0x40>; /* senary controller */
+		interrupt-controller;
+		#interrupt-cells = <3>;
+		interrupt-parent = <&gic>;
+	};
+
+	timer@0,60005000 {
+		compatible = "nvidia,tegra210-timer", "nvidia,tegra20-timer";
+		reg = <0x0 0x60005000 0x0 0x400>;
+		interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_TIMER>;
+		clock-names = "timer";
+	};
+
+	tegra_car: clock@0,60006000 {
+		compatible = "nvidia,tegra210-car";
+		reg = <0x0 0x60006000 0x0 0x1000>;
+		#clock-cells = <1>;
+		#reset-cells = <1>;
+	};
+
+	flow-controller@0,60007000 {
+		compatible = "nvidia,tegra210-flowctrl";
+		reg = <0x0 0x60007000 0x0 0x1000>;
+	};
+
+	gpio: gpio@0,6000d000 {
+		compatible = "nvidia,tegra210-gpio", "nvidia,tegra124-gpio", "nvidia,tegra30-gpio";
+		reg = <0x0 0x6000d000 0x0 0x1000>;
+		interrupts = <GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 87 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 125 IRQ_TYPE_LEVEL_HIGH>;
+		#gpio-cells = <2>;
+		gpio-controller;
+		#interrupt-cells = <2>;
+		interrupt-controller;
+	};
+
+	apbdma: dma@0,60020000 {
+		compatible = "nvidia,tegra210-apbdma", "nvidia,tegra148-apbdma";
+		reg = <0x0 0x60020000 0x0 0x1400>;
+		interrupts = <GIC_SPI 104 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 114 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 118 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 119 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 128 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 129 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 130 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 132 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 133 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 134 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 135 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 136 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 137 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 138 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 139 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 142 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_APBDMA>;
+		clock-names = "dma";
+		resets = <&tegra_car 34>;
+		reset-names = "dma";
+		#dma-cells = <1>;
+	};
+
+	apbmisc@0,70000800 {
+		compatible = "nvidia,tegra210-apbmisc", "nvidia,tegra20-apbmisc";
+		reg = <0x0 0x70000800 0x0 0x64>,   /* Chip revision */
+		      <0x0 0x7000e864 0x0 0x04>;   /* Strapping options */
+	};
+
+	pinmux: pinmux@0,700008d4 {
+		compatible = "nvidia,tegra210-pinmux";
+		reg = <0x0 0x700008d4 0x0 0x29c>, /* Pad control registers */
+		      <0x0 0x70003000 0x0 0x294>; /* Mux registers */
+	};
+
+	/*
+	 * There are two serial driver i.e. 8250 based simple serial
+	 * driver and APB DMA based serial driver for higher baudrate
+	 * and performace. To enable the 8250 based driver, the compatible
+	 * is "nvidia,tegra124-uart", "nvidia,tegra20-uart" and to enable
+	 * the APB DMA based serial driver, the comptible is
+	 * "nvidia,tegra124-hsuart", "nvidia,tegra30-hsuart".
+	 */
+	uarta: serial@0,70006000 {
+		compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart";
+		reg = <0x0 0x70006000 0x0 0x40>;
+		reg-shift = <2>;
+		interrupts = <GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_UARTA>;
+		clock-names = "serial";
+		resets = <&tegra_car 6>;
+		reset-names = "serial";
+		dmas = <&apbdma 8>, <&apbdma 8>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	uartb: serial@0,70006040 {
+		compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart";
+		reg = <0x0 0x70006040 0x0 0x40>;
+		reg-shift = <2>;
+		interrupts = <GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_UARTB>;
+		clock-names = "serial";
+		resets = <&tegra_car 7>;
+		reset-names = "serial";
+		dmas = <&apbdma 9>, <&apbdma 9>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	uartc: serial@0,70006200 {
+		compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart";
+		reg = <0x0 0x70006200 0x0 0x40>;
+		reg-shift = <2>;
+		interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_UARTC>;
+		clock-names = "serial";
+		resets = <&tegra_car 55>;
+		reset-names = "serial";
+		dmas = <&apbdma 10>, <&apbdma 10>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	uartd: serial@0,70006300 {
+		compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart";
+		reg = <0x0 0x70006300 0x0 0x40>;
+		reg-shift = <2>;
+		interrupts = <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_UARTD>;
+		clock-names = "serial";
+		resets = <&tegra_car 65>;
+		reset-names = "serial";
+		dmas = <&apbdma 19>, <&apbdma 19>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	pwm: pwm@0,7000a000 {
+		compatible = "nvidia,tegra210-pwm", "nvidia,tegra20-pwm";
+		reg = <0x0 0x7000a000 0x0 0x100>;
+		#pwm-cells = <2>;
+		clocks = <&tegra_car TEGRA210_CLK_PWM>;
+		clock-names = "pwm";
+		resets = <&tegra_car 17>;
+		reset-names = "pwm";
+		status = "disabled";
+	};
+
+	i2c@0,7000c000 {
+		compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000c000 0x0 0x100>;
+		interrupts = <GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA210_CLK_I2C1>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 12>;
+		reset-names = "i2c";
+		dmas = <&apbdma 21>, <&apbdma 21>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	i2c@0,7000c400 {
+		compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000c400 0x0 0x100>;
+		interrupts = <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA210_CLK_I2C2>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 54>;
+		reset-names = "i2c";
+		dmas = <&apbdma 22>, <&apbdma 22>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	i2c@0,7000c500 {
+		compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000c500 0x0 0x100>;
+		interrupts = <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA210_CLK_I2C3>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 67>;
+		reset-names = "i2c";
+		dmas = <&apbdma 23>, <&apbdma 23>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	i2c@0,7000c700 {
+		compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000c700 0x0 0x100>;
+		interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA210_CLK_I2C4>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 103>;
+		reset-names = "i2c";
+		dmas = <&apbdma 26>, <&apbdma 26>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	i2c@0,7000d000 {
+		compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000d000 0x0 0x100>;
+		interrupts = <GIC_SPI 53 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA210_CLK_I2C5>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 47>;
+		reset-names = "i2c";
+		dmas = <&apbdma 24>, <&apbdma 24>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	i2c@0,7000d100 {
+		compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+		reg = <0x0 0x7000d100 0x0 0x100>;
+		interrupts = <GIC_SPI 63 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA210_CLK_I2C6>;
+		clock-names = "div-clk";
+		resets = <&tegra_car 166>;
+		reset-names = "i2c";
+		dmas = <&apbdma 30>, <&apbdma 30>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	spi@0,7000d400 {
+		compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi";
+		reg = <0x0 0x7000d400 0x0 0x200>;
+		interrupts = <GIC_SPI 59 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA210_CLK_SBC1>;
+		clock-names = "spi";
+		resets = <&tegra_car 41>;
+		reset-names = "spi";
+		dmas = <&apbdma 15>, <&apbdma 15>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	spi@0,7000d600 {
+		compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi";
+		reg = <0x0 0x7000d600 0x0 0x200>;
+		interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA210_CLK_SBC2>;
+		clock-names = "spi";
+		resets = <&tegra_car 44>;
+		reset-names = "spi";
+		dmas = <&apbdma 16>, <&apbdma 16>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	spi@0,7000d800 {
+		compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi";
+		reg = <0x0 0x7000d800 0x0 0x200>;
+		interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA210_CLK_SBC3>;
+		clock-names = "spi";
+		resets = <&tegra_car 46>;
+		reset-names = "spi";
+		dmas = <&apbdma 17>, <&apbdma 17>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	spi@0,7000da00 {
+		compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi";
+		reg = <0x0 0x7000da00 0x0 0x200>;
+		interrupts = <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA210_CLK_SBC4>;
+		clock-names = "spi";
+		resets = <&tegra_car 68>;
+		reset-names = "spi";
+		dmas = <&apbdma 18>, <&apbdma 18>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	rtc@0,7000e000 {
+		compatible = "nvidia,tegra210-rtc", "nvidia,tegra20-rtc";
+		reg = <0x0 0x7000e000 0x0 0x100>;
+		interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_RTC>;
+		clock-names = "rtc";
+	};
+
+	pmc: pmc@0,7000e400 {
+		compatible = "nvidia,tegra210-pmc";
+		reg = <0x0 0x7000e400 0x0 0x400>;
+		clocks = <&tegra_car TEGRA210_CLK_PCLK>, <&clk32k_in>;
+		clock-names = "pclk", "clk32k_in";
+
+		#power-domain-cells = <1>;
+	};
+
+	fuse@0,7000f800 {
+		compatible = "nvidia,tegra210-efuse";
+		reg = <0x0 0x7000f800 0x0 0x400>;
+		clocks = <&tegra_car TEGRA210_CLK_FUSE>;
+		clock-names = "fuse";
+		resets = <&tegra_car 39>;
+		reset-names = "fuse";
+	};
+
+	mc: memory-controller@0,70019000 {
+		compatible = "nvidia,tegra210-mc";
+		reg = <0x0 0x70019000 0x0 0x1000>;
+		clocks = <&tegra_car TEGRA210_CLK_MC>;
+		clock-names = "mc";
+
+		interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>;
+
+		#iommu-cells = <1>;
+	};
+
+	hda@0,70030000 {
+		compatible = "nvidia,tegra210-hda", "nvidia,tegra30-hda";
+		reg = <0x0 0x70030000 0x0 0x10000>;
+		interrupts = <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_HDA>,
+		         <&tegra_car TEGRA210_CLK_HDA2HDMI>,
+			 <&tegra_car TEGRA210_CLK_HDA2CODEC_2X>;
+		clock-names = "hda", "hda2hdmi", "hda2codec_2x";
+		resets = <&tegra_car 125>, /* hda */
+			 <&tegra_car 128>, /* hda2hdmi */
+			 <&tegra_car 111>; /* hda2codec_2x */
+		reset-names = "hda", "hda2hdmi", "hda2codec_2x";
+		status = "disabled";
+	};
+
+	sdhci@0,700b0000 {
+		compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci";
+		reg = <0x0 0x700b0000 0x0 0x200>;
+		interrupts = <GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_SDMMC1>;
+		clock-names = "sdhci";
+		resets = <&tegra_car 14>;
+		reset-names = "sdhci";
+		status = "disabled";
+	};
+
+	sdhci@0,700b0200 {
+		compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci";
+		reg = <0x0 0x700b0200 0x0 0x200>;
+		interrupts = <GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_SDMMC2>;
+		clock-names = "sdhci";
+		resets = <&tegra_car 9>;
+		reset-names = "sdhci";
+		status = "disabled";
+	};
+
+	sdhci@0,700b0400 {
+		compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci";
+		reg = <0x0 0x700b0400 0x0 0x200>;
+		interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_SDMMC3>;
+		clock-names = "sdhci";
+		resets = <&tegra_car 69>;
+		reset-names = "sdhci";
+		status = "disabled";
+	};
+
+	sdhci@0,700b0600 {
+		compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci";
+		reg = <0x0 0x700b0600 0x0 0x200>;
+		interrupts = <GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_SDMMC4>;
+		clock-names = "sdhci";
+		resets = <&tegra_car 15>;
+		reset-names = "sdhci";
+		status = "disabled";
+	};
+
+	mipi: mipi@0,700e3000 {
+		compatible = "nvidia,tegra210-mipi";
+		reg = <0x0 0x700e3000 0x0 0x100>;
+		clocks = <&tegra_car TEGRA210_CLK_MIPI_CAL>;
+		clock-names = "mipi-cal";
+		#nvidia,mipi-calibrate-cells = <1>;
+	};
+
+	spi@0,70410000 {
+		compatible = "nvidia,tegra210-qspi";
+		reg = <0x0 0x70410000 0x0 0x1000>;
+		interrupts = <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&tegra_car TEGRA210_CLK_QSPI>;
+		clock-names = "qspi";
+		resets = <&tegra_car 211>;
+		reset-names = "qspi";
+		dmas = <&apbdma 5>, <&apbdma 5>;
+		dma-names = "rx", "tx";
+		status = "disabled";
+	};
+
+	usb@0,7d000000 {
+		compatible = "nvidia,tegra210-ehci", "nvidia,tegra30-ehci", "usb-ehci";
+		reg = <0x0 0x7d000000 0x0 0x4000>;
+		interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
+		phy_type = "utmi";
+		clocks = <&tegra_car TEGRA210_CLK_USBD>;
+		clock-names = "usb";
+		resets = <&tegra_car 22>;
+		reset-names = "usb";
+		nvidia,phy = <&phy1>;
+		status = "disabled";
+	};
+
+	phy1: usb-phy@0,7d000000 {
+		compatible = "nvidia,tegra210-usb-phy", "nvidia,tegra30-usb-phy";
+		reg = <0x0 0x7d000000 0x0 0x4000>,
+		      <0x0 0x7d000000 0x0 0x4000>;
+		phy_type = "utmi";
+		clocks = <&tegra_car TEGRA210_CLK_USBD>,
+			 <&tegra_car TEGRA210_CLK_PLL_U>,
+			 <&tegra_car TEGRA210_CLK_USBD>;
+		clock-names = "reg", "pll_u", "utmi-pads";
+		resets = <&tegra_car 22>, <&tegra_car 22>;
+		reset-names = "usb", "utmi-pads";
+		nvidia,hssync-start-delay = <0>;
+		nvidia,idle-wait-delay = <17>;
+		nvidia,elastic-limit = <16>;
+		nvidia,term-range-adj = <6>;
+		nvidia,xcvr-setup = <9>;
+		nvidia,xcvr-lsfslew = <0>;
+		nvidia,xcvr-lsrslew = <3>;
+		nvidia,hssquelch-level = <2>;
+		nvidia,hsdiscon-level = <5>;
+		nvidia,xcvr-hsslew = <12>;
+		nvidia,has-utmi-pad-registers;
+		status = "disabled";
+	};
+
+	usb@0,7d004000 {
+		compatible = "nvidia,tegra210-ehci", "nvidia,tegra30-ehci", "usb-ehci";
+		reg = <0x0 0x7d004000 0x0 0x4000>;
+		interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>;
+		phy_type = "utmi";
+		clocks = <&tegra_car TEGRA210_CLK_USB2>;
+		clock-names = "usb";
+		resets = <&tegra_car 58>;
+		reset-names = "usb";
+		nvidia,phy = <&phy2>;
+		status = "disabled";
+	};
+
+	phy2: usb-phy@0,7d004000 {
+		compatible = "nvidia,tegra210-usb-phy", "nvidia,tegra30-usb-phy";
+		reg = <0x0 0x7d004000 0x0 0x4000>,
+		      <0x0 0x7d000000 0x0 0x4000>;
+		phy_type = "utmi";
+		clocks = <&tegra_car TEGRA210_CLK_USB2>,
+			 <&tegra_car TEGRA210_CLK_PLL_U>,
+			 <&tegra_car TEGRA210_CLK_USBD>;
+		clock-names = "reg", "pll_u", "utmi-pads";
+		resets = <&tegra_car 58>, <&tegra_car 22>;
+		reset-names = "usb", "utmi-pads";
+		nvidia,hssync-start-delay = <0>;
+		nvidia,idle-wait-delay = <17>;
+		nvidia,elastic-limit = <16>;
+		nvidia,term-range-adj = <6>;
+		nvidia,xcvr-setup = <9>;
+		nvidia,xcvr-lsfslew = <0>;
+		nvidia,xcvr-lsrslew = <3>;
+		nvidia,hssquelch-level = <2>;
+		nvidia,hsdiscon-level = <5>;
+		nvidia,xcvr-hsslew = <12>;
+		status = "disabled";
+	};
+
+	cpus {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		cpu@0 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a57";
+			reg = <0>;
+		};
+
+		cpu@1 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a57";
+			reg = <1>;
+		};
+
+		cpu@2 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a57";
+			reg = <2>;
+		};
+
+		cpu@3 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a57";
+			reg = <3>;
+		};
+	};
+
+	timer {
+		compatible = "arm,armv8-timer";
+		interrupts = <GIC_PPI 13
+				(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+			     <GIC_PPI 14
+				(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+			     <GIC_PPI 11
+				(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+			     <GIC_PPI 10
+				(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>;
+		interrupt-parent = <&gic>;
+	};
+};
diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h
index 74c132d901bd..6a8685051b67 100644
--- a/arch/ia64/include/asm/unistd.h
+++ b/arch/ia64/include/asm/unistd.h
@@ -11,7 +11,7 @@
 
 
 
-#define NR_syscalls			323 /* length of syscall table */
+#define NR_syscalls			324 /* length of syscall table */
 
 /*
  * The following defines stop scripts/checksyscalls.sh from complaining about
diff --git a/arch/ia64/include/uapi/asm/unistd.h b/arch/ia64/include/uapi/asm/unistd.h
index 762edce7572e..41369a103838 100644
--- a/arch/ia64/include/uapi/asm/unistd.h
+++ b/arch/ia64/include/uapi/asm/unistd.h
@@ -336,5 +336,6 @@
 #define __NR_membarrier			1344
 #define __NR_kcmp			1345
 #define __NR_mlock2			1346
+#define __NR_copy_file_range		1347
 
 #endif /* _UAPI_ASM_IA64_UNISTD_H */
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 534a74acb849..477c55e244ec 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1772,5 +1772,6 @@ sys_call_table:
 	data8 sys_membarrier
 	data8 sys_kcmp				// 1345
 	data8 sys_mlock2
+	data8 sys_copy_file_range
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 5038fd578e65..2936a0044c04 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -1799,9 +1799,9 @@ static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int data
 	struct inode *inode = file_inode(file);
 	int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (!err) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		err = spufs_mfc_flush(file, NULL);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	return err;
 }
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index ad4840f86be1..dfa863876778 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -163,7 +163,7 @@ static void spufs_prune_dir(struct dentry *dir)
 {
 	struct dentry *dentry, *tmp;
 
-	mutex_lock(&d_inode(dir)->i_mutex);
+	inode_lock(d_inode(dir));
 	list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) {
 		spin_lock(&dentry->d_lock);
 		if (simple_positive(dentry)) {
@@ -180,7 +180,7 @@ static void spufs_prune_dir(struct dentry *dir)
 		}
 	}
 	shrink_dcache_parent(dir);
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 }
 
 /* Caller must hold parent->i_mutex */
@@ -225,9 +225,9 @@ static int spufs_dir_close(struct inode *inode, struct file *file)
 	parent = d_inode(dir->d_parent);
 	ctx = SPUFS_I(d_inode(dir))->i_ctx;
 
-	mutex_lock_nested(&parent->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(parent, I_MUTEX_PARENT);
 	ret = spufs_rmdir(parent, dir);
-	mutex_unlock(&parent->i_mutex);
+	inode_unlock(parent);
 	WARN_ON(ret);
 
 	return dcache_dir_close(inode, file);
@@ -270,7 +270,7 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	dget(dentry);
 	inc_nlink(dir);
@@ -291,7 +291,7 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
 	if (ret)
 		spufs_rmdir(dir, dentry);
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return ret;
 }
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index b2e5902bd8f4..0f3da2cb2bd6 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -67,7 +67,7 @@ static void hypfs_remove(struct dentry *dentry)
 	struct dentry *parent;
 
 	parent = dentry->d_parent;
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 	if (simple_positive(dentry)) {
 		if (d_is_dir(dentry))
 			simple_rmdir(d_inode(parent), dentry);
@@ -76,7 +76,7 @@ static void hypfs_remove(struct dentry *dentry)
 	}
 	d_delete(dentry);
 	dput(dentry);
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 }
 
 static void hypfs_delete_tree(struct dentry *root)
@@ -331,7 +331,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name,
 	struct dentry *dentry;
 	struct inode *inode;
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 	dentry = lookup_one_len(name, parent, strlen(name));
 	if (IS_ERR(dentry)) {
 		dentry = ERR_PTR(-ENOMEM);
@@ -359,7 +359,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name,
 	d_instantiate(dentry, inode);
 	dget(dentry);
 fail:
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	return dentry;
 }
 
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index 1544fabcd7f9..c57fd1ea9689 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -67,18 +67,19 @@ static inline void arch_wmb_pmem(void)
 }
 
 /**
- * __arch_wb_cache_pmem - write back a cache range with CLWB
+ * arch_wb_cache_pmem - write back a cache range with CLWB
  * @vaddr:	virtual start address
  * @size:	number of bytes to write back
  *
  * Write back a cache range using the CLWB (cache line write back)
  * instruction.  This function requires explicit ordering with an
- * arch_wmb_pmem() call.  This API is internal to the x86 PMEM implementation.
+ * arch_wmb_pmem() call.
  */
-static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
 {
 	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
 	unsigned long clflush_mask = x86_clflush_size - 1;
+	void *vaddr = (void __force *)addr;
 	void *vend = vaddr + size;
 	void *p;
 
@@ -115,7 +116,7 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
 	len = copy_from_iter_nocache(vaddr, bytes, i);
 
 	if (__iter_needs_pmem_wb(i))
-		__arch_wb_cache_pmem(vaddr, bytes);
+		arch_wb_cache_pmem(addr, bytes);
 
 	return len;
 }
@@ -133,7 +134,7 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 	void *vaddr = (void __force *)addr;
 
 	memset(vaddr, 0, size);
-	__arch_wb_cache_pmem(vaddr, size);
+	arch_wb_cache_pmem(addr, size);
 }
 
 static inline bool __arch_has_wmb_pmem(void)
diff --git a/block/Makefile b/block/Makefile
index db5f622c9d67..9eda2322b2d4 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+			blk-lib.o blk-mq.o blk-mq-tag.o \
 			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
 			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
 			badblocks.o partitions/
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index f6325d573c10..711e4d8de6fa 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -66,7 +66,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 	}
 
 	if (unlikely(!bip))
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	memset(bip, 0, sizeof(*bip));
 
@@ -89,7 +89,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 	return bip;
 err:
 	mempool_free(bip, bs->bio_integrity_pool);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
@@ -298,10 +298,10 @@ int bio_integrity_prep(struct bio *bio)
 
 	/* Allocate bio integrity payload and integrity vectors */
 	bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
-	if (unlikely(bip == NULL)) {
+	if (IS_ERR(bip)) {
 		printk(KERN_ERR "could not allocate data integrity bioset\n");
 		kfree(buf);
-		return -EIO;
+		return PTR_ERR(bip);
 	}
 
 	bip->bip_flags |= BIP_BLOCK_INTEGRITY;
@@ -465,9 +465,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
 	BUG_ON(bip_src == NULL);
 
 	bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
-
-	if (bip == NULL)
-		return -EIO;
+	if (IS_ERR(bip))
+		return PTR_ERR(bip);
 
 	memcpy(bip->bip_vec, bip_src->bip_vec,
 	       bip_src->bip_vcnt * sizeof(struct bio_vec));
diff --git a/block/blk-core.c b/block/blk-core.c
index 476244d59309..ab51685988c2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -680,6 +680,13 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
 	wake_up_all(&q->mq_freeze_wq);
 }
 
+static void blk_rq_timed_out_timer(unsigned long data)
+{
+	struct request_queue *q = (struct request_queue *)data;
+
+	kblockd_schedule_work(&q->timeout_work);
+}
+
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
@@ -841,6 +848,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
 		goto fail;
 
+	INIT_WORK(&q->timeout_work, blk_timeout_work);
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unprep_rq_fn		= NULL;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6889d7183a2a..4c0622fae413 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -603,8 +603,6 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 			blk_mq_complete_request(rq, -EIO);
 		return;
 	}
-	if (rq->cmd_flags & REQ_NO_TIMEOUT)
-		return;
 
 	if (time_after_eq(jiffies, rq->deadline)) {
 		if (!blk_mark_rq_complete(rq))
@@ -615,15 +613,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
-static void blk_mq_rq_timer(unsigned long priv)
+static void blk_mq_timeout_work(struct work_struct *work)
 {
-	struct request_queue *q = (struct request_queue *)priv;
+	struct request_queue *q =
+		container_of(work, struct request_queue, timeout_work);
 	struct blk_mq_timeout_data data = {
 		.next		= 0,
 		.next_set	= 0,
 	};
 	int i;
 
+	if (blk_queue_enter(q, true))
+		return;
+
 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 
 	if (data.next_set) {
@@ -638,6 +640,7 @@ static void blk_mq_rq_timer(unsigned long priv)
 				blk_mq_tag_idle(hctx);
 		}
 	}
+	blk_queue_exit(q);
 }
 
 /*
@@ -2008,7 +2011,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		hctxs[i]->queue_num = i;
 	}
 
-	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
+	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
 
 	q->nr_queues = nr_cpu_ids;
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 3610af561748..a30441a200c0 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -127,13 +127,16 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
 	}
 }
 
-void blk_rq_timed_out_timer(unsigned long data)
+void blk_timeout_work(struct work_struct *work)
 {
-	struct request_queue *q = (struct request_queue *) data;
+	struct request_queue *q =
+		container_of(work, struct request_queue, timeout_work);
 	unsigned long flags, next = 0;
 	struct request *rq, *tmp;
 	int next_set = 0;
 
+	if (blk_queue_enter(q, true))
+		return;
 	spin_lock_irqsave(q->queue_lock, flags);
 
 	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
@@ -143,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data)
 		mod_timer(&q->timeout, round_jiffies_up(next));
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
+	blk_queue_exit(q);
 }
 
 /**
@@ -193,9 +197,6 @@ void blk_add_timer(struct request *req)
 	struct request_queue *q = req->q;
 	unsigned long expiry;
 
-	if (req->cmd_flags & REQ_NO_TIMEOUT)
-		return;
-
 	/* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
 	if (!q->mq_ops && !q->rq_timed_out_fn)
 		return;
diff --git a/block/blk.h b/block/blk.h
index c43926d3d74d..70e4aee9cdcb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -93,7 +93,7 @@ static inline void blk_flush_integrity(void)
 }
 #endif
 
-void blk_rq_timed_out_timer(unsigned long data);
+void blk_timeout_work(struct work_struct *work);
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
 void blk_delete_timer(struct request *);
diff --git a/block/ioctl.c b/block/ioctl.c
index 2c84683aada5..77f5d17779d6 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -455,12 +455,12 @@ static int blkdev_daxset(struct block_device *bdev, unsigned long argp)
 	if (arg && !blkdev_dax_capable(bdev))
 		return -ENOTTY;
 
-	mutex_lock(&bdev->bd_inode->i_mutex);
+	inode_lock(bdev->bd_inode);
 	if (bdev->bd_map_count == 0)
 		inode_set_flags(bdev->bd_inode, arg, S_DAX);
 	else
 		rc = -EBUSY;
-	mutex_unlock(&bdev->bd_inode->i_mutex);
+	inode_unlock(bdev->bd_inode);
 	return rc;
 }
 #else
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index a8e7aa3e257b..f5e18c2a4852 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -76,6 +76,8 @@ int af_alg_register_type(const struct af_alg_type *type)
 		goto unlock;
 
 	type->ops->owner = THIS_MODULE;
+	if (type->ops_nokey)
+		type->ops_nokey->owner = THIS_MODULE;
 	node->type = type;
 	list_add(&node->list, &alg_types);
 	err = 0;
@@ -125,6 +127,26 @@ int af_alg_release(struct socket *sock)
 }
 EXPORT_SYMBOL_GPL(af_alg_release);
 
+void af_alg_release_parent(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	unsigned int nokey = ask->nokey_refcnt;
+	bool last = nokey && !ask->refcnt;
+
+	sk = ask->parent;
+	ask = alg_sk(sk);
+
+	lock_sock(sk);
+	ask->nokey_refcnt -= nokey;
+	if (!last)
+		last = !--ask->refcnt;
+	release_sock(sk);
+
+	if (last)
+		sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(af_alg_release_parent);
+
 static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
 	const u32 forbidden = CRYPTO_ALG_INTERNAL;
@@ -133,6 +155,7 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct sockaddr_alg *sa = (void *)uaddr;
 	const struct af_alg_type *type;
 	void *private;
+	int err;
 
 	if (sock->state == SS_CONNECTED)
 		return -EINVAL;
@@ -160,16 +183,22 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		return PTR_ERR(private);
 	}
 
+	err = -EBUSY;
 	lock_sock(sk);
+	if (ask->refcnt | ask->nokey_refcnt)
+		goto unlock;
 
 	swap(ask->type, type);
 	swap(ask->private, private);
 
+	err = 0;
+
+unlock:
 	release_sock(sk);
 
 	alg_do_release(type, private);
 
-	return 0;
+	return err;
 }
 
 static int alg_setkey(struct sock *sk, char __user *ukey,
@@ -202,11 +231,15 @@ static int alg_setsockopt(struct socket *sock, int level, int optname,
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 	const struct af_alg_type *type;
-	int err = -ENOPROTOOPT;
+	int err = -EBUSY;
 
 	lock_sock(sk);
+	if (ask->refcnt)
+		goto unlock;
+
 	type = ask->type;
 
+	err = -ENOPROTOOPT;
 	if (level != SOL_ALG || !type)
 		goto unlock;
 
@@ -238,6 +271,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
 	struct alg_sock *ask = alg_sk(sk);
 	const struct af_alg_type *type;
 	struct sock *sk2;
+	unsigned int nokey;
 	int err;
 
 	lock_sock(sk);
@@ -257,20 +291,29 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
 	security_sk_clone(sk, sk2);
 
 	err = type->accept(ask->private, sk2);
-	if (err) {
-		sk_free(sk2);
+
+	nokey = err == -ENOKEY;
+	if (nokey && type->accept_nokey)
+		err = type->accept_nokey(ask->private, sk2);
+
+	if (err)
 		goto unlock;
-	}
 
 	sk2->sk_family = PF_ALG;
 
-	sock_hold(sk);
+	if (nokey || !ask->refcnt++)
+		sock_hold(sk);
+	ask->nokey_refcnt += nokey;
 	alg_sk(sk2)->parent = sk;
 	alg_sk(sk2)->type = type;
+	alg_sk(sk2)->nokey_refcnt = nokey;
 
 	newsock->ops = type->ops;
 	newsock->state = SS_CONNECTED;
 
+	if (nokey)
+		newsock->ops = type->ops_nokey;
+
 	err = 0;
 
 unlock:
diff --git a/crypto/ahash.c b/crypto/ahash.c
index 9c1dc8d6106a..d19b52324cf5 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -451,6 +451,7 @@ static int crypto_ahash_init_tfm(struct crypto_tfm *tfm)
 	struct ahash_alg *alg = crypto_ahash_alg(hash);
 
 	hash->setkey = ahash_nosetkey;
+	hash->has_setkey = false;
 	hash->export = ahash_no_export;
 	hash->import = ahash_no_import;
 
@@ -463,8 +464,10 @@ static int crypto_ahash_init_tfm(struct crypto_tfm *tfm)
 	hash->finup = alg->finup ?: ahash_def_finup;
 	hash->digest = alg->digest;
 
-	if (alg->setkey)
+	if (alg->setkey) {
 		hash->setkey = alg->setkey;
+		hash->has_setkey = true;
+	}
 	if (alg->export)
 		hash->export = alg->export;
 	if (alg->import)
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index b4c24fe3dcfb..608a7562839d 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -34,6 +34,11 @@ struct hash_ctx {
 	struct ahash_request req;
 };
 
+struct algif_hash_tfm {
+	struct crypto_ahash *hash;
+	bool has_key;
+};
+
 static int hash_sendmsg(struct socket *sock, struct msghdr *msg,
 			size_t ignored)
 {
@@ -235,19 +240,151 @@ static struct proto_ops algif_hash_ops = {
 	.accept		=	hash_accept,
 };
 
+static int hash_check_key(struct socket *sock)
+{
+	int err = 0;
+	struct sock *psk;
+	struct alg_sock *pask;
+	struct algif_hash_tfm *tfm;
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+
+	lock_sock(sk);
+	if (ask->refcnt)
+		goto unlock_child;
+
+	psk = ask->parent;
+	pask = alg_sk(ask->parent);
+	tfm = pask->private;
+
+	err = -ENOKEY;
+	lock_sock_nested(psk, SINGLE_DEPTH_NESTING);
+	if (!tfm->has_key)
+		goto unlock;
+
+	if (!pask->refcnt++)
+		sock_hold(psk);
+
+	ask->refcnt = 1;
+	sock_put(psk);
+
+	err = 0;
+
+unlock:
+	release_sock(psk);
+unlock_child:
+	release_sock(sk);
+
+	return err;
+}
+
+static int hash_sendmsg_nokey(struct socket *sock, struct msghdr *msg,
+			      size_t size)
+{
+	int err;
+
+	err = hash_check_key(sock);
+	if (err)
+		return err;
+
+	return hash_sendmsg(sock, msg, size);
+}
+
+static ssize_t hash_sendpage_nokey(struct socket *sock, struct page *page,
+				   int offset, size_t size, int flags)
+{
+	int err;
+
+	err = hash_check_key(sock);
+	if (err)
+		return err;
+
+	return hash_sendpage(sock, page, offset, size, flags);
+}
+
+static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
+			      size_t ignored, int flags)
+{
+	int err;
+
+	err = hash_check_key(sock);
+	if (err)
+		return err;
+
+	return hash_recvmsg(sock, msg, ignored, flags);
+}
+
+static int hash_accept_nokey(struct socket *sock, struct socket *newsock,
+			     int flags)
+{
+	int err;
+
+	err = hash_check_key(sock);
+	if (err)
+		return err;
+
+	return hash_accept(sock, newsock, flags);
+}
+
+static struct proto_ops algif_hash_ops_nokey = {
+	.family		=	PF_ALG,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.bind		=	sock_no_bind,
+	.setsockopt	=	sock_no_setsockopt,
+	.poll		=	sock_no_poll,
+
+	.release	=	af_alg_release,
+	.sendmsg	=	hash_sendmsg_nokey,
+	.sendpage	=	hash_sendpage_nokey,
+	.recvmsg	=	hash_recvmsg_nokey,
+	.accept		=	hash_accept_nokey,
+};
+
 static void *hash_bind(const char *name, u32 type, u32 mask)
 {
-	return crypto_alloc_ahash(name, type, mask);
+	struct algif_hash_tfm *tfm;
+	struct crypto_ahash *hash;
+
+	tfm = kzalloc(sizeof(*tfm), GFP_KERNEL);
+	if (!tfm)
+		return ERR_PTR(-ENOMEM);
+
+	hash = crypto_alloc_ahash(name, type, mask);
+	if (IS_ERR(hash)) {
+		kfree(tfm);
+		return ERR_CAST(hash);
+	}
+
+	tfm->hash = hash;
+
+	return tfm;
 }
 
 static void hash_release(void *private)
 {
-	crypto_free_ahash(private);
+	struct algif_hash_tfm *tfm = private;
+
+	crypto_free_ahash(tfm->hash);
+	kfree(tfm);
 }
 
 static int hash_setkey(void *private, const u8 *key, unsigned int keylen)
 {
-	return crypto_ahash_setkey(private, key, keylen);
+	struct algif_hash_tfm *tfm = private;
+	int err;
+
+	err = crypto_ahash_setkey(tfm->hash, key, keylen);
+	tfm->has_key = !err;
+
+	return err;
 }
 
 static void hash_sock_destruct(struct sock *sk)
@@ -261,12 +398,14 @@ static void hash_sock_destruct(struct sock *sk)
 	af_alg_release_parent(sk);
 }
 
-static int hash_accept_parent(void *private, struct sock *sk)
+static int hash_accept_parent_nokey(void *private, struct sock *sk)
 {
 	struct hash_ctx *ctx;
 	struct alg_sock *ask = alg_sk(sk);
-	unsigned len = sizeof(*ctx) + crypto_ahash_reqsize(private);
-	unsigned ds = crypto_ahash_digestsize(private);
+	struct algif_hash_tfm *tfm = private;
+	struct crypto_ahash *hash = tfm->hash;
+	unsigned len = sizeof(*ctx) + crypto_ahash_reqsize(hash);
+	unsigned ds = crypto_ahash_digestsize(hash);
 
 	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
 	if (!ctx)
@@ -286,7 +425,7 @@ static int hash_accept_parent(void *private, struct sock *sk)
 
 	ask->private = ctx;
 
-	ahash_request_set_tfm(&ctx->req, private);
+	ahash_request_set_tfm(&ctx->req, hash);
 	ahash_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   af_alg_complete, &ctx->completion);
 
@@ -295,12 +434,24 @@ static int hash_accept_parent(void *private, struct sock *sk)
 	return 0;
 }
 
+static int hash_accept_parent(void *private, struct sock *sk)
+{
+	struct algif_hash_tfm *tfm = private;
+
+	if (!tfm->has_key && crypto_ahash_has_setkey(tfm->hash))
+		return -ENOKEY;
+
+	return hash_accept_parent_nokey(private, sk);
+}
+
 static const struct af_alg_type algif_type_hash = {
 	.bind		=	hash_bind,
 	.release	=	hash_release,
 	.setkey		=	hash_setkey,
 	.accept		=	hash_accept_parent,
+	.accept_nokey	=	hash_accept_parent_nokey,
 	.ops		=	&algif_hash_ops,
+	.ops_nokey	=	&algif_hash_ops_nokey,
 	.name		=	"hash",
 	.owner		=	THIS_MODULE
 };
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index eaa9f9be5b87..38c1aa89d3a0 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -31,6 +31,11 @@ struct skcipher_sg_list {
 	struct scatterlist sg[0];
 };
 
+struct skcipher_tfm {
+	struct crypto_skcipher *skcipher;
+	bool has_key;
+};
+
 struct skcipher_ctx {
 	struct list_head tsgl;
 	struct af_alg_sgl rsgl;
@@ -387,7 +392,8 @@ static int skcipher_sendmsg(struct socket *sock, struct msghdr *msg,
 
 		sgl = list_entry(ctx->tsgl.prev, struct skcipher_sg_list, list);
 		sg = sgl->sg;
-		sg_unmark_end(sg + sgl->cur);
+		if (sgl->cur)
+			sg_unmark_end(sg + sgl->cur - 1);
 		do {
 			i = sgl->cur;
 			plen = min_t(size_t, len, PAGE_SIZE);
@@ -642,13 +648,6 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg,
 
 	lock_sock(sk);
 	while (msg_data_left(msg)) {
-		sgl = list_first_entry(&ctx->tsgl,
-				       struct skcipher_sg_list, list);
-		sg = sgl->sg;
-
-		while (!sg->length)
-			sg++;
-
 		if (!ctx->used) {
 			err = skcipher_wait_for_data(sk, flags);
 			if (err)
@@ -669,6 +668,13 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg,
 		if (!used)
 			goto free;
 
+		sgl = list_first_entry(&ctx->tsgl,
+				       struct skcipher_sg_list, list);
+		sg = sgl->sg;
+
+		while (!sg->length)
+			sg++;
+
 		skcipher_request_set_crypt(&ctx->req, sg, ctx->rsgl.sg, used,
 					   ctx->iv);
 
@@ -748,19 +754,139 @@ static struct proto_ops algif_skcipher_ops = {
 	.poll		=	skcipher_poll,
 };
 
+static int skcipher_check_key(struct socket *sock)
+{
+	int err = 0;
+	struct sock *psk;
+	struct alg_sock *pask;
+	struct skcipher_tfm *tfm;
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+
+	lock_sock(sk);
+	if (ask->refcnt)
+		goto unlock_child;
+
+	psk = ask->parent;
+	pask = alg_sk(ask->parent);
+	tfm = pask->private;
+
+	err = -ENOKEY;
+	lock_sock_nested(psk, SINGLE_DEPTH_NESTING);
+	if (!tfm->has_key)
+		goto unlock;
+
+	if (!pask->refcnt++)
+		sock_hold(psk);
+
+	ask->refcnt = 1;
+	sock_put(psk);
+
+	err = 0;
+
+unlock:
+	release_sock(psk);
+unlock_child:
+	release_sock(sk);
+
+	return err;
+}
+
+static int skcipher_sendmsg_nokey(struct socket *sock, struct msghdr *msg,
+				  size_t size)
+{
+	int err;
+
+	err = skcipher_check_key(sock);
+	if (err)
+		return err;
+
+	return skcipher_sendmsg(sock, msg, size);
+}
+
+static ssize_t skcipher_sendpage_nokey(struct socket *sock, struct page *page,
+				       int offset, size_t size, int flags)
+{
+	int err;
+
+	err = skcipher_check_key(sock);
+	if (err)
+		return err;
+
+	return skcipher_sendpage(sock, page, offset, size, flags);
+}
+
+static int skcipher_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
+				  size_t ignored, int flags)
+{
+	int err;
+
+	err = skcipher_check_key(sock);
+	if (err)
+		return err;
+
+	return skcipher_recvmsg(sock, msg, ignored, flags);
+}
+
+static struct proto_ops algif_skcipher_ops_nokey = {
+	.family		=	PF_ALG,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.bind		=	sock_no_bind,
+	.accept		=	sock_no_accept,
+	.setsockopt	=	sock_no_setsockopt,
+
+	.release	=	af_alg_release,
+	.sendmsg	=	skcipher_sendmsg_nokey,
+	.sendpage	=	skcipher_sendpage_nokey,
+	.recvmsg	=	skcipher_recvmsg_nokey,
+	.poll		=	skcipher_poll,
+};
+
 static void *skcipher_bind(const char *name, u32 type, u32 mask)
 {
-	return crypto_alloc_skcipher(name, type, mask);
+	struct skcipher_tfm *tfm;
+	struct crypto_skcipher *skcipher;
+
+	tfm = kzalloc(sizeof(*tfm), GFP_KERNEL);
+	if (!tfm)
+		return ERR_PTR(-ENOMEM);
+
+	skcipher = crypto_alloc_skcipher(name, type, mask);
+	if (IS_ERR(skcipher)) {
+		kfree(tfm);
+		return ERR_CAST(skcipher);
+	}
+
+	tfm->skcipher = skcipher;
+
+	return tfm;
 }
 
 static void skcipher_release(void *private)
 {
-	crypto_free_skcipher(private);
+	struct skcipher_tfm *tfm = private;
+
+	crypto_free_skcipher(tfm->skcipher);
+	kfree(tfm);
 }
 
 static int skcipher_setkey(void *private, const u8 *key, unsigned int keylen)
 {
-	return crypto_skcipher_setkey(private, key, keylen);
+	struct skcipher_tfm *tfm = private;
+	int err;
+
+	err = crypto_skcipher_setkey(tfm->skcipher, key, keylen);
+	tfm->has_key = !err;
+
+	return err;
 }
 
 static void skcipher_wait(struct sock *sk)
@@ -788,24 +914,26 @@ static void skcipher_sock_destruct(struct sock *sk)
 	af_alg_release_parent(sk);
 }
 
-static int skcipher_accept_parent(void *private, struct sock *sk)
+static int skcipher_accept_parent_nokey(void *private, struct sock *sk)
 {
 	struct skcipher_ctx *ctx;
 	struct alg_sock *ask = alg_sk(sk);
-	unsigned int len = sizeof(*ctx) + crypto_skcipher_reqsize(private);
+	struct skcipher_tfm *tfm = private;
+	struct crypto_skcipher *skcipher = tfm->skcipher;
+	unsigned int len = sizeof(*ctx) + crypto_skcipher_reqsize(skcipher);
 
 	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
 
-	ctx->iv = sock_kmalloc(sk, crypto_skcipher_ivsize(private),
+	ctx->iv = sock_kmalloc(sk, crypto_skcipher_ivsize(skcipher),
 			       GFP_KERNEL);
 	if (!ctx->iv) {
 		sock_kfree_s(sk, ctx, len);
 		return -ENOMEM;
 	}
 
-	memset(ctx->iv, 0, crypto_skcipher_ivsize(private));
+	memset(ctx->iv, 0, crypto_skcipher_ivsize(skcipher));
 
 	INIT_LIST_HEAD(&ctx->tsgl);
 	ctx->len = len;
@@ -818,7 +946,7 @@ static int skcipher_accept_parent(void *private, struct sock *sk)
 
 	ask->private = ctx;
 
-	skcipher_request_set_tfm(&ctx->req, private);
+	skcipher_request_set_tfm(&ctx->req, skcipher);
 	skcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				      af_alg_complete, &ctx->completion);
 
@@ -827,12 +955,24 @@ static int skcipher_accept_parent(void *private, struct sock *sk)
 	return 0;
 }
 
+static int skcipher_accept_parent(void *private, struct sock *sk)
+{
+	struct skcipher_tfm *tfm = private;
+
+	if (!tfm->has_key && crypto_skcipher_has_setkey(tfm->skcipher))
+		return -ENOKEY;
+
+	return skcipher_accept_parent_nokey(private, sk);
+}
+
 static const struct af_alg_type algif_type_skcipher = {
 	.bind		=	skcipher_bind,
 	.release	=	skcipher_release,
 	.setkey		=	skcipher_setkey,
 	.accept		=	skcipher_accept_parent,
+	.accept_nokey	=	skcipher_accept_parent_nokey,
 	.ops		=	&algif_skcipher_ops,
+	.ops_nokey	=	&algif_skcipher_ops_nokey,
 	.name		=	"skcipher",
 	.owner		=	THIS_MODULE
 };
diff --git a/crypto/crc32c_generic.c b/crypto/crc32c_generic.c
index 06f1b60f02b2..4c0a0e271876 100644
--- a/crypto/crc32c_generic.c
+++ b/crypto/crc32c_generic.c
@@ -172,4 +172,3 @@ MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations wrapper for lib/crc32c");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_CRYPTO("crc32c");
 MODULE_ALIAS_CRYPTO("crc32c-generic");
-MODULE_SOFTDEP("pre: crc32c");
diff --git a/crypto/shash.c b/crypto/shash.c
index ecb1e3d39bf0..88a27de79848 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -355,8 +355,10 @@ int crypto_init_shash_ops_async(struct crypto_tfm *tfm)
 	crt->finup = shash_async_finup;
 	crt->digest = shash_async_digest;
 
-	if (alg->setkey)
+	if (alg->setkey) {
 		crt->setkey = shash_async_setkey;
+		crt->has_setkey = true;
+	}
 	if (alg->export)
 		crt->export = shash_async_export;
 	if (alg->import)
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 7591928be7ca..d199c0b1751c 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -118,6 +118,7 @@ static int crypto_init_skcipher_ops_blkcipher(struct crypto_tfm *tfm)
 	skcipher->decrypt = skcipher_decrypt_blkcipher;
 
 	skcipher->ivsize = crypto_blkcipher_ivsize(blkcipher);
+	skcipher->has_setkey = calg->cra_blkcipher.max_keysize;
 
 	return 0;
 }
@@ -210,6 +211,7 @@ static int crypto_init_skcipher_ops_ablkcipher(struct crypto_tfm *tfm)
 	skcipher->ivsize = crypto_ablkcipher_ivsize(ablkcipher);
 	skcipher->reqsize = crypto_ablkcipher_reqsize(ablkcipher) +
 			    sizeof(struct ablkcipher_request);
+	skcipher->has_setkey = calg->cra_ablkcipher.max_keysize;
 
 	return 0;
 }
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 6682c5daf742..6e6bc1059301 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -32,6 +32,7 @@
 #include <linux/hardirq.h>
 #include <linux/pstore.h>
 #include <linux/vmalloc.h>
+#include <linux/mm.h> /* kvfree() */
 #include <acpi/apei.h>
 
 #include "apei-internal.h"
@@ -532,10 +533,7 @@ retry:
 			return -ENOMEM;
 		memcpy(new_entries, entries,
 		       erst_record_id_cache.len * sizeof(entries[0]));
-		if (erst_record_id_cache.size < PAGE_SIZE)
-			kfree(entries);
-		else
-			vfree(entries);
+		kvfree(entries);
 		erst_record_id_cache.entries = entries = new_entries;
 		erst_record_id_cache.size = new_size;
 	}
diff --git a/drivers/amba/Kconfig b/drivers/amba/Kconfig
index 4a5c9d279059..294ba6f36396 100644
--- a/drivers/amba/Kconfig
+++ b/drivers/amba/Kconfig
@@ -4,7 +4,7 @@ config ARM_AMBA
 if ARM_AMBA
 
 config TEGRA_AHB
-	bool "Enable AHB driver for NVIDIA Tegra SoCs"
+	bool
 	default y if ARCH_TEGRA
 	help
 	  Adds AHB configuration functionality for NVIDIA Tegra SoCs,
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 68f03141e432..44a74cf1372c 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -215,9 +215,9 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
 		newattrs.ia_uid = uid;
 		newattrs.ia_gid = gid;
 		newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID;
-		mutex_lock(&d_inode(dentry)->i_mutex);
+		inode_lock(d_inode(dentry));
 		notify_change(dentry, &newattrs, NULL);
-		mutex_unlock(&d_inode(dentry)->i_mutex);
+		inode_unlock(d_inode(dentry));
 
 		/* mark as kernel-created inode */
 		d_inode(dentry)->i_private = &thread;
@@ -244,7 +244,7 @@ static int dev_rmdir(const char *name)
 		err = -ENOENT;
 	}
 	dput(dentry);
-	mutex_unlock(&d_inode(parent.dentry)->i_mutex);
+	inode_unlock(d_inode(parent.dentry));
 	path_put(&parent);
 	return err;
 }
@@ -321,9 +321,9 @@ static int handle_remove(const char *nodename, struct device *dev)
 			newattrs.ia_mode = stat.mode & ~0777;
 			newattrs.ia_valid =
 				ATTR_UID|ATTR_GID|ATTR_MODE;
-			mutex_lock(&d_inode(dentry)->i_mutex);
+			inode_lock(d_inode(dentry));
 			notify_change(dentry, &newattrs, NULL);
-			mutex_unlock(&d_inode(dentry)->i_mutex);
+			inode_unlock(d_inode(dentry));
 			err = vfs_unlink(d_inode(parent.dentry), dentry, NULL);
 			if (!err || err == -ENOENT)
 				deleted = 1;
@@ -332,7 +332,7 @@ static int handle_remove(const char *nodename, struct device *dev)
 		err = -ENOENT;
 	}
 	dput(dentry);
-	mutex_unlock(&d_inode(parent.dentry)->i_mutex);
+	inode_unlock(d_inode(parent.dentry));
 
 	path_put(&parent);
 	if (deleted && strchr(nodename, '/'))
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index ad80c85e0857..d048d2009e89 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -964,9 +964,9 @@ aoecmd_sleepwork(struct work_struct *work)
 		ssize = get_capacity(d->gd);
 		bd = bdget_disk(d->gd, 0);
 		if (bd) {
-			mutex_lock(&bd->bd_inode->i_mutex);
+			inode_lock(bd->bd_inode);
 			i_size_write(bd->bd_inode, (loff_t)ssize<<9);
-			mutex_unlock(&bd->bd_inode->i_mutex);
+			inode_unlock(bd->bd_inode);
 			bdput(bd);
 		}
 		spin_lock_irq(&d->lock);
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index b3868e7a1ffd..10459a145062 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -288,7 +288,162 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *
 	return need_transaction;
 }
 
-static int al_write_transaction(struct drbd_device *device);
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+	return al_enr >>
+		/* bit to page */
+		((PAGE_SHIFT + 3) -
+		/* al extent number to bit */
+		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
+{
+	const unsigned int stripes = device->ldev->md.al_stripes;
+	const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
+
+	/* transaction number, modulo on-disk ring buffer wrap around */
+	unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
+
+	/* ... to aligned 4k on disk block */
+	t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+	/* ... to 512 byte sector in activity log */
+	t *= 8;
+
+	/* ... plus offset to the on disk position */
+	return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
+}
+
+static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer)
+{
+	struct lc_element *e;
+	sector_t sector;
+	int i, mx;
+	unsigned extent_nr;
+	unsigned crc = 0;
+	int err = 0;
+
+	memset(buffer, 0, sizeof(*buffer));
+	buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
+	buffer->tr_number = cpu_to_be32(device->al_tr_number);
+
+	i = 0;
+
+	/* Even though no one can start to change this list
+	 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+	 * lc_try_lock_for_transaction() --, someone may still
+	 * be in the process of changing it. */
+	spin_lock_irq(&device->al_lock);
+	list_for_each_entry(e, &device->act_log->to_be_changed, list) {
+		if (i == AL_UPDATES_PER_TRANSACTION) {
+			i++;
+			break;
+		}
+		buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+		buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+		if (e->lc_number != LC_FREE)
+			drbd_bm_mark_for_writeout(device,
+					al_extent_to_bm_page(e->lc_number));
+		i++;
+	}
+	spin_unlock_irq(&device->al_lock);
+	BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
+
+	buffer->n_updates = cpu_to_be16(i);
+	for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+		buffer->update_slot_nr[i] = cpu_to_be16(-1);
+		buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+	}
+
+	buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
+	buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
+
+	mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
+		   device->act_log->nr_elements - device->al_tr_cycle);
+	for (i = 0; i < mx; i++) {
+		unsigned idx = device->al_tr_cycle + i;
+		extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
+		buffer->context[i] = cpu_to_be32(extent_nr);
+	}
+	for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+		buffer->context[i] = cpu_to_be32(LC_FREE);
+
+	device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
+	if (device->al_tr_cycle >= device->act_log->nr_elements)
+		device->al_tr_cycle = 0;
+
+	sector = al_tr_number_to_on_disk_sector(device);
+
+	crc = crc32c(0, buffer, 4096);
+	buffer->crc32c = cpu_to_be32(crc);
+
+	if (drbd_bm_write_hinted(device))
+		err = -EIO;
+	else {
+		bool write_al_updates;
+		rcu_read_lock();
+		write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+		rcu_read_unlock();
+		if (write_al_updates) {
+			if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+				err = -EIO;
+				drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+			} else {
+				device->al_tr_number++;
+				device->al_writ_cnt++;
+			}
+		}
+	}
+
+	return err;
+}
+
+static int al_write_transaction(struct drbd_device *device)
+{
+	struct al_transaction_on_disk *buffer;
+	int err;
+
+	if (!get_ldev(device)) {
+		drbd_err(device, "disk is %s, cannot start al transaction\n",
+			drbd_disk_str(device->state.disk));
+		return -EIO;
+	}
+
+	/* The bitmap write may have failed, causing a state change. */
+	if (device->state.disk < D_INCONSISTENT) {
+		drbd_err(device,
+			"disk is %s, cannot write al transaction\n",
+			drbd_disk_str(device->state.disk));
+		put_ldev(device);
+		return -EIO;
+	}
+
+	/* protects md_io_buffer, al_tr_cycle, ... */
+	buffer = drbd_md_get_buffer(device, __func__);
+	if (!buffer) {
+		drbd_err(device, "disk failed while waiting for md_io buffer\n");
+		put_ldev(device);
+		return -ENODEV;
+	}
+
+	err = __al_write_transaction(device, buffer);
+
+	drbd_md_put_buffer(device);
+	put_ldev(device);
+
+	return err;
+}
+
 
 void drbd_al_begin_io_commit(struct drbd_device *device)
 {
@@ -420,153 +575,6 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
 	wake_up(&device->al_wait);
 }
 
-#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
-/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
- * are still coupled, or assume too much about their relation.
- * Code below will not work if this is violated.
- * Will be cleaned up with some followup patch.
- */
-# error FIXME
-#endif
-
-static unsigned int al_extent_to_bm_page(unsigned int al_enr)
-{
-	return al_enr >>
-		/* bit to page */
-		((PAGE_SHIFT + 3) -
-		/* al extent number to bit */
-		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
-}
-
-static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
-{
-	const unsigned int stripes = device->ldev->md.al_stripes;
-	const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
-
-	/* transaction number, modulo on-disk ring buffer wrap around */
-	unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
-
-	/* ... to aligned 4k on disk block */
-	t = ((t % stripes) * stripe_size_4kB) + t/stripes;
-
-	/* ... to 512 byte sector in activity log */
-	t *= 8;
-
-	/* ... plus offset to the on disk position */
-	return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
-}
-
-int al_write_transaction(struct drbd_device *device)
-{
-	struct al_transaction_on_disk *buffer;
-	struct lc_element *e;
-	sector_t sector;
-	int i, mx;
-	unsigned extent_nr;
-	unsigned crc = 0;
-	int err = 0;
-
-	if (!get_ldev(device)) {
-		drbd_err(device, "disk is %s, cannot start al transaction\n",
-			drbd_disk_str(device->state.disk));
-		return -EIO;
-	}
-
-	/* The bitmap write may have failed, causing a state change. */
-	if (device->state.disk < D_INCONSISTENT) {
-		drbd_err(device,
-			"disk is %s, cannot write al transaction\n",
-			drbd_disk_str(device->state.disk));
-		put_ldev(device);
-		return -EIO;
-	}
-
-	/* protects md_io_buffer, al_tr_cycle, ... */
-	buffer = drbd_md_get_buffer(device, __func__);
-	if (!buffer) {
-		drbd_err(device, "disk failed while waiting for md_io buffer\n");
-		put_ldev(device);
-		return -ENODEV;
-	}
-
-	memset(buffer, 0, sizeof(*buffer));
-	buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
-	buffer->tr_number = cpu_to_be32(device->al_tr_number);
-
-	i = 0;
-
-	/* Even though no one can start to change this list
-	 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
-	 * lc_try_lock_for_transaction() --, someone may still
-	 * be in the process of changing it. */
-	spin_lock_irq(&device->al_lock);
-	list_for_each_entry(e, &device->act_log->to_be_changed, list) {
-		if (i == AL_UPDATES_PER_TRANSACTION) {
-			i++;
-			break;
-		}
-		buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
-		buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
-		if (e->lc_number != LC_FREE)
-			drbd_bm_mark_for_writeout(device,
-					al_extent_to_bm_page(e->lc_number));
-		i++;
-	}
-	spin_unlock_irq(&device->al_lock);
-	BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
-
-	buffer->n_updates = cpu_to_be16(i);
-	for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
-		buffer->update_slot_nr[i] = cpu_to_be16(-1);
-		buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
-	}
-
-	buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
-	buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
-
-	mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
-		   device->act_log->nr_elements - device->al_tr_cycle);
-	for (i = 0; i < mx; i++) {
-		unsigned idx = device->al_tr_cycle + i;
-		extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
-		buffer->context[i] = cpu_to_be32(extent_nr);
-	}
-	for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
-		buffer->context[i] = cpu_to_be32(LC_FREE);
-
-	device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
-	if (device->al_tr_cycle >= device->act_log->nr_elements)
-		device->al_tr_cycle = 0;
-
-	sector = al_tr_number_to_on_disk_sector(device);
-
-	crc = crc32c(0, buffer, 4096);
-	buffer->crc32c = cpu_to_be32(crc);
-
-	if (drbd_bm_write_hinted(device))
-		err = -EIO;
-	else {
-		bool write_al_updates;
-		rcu_read_lock();
-		write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
-		rcu_read_unlock();
-		if (write_al_updates) {
-			if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
-				err = -EIO;
-				drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
-			} else {
-				device->al_tr_number++;
-				device->al_writ_cnt++;
-			}
-		}
-	}
-
-	drbd_md_put_buffer(device);
-	put_ldev(device);
-
-	return err;
-}
-
 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
 {
 	int rv;
@@ -606,21 +614,24 @@ void drbd_al_shrink(struct drbd_device *device)
 	wake_up(&device->al_wait);
 }
 
-int drbd_initialize_al(struct drbd_device *device, void *buffer)
+int drbd_al_initialize(struct drbd_device *device, void *buffer)
 {
 	struct al_transaction_on_disk *al = buffer;
 	struct drbd_md *md = &device->ldev->md;
-	sector_t al_base = md->md_offset + md->al_offset;
 	int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
 	int i;
 
-	memset(al, 0, 4096);
-	al->magic = cpu_to_be32(DRBD_AL_MAGIC);
-	al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
-	al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
+	__al_write_transaction(device, al);
+	/* There may or may not have been a pending transaction. */
+	spin_lock_irq(&device->al_lock);
+	lc_committed(device->act_log);
+	spin_unlock_irq(&device->al_lock);
 
-	for (i = 0; i < al_size_4k; i++) {
-		int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE);
+	/* The rest of the transactions will have an empty "updates" list, and
+	 * are written out only to provide the context, and to initialize the
+	 * on-disk ring buffer. */
+	for (i = 1; i < al_size_4k; i++) {
+		int err = __al_write_transaction(device, al);
 		if (err)
 			return err;
 	}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 9462d2752850..92d6fc020a65 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -24,7 +24,7 @@
 
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
 #include <linux/drbd.h>
@@ -364,12 +364,9 @@ static void bm_free_pages(struct page **pages, unsigned long number)
 	}
 }
 
-static void bm_vk_free(void *ptr, int v)
+static inline void bm_vk_free(void *ptr)
 {
-	if (v)
-		vfree(ptr);
-	else
-		kfree(ptr);
+	kvfree(ptr);
 }
 
 /*
@@ -379,7 +376,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 {
 	struct page **old_pages = b->bm_pages;
 	struct page **new_pages, *page;
-	unsigned int i, bytes, vmalloced = 0;
+	unsigned int i, bytes;
 	unsigned long have = b->bm_number_of_pages;
 
 	BUG_ON(have == 0 && old_pages != NULL);
@@ -401,7 +398,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 				PAGE_KERNEL);
 		if (!new_pages)
 			return NULL;
-		vmalloced = 1;
 	}
 
 	if (want >= have) {
@@ -411,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
 			if (!page) {
 				bm_free_pages(new_pages + have, i - have);
-				bm_vk_free(new_pages, vmalloced);
+				bm_vk_free(new_pages);
 				return NULL;
 			}
 			/* we want to know which page it is
@@ -427,11 +423,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 		*/
 	}
 
-	if (vmalloced)
-		b->bm_flags |= BM_P_VMALLOCED;
-	else
-		b->bm_flags &= ~BM_P_VMALLOCED;
-
 	return new_pages;
 }
 
@@ -469,7 +460,7 @@ void drbd_bm_cleanup(struct drbd_device *device)
 	if (!expect(device->bitmap))
 		return;
 	bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
-	bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags));
+	bm_vk_free(device->bitmap->bm_pages);
 	kfree(device->bitmap);
 	device->bitmap = NULL;
 }
@@ -479,8 +470,14 @@ void drbd_bm_cleanup(struct drbd_device *device)
  * this masks out the remaining bits.
  * Returns the number of bits cleared.
  */
+#ifndef BITS_PER_PAGE
 #define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
 #define BITS_PER_PAGE_MASK	(BITS_PER_PAGE - 1)
+#else
+# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3))
+#  error "ambiguous BITS_PER_PAGE"
+# endif
+#endif
 #define BITS_PER_LONG_MASK	(BITS_PER_LONG - 1)
 static int bm_clear_surplus(struct drbd_bitmap *b)
 {
@@ -559,21 +556,19 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
 	unsigned long *p_addr;
 	unsigned long bits = 0;
 	unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
-	int idx, i, last_word;
+	int idx, last_word;
 
 	/* all but last page */
 	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
 		p_addr = __bm_map_pidx(b, idx);
-		for (i = 0; i < LWPP; i++)
-			bits += hweight_long(p_addr[i]);
+		bits += bitmap_weight(p_addr, BITS_PER_PAGE);
 		__bm_unmap(p_addr);
 		cond_resched();
 	}
 	/* last (or only) page */
 	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
 	p_addr = __bm_map_pidx(b, idx);
-	for (i = 0; i < last_word; i++)
-		bits += hweight_long(p_addr[i]);
+	bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG);
 	p_addr[last_word] &= cpu_to_lel(mask);
 	bits += hweight_long(p_addr[last_word]);
 	/* 32bit arch, may have an unused padding long */
@@ -639,7 +634,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
 	unsigned long want, have, onpages; /* number of pages */
 	struct page **npages, **opages = NULL;
 	int err = 0, growing;
-	int opages_vmalloced;
 
 	if (!expect(b))
 		return -ENOMEM;
@@ -652,8 +646,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
 	if (capacity == b->bm_dev_capacity)
 		goto out;
 
-	opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
-
 	if (capacity == 0) {
 		spin_lock_irq(&b->bm_lock);
 		opages = b->bm_pages;
@@ -667,7 +659,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
 		b->bm_dev_capacity = 0;
 		spin_unlock_irq(&b->bm_lock);
 		bm_free_pages(opages, onpages);
-		bm_vk_free(opages, opages_vmalloced);
+		bm_vk_free(opages);
 		goto out;
 	}
 	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
@@ -740,7 +732,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
 
 	spin_unlock_irq(&b->bm_lock);
 	if (opages != npages)
-		bm_vk_free(opages, opages_vmalloced);
+		bm_vk_free(opages);
 	if (!growing)
 		b->bm_set = bm_count_bits(b);
 	drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
@@ -1419,6 +1411,9 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
 	int bits;
 	int changed = 0;
 	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
+
+	/* I think it is more cache line friendly to hweight_long then set to ~0UL,
+	 * than to first bitmap_weight() all words, then bitmap_fill() all words */
 	for (i = first_word; i < last_word; i++) {
 		bits = hweight_long(paddr[i]);
 		paddr[i] = ~0UL;
@@ -1628,8 +1623,7 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
 		int n = e-s;
 		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
 		bm = p_addr + MLPP(s);
-		while (n--)
-			count += hweight_long(*bm++);
+		count += bitmap_weight(bm, n * BITS_PER_LONG);
 		bm_unmap(p_addr);
 	} else {
 		drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 6b88a35fb048..4de95bbff486 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -434,12 +434,12 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo
 	if (!parent || d_really_is_negative(parent))
 		goto out;
 	/* serialize with d_delete() */
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 	/* Make sure the object is still alive */
 	if (simple_positive(file->f_path.dentry)
 	&& kref_get_unless_zero(kref))
 		ret = 0;
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	if (!ret) {
 		ret = single_open(file, show, data);
 		if (ret)
@@ -771,6 +771,13 @@ static int device_data_gen_id_show(struct seq_file *m, void *ignored)
 	return 0;
 }
 
+static int device_ed_gen_id_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	seq_printf(m, "0x%016llX\n", (unsigned long long)device->ed_uuid);
+	return 0;
+}
+
 #define drbd_debugfs_device_attr(name)						\
 static int device_ ## name ## _open(struct inode *inode, struct file *file)	\
 {										\
@@ -796,6 +803,7 @@ drbd_debugfs_device_attr(oldest_requests)
 drbd_debugfs_device_attr(act_log_extents)
 drbd_debugfs_device_attr(resync_extents)
 drbd_debugfs_device_attr(data_gen_id)
+drbd_debugfs_device_attr(ed_gen_id)
 
 void drbd_debugfs_device_add(struct drbd_device *device)
 {
@@ -839,6 +847,7 @@ void drbd_debugfs_device_add(struct drbd_device *device)
 	DCF(act_log_extents);
 	DCF(resync_extents);
 	DCF(data_gen_id);
+	DCF(ed_gen_id);
 #undef DCF
 	return;
 
@@ -854,6 +863,7 @@ void drbd_debugfs_device_cleanup(struct drbd_device *device)
 	drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
 	drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
 	drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+	drbd_debugfs_remove(&device->debugfs_vol_ed_gen_id);
 	drbd_debugfs_remove(&device->debugfs_vol);
 }
 
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e66d453a5f2b..34bc84efc29e 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -77,13 +77,6 @@ extern int fault_devs;
 extern char usermode_helper[];
 
 
-/* I don't remember why XCPU ...
- * This is used to wake the asender,
- * and to interrupt sending the sending task
- * on disconnect.
- */
-#define DRBD_SIG SIGXCPU
-
 /* This is used to stop/restart our threads.
  * Cannot use SIGTERM nor SIGKILL, since these
  * are sent out by init on runlevel changes
@@ -292,6 +285,9 @@ struct drbd_device_work {
 
 extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
 
+extern void lock_all_resources(void);
+extern void unlock_all_resources(void);
+
 struct drbd_request {
 	struct drbd_work w;
 	struct drbd_device *device;
@@ -504,7 +500,6 @@ enum {
 
 	MD_NO_FUA,		/* Users wants us to not use FUA/FLUSH on meta data dev */
 
-	SUSPEND_IO,		/* suspend application io */
 	BITMAP_IO,		/* suspend application io;
 				   once no more io in flight, start bitmap io */
 	BITMAP_IO_QUEUED,       /* Started bitmap IO */
@@ -541,9 +536,6 @@ struct drbd_bitmap; /* opaque for drbd_device */
 /* definition of bits in bm_flags to be used in drbd_bm_lock
  * and drbd_bitmap_io and friends. */
 enum bm_flag {
-	/* do we need to kfree, or vfree bm_pages? */
-	BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
-
 	/* currently locked for bulk operation */
 	BM_LOCKED_MASK = 0xf,
 
@@ -632,12 +624,6 @@ struct bm_io_work {
 	void (*done)(struct drbd_device *device, int rv);
 };
 
-enum write_ordering_e {
-	WO_none,
-	WO_drain_io,
-	WO_bdev_flush,
-};
-
 struct fifo_buffer {
 	unsigned int head_index;
 	unsigned int size;
@@ -650,8 +636,7 @@ extern struct fifo_buffer *fifo_alloc(int fifo_size);
 enum {
 	NET_CONGESTED,		/* The data socket is congested */
 	RESOLVE_CONFLICTS,	/* Set on one node, cleared on the peer! */
-	SEND_PING,		/* whether asender should send a ping asap */
-	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
+	SEND_PING,
 	GOT_PING_ACK,		/* set when we receive a ping_ack packet, ping_wait gets woken */
 	CONN_WD_ST_CHG_REQ,	/* A cluster wide state change on the connection is active */
 	CONN_WD_ST_CHG_OKAY,
@@ -670,6 +655,8 @@ enum {
 	DEVICE_WORK_PENDING,	/* tell worker that some device has pending work */
 };
 
+enum which_state { NOW, OLD = NOW, NEW };
+
 struct drbd_resource {
 	char *name;
 #ifdef CONFIG_DEBUG_FS
@@ -755,7 +742,8 @@ struct drbd_connection {
 	unsigned long last_reconnect_jif;
 	struct drbd_thread receiver;
 	struct drbd_thread worker;
-	struct drbd_thread asender;
+	struct drbd_thread ack_receiver;
+	struct workqueue_struct *ack_sender;
 
 	/* cached pointers,
 	 * so we can look up the oldest pending requests more quickly.
@@ -774,6 +762,8 @@ struct drbd_connection {
 	struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
 
 	struct {
+		unsigned long last_sent_barrier_jif;
+
 		/* whether this sender thread
 		 * has processed a single write yet. */
 		bool seen_any_write_yet;
@@ -788,6 +778,17 @@ struct drbd_connection {
 	} send;
 };
 
+static inline bool has_net_conf(struct drbd_connection *connection)
+{
+	bool has_net_conf;
+
+	rcu_read_lock();
+	has_net_conf = rcu_dereference(connection->net_conf);
+	rcu_read_unlock();
+
+	return has_net_conf;
+}
+
 void __update_timing_details(
 		struct drbd_thread_timing_details *tdp,
 		unsigned int *cb_nr,
@@ -811,6 +812,7 @@ struct drbd_peer_device {
 	struct list_head peer_devices;
 	struct drbd_device *device;
 	struct drbd_connection *connection;
+	struct work_struct send_acks_work;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs_peer_dev;
 #endif
@@ -829,6 +831,7 @@ struct drbd_device {
 	struct dentry *debugfs_vol_act_log_extents;
 	struct dentry *debugfs_vol_resync_extents;
 	struct dentry *debugfs_vol_data_gen_id;
+	struct dentry *debugfs_vol_ed_gen_id;
 #endif
 
 	unsigned int vnr;	/* volume number within the connection */
@@ -873,6 +876,7 @@ struct drbd_device {
 	atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
 	atomic_t unacked_cnt;	 /* Need to send replies for */
 	atomic_t local_cnt;	 /* Waiting for local completion */
+	atomic_t suspend_cnt;
 
 	/* Interval tree of pending local requests */
 	struct rb_root read_requests;
@@ -1020,6 +1024,12 @@ static inline struct drbd_peer_device *first_peer_device(struct drbd_device *dev
 	return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
 }
 
+static inline struct drbd_peer_device *
+conn_peer_device(struct drbd_connection *connection, int volume_number)
+{
+	return idr_find(&connection->peer_devices, volume_number);
+}
+
 #define for_each_resource(resource, _resources) \
 	list_for_each_entry(resource, _resources, resources)
 
@@ -1113,7 +1123,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
 extern int drbd_send_bitmap(struct drbd_device *device);
 extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
 extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
-extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
+extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
 extern void drbd_device_cleanup(struct drbd_device *device);
 void drbd_print_uuids(struct drbd_device *device, const char *text);
 
@@ -1424,7 +1434,7 @@ extern struct bio_set *drbd_md_io_bio_set;
 /* to allocate from that set */
 extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
 
-extern rwlock_t global_state_lock;
+extern struct mutex resources_mutex;
 
 extern int conn_lowest_minor(struct drbd_connection *connection);
 extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
@@ -1454,6 +1464,9 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
 
 
 /* drbd_nl.c */
+
+extern struct mutex notification_mutex;
+
 extern void drbd_suspend_io(struct drbd_device *device);
 extern void drbd_resume_io(struct drbd_device *device);
 extern char *ppsize(char *buf, unsigned long long size);
@@ -1536,7 +1549,9 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
 
 /* drbd_receiver.c */
 extern int drbd_receiver(struct drbd_thread *thi);
-extern int drbd_asender(struct drbd_thread *thi);
+extern int drbd_ack_receiver(struct drbd_thread *thi);
+extern void drbd_send_ping_wf(struct work_struct *ws);
+extern void drbd_send_acks_wf(struct work_struct *ws);
 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
 extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
 		bool throttle_if_app_is_waiting);
@@ -1649,7 +1664,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s
 #define drbd_rs_failed_io(device, sector, size) \
 	__drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
 extern void drbd_al_shrink(struct drbd_device *device);
-extern int drbd_initialize_al(struct drbd_device *, void *);
+extern int drbd_al_initialize(struct drbd_device *, void *);
 
 /* drbd_nl.c */
 /* state info broadcast */
@@ -1668,6 +1683,29 @@ struct sib_info {
 };
 void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
 
+extern void notify_resource_state(struct sk_buff *,
+				  unsigned int,
+				  struct drbd_resource *,
+				  struct resource_info *,
+				  enum drbd_notification_type);
+extern void notify_device_state(struct sk_buff *,
+				unsigned int,
+				struct drbd_device *,
+				struct device_info *,
+				enum drbd_notification_type);
+extern void notify_connection_state(struct sk_buff *,
+				    unsigned int,
+				    struct drbd_connection *,
+				    struct connection_info *,
+				    enum drbd_notification_type);
+extern void notify_peer_device_state(struct sk_buff *,
+				     unsigned int,
+				     struct drbd_peer_device *,
+				     struct peer_device_info *,
+				     enum drbd_notification_type);
+extern void notify_helper(enum drbd_notification_type, struct drbd_device *,
+			  struct drbd_connection *, const char *, int);
+
 /*
  * inline helper functions
  *************************/
@@ -1694,19 +1732,6 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r
 	return 0;
 }
 
-static inline enum drbd_state_rv
-_drbd_set_state(struct drbd_device *device, union drbd_state ns,
-		enum chg_state_flags flags, struct completion *done)
-{
-	enum drbd_state_rv rv;
-
-	read_lock(&global_state_lock);
-	rv = __drbd_set_state(device, ns, flags, done);
-	read_unlock(&global_state_lock);
-
-	return rv;
-}
-
 static inline union drbd_state drbd_read_state(struct drbd_device *device)
 {
 	struct drbd_resource *resource = device->resource;
@@ -1937,16 +1962,21 @@ drbd_device_post_work(struct drbd_device *device, int work_bit)
 
 extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
 
-static inline void wake_asender(struct drbd_connection *connection)
+/* To get the ack_receiver out of the blocking network stack,
+ * so it can change its sk_rcvtimeo from idle- to ping-timeout,
+ * and send a ping, we need to send a signal.
+ * Which signal we send is irrelevant. */
+static inline void wake_ack_receiver(struct drbd_connection *connection)
 {
-	if (test_bit(SIGNAL_ASENDER, &connection->flags))
-		force_sig(DRBD_SIG, connection->asender.task);
+	struct task_struct *task = connection->ack_receiver.task;
+	if (task && get_t_state(&connection->ack_receiver) == RUNNING)
+		force_sig(SIGXCPU, task);
 }
 
 static inline void request_ping(struct drbd_connection *connection)
 {
 	set_bit(SEND_PING, &connection->flags);
-	wake_asender(connection);
+	wake_ack_receiver(connection);
 }
 
 extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
@@ -2230,7 +2260,7 @@ static inline bool may_inc_ap_bio(struct drbd_device *device)
 
 	if (drbd_suspended(device))
 		return false;
-	if (test_bit(SUSPEND_IO, &device->flags))
+	if (atomic_read(&device->suspend_cnt))
 		return false;
 
 	/* to avoid potential deadlock or bitmap corruption,
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 74d97f4bac34..5b43dfb79819 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -117,6 +117,7 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0
  */
 struct idr drbd_devices;
 struct list_head drbd_resources;
+struct mutex resources_mutex;
 
 struct kmem_cache *drbd_request_cache;
 struct kmem_cache *drbd_ee_cache;	/* peer requests */
@@ -1435,8 +1436,8 @@ static int we_should_drop_the_connection(struct drbd_connection *connection, str
 	/* long elapsed = (long)(jiffies - device->last_received); */
 
 	drop_it =   connection->meta.socket == sock
-		|| !connection->asender.task
-		|| get_t_state(&connection->asender) != RUNNING
+		|| !connection->ack_receiver.task
+		|| get_t_state(&connection->ack_receiver) != RUNNING
 		|| connection->cstate < C_WF_REPORT_PARAMS;
 
 	if (drop_it)
@@ -1793,15 +1794,6 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock,
 		drbd_update_congested(connection);
 	}
 	do {
-		/* STRANGE
-		 * tcp_sendmsg does _not_ use its size parameter at all ?
-		 *
-		 * -EAGAIN on timeout, -EINTR on signal.
-		 */
-/* THINK
- * do we need to block DRBD_SIG if sock == &meta.socket ??
- * otherwise wake_asender() might interrupt some send_*Ack !
- */
 		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
 		if (rv == -EAGAIN) {
 			if (we_should_drop_the_connection(connection, sock))
@@ -2000,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device)
 		drbd_bm_cleanup(device);
 	}
 
-	drbd_free_ldev(device->ldev);
+	drbd_backing_dev_free(device, device->ldev);
 	device->ldev = NULL;
 
 	clear_bit(AL_SUSPENDED, &device->flags);
@@ -2179,7 +2171,7 @@ void drbd_destroy_device(struct kref *kref)
 	if (device->this_bdev)
 		bdput(device->this_bdev);
 
-	drbd_free_ldev(device->ldev);
+	drbd_backing_dev_free(device, device->ldev);
 	device->ldev = NULL;
 
 	drbd_release_all_peer_reqs(device);
@@ -2563,7 +2555,7 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
 		cpumask_copy(resource->cpu_mask, new_cpu_mask);
 		for_each_connection_rcu(connection, resource) {
 			connection->receiver.reset_cpu_mask = 1;
-			connection->asender.reset_cpu_mask = 1;
+			connection->ack_receiver.reset_cpu_mask = 1;
 			connection->worker.reset_cpu_mask = 1;
 		}
 	}
@@ -2590,7 +2582,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
 	kref_init(&resource->kref);
 	idr_init(&resource->devices);
 	INIT_LIST_HEAD(&resource->connections);
-	resource->write_ordering = WO_bdev_flush;
+	resource->write_ordering = WO_BDEV_FLUSH;
 	list_add_tail_rcu(&resource->resources, &drbd_resources);
 	mutex_init(&resource->conf_update);
 	mutex_init(&resource->adm_mutex);
@@ -2652,8 +2644,8 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 	connection->receiver.connection = connection;
 	drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
 	connection->worker.connection = connection;
-	drbd_thread_init(resource, &connection->asender, drbd_asender, "asender");
-	connection->asender.connection = connection;
+	drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv");
+	connection->ack_receiver.connection = connection;
 
 	kref_init(&connection->kref);
 
@@ -2702,8 +2694,8 @@ static int init_submitter(struct drbd_device *device)
 {
 	/* opencoded create_singlethread_workqueue(),
 	 * to be able to say "drbd%d", ..., minor */
-	device->submit.wq = alloc_workqueue("drbd%u_submit",
-			WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor);
+	device->submit.wq =
+		alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor);
 	if (!device->submit.wq)
 		return -ENOMEM;
 
@@ -2820,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 			goto out_idr_remove_from_resource;
 		}
 		kref_get(&connection->kref);
+		INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf);
 	}
 
 	if (init_submitter(device)) {
@@ -2923,7 +2916,7 @@ static int __init drbd_init(void)
 	drbd_proc = NULL; /* play safe for drbd_cleanup */
 	idr_init(&drbd_devices);
 
-	rwlock_init(&global_state_lock);
+	mutex_init(&resources_mutex);
 	INIT_LIST_HEAD(&drbd_resources);
 
 	err = drbd_genl_register();
@@ -2971,18 +2964,6 @@ fail:
 	return err;
 }
 
-void drbd_free_ldev(struct drbd_backing_dev *ldev)
-{
-	if (ldev == NULL)
-		return;
-
-	blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-	blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-
-	kfree(ldev->disk_conf);
-	kfree(ldev);
-}
-
 static void drbd_free_one_sock(struct drbd_socket *ds)
 {
 	struct socket *s;
@@ -3277,6 +3258,10 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
 	 * and read it. */
 	bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
 	bdev->md.md_offset = drbd_md_ss(bdev);
+	/* Even for (flexible or indexed) external meta data,
+	 * initially restrict us to the 4k superblock for now.
+	 * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */
+	bdev->md.md_size_sect = 8;
 
 	if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
 		/* NOTE: can't do normal error processing here as this is
@@ -3578,7 +3563,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device,
 
 	spin_lock_irq(&device->resource->req_lock);
 	set_bit(BITMAP_IO, &device->flags);
-	if (atomic_read(&device->ap_bio_cnt) == 0) {
+	/* don't wait for pending application IO if the caller indicates that
+	 * application IO does not conflict anyways. */
+	if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) {
 		if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
 			drbd_queue_work(&first_peer_device(device)->connection->sender_work,
 					&device->bm_io_work.w);
@@ -3746,6 +3733,27 @@ int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
 	return 0;
 }
 
+void lock_all_resources(void)
+{
+	struct drbd_resource *resource;
+	int __maybe_unused i = 0;
+
+	mutex_lock(&resources_mutex);
+	local_irq_disable();
+	for_each_resource(resource, &drbd_resources)
+		spin_lock_nested(&resource->req_lock, i++);
+}
+
+void unlock_all_resources(void)
+{
+	struct drbd_resource *resource;
+
+	for_each_resource(resource, &drbd_resources)
+		spin_unlock(&resource->req_lock);
+	local_irq_enable();
+	mutex_unlock(&resources_mutex);
+}
+
 #ifdef CONFIG_DRBD_FAULT_INJECTION
 /* Fault insertion support including random number generator shamelessly
  * stolen from kernel/rcutorture.c */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e80cbefbc2b5..c055c5e12f24 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -36,6 +36,7 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
+#include "drbd_state_change.h"
 #include <asm/unaligned.h>
 #include <linux/drbd_limits.h>
 #include <linux/kthread.h>
@@ -75,11 +76,24 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
 /* .dumpit */
 int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_devices_done(struct netlink_callback *cb);
+int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_connections_done(struct netlink_callback *cb);
+int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb);
+int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb);
 
 #include <linux/drbd_genl_api.h>
 #include "drbd_nla.h"
 #include <linux/genl_magic_func.h>
 
+static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+static atomic_t notify_genl_seq = ATOMIC_INIT(2); /* two. */
+
+DEFINE_MUTEX(notification_mutex);
+
 /* used blkdev_get_by_path, to claim our meta data device(s) */
 static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
 
@@ -349,6 +363,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
 	sib.sib_reason = SIB_HELPER_PRE;
 	sib.helper_name = cmd;
 	drbd_bcast_event(device, &sib);
+	notify_helper(NOTIFY_CALL, device, connection, cmd, 0);
 	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 	if (ret)
 		drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
@@ -361,6 +376,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
 	sib.sib_reason = SIB_HELPER_POST;
 	sib.helper_exit_code = ret;
 	drbd_bcast_event(device, &sib);
+	notify_helper(NOTIFY_RESPONSE, device, connection, cmd, ret);
 
 	if (current == connection->worker.task)
 		clear_bit(CALLBACK_PENDING, &connection->flags);
@@ -388,6 +404,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
 
 	drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
 	/* TODO: conn_bcast_event() ?? */
+	notify_helper(NOTIFY_CALL, NULL, connection, cmd, 0);
 
 	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 	if (ret)
@@ -399,6 +416,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
 			  usermode_helper, cmd, resource_name,
 			  (ret >> 8) & 0xff, ret);
 	/* TODO: conn_bcast_event() ?? */
+	notify_helper(NOTIFY_RESPONSE, NULL, connection, cmd, ret);
 
 	if (ret < 0) /* Ignore any ERRNOs we got. */
 		ret = 0;
@@ -847,9 +865,11 @@ char *ppsize(char *buf, unsigned long long size)
  * and can be long lived.
  * This changes an device->flag, is triggered by drbd internals,
  * and should be short-lived. */
+/* It needs to be a counter, since multiple threads might
+   independently suspend and resume IO. */
 void drbd_suspend_io(struct drbd_device *device)
 {
-	set_bit(SUSPEND_IO, &device->flags);
+	atomic_inc(&device->suspend_cnt);
 	if (drbd_suspended(device))
 		return;
 	wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
@@ -857,8 +877,8 @@ void drbd_suspend_io(struct drbd_device *device)
 
 void drbd_resume_io(struct drbd_device *device)
 {
-	clear_bit(SUSPEND_IO, &device->flags);
-	wake_up(&device->misc_wait);
+	if (atomic_dec_and_test(&device->suspend_cnt))
+		wake_up(&device->misc_wait);
 }
 
 /**
@@ -871,27 +891,32 @@ void drbd_resume_io(struct drbd_device *device)
 enum determine_dev_size
 drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
 {
-	sector_t prev_first_sect, prev_size; /* previous meta location */
-	sector_t la_size_sect, u_size;
+	struct md_offsets_and_sizes {
+		u64 last_agreed_sect;
+		u64 md_offset;
+		s32 al_offset;
+		s32 bm_offset;
+		u32 md_size_sect;
+
+		u32 al_stripes;
+		u32 al_stripe_size_4k;
+	} prev;
+	sector_t u_size, size;
 	struct drbd_md *md = &device->ldev->md;
-	u32 prev_al_stripe_size_4k;
-	u32 prev_al_stripes;
-	sector_t size;
 	char ppb[10];
 	void *buffer;
 
 	int md_moved, la_size_changed;
 	enum determine_dev_size rv = DS_UNCHANGED;
 
-	/* race:
-	 * application request passes inc_ap_bio,
-	 * but then cannot get an AL-reference.
-	 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+	/* We may change the on-disk offsets of our meta data below.  Lock out
+	 * anything that may cause meta data IO, to avoid acting on incomplete
+	 * layout changes or scribbling over meta data that is in the process
+	 * of being moved.
 	 *
-	 * to avoid that:
-	 * Suspend IO right here.
-	 * still lock the act_log to not trigger ASSERTs there.
-	 */
+	 * Move is not exactly correct, btw, currently we have all our meta
+	 * data in core memory, to "move" it we just write it all out, there
+	 * are no reads. */
 	drbd_suspend_io(device);
 	buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
 	if (!buffer) {
@@ -899,19 +924,17 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 		return DS_ERROR;
 	}
 
-	/* no wait necessary anymore, actually we could assert that */
-	wait_event(device->al_wait, lc_try_lock(device->act_log));
-
-	prev_first_sect = drbd_md_first_sector(device->ldev);
-	prev_size = device->ldev->md.md_size_sect;
-	la_size_sect = device->ldev->md.la_size_sect;
+	/* remember current offset and sizes */
+	prev.last_agreed_sect = md->la_size_sect;
+	prev.md_offset = md->md_offset;
+	prev.al_offset = md->al_offset;
+	prev.bm_offset = md->bm_offset;
+	prev.md_size_sect = md->md_size_sect;
+	prev.al_stripes = md->al_stripes;
+	prev.al_stripe_size_4k = md->al_stripe_size_4k;
 
 	if (rs) {
 		/* rs is non NULL if we should change the AL layout only */
-
-		prev_al_stripes = md->al_stripes;
-		prev_al_stripe_size_4k = md->al_stripe_size_4k;
-
 		md->al_stripes = rs->al_stripes;
 		md->al_stripe_size_4k = rs->al_stripe_size / 4;
 		md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
@@ -924,7 +947,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 	rcu_read_unlock();
 	size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
 
-	if (size < la_size_sect) {
+	if (size < prev.last_agreed_sect) {
 		if (rs && u_size == 0) {
 			/* Remove "rs &&" later. This check should always be active, but
 			   right now the receiver expects the permissive behavior */
@@ -945,30 +968,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 		err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
 		if (unlikely(err)) {
 			/* currently there is only one error: ENOMEM! */
-			size = drbd_bm_capacity(device)>>1;
+			size = drbd_bm_capacity(device);
 			if (size == 0) {
 				drbd_err(device, "OUT OF MEMORY! "
 				    "Could not allocate bitmap!\n");
 			} else {
 				drbd_err(device, "BM resizing failed. "
-				    "Leaving size unchanged at size = %lu KB\n",
-				    (unsigned long)size);
+				    "Leaving size unchanged\n");
 			}
 			rv = DS_ERROR;
 		}
 		/* racy, see comments above. */
 		drbd_set_my_capacity(device, size);
-		device->ldev->md.la_size_sect = size;
+		md->la_size_sect = size;
 		drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
 		     (unsigned long long)size>>1);
 	}
 	if (rv <= DS_ERROR)
 		goto err_out;
 
-	la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
+	la_size_changed = (prev.last_agreed_sect != md->la_size_sect);
 
-	md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
-		|| prev_size	   != device->ldev->md.md_size_sect;
+	md_moved = prev.md_offset    != md->md_offset
+		|| prev.md_size_sect != md->md_size_sect;
 
 	if (la_size_changed || md_moved || rs) {
 		u32 prev_flags;
@@ -977,20 +999,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 		 * Clear the timer, to avoid scary "timer expired!" messages,
 		 * "Superblock" is written out at least twice below, anyways. */
 		del_timer(&device->md_sync_timer);
-		drbd_al_shrink(device); /* All extents inactive. */
 
+		/* We won't change the "al-extents" setting, we just may need
+		 * to move the on-disk location of the activity log ringbuffer.
+		 * Lock for transaction is good enough, it may well be "dirty"
+		 * or even "starving". */
+		wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log));
+
+		/* mark current on-disk bitmap and activity log as unreliable */
 		prev_flags = md->flags;
-		md->flags &= ~MDF_PRIMARY_IND;
+		md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED;
 		drbd_md_write(device, buffer);
 
+		drbd_al_initialize(device, buffer);
+
 		drbd_info(device, "Writing the whole bitmap, %s\n",
 			 la_size_changed && md_moved ? "size changed and md moved" :
 			 la_size_changed ? "size changed" : "md moved");
 		/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
 		drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
 			       "size changed", BM_LOCKED_MASK);
-		drbd_initialize_al(device, buffer);
 
+		/* on-disk bitmap and activity log is authoritative again
+		 * (unless there was an IO error meanwhile...) */
 		md->flags = prev_flags;
 		drbd_md_write(device, buffer);
 
@@ -999,20 +1030,22 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 				  md->al_stripes, md->al_stripe_size_4k * 4);
 	}
 
-	if (size > la_size_sect)
-		rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
-	if (size < la_size_sect)
+	if (size > prev.last_agreed_sect)
+		rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO;
+	if (size < prev.last_agreed_sect)
 		rv = DS_SHRUNK;
 
 	if (0) {
 	err_out:
-		if (rs) {
-			md->al_stripes = prev_al_stripes;
-			md->al_stripe_size_4k = prev_al_stripe_size_4k;
-			md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
-
-			drbd_md_set_sector_offsets(device, device->ldev);
-		}
+		/* restore previous offset and sizes */
+		md->la_size_sect = prev.last_agreed_sect;
+		md->md_offset = prev.md_offset;
+		md->al_offset = prev.al_offset;
+		md->bm_offset = prev.bm_offset;
+		md->md_size_sect = prev.md_size_sect;
+		md->al_stripes = prev.al_stripes;
+		md->al_stripe_size_4k = prev.al_stripe_size_4k;
+		md->al_size_4k = (u64)prev.al_stripes * prev.al_stripe_size_4k;
 	}
 	lc_unlock(device->act_log);
 	wake_up(&device->al_wait);
@@ -1115,8 +1148,7 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
 		lc_destroy(n);
 		return -EBUSY;
 	} else {
-		if (t)
-			lc_destroy(t);
+		lc_destroy(t);
 	}
 	drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
 	return 0;
@@ -1151,21 +1183,20 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
 	if (b) {
 		struct drbd_connection *connection = first_peer_device(device)->connection;
 
+		blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
+
 		if (blk_queue_discard(b) &&
 		    (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
-			/* For now, don't allow more than one activity log extent worth of data
-			 * to be discarded in one go. We may need to rework drbd_al_begin_io()
-			 * to allow for even larger discard ranges */
-			blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
-
+			/* We don't care, stacking below should fix it for the local device.
+			 * Whether or not it is a suitable granularity on the remote device
+			 * is not our problem, really. If you care, you need to
+			 * use devices with similar topology on all peers. */
+			q->limits.discard_granularity = 512;
 			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
-			/* REALLY? Is stacking secdiscard "legal"? */
-			if (blk_queue_secdiscard(b))
-				queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
 		} else {
 			blk_queue_max_discard_sectors(q, 0);
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
-			queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q);
+			q->limits.discard_granularity = 0;
 		}
 
 		blk_queue_stack_limits(q, b);
@@ -1177,6 +1208,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
 			q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
 		}
 	}
+	/* To avoid confusion, if this queue does not support discard, clear
+	 * max_discard_sectors, which is what lsblk -D reports to the user.  */
+	if (!blk_queue_discard(q)) {
+		blk_queue_max_discard_sectors(q, 0);
+		q->limits.discard_granularity = 0;
+	}
 }
 
 void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
@@ -1241,8 +1278,8 @@ static void conn_reconfig_done(struct drbd_connection *connection)
 		connection->cstate == C_STANDALONE;
 	spin_unlock_irq(&connection->resource->req_lock);
 	if (stop_threads) {
-		/* asender is implicitly stopped by receiver
-		 * in conn_disconnect() */
+		/* ack_receiver thread and ack_sender workqueue are implicitly
+		 * stopped by receiver in conn_disconnect() */
 		drbd_thread_stop(&connection->receiver);
 		drbd_thread_stop(&connection->worker);
 	}
@@ -1389,13 +1426,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 		goto fail_unlock;
 	}
 
-	write_lock_irq(&global_state_lock);
+	lock_all_resources();
 	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
 	if (retcode == NO_ERROR) {
 		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
 		drbd_resync_after_changed(device);
 	}
-	write_unlock_irq(&global_state_lock);
+	unlock_all_resources();
 
 	if (retcode != NO_ERROR)
 		goto fail_unlock;
@@ -1418,7 +1455,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 		set_bit(MD_NO_FUA, &device->flags);
 
 	if (write_ordering_changed(old_disk_conf, new_disk_conf))
-		drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
+		drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
 
 	drbd_md_sync(device);
 
@@ -1449,6 +1486,88 @@ success:
 	return 0;
 }
 
+static struct block_device *open_backing_dev(struct drbd_device *device,
+		const char *bdev_path, void *claim_ptr, bool do_bd_link)
+{
+	struct block_device *bdev;
+	int err = 0;
+
+	bdev = blkdev_get_by_path(bdev_path,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr);
+	if (IS_ERR(bdev)) {
+		drbd_err(device, "open(\"%s\") failed with %ld\n",
+				bdev_path, PTR_ERR(bdev));
+		return bdev;
+	}
+
+	if (!do_bd_link)
+		return bdev;
+
+	err = bd_link_disk_holder(bdev, device->vdisk);
+	if (err) {
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n",
+				bdev_path, err);
+		bdev = ERR_PTR(err);
+	}
+	return bdev;
+}
+
+static int open_backing_devices(struct drbd_device *device,
+		struct disk_conf *new_disk_conf,
+		struct drbd_backing_dev *nbc)
+{
+	struct block_device *bdev;
+
+	bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true);
+	if (IS_ERR(bdev))
+		return ERR_OPEN_DISK;
+	nbc->backing_bdev = bdev;
+
+	/*
+	 * meta_dev_idx >= 0: external fixed size, possibly multiple
+	 * drbd sharing one meta device.  TODO in that case, paranoia
+	 * check that [md_bdev, meta_dev_idx] is not yet used by some
+	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
+	 * should check it for you already; but if you don't, or
+	 * someone fooled it, we need to double check here)
+	 */
+	bdev = open_backing_dev(device, new_disk_conf->meta_dev,
+		/* claim ptr: device, if claimed exclusively; shared drbd_m_holder,
+		 * if potentially shared with other drbd minors */
+			(new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder,
+		/* avoid double bd_claim_by_disk() for the same (source,target) tuple,
+		 * as would happen with internal metadata. */
+			(new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT &&
+			 new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL));
+	if (IS_ERR(bdev))
+		return ERR_OPEN_MD_DISK;
+	nbc->md_bdev = bdev;
+	return NO_ERROR;
+}
+
+static void close_backing_dev(struct drbd_device *device, struct block_device *bdev,
+	bool do_bd_unlink)
+{
+	if (!bdev)
+		return;
+	if (do_bd_unlink)
+		bd_unlink_disk_holder(bdev, device->vdisk);
+	blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+}
+
+void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev)
+{
+	if (ldev == NULL)
+		return;
+
+	close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev);
+	close_backing_dev(device, ldev->backing_bdev, true);
+
+	kfree(ldev->disk_conf);
+	kfree(ldev);
+}
+
 int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 {
 	struct drbd_config_context adm_ctx;
@@ -1462,7 +1581,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	sector_t min_md_device_sectors;
 	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
 	struct disk_conf *new_disk_conf = NULL;
-	struct block_device *bdev;
 	struct lru_cache *resync_lru = NULL;
 	struct fifo_buffer *new_plan = NULL;
 	union drbd_state ns, os;
@@ -1478,7 +1596,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	device = adm_ctx.device;
 	mutex_lock(&adm_ctx.resource->adm_mutex);
 	peer_device = first_peer_device(device);
-	connection = peer_device ? peer_device->connection : NULL;
+	connection = peer_device->connection;
 	conn_reconfig_start(connection);
 
 	/* if you want to reconfigure, please tear down first */
@@ -1539,12 +1657,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		goto fail;
 	}
 
-	write_lock_irq(&global_state_lock);
-	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
-	write_unlock_irq(&global_state_lock);
-	if (retcode != NO_ERROR)
-		goto fail;
-
 	rcu_read_lock();
 	nc = rcu_dereference(connection->net_conf);
 	if (nc) {
@@ -1556,35 +1668,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	}
 	rcu_read_unlock();
 
-	bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
-				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
-	if (IS_ERR(bdev)) {
-		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
-			PTR_ERR(bdev));
-		retcode = ERR_OPEN_DISK;
-		goto fail;
-	}
-	nbc->backing_bdev = bdev;
-
-	/*
-	 * meta_dev_idx >= 0: external fixed size, possibly multiple
-	 * drbd sharing one meta device.  TODO in that case, paranoia
-	 * check that [md_bdev, meta_dev_idx] is not yet used by some
-	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
-	 * should check it for you already; but if you don't, or
-	 * someone fooled it, we need to double check here)
-	 */
-	bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
-				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
-				  (new_disk_conf->meta_dev_idx < 0) ?
-				  (void *)device : (void *)drbd_m_holder);
-	if (IS_ERR(bdev)) {
-		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
-			PTR_ERR(bdev));
-		retcode = ERR_OPEN_MD_DISK;
+	retcode = open_backing_devices(device, new_disk_conf, nbc);
+	if (retcode != NO_ERROR)
 		goto fail;
-	}
-	nbc->md_bdev = bdev;
 
 	if ((nbc->backing_bdev == nbc->md_bdev) !=
 	    (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
@@ -1707,6 +1793,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		goto force_diskless_dec;
 	}
 
+	lock_all_resources();
+	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+	if (retcode != NO_ERROR) {
+		unlock_all_resources();
+		goto force_diskless_dec;
+	}
+
 	/* Reset the "barriers don't work" bits here, then force meta data to
 	 * be written, to ensure we determine if barriers are supported. */
 	if (new_disk_conf->md_flushes)
@@ -1727,7 +1820,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	new_disk_conf = NULL;
 	new_plan = NULL;
 
-	drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
+	drbd_resync_after_changed(device);
+	drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH);
+	unlock_all_resources();
 
 	if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
 		set_bit(CRASHED_PRIMARY, &device->flags);
@@ -1875,12 +1970,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
  fail:
 	conn_reconfig_done(connection);
 	if (nbc) {
-		if (nbc->backing_bdev)
-			blkdev_put(nbc->backing_bdev,
-				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-		if (nbc->md_bdev)
-			blkdev_put(nbc->md_bdev,
-				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev);
+		close_backing_dev(device, nbc->backing_bdev, true);
 		kfree(nbc);
 	}
 	kfree(new_disk_conf);
@@ -1895,6 +1986,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 static int adm_detach(struct drbd_device *device, int force)
 {
 	enum drbd_state_rv retcode;
+	void *buffer;
 	int ret;
 
 	if (force) {
@@ -1905,13 +1997,16 @@ static int adm_detach(struct drbd_device *device, int force)
 	}
 
 	drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
-	drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
-	retcode = drbd_request_state(device, NS(disk, D_FAILED));
-	drbd_md_put_buffer(device);
+	buffer = drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
+	if (buffer) {
+		retcode = drbd_request_state(device, NS(disk, D_FAILED));
+		drbd_md_put_buffer(device);
+	} else /* already <= D_FAILED */
+		retcode = SS_NOTHING_TO_DO;
 	/* D_FAILED will transition to DISKLESS. */
+	drbd_resume_io(device);
 	ret = wait_event_interruptible(device->misc_wait,
 			device->state.disk != D_FAILED);
-	drbd_resume_io(device);
 	if ((int)retcode == (int)SS_IS_DISKLESS)
 		retcode = SS_NOTHING_TO_DO;
 	if (ret)
@@ -2245,8 +2340,31 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
+static void connection_to_info(struct connection_info *info,
+			       struct drbd_connection *connection)
+{
+	info->conn_connection_state = connection->cstate;
+	info->conn_role = conn_highest_peer(connection);
+}
+
+static void peer_device_to_info(struct peer_device_info *info,
+				struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+
+	info->peer_repl_state =
+		max_t(enum drbd_conns, C_WF_REPORT_PARAMS, device->state.conn);
+	info->peer_disk_state = device->state.pdsk;
+	info->peer_resync_susp_user = device->state.user_isp;
+	info->peer_resync_susp_peer = device->state.peer_isp;
+	info->peer_resync_susp_dependency = device->state.aftr_isp;
+}
+
 int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 {
+	struct connection_info connection_info;
+	enum drbd_notification_type flags;
+	unsigned int peer_devices = 0;
 	struct drbd_config_context adm_ctx;
 	struct drbd_peer_device *peer_device;
 	struct net_conf *old_net_conf, *new_net_conf = NULL;
@@ -2347,6 +2465,22 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 	connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
 	memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
 
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		peer_devices++;
+	}
+
+	connection_to_info(&connection_info, connection);
+	flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+	mutex_lock(&notification_mutex);
+	notify_connection_state(NULL, 0, connection, &connection_info, NOTIFY_CREATE | flags);
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct peer_device_info peer_device_info;
+
+		peer_device_to_info(&peer_device_info, peer_device);
+		flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+		notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags);
+	}
+	mutex_unlock(&notification_mutex);
 	mutex_unlock(&adm_ctx.resource->conf_update);
 
 	rcu_read_lock();
@@ -2428,6 +2562,8 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection
 			drbd_err(connection,
 				"unexpected rv2=%d in conn_try_disconnect()\n",
 				rv2);
+		/* Unlike in DRBD 9, the state engine has generated
+		 * NOTIFY_DESTROY events before clearing connection->net_conf. */
 	}
 	return rv;
 }
@@ -2585,6 +2721,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 		mutex_unlock(&device->resource->conf_update);
 		synchronize_rcu();
 		kfree(old_disk_conf);
+		new_disk_conf = NULL;
 	}
 
 	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
@@ -2618,6 +2755,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 
  fail_ldev:
 	put_ldev(device);
+	kfree(new_disk_conf);
 	goto fail;
 }
 
@@ -2855,7 +2993,30 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
 	mutex_lock(&adm_ctx.resource->adm_mutex);
 	device = adm_ctx.device;
 	if (test_bit(NEW_CUR_UUID, &device->flags)) {
-		drbd_uuid_new_current(device);
+		if (get_ldev_if_state(device, D_ATTACHING)) {
+			drbd_uuid_new_current(device);
+			put_ldev(device);
+		} else {
+			/* This is effectively a multi-stage "forced down".
+			 * The NEW_CUR_UUID bit is supposedly only set, if we
+			 * lost the replication connection, and are configured
+			 * to freeze IO and wait for some fence-peer handler.
+			 * So we still don't have a replication connection.
+			 * And now we don't have a local disk either.  After
+			 * resume, we will fail all pending and new IO, because
+			 * we don't have any data anymore.  Which means we will
+			 * eventually be able to terminate all users of this
+			 * device, and then take it down.  By bumping the
+			 * "effective" data uuid, we make sure that you really
+			 * need to tear down before you reconfigure, we will
+			 * the refuse to re-connect or re-attach (because no
+			 * matching real data uuid exists).
+			 */
+			u64 val;
+			get_random_bytes(&val, sizeof(u64));
+			drbd_set_ed_uuid(device, val);
+			drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n");
+		}
 		clear_bit(NEW_CUR_UUID, &device->flags);
 	}
 	drbd_suspend_io(device);
@@ -2910,6 +3071,486 @@ nla_put_failure:
 }
 
 /*
+ * The generic netlink dump callbacks are called outside the genl_lock(), so
+ * they cannot use the simple attribute parsing code which uses global
+ * attribute tables.
+ */
+static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr)
+{
+	const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+	const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+	struct nlattr *nla;
+
+	nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen),
+		       DRBD_NLA_CFG_CONTEXT);
+	if (!nla)
+		return NULL;
+	return drbd_nla_find_nested(maxtype, nla, __nla_type(attr));
+}
+
+static void resource_to_info(struct resource_info *, struct drbd_resource *);
+
+int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_genlmsghdr *dh;
+	struct drbd_resource *resource;
+	struct resource_info resource_info;
+	struct resource_statistics resource_statistics;
+	int err;
+
+	rcu_read_lock();
+	if (cb->args[0]) {
+		for_each_resource_rcu(resource, &drbd_resources)
+			if (resource == (struct drbd_resource *)cb->args[0])
+				goto found_resource;
+		err = 0;  /* resource was probably deleted */
+		goto out;
+	}
+	resource = list_entry(&drbd_resources,
+			      struct drbd_resource, resources);
+
+found_resource:
+	list_for_each_entry_continue_rcu(resource, &drbd_resources, resources) {
+		goto put_result;
+	}
+	err = 0;
+	goto out;
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_RESOURCES);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	err = nla_put_drbd_cfg_context(skb, resource, NULL, NULL);
+	if (err)
+		goto out;
+	err = res_opts_to_skb(skb, &resource->res_opts, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto out;
+	resource_to_info(&resource_info, resource);
+	err = resource_info_to_skb(skb, &resource_info, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto out;
+	resource_statistics.res_stat_write_ordering = resource->write_ordering;
+	err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto out;
+	cb->args[0] = (long)resource;
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (err)
+		return err;
+	return skb->len;
+}
+
+static void device_to_statistics(struct device_statistics *s,
+				 struct drbd_device *device)
+{
+	memset(s, 0, sizeof(*s));
+	s->dev_upper_blocked = !may_inc_ap_bio(device);
+	if (get_ldev(device)) {
+		struct drbd_md *md = &device->ldev->md;
+		u64 *history_uuids = (u64 *)s->history_uuids;
+		struct request_queue *q;
+		int n;
+
+		spin_lock_irq(&md->uuid_lock);
+		s->dev_current_uuid = md->uuid[UI_CURRENT];
+		BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1);
+		for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++)
+			history_uuids[n] = md->uuid[UI_HISTORY_START + n];
+		for (; n < HISTORY_UUIDS; n++)
+			history_uuids[n] = 0;
+		s->history_uuids_len = HISTORY_UUIDS;
+		spin_unlock_irq(&md->uuid_lock);
+
+		s->dev_disk_flags = md->flags;
+		q = bdev_get_queue(device->ldev->backing_bdev);
+		s->dev_lower_blocked =
+			bdi_congested(&q->backing_dev_info,
+				      (1 << WB_async_congested) |
+				      (1 << WB_sync_congested));
+		put_ldev(device);
+	}
+	s->dev_size = drbd_get_capacity(device->this_bdev);
+	s->dev_read = device->read_cnt;
+	s->dev_write = device->writ_cnt;
+	s->dev_al_writes = device->al_writ_cnt;
+	s->dev_bm_writes = device->bm_writ_cnt;
+	s->dev_upper_pending = atomic_read(&device->ap_bio_cnt);
+	s->dev_lower_pending = atomic_read(&device->local_cnt);
+	s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags);
+	s->dev_exposed_data_uuid = device->ed_uuid;
+}
+
+static int put_resource_in_arg0(struct netlink_callback *cb, int holder_nr)
+{
+	if (cb->args[0]) {
+		struct drbd_resource *resource =
+			(struct drbd_resource *)cb->args[0];
+		kref_put(&resource->kref, drbd_destroy_resource);
+	}
+
+	return 0;
+}
+
+int drbd_adm_dump_devices_done(struct netlink_callback *cb) {
+	return put_resource_in_arg0(cb, 7);
+}
+
+static void device_to_info(struct device_info *, struct drbd_device *);
+
+int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *resource_filter;
+	struct drbd_resource *resource;
+	struct drbd_device *uninitialized_var(device);
+	int minor, err, retcode;
+	struct drbd_genlmsghdr *dh;
+	struct device_info device_info;
+	struct device_statistics device_statistics;
+	struct idr *idr_to_search;
+
+	resource = (struct drbd_resource *)cb->args[0];
+	if (!cb->args[0] && !cb->args[1]) {
+		resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+		if (resource_filter) {
+			retcode = ERR_RES_NOT_KNOWN;
+			resource = drbd_find_resource(nla_data(resource_filter));
+			if (!resource)
+				goto put_result;
+			cb->args[0] = (long)resource;
+		}
+	}
+
+	rcu_read_lock();
+	minor = cb->args[1];
+	idr_to_search = resource ? &resource->devices : &drbd_devices;
+	device = idr_get_next(idr_to_search, &minor);
+	if (!device) {
+		err = 0;
+		goto out;
+	}
+	idr_for_each_entry_continue(idr_to_search, device, minor) {
+		retcode = NO_ERROR;
+		goto put_result;  /* only one iteration */
+	}
+	err = 0;
+	goto out;  /* no more devices */
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_DEVICES);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->ret_code = retcode;
+	dh->minor = -1U;
+	if (retcode == NO_ERROR) {
+		dh->minor = device->minor;
+		err = nla_put_drbd_cfg_context(skb, device->resource, NULL, device);
+		if (err)
+			goto out;
+		if (get_ldev(device)) {
+			struct disk_conf *disk_conf =
+				rcu_dereference(device->ldev->disk_conf);
+
+			err = disk_conf_to_skb(skb, disk_conf, !capable(CAP_SYS_ADMIN));
+			put_ldev(device);
+			if (err)
+				goto out;
+		}
+		device_to_info(&device_info, device);
+		err = device_info_to_skb(skb, &device_info, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+
+		device_to_statistics(&device_statistics, device);
+		err = device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		cb->args[1] = minor + 1;
+	}
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (err)
+		return err;
+	return skb->len;
+}
+
+int drbd_adm_dump_connections_done(struct netlink_callback *cb)
+{
+	return put_resource_in_arg0(cb, 6);
+}
+
+enum { SINGLE_RESOURCE, ITERATE_RESOURCES };
+
+int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *resource_filter;
+	struct drbd_resource *resource = NULL, *next_resource;
+	struct drbd_connection *uninitialized_var(connection);
+	int err = 0, retcode;
+	struct drbd_genlmsghdr *dh;
+	struct connection_info connection_info;
+	struct connection_statistics connection_statistics;
+
+	rcu_read_lock();
+	resource = (struct drbd_resource *)cb->args[0];
+	if (!cb->args[0]) {
+		resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+		if (resource_filter) {
+			retcode = ERR_RES_NOT_KNOWN;
+			resource = drbd_find_resource(nla_data(resource_filter));
+			if (!resource)
+				goto put_result;
+			cb->args[0] = (long)resource;
+			cb->args[1] = SINGLE_RESOURCE;
+		}
+	}
+	if (!resource) {
+		if (list_empty(&drbd_resources))
+			goto out;
+		resource = list_first_entry(&drbd_resources, struct drbd_resource, resources);
+		kref_get(&resource->kref);
+		cb->args[0] = (long)resource;
+		cb->args[1] = ITERATE_RESOURCES;
+	}
+
+    next_resource:
+	rcu_read_unlock();
+	mutex_lock(&resource->conf_update);
+	rcu_read_lock();
+	if (cb->args[2]) {
+		for_each_connection_rcu(connection, resource)
+			if (connection == (struct drbd_connection *)cb->args[2])
+				goto found_connection;
+		/* connection was probably deleted */
+		goto no_more_connections;
+	}
+	connection = list_entry(&resource->connections, struct drbd_connection, connections);
+
+found_connection:
+	list_for_each_entry_continue_rcu(connection, &resource->connections, connections) {
+		if (!has_net_conf(connection))
+			continue;
+		retcode = NO_ERROR;
+		goto put_result;  /* only one iteration */
+	}
+
+no_more_connections:
+	if (cb->args[1] == ITERATE_RESOURCES) {
+		for_each_resource_rcu(next_resource, &drbd_resources) {
+			if (next_resource == resource)
+				goto found_resource;
+		}
+		/* resource was probably deleted */
+	}
+	goto out;
+
+found_resource:
+	list_for_each_entry_continue_rcu(next_resource, &drbd_resources, resources) {
+		mutex_unlock(&resource->conf_update);
+		kref_put(&resource->kref, drbd_destroy_resource);
+		resource = next_resource;
+		kref_get(&resource->kref);
+		cb->args[0] = (long)resource;
+		cb->args[2] = 0;
+		goto next_resource;
+	}
+	goto out;  /* no more resources */
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_CONNECTIONS);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->ret_code = retcode;
+	dh->minor = -1U;
+	if (retcode == NO_ERROR) {
+		struct net_conf *net_conf;
+
+		err = nla_put_drbd_cfg_context(skb, resource, connection, NULL);
+		if (err)
+			goto out;
+		net_conf = rcu_dereference(connection->net_conf);
+		if (net_conf) {
+			err = net_conf_to_skb(skb, net_conf, !capable(CAP_SYS_ADMIN));
+			if (err)
+				goto out;
+		}
+		connection_to_info(&connection_info, connection);
+		err = connection_info_to_skb(skb, &connection_info, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
+		err = connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		cb->args[2] = (long)connection;
+	}
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (resource)
+		mutex_unlock(&resource->conf_update);
+	if (err)
+		return err;
+	return skb->len;
+}
+
+enum mdf_peer_flag {
+	MDF_PEER_CONNECTED =	1 << 0,
+	MDF_PEER_OUTDATED =	1 << 1,
+	MDF_PEER_FENCING =	1 << 2,
+	MDF_PEER_FULL_SYNC =	1 << 3,
+};
+
+static void peer_device_to_statistics(struct peer_device_statistics *s,
+				      struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+
+	memset(s, 0, sizeof(*s));
+	s->peer_dev_received = device->recv_cnt;
+	s->peer_dev_sent = device->send_cnt;
+	s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) +
+			      atomic_read(&device->rs_pending_cnt);
+	s->peer_dev_unacked = atomic_read(&device->unacked_cnt);
+	s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9);
+	s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9);
+	if (get_ldev(device)) {
+		struct drbd_md *md = &device->ldev->md;
+
+		spin_lock_irq(&md->uuid_lock);
+		s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP];
+		spin_unlock_irq(&md->uuid_lock);
+		s->peer_dev_flags =
+			(drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ?
+				MDF_PEER_CONNECTED : 0) +
+			(drbd_md_test_flag(device->ldev, MDF_CONSISTENT) &&
+			 !drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ?
+				MDF_PEER_OUTDATED : 0) +
+			/* FIXME: MDF_PEER_FENCING? */
+			(drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ?
+				MDF_PEER_FULL_SYNC : 0);
+		put_ldev(device);
+	}
+}
+
+int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb)
+{
+	return put_resource_in_arg0(cb, 9);
+}
+
+int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *resource_filter;
+	struct drbd_resource *resource;
+	struct drbd_device *uninitialized_var(device);
+	struct drbd_peer_device *peer_device = NULL;
+	int minor, err, retcode;
+	struct drbd_genlmsghdr *dh;
+	struct idr *idr_to_search;
+
+	resource = (struct drbd_resource *)cb->args[0];
+	if (!cb->args[0] && !cb->args[1]) {
+		resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+		if (resource_filter) {
+			retcode = ERR_RES_NOT_KNOWN;
+			resource = drbd_find_resource(nla_data(resource_filter));
+			if (!resource)
+				goto put_result;
+		}
+		cb->args[0] = (long)resource;
+	}
+
+	rcu_read_lock();
+	minor = cb->args[1];
+	idr_to_search = resource ? &resource->devices : &drbd_devices;
+	device = idr_find(idr_to_search, minor);
+	if (!device) {
+next_device:
+		minor++;
+		cb->args[2] = 0;
+		device = idr_get_next(idr_to_search, &minor);
+		if (!device) {
+			err = 0;
+			goto out;
+		}
+	}
+	if (cb->args[2]) {
+		for_each_peer_device(peer_device, device)
+			if (peer_device == (struct drbd_peer_device *)cb->args[2])
+				goto found_peer_device;
+		/* peer device was probably deleted */
+		goto next_device;
+	}
+	/* Make peer_device point to the list head (not the first entry). */
+	peer_device = list_entry(&device->peer_devices, struct drbd_peer_device, peer_devices);
+
+found_peer_device:
+	list_for_each_entry_continue_rcu(peer_device, &device->peer_devices, peer_devices) {
+		if (!has_net_conf(peer_device->connection))
+			continue;
+		retcode = NO_ERROR;
+		goto put_result;  /* only one iteration */
+	}
+	goto next_device;
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_PEER_DEVICES);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->ret_code = retcode;
+	dh->minor = -1U;
+	if (retcode == NO_ERROR) {
+		struct peer_device_info peer_device_info;
+		struct peer_device_statistics peer_device_statistics;
+
+		dh->minor = minor;
+		err = nla_put_drbd_cfg_context(skb, device->resource, peer_device->connection, device);
+		if (err)
+			goto out;
+		peer_device_to_info(&peer_device_info, peer_device);
+		err = peer_device_info_to_skb(skb, &peer_device_info, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		peer_device_to_statistics(&peer_device_statistics, peer_device);
+		err = peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		cb->args[1] = minor;
+		cb->args[2] = (long)peer_device;
+	}
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (err)
+		return err;
+	return skb->len;
+}
+/*
  * Return the connection of @resource if @resource has exactly one connection.
  */
 static struct drbd_connection *the_only_connection(struct drbd_resource *resource)
@@ -3414,8 +4055,18 @@ drbd_check_resource_name(struct drbd_config_context *adm_ctx)
 	return NO_ERROR;
 }
 
+static void resource_to_info(struct resource_info *info,
+			     struct drbd_resource *resource)
+{
+	info->res_role = conn_highest_role(first_connection(resource));
+	info->res_susp = resource->susp;
+	info->res_susp_nod = resource->susp_nod;
+	info->res_susp_fen = resource->susp_fen;
+}
+
 int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_connection *connection;
 	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 	struct res_opts res_opts;
@@ -3449,13 +4100,33 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	/* not yet safe for genl_family.parallel_ops */
-	if (!conn_create(adm_ctx.resource_name, &res_opts))
+	mutex_lock(&resources_mutex);
+	connection = conn_create(adm_ctx.resource_name, &res_opts);
+	mutex_unlock(&resources_mutex);
+
+	if (connection) {
+		struct resource_info resource_info;
+
+		mutex_lock(&notification_mutex);
+		resource_to_info(&resource_info, connection->resource);
+		notify_resource_state(NULL, 0, connection->resource,
+				      &resource_info, NOTIFY_CREATE);
+		mutex_unlock(&notification_mutex);
+	} else
 		retcode = ERR_NOMEM;
+
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
+static void device_to_info(struct device_info *info,
+			   struct drbd_device *device)
+{
+	info->dev_disk_state = device->state.disk;
+}
+
+
 int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
 {
 	struct drbd_config_context adm_ctx;
@@ -3490,6 +4161,36 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
 
 	mutex_lock(&adm_ctx.resource->adm_mutex);
 	retcode = drbd_create_device(&adm_ctx, dh->minor);
+	if (retcode == NO_ERROR) {
+		struct drbd_device *device;
+		struct drbd_peer_device *peer_device;
+		struct device_info info;
+		unsigned int peer_devices = 0;
+		enum drbd_notification_type flags;
+
+		device = minor_to_device(dh->minor);
+		for_each_peer_device(peer_device, device) {
+			if (!has_net_conf(peer_device->connection))
+				continue;
+			peer_devices++;
+		}
+
+		device_to_info(&info, device);
+		mutex_lock(&notification_mutex);
+		flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+		notify_device_state(NULL, 0, device, &info, NOTIFY_CREATE | flags);
+		for_each_peer_device(peer_device, device) {
+			struct peer_device_info peer_device_info;
+
+			if (!has_net_conf(peer_device->connection))
+				continue;
+			peer_device_to_info(&peer_device_info, peer_device);
+			flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+			notify_peer_device_state(NULL, 0, peer_device, &peer_device_info,
+						 NOTIFY_CREATE | flags);
+		}
+		mutex_unlock(&notification_mutex);
+	}
 	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
@@ -3498,13 +4199,35 @@ out:
 
 static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
 {
+	struct drbd_peer_device *peer_device;
+
 	if (device->state.disk == D_DISKLESS &&
 	    /* no need to be device->state.conn == C_STANDALONE &&
 	     * we may want to delete a minor from a live replication group.
 	     */
 	    device->state.role == R_SECONDARY) {
+		struct drbd_connection *connection =
+			first_connection(device->resource);
+
 		_drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
 				    CS_VERBOSE + CS_WAIT_COMPLETE);
+
+		/* If the state engine hasn't stopped the sender thread yet, we
+		 * need to flush the sender work queue before generating the
+		 * DESTROY events here. */
+		if (get_t_state(&connection->worker) == RUNNING)
+			drbd_flush_workqueue(&connection->sender_work);
+
+		mutex_lock(&notification_mutex);
+		for_each_peer_device(peer_device, device) {
+			if (!has_net_conf(peer_device->connection))
+				continue;
+			notify_peer_device_state(NULL, 0, peer_device, NULL,
+						 NOTIFY_DESTROY | NOTIFY_CONTINUES);
+		}
+		notify_device_state(NULL, 0, device, NULL, NOTIFY_DESTROY);
+		mutex_unlock(&notification_mutex);
+
 		drbd_delete_device(device);
 		return NO_ERROR;
 	} else
@@ -3541,7 +4264,16 @@ static int adm_del_resource(struct drbd_resource *resource)
 	if (!idr_is_empty(&resource->devices))
 		return ERR_RES_IN_USE;
 
+	/* The state engine has stopped the sender thread, so we don't
+	 * need to flush the sender work queue before generating the
+	 * DESTROY event here. */
+	mutex_lock(&notification_mutex);
+	notify_resource_state(NULL, 0, resource, NULL, NOTIFY_DESTROY);
+	mutex_unlock(&notification_mutex);
+
+	mutex_lock(&resources_mutex);
 	list_del_rcu(&resource->resources);
+	mutex_unlock(&resources_mutex);
 	/* Make sure all threads have actually stopped: state handling only
 	 * does drbd_thread_stop_nowait(). */
 	list_for_each_entry(connection, &resource->connections, connections)
@@ -3637,7 +4369,6 @@ finish:
 
 void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
 {
-	static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
 	struct sk_buff *msg;
 	struct drbd_genlmsghdr *d_out;
 	unsigned seq;
@@ -3658,7 +4389,7 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
 	if (nla_put_status_info(msg, device, sib))
 		goto nla_put_failure;
 	genlmsg_end(msg, d_out);
-	err = drbd_genl_multicast_events(msg, 0);
+	err = drbd_genl_multicast_events(msg, GFP_NOWAIT);
 	/* msg has been consumed or freed in netlink_broadcast() */
 	if (err && err != -ESRCH)
 		goto failed;
@@ -3672,3 +4403,405 @@ failed:
 			"Event seq:%u sib_reason:%u\n",
 			err, seq, sib->sib_reason);
 }
+
+static int nla_put_notification_header(struct sk_buff *msg,
+				       enum drbd_notification_type type)
+{
+	struct drbd_notification_header nh = {
+		.nh_type = type,
+	};
+
+	return drbd_notification_header_to_skb(msg, &nh, true);
+}
+
+void notify_resource_state(struct sk_buff *skb,
+			   unsigned int seq,
+			   struct drbd_resource *resource,
+			   struct resource_info *resource_info,
+			   enum drbd_notification_type type)
+{
+	struct resource_statistics resource_statistics;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_RESOURCE_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, resource, NULL, NULL) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     resource_info_to_skb(skb, resource_info, true)))
+		goto nla_put_failure;
+	resource_statistics.res_stat_write_ordering = resource->write_ordering;
+	err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto nla_put_failure;
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
+			err, seq);
+}
+
+void notify_device_state(struct sk_buff *skb,
+			 unsigned int seq,
+			 struct drbd_device *device,
+			 struct device_info *device_info,
+			 enum drbd_notification_type type)
+{
+	struct device_statistics device_statistics;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_DEVICE_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = device->minor;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, device->resource, NULL, device) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     device_info_to_skb(skb, device_info, true)))
+		goto nla_put_failure;
+	device_to_statistics(&device_statistics, device);
+	device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+}
+
+void notify_connection_state(struct sk_buff *skb,
+			     unsigned int seq,
+			     struct drbd_connection *connection,
+			     struct connection_info *connection_info,
+			     enum drbd_notification_type type)
+{
+	struct connection_statistics connection_statistics;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_CONNECTION_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, connection->resource, connection, NULL) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     connection_info_to_skb(skb, connection_info, true)))
+		goto nla_put_failure;
+	connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
+	connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+}
+
+void notify_peer_device_state(struct sk_buff *skb,
+			      unsigned int seq,
+			      struct drbd_peer_device *peer_device,
+			      struct peer_device_info *peer_device_info,
+			      enum drbd_notification_type type)
+{
+	struct peer_device_statistics peer_device_statistics;
+	struct drbd_resource *resource = peer_device->device->resource;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_PEER_DEVICE_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, resource, peer_device->connection, peer_device->device) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     peer_device_info_to_skb(skb, peer_device_info, true)))
+		goto nla_put_failure;
+	peer_device_to_statistics(&peer_device_statistics, peer_device);
+	peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+}
+
+void notify_helper(enum drbd_notification_type type,
+		   struct drbd_device *device, struct drbd_connection *connection,
+		   const char *name, int status)
+{
+	struct drbd_resource *resource = device ? device->resource : connection->resource;
+	struct drbd_helper_info helper_info;
+	unsigned int seq = atomic_inc_return(&notify_genl_seq);
+	struct sk_buff *skb = NULL;
+	struct drbd_genlmsghdr *dh;
+	int err;
+
+	strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name));
+	helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name));
+	helper_info.helper_status = status;
+
+	skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+	err = -ENOMEM;
+	if (!skb)
+		goto fail;
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_HELPER);
+	if (!dh)
+		goto fail;
+	dh->minor = device ? device->minor : -1;
+	dh->ret_code = NO_ERROR;
+	mutex_lock(&notification_mutex);
+	if (nla_put_drbd_cfg_context(skb, resource, connection, device) ||
+	    nla_put_notification_header(skb, type) ||
+	    drbd_helper_info_to_skb(skb, &helper_info, true))
+		goto unlock_fail;
+	genlmsg_end(skb, dh);
+	err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+	skb = NULL;
+	/* skb has been consumed or freed in netlink_broadcast() */
+	if (err && err != -ESRCH)
+		goto unlock_fail;
+	mutex_unlock(&notification_mutex);
+	return;
+
+unlock_fail:
+	mutex_unlock(&notification_mutex);
+fail:
+	nlmsg_free(skb);
+	drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+}
+
+static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq)
+{
+	struct drbd_genlmsghdr *dh;
+	int err;
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_INITIAL_STATE_DONE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_notification_header(skb, NOTIFY_EXISTS))
+		goto nla_put_failure;
+	genlmsg_end(skb, dh);
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+	pr_err("Error %d sending event. Event seq:%u\n", err, seq);
+}
+
+static void free_state_changes(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct drbd_state_change *state_change =
+			list_first_entry(list, struct drbd_state_change, list);
+		list_del(&state_change->list);
+		forget_state_change(state_change);
+	}
+}
+
+static unsigned int notifications_for_state_change(struct drbd_state_change *state_change)
+{
+	return 1 +
+	       state_change->n_connections +
+	       state_change->n_devices +
+	       state_change->n_devices * state_change->n_connections;
+}
+
+static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_state_change *state_change = (struct drbd_state_change *)cb->args[0];
+	unsigned int seq = cb->args[2];
+	unsigned int n;
+	enum drbd_notification_type flags = 0;
+
+	/* There is no need for taking notification_mutex here: it doesn't
+	   matter if the initial state events mix with later state chage
+	   events; we can always tell the events apart by the NOTIFY_EXISTS
+	   flag. */
+
+	cb->args[5]--;
+	if (cb->args[5] == 1) {
+		notify_initial_state_done(skb, seq);
+		goto out;
+	}
+	n = cb->args[4]++;
+	if (cb->args[4] < cb->args[3])
+		flags |= NOTIFY_CONTINUES;
+	if (n < 1) {
+		notify_resource_state_change(skb, seq, state_change->resource,
+					     NOTIFY_EXISTS | flags);
+		goto next;
+	}
+	n--;
+	if (n < state_change->n_connections) {
+		notify_connection_state_change(skb, seq, &state_change->connections[n],
+					       NOTIFY_EXISTS | flags);
+		goto next;
+	}
+	n -= state_change->n_connections;
+	if (n < state_change->n_devices) {
+		notify_device_state_change(skb, seq, &state_change->devices[n],
+					   NOTIFY_EXISTS | flags);
+		goto next;
+	}
+	n -= state_change->n_devices;
+	if (n < state_change->n_devices * state_change->n_connections) {
+		notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n],
+						NOTIFY_EXISTS | flags);
+		goto next;
+	}
+
+next:
+	if (cb->args[4] == cb->args[3]) {
+		struct drbd_state_change *next_state_change =
+			list_entry(state_change->list.next,
+				   struct drbd_state_change, list);
+		cb->args[0] = (long)next_state_change;
+		cb->args[3] = notifications_for_state_change(next_state_change);
+		cb->args[4] = 0;
+	}
+out:
+	return skb->len;
+}
+
+int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_resource *resource;
+	LIST_HEAD(head);
+
+	if (cb->args[5] >= 1) {
+		if (cb->args[5] > 1)
+			return get_initial_state(skb, cb);
+		if (cb->args[0]) {
+			struct drbd_state_change *state_change =
+				(struct drbd_state_change *)cb->args[0];
+
+			/* connect list to head */
+			list_add(&head, &state_change->list);
+			free_state_changes(&head);
+		}
+		return 0;
+	}
+
+	cb->args[5] = 2;  /* number of iterations */
+	mutex_lock(&resources_mutex);
+	for_each_resource(resource, &drbd_resources) {
+		struct drbd_state_change *state_change;
+
+		state_change = remember_old_state(resource, GFP_KERNEL);
+		if (!state_change) {
+			if (!list_empty(&head))
+				free_state_changes(&head);
+			mutex_unlock(&resources_mutex);
+			return -ENOMEM;
+		}
+		copy_old_to_new_state_change(state_change);
+		list_add_tail(&state_change->list, &head);
+		cb->args[5] += notifications_for_state_change(state_change);
+	}
+	mutex_unlock(&resources_mutex);
+
+	if (!list_empty(&head)) {
+		struct drbd_state_change *state_change =
+			list_entry(head.next, struct drbd_state_change, list);
+		cb->args[0] = (long)state_change;
+		cb->args[3] = notifications_for_state_change(state_change);
+		list_del(&head);  /* detach list from head */
+	}
+
+	cb->args[2] = cb->nlh->nlmsg_seq;
+	return get_initial_state(skb, cb);
+}
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 3b10fa6cb039..6537b25db9c1 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -245,9 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 	char wp;
 
 	static char write_ordering_chars[] = {
-		[WO_none] = 'n',
-		[WO_drain_io] = 'd',
-		[WO_bdev_flush] = 'f',
+		[WO_NONE] = 'n',
+		[WO_DRAIN_IO] = 'd',
+		[WO_BDEV_FLUSH] = 'f',
 	};
 
 	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index 2da9104a3851..ef9245363dcc 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -23,7 +23,7 @@ enum drbd_packet {
 	P_AUTH_RESPONSE	      = 0x11,
 	P_STATE_CHG_REQ	      = 0x12,
 
-	/* asender (meta socket */
+	/* (meta socket) */
 	P_PING		      = 0x13,
 	P_PING_ACK	      = 0x14,
 	P_RECV_ACK	      = 0x15, /* Used in protocol B */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index b4b5680ac6ad..1957fe8601dc 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -215,7 +215,7 @@ static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
 	}
 }
 
-static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
+static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
 {
 	LIST_HEAD(reclaimed);
 	struct drbd_peer_request *peer_req, *t;
@@ -223,11 +223,30 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
 	spin_lock_irq(&device->resource->req_lock);
 	reclaim_finished_net_peer_reqs(device, &reclaimed);
 	spin_unlock_irq(&device->resource->req_lock);
-
 	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 		drbd_free_net_peer_req(device, peer_req);
 }
 
+static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (!atomic_read(&device->pp_in_use_by_net))
+			continue;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_reclaim_net_peer_reqs(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
 /**
  * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
  * @device:	DRBD device.
@@ -265,10 +284,15 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
 	if (atomic_read(&device->pp_in_use) < mxb)
 		page = __drbd_alloc_pages(device, number);
 
+	/* Try to keep the fast path fast, but occasionally we need
+	 * to reclaim the pages we lended to the network stack. */
+	if (page && atomic_read(&device->pp_in_use_by_net) > 512)
+		drbd_reclaim_net_peer_reqs(device);
+
 	while (page == NULL) {
 		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 
-		drbd_kick_lo_and_reclaim_net(device);
+		drbd_reclaim_net_peer_reqs(device);
 
 		if (atomic_read(&device->pp_in_use) < mxb) {
 			page = __drbd_alloc_pages(device, number);
@@ -1099,7 +1123,15 @@ randomize:
 		return 0;
 	}
 
-	drbd_thread_start(&connection->asender);
+	drbd_thread_start(&connection->ack_receiver);
+	/* opencoded create_singlethread_workqueue(),
+	 * to be able to use format string arguments */
+	connection->ack_sender =
+		alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
+	if (!connection->ack_sender) {
+		drbd_err(connection, "Failed to create workqueue ack_sender\n");
+		return 0;
+	}
 
 	mutex_lock(&connection->resource->conf_update);
 	/* The discard_my_data flag is a single-shot modifier to the next
@@ -1178,7 +1210,7 @@ static void drbd_flush(struct drbd_connection *connection)
 	struct drbd_peer_device *peer_device;
 	int vnr;
 
-	if (connection->resource->write_ordering >= WO_bdev_flush) {
+	if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
 		rcu_read_lock();
 		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 			struct drbd_device *device = peer_device->device;
@@ -1203,7 +1235,7 @@ static void drbd_flush(struct drbd_connection *connection)
 				/* would rather check on EOPNOTSUPP, but that is not reliable.
 				 * don't try again for ANY return value != 0
 				 * if (rv == -EOPNOTSUPP) */
-				drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
+				drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
 			}
 			put_ldev(device);
 			kref_put(&device->kref, drbd_destroy_device);
@@ -1299,10 +1331,10 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
 
 	dc = rcu_dereference(bdev->disk_conf);
 
-	if (wo == WO_bdev_flush && !dc->disk_flushes)
-		wo = WO_drain_io;
-	if (wo == WO_drain_io && !dc->disk_drain)
-		wo = WO_none;
+	if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
+		wo = WO_DRAIN_IO;
+	if (wo == WO_DRAIN_IO && !dc->disk_drain)
+		wo = WO_NONE;
 
 	return wo;
 }
@@ -1319,13 +1351,13 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
 	enum write_ordering_e pwo;
 	int vnr;
 	static char *write_ordering_str[] = {
-		[WO_none] = "none",
-		[WO_drain_io] = "drain",
-		[WO_bdev_flush] = "flush",
+		[WO_NONE] = "none",
+		[WO_DRAIN_IO] = "drain",
+		[WO_BDEV_FLUSH] = "flush",
 	};
 
 	pwo = resource->write_ordering;
-	if (wo != WO_bdev_flush)
+	if (wo != WO_BDEV_FLUSH)
 		wo = min(pwo, wo);
 	rcu_read_lock();
 	idr_for_each_entry(&resource->devices, device, vnr) {
@@ -1343,7 +1375,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
 	rcu_read_unlock();
 
 	resource->write_ordering = wo;
-	if (pwo != resource->write_ordering || wo == WO_bdev_flush)
+	if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
 		drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
 
@@ -1380,7 +1412,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
 	if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
 		/* wait for all pending IO completions, before we start
 		 * zeroing things out. */
-		conn_wait_active_ee_empty(first_peer_device(device)->connection);
+		conn_wait_active_ee_empty(peer_req->peer_device->connection);
 		/* add it to the active list now,
 		 * so we can find it to present it in debugfs */
 		peer_req->submit_jif = jiffies;
@@ -1508,12 +1540,6 @@ static void conn_wait_active_ee_empty(struct drbd_connection *connection)
 	rcu_read_unlock();
 }
 
-static struct drbd_peer_device *
-conn_peer_device(struct drbd_connection *connection, int volume_number)
-{
-	return idr_find(&connection->peer_devices, volume_number);
-}
-
 static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
 {
 	int rv;
@@ -1533,7 +1559,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 	 * Therefore we must send the barrier_ack after the barrier request was
 	 * completed. */
 	switch (connection->resource->write_ordering) {
-	case WO_none:
+	case WO_NONE:
 		if (rv == FE_RECYCLED)
 			return 0;
 
@@ -1546,8 +1572,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 			drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
 			/* Fall through */
 
-	case WO_bdev_flush:
-	case WO_drain_io:
+	case WO_BDEV_FLUSH:
+	case WO_DRAIN_IO:
 		conn_wait_active_ee_empty(connection);
 		drbd_flush(connection);
 
@@ -1752,7 +1778,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req
 }
 
 /*
- * e_end_resync_block() is called in asender context via
+ * e_end_resync_block() is called in ack_sender context via
  * drbd_finish_peer_reqs().
  */
 static int e_end_resync_block(struct drbd_work *w, int unused)
@@ -1926,7 +1952,7 @@ static void restart_conflicting_writes(struct drbd_device *device,
 }
 
 /*
- * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+ * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
  */
 static int e_end_block(struct drbd_work *w, int cancel)
 {
@@ -1966,7 +1992,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
 	} else
 		D_ASSERT(device, drbd_interval_empty(&peer_req->i));
 
-	drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+	drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
 
 	return err;
 }
@@ -2098,7 +2124,7 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co
 		}
 
 		rcu_read_lock();
-		tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
+		tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
 		rcu_read_unlock();
 
 		if (!tp)
@@ -2217,7 +2243,7 @@ static int handle_write_conflicts(struct drbd_device *device,
 			peer_req->w.cb = superseded ? e_send_superseded :
 						   e_send_retry_write;
 			list_add_tail(&peer_req->w.list, &device->done_ee);
-			wake_asender(connection);
+			queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
 
 			err = -ENOENT;
 			goto out;
@@ -2364,7 +2390,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	if (dp_flags & DP_SEND_RECEIVE_ACK) {
 		/* I really don't like it that the receiver thread
 		 * sends on the msock, but anyways */
-		drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+		drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
 	}
 
 	if (tp) {
@@ -4056,7 +4082,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info
 	os = ns = drbd_read_state(device);
 	spin_unlock_irq(&device->resource->req_lock);
 
-	/* If some other part of the code (asender thread, timeout)
+	/* If some other part of the code (ack_receiver thread, timeout)
 	 * already decided to close the connection again,
 	 * we must not "re-establish" it here. */
 	if (os.conn <= C_TEAR_DOWN)
@@ -4661,8 +4687,12 @@ static void conn_disconnect(struct drbd_connection *connection)
 	 */
 	conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
 
-	/* asender does not clean up anything. it must not interfere, either */
-	drbd_thread_stop(&connection->asender);
+	/* ack_receiver does not clean up anything. it must not interfere, either */
+	drbd_thread_stop(&connection->ack_receiver);
+	if (connection->ack_sender) {
+		destroy_workqueue(connection->ack_sender);
+		connection->ack_sender = NULL;
+	}
 	drbd_free_sock(connection);
 
 	rcu_read_lock();
@@ -5431,49 +5461,39 @@ static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
 	return 0;
 }
 
-static int connection_finish_peer_reqs(struct drbd_connection *connection)
+struct meta_sock_cmd {
+	size_t pkt_size;
+	int (*fn)(struct drbd_connection *connection, struct packet_info *);
+};
+
+static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
 {
-	struct drbd_peer_device *peer_device;
-	int vnr, not_empty = 0;
+	long t;
+	struct net_conf *nc;
 
-	do {
-		clear_bit(SIGNAL_ASENDER, &connection->flags);
-		flush_signals(current);
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	t = ping_timeout ? nc->ping_timeo : nc->ping_int;
+	rcu_read_unlock();
 
-		rcu_read_lock();
-		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-			struct drbd_device *device = peer_device->device;
-			kref_get(&device->kref);
-			rcu_read_unlock();
-			if (drbd_finish_peer_reqs(device)) {
-				kref_put(&device->kref, drbd_destroy_device);
-				return 1;
-			}
-			kref_put(&device->kref, drbd_destroy_device);
-			rcu_read_lock();
-		}
-		set_bit(SIGNAL_ASENDER, &connection->flags);
+	t *= HZ;
+	if (ping_timeout)
+		t /= 10;
 
-		spin_lock_irq(&connection->resource->req_lock);
-		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-			struct drbd_device *device = peer_device->device;
-			not_empty = !list_empty(&device->done_ee);
-			if (not_empty)
-				break;
-		}
-		spin_unlock_irq(&connection->resource->req_lock);
-		rcu_read_unlock();
-	} while (not_empty);
+	connection->meta.socket->sk->sk_rcvtimeo = t;
+}
 
-	return 0;
+static void set_ping_timeout(struct drbd_connection *connection)
+{
+	set_rcvtimeo(connection, 1);
 }
 
-struct asender_cmd {
-	size_t pkt_size;
-	int (*fn)(struct drbd_connection *connection, struct packet_info *);
-};
+static void set_idle_timeout(struct drbd_connection *connection)
+{
+	set_rcvtimeo(connection, 0);
+}
 
-static struct asender_cmd asender_tbl[] = {
+static struct meta_sock_cmd ack_receiver_tbl[] = {
 	[P_PING]	    = { 0, got_Ping },
 	[P_PING_ACK]	    = { 0, got_PingAck },
 	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
@@ -5493,64 +5513,40 @@ static struct asender_cmd asender_tbl[] = {
 	[P_RETRY_WRITE]	    = { sizeof(struct p_block_ack), got_BlockAck },
 };
 
-int drbd_asender(struct drbd_thread *thi)
+int drbd_ack_receiver(struct drbd_thread *thi)
 {
 	struct drbd_connection *connection = thi->connection;
-	struct asender_cmd *cmd = NULL;
+	struct meta_sock_cmd *cmd = NULL;
 	struct packet_info pi;
+	unsigned long pre_recv_jif;
 	int rv;
 	void *buf    = connection->meta.rbuf;
 	int received = 0;
 	unsigned int header_size = drbd_header_size(connection);
 	int expect   = header_size;
 	bool ping_timeout_active = false;
-	struct net_conf *nc;
-	int ping_timeo, tcp_cork, ping_int;
 	struct sched_param param = { .sched_priority = 2 };
 
 	rv = sched_setscheduler(current, SCHED_RR, &param);
 	if (rv < 0)
-		drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
+		drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv);
 
 	while (get_t_state(thi) == RUNNING) {
 		drbd_thread_current_set_cpu(thi);
 
-		rcu_read_lock();
-		nc = rcu_dereference(connection->net_conf);
-		ping_timeo = nc->ping_timeo;
-		tcp_cork = nc->tcp_cork;
-		ping_int = nc->ping_int;
-		rcu_read_unlock();
+		conn_reclaim_net_peer_reqs(connection);
 
 		if (test_and_clear_bit(SEND_PING, &connection->flags)) {
 			if (drbd_send_ping(connection)) {
 				drbd_err(connection, "drbd_send_ping has failed\n");
 				goto reconnect;
 			}
-			connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+			set_ping_timeout(connection);
 			ping_timeout_active = true;
 		}
 
-		/* TODO: conditionally cork; it may hurt latency if we cork without
-		   much to send */
-		if (tcp_cork)
-			drbd_tcp_cork(connection->meta.socket);
-		if (connection_finish_peer_reqs(connection)) {
-			drbd_err(connection, "connection_finish_peer_reqs() failed\n");
-			goto reconnect;
-		}
-		/* but unconditionally uncork unless disabled */
-		if (tcp_cork)
-			drbd_tcp_uncork(connection->meta.socket);
-
-		/* short circuit, recv_msg would return EINTR anyways. */
-		if (signal_pending(current))
-			continue;
-
+		pre_recv_jif = jiffies;
 		rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
-		clear_bit(SIGNAL_ASENDER, &connection->flags);
-
-		flush_signals(current);
 
 		/* Note:
 		 * -EINTR	 (on meta) we got a signal
@@ -5562,7 +5558,6 @@ int drbd_asender(struct drbd_thread *thi)
 		 * rv <  expected: "woken" by signal during receive
 		 * rv == 0	 : "connection shut down by peer"
 		 */
-received_more:
 		if (likely(rv > 0)) {
 			received += rv;
 			buf	 += rv;
@@ -5584,8 +5579,7 @@ received_more:
 		} else if (rv == -EAGAIN) {
 			/* If the data socket received something meanwhile,
 			 * that is good enough: peer is still alive. */
-			if (time_after(connection->last_received,
-				jiffies - connection->meta.socket->sk->sk_rcvtimeo))
+			if (time_after(connection->last_received, pre_recv_jif))
 				continue;
 			if (ping_timeout_active) {
 				drbd_err(connection, "PingAck did not arrive in time.\n");
@@ -5594,6 +5588,10 @@ received_more:
 			set_bit(SEND_PING, &connection->flags);
 			continue;
 		} else if (rv == -EINTR) {
+			/* maybe drbd_thread_stop(): the while condition will notice.
+			 * maybe woken for send_ping: we'll send a ping above,
+			 * and change the rcvtimeo */
+			flush_signals(current);
 			continue;
 		} else {
 			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
@@ -5603,8 +5601,8 @@ received_more:
 		if (received == expect && cmd == NULL) {
 			if (decode_header(connection, connection->meta.rbuf, &pi))
 				goto reconnect;
-			cmd = &asender_tbl[pi.cmd];
-			if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+			cmd = &ack_receiver_tbl[pi.cmd];
+			if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
 				drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
 					 cmdname(pi.cmd), pi.cmd);
 				goto disconnect;
@@ -5627,9 +5625,8 @@ received_more:
 
 			connection->last_received = jiffies;
 
-			if (cmd == &asender_tbl[P_PING_ACK]) {
-				/* restore idle timeout */
-				connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
+			if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
+				set_idle_timeout(connection);
 				ping_timeout_active = false;
 			}
 
@@ -5638,11 +5635,6 @@ received_more:
 			expect	 = header_size;
 			cmd	 = NULL;
 		}
-		if (test_bit(SEND_PING, &connection->flags))
-			continue;
-		rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT);
-		if (rv > 0)
-			goto received_more;
 	}
 
 	if (0) {
@@ -5654,9 +5646,41 @@ reconnect:
 disconnect:
 		conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 	}
-	clear_bit(SIGNAL_ASENDER, &connection->flags);
 
-	drbd_info(connection, "asender terminated\n");
+	drbd_info(connection, "ack_receiver terminated\n");
 
 	return 0;
 }
+
+void drbd_send_acks_wf(struct work_struct *ws)
+{
+	struct drbd_peer_device *peer_device =
+		container_of(ws, struct drbd_peer_device, send_acks_work);
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_device *device = peer_device->device;
+	struct net_conf *nc;
+	int tcp_cork, err;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	tcp_cork = nc->tcp_cork;
+	rcu_read_unlock();
+
+	if (tcp_cork)
+		drbd_tcp_cork(connection->meta.socket);
+
+	err = drbd_finish_peer_reqs(device);
+	kref_put(&device->kref, drbd_destroy_device);
+	/* get is in drbd_endio_write_sec_final(). That is necessary to keep the
+	   struct work_struct send_acks_work alive, which is in the peer_device object */
+
+	if (err) {
+		conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+		return;
+	}
+
+	if (tcp_cork)
+		drbd_tcp_uncork(connection->meta.socket);
+
+	return;
+}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3ae2c0086563..2255dcfebd2b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -453,12 +453,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		kref_get(&req->kref); /* wait for the DONE */
 
 	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
-		/* potentially already completed in the asender thread */
+		/* potentially already completed in the ack_receiver thread */
 		if (!(s & RQ_NET_DONE)) {
 			atomic_add(req->i.size >> 9, &device->ap_in_flight);
 			set_if_null_req_not_net_done(peer_device, req);
 		}
-		if (s & RQ_NET_PENDING)
+		if (req->rq_state & RQ_NET_PENDING)
 			set_if_null_req_ack_pending(peer_device, req);
 	}
 
@@ -1095,6 +1095,24 @@ static bool do_remote_read(struct drbd_request *req)
 	return false;
 }
 
+bool drbd_should_do_remote(union drbd_dev_state s)
+{
+	return s.pdsk == D_UP_TO_DATE ||
+		(s.pdsk >= D_INCONSISTENT &&
+		 s.conn >= C_WF_BITMAP_T &&
+		 s.conn < C_AHEAD);
+	/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+	   That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+	   states. */
+}
+
+static bool drbd_should_send_out_of_sync(union drbd_dev_state s)
+{
+	return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+	/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+	   since we enter state C_AHEAD only if proto >= 96 */
+}
+
 /* returns number of connections (== 1, for drbd 8.4)
  * expected to actually write this data,
  * which does NOT include those that we are L_AHEAD for. */
@@ -1149,7 +1167,6 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 	 * stable storage, and this is a WRITE, we may not even submit
 	 * this bio. */
 	if (get_ldev(device)) {
-		req->pre_submit_jif = jiffies;
 		if (drbd_insert_fault(device,
 				      rw == WRITE ? DRBD_FAULT_DT_WR
 				    : rw == READ  ? DRBD_FAULT_DT_RD
@@ -1293,6 +1310,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
 			&device->pending_master_completion[rw == WRITE]);
 	if (req->private_bio) {
 		/* needs to be marked within the same spinlock */
+		req->pre_submit_jif = jiffies;
 		list_add_tail(&req->req_pending_local,
 			&device->pending_completion[rw == WRITE]);
 		_req_mod(req, TO_BE_SUBMITTED);
@@ -1513,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
+static bool net_timeout_reached(struct drbd_request *net_req,
+		struct drbd_connection *connection,
+		unsigned long now, unsigned long ent,
+		unsigned int ko_count, unsigned int timeout)
+{
+	struct drbd_device *device = net_req->device;
+
+	if (!time_after(now, net_req->pre_send_jif + ent))
+		return false;
+
+	if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent))
+		return false;
+
+	if (net_req->rq_state & RQ_NET_PENDING) {
+		drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+			jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+		return true;
+	}
+
+	/* We received an ACK already (or are using protocol A),
+	 * but are waiting for the epoch closing barrier ack.
+	 * Check if we sent the barrier already.  We should not blame the peer
+	 * for being unresponsive, if we did not even ask it yet. */
+	if (net_req->epoch == connection->send.current_epoch_nr) {
+		drbd_warn(device,
+			"We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n",
+			jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+		return false;
+	}
+
+	/* Worst case: we may have been blocked for whatever reason, then
+	 * suddenly are able to send a lot of requests (and epoch separating
+	 * barriers) in quick succession.
+	 * The timestamp of the net_req may be much too old and not correspond
+	 * to the sending time of the relevant unack'ed barrier packet, so
+	 * would trigger a spurious timeout.  The latest barrier packet may
+	 * have a too recent timestamp to trigger the timeout, potentially miss
+	 * a timeout.  Right now we don't have a place to conveniently store
+	 * these timestamps.
+	 * But in this particular situation, the application requests are still
+	 * completed to upper layers, DRBD should still "feel" responsive.
+	 * No need yet to kill this connection, it may still recover.
+	 * If not, eventually we will have queued enough into the network for
+	 * us to block. From that point of view, the timestamp of the last sent
+	 * barrier packet is relevant enough.
+	 */
+	if (time_after(now, connection->send.last_sent_barrier_jif + ent)) {
+		drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+			connection->send.last_sent_barrier_jif, now,
+			jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout);
+		return true;
+	}
+	return false;
+}
+
+/* A request is considered timed out, if
+ * - we have some effective timeout from the configuration,
+ *   with some state restrictions applied,
+ * - the oldest request is waiting for a response from the network
+ *   resp. the local disk,
+ * - the oldest request is in fact older than the effective timeout,
+ * - the connection was established (resp. disk was attached)
+ *   for longer than the timeout already.
+ * Note that for 32bit jiffies and very stable connections/disks,
+ * we may have a wrap around, which is catched by
+ *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+ *
+ * Side effect: once per 32bit wrap-around interval, which means every
+ * ~198 days with 250 HZ, we have a window where the timeout would need
+ * to expire twice (worst case) to become effective. Good enough.
+ */
+
 void request_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
@@ -1522,11 +1612,14 @@ void request_timer_fn(unsigned long data)
 	unsigned long oldest_submit_jif;
 	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
 	unsigned long now;
+	unsigned int ko_count = 0, timeout = 0;
 
 	rcu_read_lock();
 	nc = rcu_dereference(connection->net_conf);
-	if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
-		ent = nc->timeout * HZ/10 * nc->ko_count;
+	if (nc && device->state.conn >= C_WF_REPORT_PARAMS) {
+		ko_count = nc->ko_count;
+		timeout = nc->timeout;
+	}
 
 	if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
 		dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
@@ -1534,6 +1627,8 @@ void request_timer_fn(unsigned long data)
 	}
 	rcu_read_unlock();
 
+
+	ent = timeout * HZ/10 * ko_count;
 	et = min_not_zero(dt, ent);
 
 	if (!et)
@@ -1545,11 +1640,22 @@ void request_timer_fn(unsigned long data)
 	spin_lock_irq(&device->resource->req_lock);
 	req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
 	req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
-	req_peer = connection->req_not_net_done;
+
 	/* maybe the oldest request waiting for the peer is in fact still
-	 * blocking in tcp sendmsg */
-	if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
-		req_peer = connection->req_next;
+	 * blocking in tcp sendmsg.  That's ok, though, that's handled via the
+	 * socket send timeout, requesting a ping, and bumping ko-count in
+	 * we_should_drop_the_connection().
+	 */
+
+	/* check the oldest request we did successfully sent,
+	 * but which is still waiting for an ACK. */
+	req_peer = connection->req_ack_pending;
+
+	/* if we don't have such request (e.g. protocoll A)
+	 * check the oldest requests which is still waiting on its epoch
+	 * closing barrier ack. */
+	if (!req_peer)
+		req_peer = connection->req_not_net_done;
 
 	/* evaluate the oldest peer request only in one timer! */
 	if (req_peer && req_peer->device != device)
@@ -1566,28 +1672,9 @@ void request_timer_fn(unsigned long data)
 		: req_write ? req_write->pre_submit_jif
 		: req_read ? req_read->pre_submit_jif : now;
 
-	/* The request is considered timed out, if
-	 * - we have some effective timeout from the configuration,
-	 *   with above state restrictions applied,
-	 * - the oldest request is waiting for a response from the network
-	 *   resp. the local disk,
-	 * - the oldest request is in fact older than the effective timeout,
-	 * - the connection was established (resp. disk was attached)
-	 *   for longer than the timeout already.
-	 * Note that for 32bit jiffies and very stable connections/disks,
-	 * we may have a wrap around, which is catched by
-	 *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
-	 *
-	 * Side effect: once per 32bit wrap-around interval, which means every
-	 * ~198 days with 250 HZ, we have a window where the timeout would need
-	 * to expire twice (worst case) to become effective. Good enough.
-	 */
-	if (ent && req_peer &&
-		 time_after(now, req_peer->pre_send_jif + ent) &&
-		!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
-		drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
+	if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout))
 		_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
-	}
+
 	if (dt && oldest_submit_jif != now &&
 		 time_after(now, oldest_submit_jif + dt) &&
 		!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 9f6a04080e9f..bb2ef78165e5 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -331,21 +331,6 @@ static inline int req_mod(struct drbd_request *req,
 	return rv;
 }
 
-static inline bool drbd_should_do_remote(union drbd_dev_state s)
-{
-	return s.pdsk == D_UP_TO_DATE ||
-		(s.pdsk >= D_INCONSISTENT &&
-		 s.conn >= C_WF_BITMAP_T &&
-		 s.conn < C_AHEAD);
-	/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
-	   That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
-	   states. */
-}
-static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
-{
-	return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
-	/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
-	   since we enter state C_AHEAD only if proto >= 96 */
-}
+extern bool drbd_should_do_remote(union drbd_dev_state);
 
 #endif
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 2d7dd269b6a8..5a7ef7873b67 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -29,6 +29,7 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
+#include "drbd_state_change.h"
 
 struct after_state_chg_work {
 	struct drbd_work w;
@@ -37,6 +38,7 @@ struct after_state_chg_work {
 	union drbd_state ns;
 	enum chg_state_flags flags;
 	struct completion *done;
+	struct drbd_state_change *state_change;
 };
 
 enum sanitize_state_warnings {
@@ -48,9 +50,248 @@ enum sanitize_state_warnings {
 	IMPLICITLY_UPGRADED_PDSK,
 };
 
+static void count_objects(struct drbd_resource *resource,
+			  unsigned int *n_devices,
+			  unsigned int *n_connections)
+{
+	struct drbd_device *device;
+	struct drbd_connection *connection;
+	int vnr;
+
+	*n_devices = 0;
+	*n_connections = 0;
+
+	idr_for_each_entry(&resource->devices, device, vnr)
+		(*n_devices)++;
+	for_each_connection(connection, resource)
+		(*n_connections)++;
+}
+
+static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp)
+{
+	struct drbd_state_change *state_change;
+	unsigned int size, n;
+
+	size = sizeof(struct drbd_state_change) +
+	       n_devices * sizeof(struct drbd_device_state_change) +
+	       n_connections * sizeof(struct drbd_connection_state_change) +
+	       n_devices * n_connections * sizeof(struct drbd_peer_device_state_change);
+	state_change = kmalloc(size, gfp);
+	if (!state_change)
+		return NULL;
+	state_change->n_devices = n_devices;
+	state_change->n_connections = n_connections;
+	state_change->devices = (void *)(state_change + 1);
+	state_change->connections = (void *)&state_change->devices[n_devices];
+	state_change->peer_devices = (void *)&state_change->connections[n_connections];
+	state_change->resource->resource = NULL;
+	for (n = 0; n < n_devices; n++)
+		state_change->devices[n].device = NULL;
+	for (n = 0; n < n_connections; n++)
+		state_change->connections[n].connection = NULL;
+	return state_change;
+}
+
+struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp)
+{
+	struct drbd_state_change *state_change;
+	struct drbd_device *device;
+	unsigned int n_devices;
+	struct drbd_connection *connection;
+	unsigned int n_connections;
+	int vnr;
+
+	struct drbd_device_state_change *device_state_change;
+	struct drbd_peer_device_state_change *peer_device_state_change;
+	struct drbd_connection_state_change *connection_state_change;
+
+	/* Caller holds req_lock spinlock.
+	 * No state, no device IDR, no connections lists can change. */
+	count_objects(resource, &n_devices, &n_connections);
+	state_change = alloc_state_change(n_devices, n_connections, gfp);
+	if (!state_change)
+		return NULL;
+
+	kref_get(&resource->kref);
+	state_change->resource->resource = resource;
+	state_change->resource->role[OLD] =
+		conn_highest_role(first_connection(resource));
+	state_change->resource->susp[OLD] = resource->susp;
+	state_change->resource->susp_nod[OLD] = resource->susp_nod;
+	state_change->resource->susp_fen[OLD] = resource->susp_fen;
+
+	connection_state_change = state_change->connections;
+	for_each_connection(connection, resource) {
+		kref_get(&connection->kref);
+		connection_state_change->connection = connection;
+		connection_state_change->cstate[OLD] =
+			connection->cstate;
+		connection_state_change->peer_role[OLD] =
+			conn_highest_peer(connection);
+		connection_state_change++;
+	}
+
+	device_state_change = state_change->devices;
+	peer_device_state_change = state_change->peer_devices;
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		kref_get(&device->kref);
+		device_state_change->device = device;
+		device_state_change->disk_state[OLD] = device->state.disk;
+
+		/* The peer_devices for each device have to be enumerated in
+		   the order of the connections. We may not use for_each_peer_device() here. */
+		for_each_connection(connection, resource) {
+			struct drbd_peer_device *peer_device;
+
+			peer_device = conn_peer_device(connection, device->vnr);
+			peer_device_state_change->peer_device = peer_device;
+			peer_device_state_change->disk_state[OLD] =
+				device->state.pdsk;
+			peer_device_state_change->repl_state[OLD] =
+				max_t(enum drbd_conns,
+				      C_WF_REPORT_PARAMS, device->state.conn);
+			peer_device_state_change->resync_susp_user[OLD] =
+				device->state.user_isp;
+			peer_device_state_change->resync_susp_peer[OLD] =
+				device->state.peer_isp;
+			peer_device_state_change->resync_susp_dependency[OLD] =
+				device->state.aftr_isp;
+			peer_device_state_change++;
+		}
+		device_state_change++;
+	}
+
+	return state_change;
+}
+
+static void remember_new_state(struct drbd_state_change *state_change)
+{
+	struct drbd_resource_state_change *resource_state_change;
+	struct drbd_resource *resource;
+	unsigned int n;
+
+	if (!state_change)
+		return;
+
+	resource_state_change = &state_change->resource[0];
+	resource = resource_state_change->resource;
+
+	resource_state_change->role[NEW] =
+		conn_highest_role(first_connection(resource));
+	resource_state_change->susp[NEW] = resource->susp;
+	resource_state_change->susp_nod[NEW] = resource->susp_nod;
+	resource_state_change->susp_fen[NEW] = resource->susp_fen;
+
+	for (n = 0; n < state_change->n_devices; n++) {
+		struct drbd_device_state_change *device_state_change =
+			&state_change->devices[n];
+		struct drbd_device *device = device_state_change->device;
+
+		device_state_change->disk_state[NEW] = device->state.disk;
+	}
+
+	for (n = 0; n < state_change->n_connections; n++) {
+		struct drbd_connection_state_change *connection_state_change =
+			&state_change->connections[n];
+		struct drbd_connection *connection =
+			connection_state_change->connection;
+
+		connection_state_change->cstate[NEW] = connection->cstate;
+		connection_state_change->peer_role[NEW] =
+			conn_highest_peer(connection);
+	}
+
+	for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) {
+		struct drbd_peer_device_state_change *peer_device_state_change =
+			&state_change->peer_devices[n];
+		struct drbd_device *device =
+			peer_device_state_change->peer_device->device;
+		union drbd_dev_state state = device->state;
+
+		peer_device_state_change->disk_state[NEW] = state.pdsk;
+		peer_device_state_change->repl_state[NEW] =
+			max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn);
+		peer_device_state_change->resync_susp_user[NEW] =
+			state.user_isp;
+		peer_device_state_change->resync_susp_peer[NEW] =
+			state.peer_isp;
+		peer_device_state_change->resync_susp_dependency[NEW] =
+			state.aftr_isp;
+	}
+}
+
+void copy_old_to_new_state_change(struct drbd_state_change *state_change)
+{
+	struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+	unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
+
+#define OLD_TO_NEW(x) \
+	(x[NEW] = x[OLD])
+
+	OLD_TO_NEW(resource_state_change->role);
+	OLD_TO_NEW(resource_state_change->susp);
+	OLD_TO_NEW(resource_state_change->susp_nod);
+	OLD_TO_NEW(resource_state_change->susp_fen);
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change =
+				&state_change->connections[n_connection];
+
+		OLD_TO_NEW(connection_state_change->peer_role);
+		OLD_TO_NEW(connection_state_change->cstate);
+	}
+
+	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+		struct drbd_device_state_change *device_state_change =
+			&state_change->devices[n_device];
+
+		OLD_TO_NEW(device_state_change->disk_state);
+	}
+
+	n_peer_devices = state_change->n_devices * state_change->n_connections;
+	for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
+		struct drbd_peer_device_state_change *p =
+			&state_change->peer_devices[n_peer_device];
+
+		OLD_TO_NEW(p->disk_state);
+		OLD_TO_NEW(p->repl_state);
+		OLD_TO_NEW(p->resync_susp_user);
+		OLD_TO_NEW(p->resync_susp_peer);
+		OLD_TO_NEW(p->resync_susp_dependency);
+	}
+
+#undef OLD_TO_NEW
+}
+
+void forget_state_change(struct drbd_state_change *state_change)
+{
+	unsigned int n;
+
+	if (!state_change)
+		return;
+
+	if (state_change->resource->resource)
+		kref_put(&state_change->resource->resource->kref, drbd_destroy_resource);
+	for (n = 0; n < state_change->n_devices; n++) {
+		struct drbd_device *device = state_change->devices[n].device;
+
+		if (device)
+			kref_put(&device->kref, drbd_destroy_device);
+	}
+	for (n = 0; n < state_change->n_connections; n++) {
+		struct drbd_connection *connection =
+			state_change->connections[n].connection;
+
+		if (connection)
+			kref_put(&connection->kref, drbd_destroy_connection);
+	}
+	kfree(state_change);
+}
+
 static int w_after_state_ch(struct drbd_work *w, int unused);
 static void after_state_ch(struct drbd_device *device, union drbd_state os,
-			   union drbd_state ns, enum chg_state_flags flags);
+			   union drbd_state ns, enum chg_state_flags flags,
+			   struct drbd_state_change *);
 static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
 static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
 static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
@@ -93,6 +334,7 @@ static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
 		return R_SECONDARY;
 	return R_UNKNOWN;
 }
+
 static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
 {
 	if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
@@ -937,7 +1179,7 @@ void drbd_resume_al(struct drbd_device *device)
 		drbd_info(device, "Resumed AL updates\n");
 }
 
-/* helper for __drbd_set_state */
+/* helper for _drbd_set_state */
 static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
 {
 	if (first_peer_device(device)->connection->agreed_pro_version < 90)
@@ -965,17 +1207,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
 }
 
 /**
- * __drbd_set_state() - Set a new DRBD state
+ * _drbd_set_state() - Set a new DRBD state
  * @device:	DRBD device.
  * @ns:		new state.
  * @flags:	Flags
  * @done:	Optional completion, that will get completed after the after_state_ch() finished
  *
- * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ * Caller needs to hold req_lock. Do not call directly.
  */
 enum drbd_state_rv
-__drbd_set_state(struct drbd_device *device, union drbd_state ns,
-	         enum chg_state_flags flags, struct completion *done)
+_drbd_set_state(struct drbd_device *device, union drbd_state ns,
+	        enum chg_state_flags flags, struct completion *done)
 {
 	struct drbd_peer_device *peer_device = first_peer_device(device);
 	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
@@ -983,6 +1225,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	enum drbd_state_rv rv = SS_SUCCESS;
 	enum sanitize_state_warnings ssw;
 	struct after_state_chg_work *ascw;
+	struct drbd_state_change *state_change;
 
 	os = drbd_read_state(device);
 
@@ -1037,6 +1280,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
 		clear_bit(RS_DONE, &device->flags);
 
+	/* FIXME: Have any flags been set earlier in this function already? */
+	state_change = remember_old_state(device->resource, GFP_ATOMIC);
+
 	/* changes to local_cnt and device flags should be visible before
 	 * changes to state, which again should be visible before anything else
 	 * depending on that change happens. */
@@ -1047,6 +1293,8 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	device->resource->susp_fen = ns.susp_fen;
 	smp_wmb();
 
+	remember_new_state(state_change);
+
 	/* put replicated vs not-replicated requests in seperate epochs */
 	if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
 	    drbd_should_do_remote((union drbd_dev_state)ns.i))
@@ -1184,6 +1432,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 		ascw->w.cb = w_after_state_ch;
 		ascw->device = device;
 		ascw->done = done;
+		ascw->state_change = state_change;
 		drbd_queue_work(&connection->sender_work,
 				&ascw->w);
 	} else {
@@ -1199,7 +1448,8 @@ static int w_after_state_ch(struct drbd_work *w, int unused)
 		container_of(w, struct after_state_chg_work, w);
 	struct drbd_device *device = ascw->device;
 
-	after_state_ch(device, ascw->os, ascw->ns, ascw->flags);
+	after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change);
+	forget_state_change(ascw->state_change);
 	if (ascw->flags & CS_WAIT_COMPLETE)
 		complete(ascw->done);
 	kfree(ascw);
@@ -1234,7 +1484,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
 	D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
 
 	/* open coded non-blocking drbd_suspend_io(device); */
-	set_bit(SUSPEND_IO, &device->flags);
+	atomic_inc(&device->suspend_cnt);
 
 	drbd_bm_lock(device, why, flags);
 	rv = io_fn(device);
@@ -1245,6 +1495,139 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
 	return rv;
 }
 
+void notify_resource_state_change(struct sk_buff *skb,
+				  unsigned int seq,
+				  struct drbd_resource_state_change *resource_state_change,
+				  enum drbd_notification_type type)
+{
+	struct drbd_resource *resource = resource_state_change->resource;
+	struct resource_info resource_info = {
+		.res_role = resource_state_change->role[NEW],
+		.res_susp = resource_state_change->susp[NEW],
+		.res_susp_nod = resource_state_change->susp_nod[NEW],
+		.res_susp_fen = resource_state_change->susp_fen[NEW],
+	};
+
+	notify_resource_state(skb, seq, resource, &resource_info, type);
+}
+
+void notify_connection_state_change(struct sk_buff *skb,
+				    unsigned int seq,
+				    struct drbd_connection_state_change *connection_state_change,
+				    enum drbd_notification_type type)
+{
+	struct drbd_connection *connection = connection_state_change->connection;
+	struct connection_info connection_info = {
+		.conn_connection_state = connection_state_change->cstate[NEW],
+		.conn_role = connection_state_change->peer_role[NEW],
+	};
+
+	notify_connection_state(skb, seq, connection, &connection_info, type);
+}
+
+void notify_device_state_change(struct sk_buff *skb,
+				unsigned int seq,
+				struct drbd_device_state_change *device_state_change,
+				enum drbd_notification_type type)
+{
+	struct drbd_device *device = device_state_change->device;
+	struct device_info device_info = {
+		.dev_disk_state = device_state_change->disk_state[NEW],
+	};
+
+	notify_device_state(skb, seq, device, &device_info, type);
+}
+
+void notify_peer_device_state_change(struct sk_buff *skb,
+				     unsigned int seq,
+				     struct drbd_peer_device_state_change *p,
+				     enum drbd_notification_type type)
+{
+	struct drbd_peer_device *peer_device = p->peer_device;
+	struct peer_device_info peer_device_info = {
+		.peer_repl_state = p->repl_state[NEW],
+		.peer_disk_state = p->disk_state[NEW],
+		.peer_resync_susp_user = p->resync_susp_user[NEW],
+		.peer_resync_susp_peer = p->resync_susp_peer[NEW],
+		.peer_resync_susp_dependency = p->resync_susp_dependency[NEW],
+	};
+
+	notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type);
+}
+
+static void broadcast_state_change(struct drbd_state_change *state_change)
+{
+	struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+	bool resource_state_has_changed;
+	unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
+	void (*last_func)(struct sk_buff *, unsigned int, void *,
+			  enum drbd_notification_type) = NULL;
+	void *uninitialized_var(last_arg);
+
+#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
+#define FINAL_STATE_CHANGE(type) \
+	({ if (last_func) \
+		last_func(NULL, 0, last_arg, type); \
+	})
+#define REMEMBER_STATE_CHANGE(func, arg, type) \
+	({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
+	   last_func = (typeof(last_func))func; \
+	   last_arg = arg; \
+	 })
+
+	mutex_lock(&notification_mutex);
+
+	resource_state_has_changed =
+	    HAS_CHANGED(resource_state_change->role) ||
+	    HAS_CHANGED(resource_state_change->susp) ||
+	    HAS_CHANGED(resource_state_change->susp_nod) ||
+	    HAS_CHANGED(resource_state_change->susp_fen);
+
+	if (resource_state_has_changed)
+		REMEMBER_STATE_CHANGE(notify_resource_state_change,
+				      resource_state_change, NOTIFY_CHANGE);
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change =
+				&state_change->connections[n_connection];
+
+		if (HAS_CHANGED(connection_state_change->peer_role) ||
+		    HAS_CHANGED(connection_state_change->cstate))
+			REMEMBER_STATE_CHANGE(notify_connection_state_change,
+					      connection_state_change, NOTIFY_CHANGE);
+	}
+
+	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+		struct drbd_device_state_change *device_state_change =
+			&state_change->devices[n_device];
+
+		if (HAS_CHANGED(device_state_change->disk_state))
+			REMEMBER_STATE_CHANGE(notify_device_state_change,
+					      device_state_change, NOTIFY_CHANGE);
+	}
+
+	n_peer_devices = state_change->n_devices * state_change->n_connections;
+	for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
+		struct drbd_peer_device_state_change *p =
+			&state_change->peer_devices[n_peer_device];
+
+		if (HAS_CHANGED(p->disk_state) ||
+		    HAS_CHANGED(p->repl_state) ||
+		    HAS_CHANGED(p->resync_susp_user) ||
+		    HAS_CHANGED(p->resync_susp_peer) ||
+		    HAS_CHANGED(p->resync_susp_dependency))
+			REMEMBER_STATE_CHANGE(notify_peer_device_state_change,
+					      p, NOTIFY_CHANGE);
+	}
+
+	FINAL_STATE_CHANGE(NOTIFY_CHANGE);
+	mutex_unlock(&notification_mutex);
+
+#undef HAS_CHANGED
+#undef FINAL_STATE_CHANGE
+#undef REMEMBER_STATE_CHANGE
+}
+
 /**
  * after_state_ch() - Perform after state change actions that may sleep
  * @device:	DRBD device.
@@ -1253,13 +1636,16 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
  * @flags:	Flags
  */
 static void after_state_ch(struct drbd_device *device, union drbd_state os,
-			   union drbd_state ns, enum chg_state_flags flags)
+			   union drbd_state ns, enum chg_state_flags flags,
+			   struct drbd_state_change *state_change)
 {
 	struct drbd_resource *resource = device->resource;
 	struct drbd_peer_device *peer_device = first_peer_device(device);
 	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
 	struct sib_info sib;
 
+	broadcast_state_change(state_change);
+
 	sib.sib_reason = SIB_STATE_CHANGE;
 	sib.os = os;
 	sib.ns = ns;
@@ -1377,7 +1763,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	}
 
 	if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
-		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+		if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY &&
 		    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
 			drbd_uuid_new_current(device);
 			drbd_send_uuids(peer_device);
@@ -1444,7 +1830,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
 		enum drbd_io_error_p eh = EP_PASS_ON;
 		int was_io_error = 0;
-		/* corresponding get_ldev was in __drbd_set_state, to serialize
+		/* corresponding get_ldev was in _drbd_set_state, to serialize
 		 * our cleanup here with the transition to D_DISKLESS.
 		 * But is is still not save to dreference ldev here, since
 		 * we might come from an failed Attach before ldev was set. */
@@ -1455,6 +1841,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 
 			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
 
+			/* Intentionally call this handler first, before drbd_send_state().
+			 * See: 2932204 drbd: call local-io-error handler early
+			 * People may chose to hard-reset the box from this handler.
+			 * It is useful if this looks like a "regular node crash". */
 			if (was_io_error && eh == EP_CALL_HELPER)
 				drbd_khelper(device, "local-io-error");
 
@@ -1572,6 +1962,7 @@ struct after_conn_state_chg_work {
 	union drbd_state ns_max; /* new, max state, over all devices */
 	enum chg_state_flags flags;
 	struct drbd_connection *connection;
+	struct drbd_state_change *state_change;
 };
 
 static int w_after_conn_state_ch(struct drbd_work *w, int unused)
@@ -1584,6 +1975,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
 	struct drbd_peer_device *peer_device;
 	int vnr;
 
+	broadcast_state_change(acscw->state_change);
+	forget_state_change(acscw->state_change);
 	kfree(acscw);
 
 	/* Upon network configuration, we need to start the receiver */
@@ -1593,6 +1986,13 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
 	if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
 		struct net_conf *old_conf;
 
+		mutex_lock(&notification_mutex);
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+			notify_peer_device_state(NULL, 0, peer_device, NULL,
+						 NOTIFY_DESTROY | NOTIFY_CONTINUES);
+		notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY);
+		mutex_unlock(&notification_mutex);
+
 		mutex_lock(&connection->resource->conf_update);
 		old_conf = connection->net_conf;
 		connection->my_addr_len = 0;
@@ -1759,7 +2159,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
 		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
 			ns.disk = os.disk;
 
-		rv = __drbd_set_state(device, ns, flags, NULL);
+		rv = _drbd_set_state(device, ns, flags, NULL);
 		if (rv < SS_SUCCESS)
 			BUG();
 
@@ -1823,6 +2223,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
 	enum drbd_conns oc = connection->cstate;
 	union drbd_state ns_max, ns_min, os;
 	bool have_mutex = false;
+	struct drbd_state_change *state_change;
 
 	if (mask.conn) {
 		rv = is_valid_conn_transition(oc, val.conn);
@@ -1868,10 +2269,12 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
 			goto abort;
 	}
 
+	state_change = remember_old_state(connection->resource, GFP_ATOMIC);
 	conn_old_common_state(connection, &os, &flags);
 	flags |= CS_DC_SUSP;
 	conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
 	conn_pr_state_change(connection, os, ns_max, flags);
+	remember_new_state(state_change);
 
 	acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
 	if (acscw) {
@@ -1882,6 +2285,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
 		acscw->w.cb = w_after_conn_state_ch;
 		kref_get(&connection->kref);
 		acscw->connection = connection;
+		acscw->state_change = state_change;
 		drbd_queue_work(&connection->sender_work, &acscw->w);
 	} else {
 		drbd_err(connection, "Could not kmalloc an acscw\n");
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index 7f53c40823cd..bd989536f888 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -122,9 +122,9 @@ extern enum drbd_state_rv
 _drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
 					union drbd_state, enum chg_state_flags);
 
-extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
-					   enum chg_state_flags,
-					   struct completion *done);
+extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state,
+					  enum chg_state_flags,
+					  struct completion *done);
 extern void print_st_err(struct drbd_device *, union drbd_state,
 			union drbd_state, int);
 
diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h
new file mode 100644
index 000000000000..9e503a1a0bfb
--- /dev/null
+++ b/drivers/block/drbd/drbd_state_change.h
@@ -0,0 +1,63 @@
+#ifndef DRBD_STATE_CHANGE_H
+#define DRBD_STATE_CHANGE_H
+
+struct drbd_resource_state_change {
+	struct drbd_resource *resource;
+	enum drbd_role role[2];
+	bool susp[2];
+	bool susp_nod[2];
+	bool susp_fen[2];
+};
+
+struct drbd_device_state_change {
+	struct drbd_device *device;
+	enum drbd_disk_state disk_state[2];
+};
+
+struct drbd_connection_state_change {
+	struct drbd_connection *connection;
+	enum drbd_conns cstate[2];  /* drbd9: enum drbd_conn_state */
+	enum drbd_role peer_role[2];
+};
+
+struct drbd_peer_device_state_change {
+	struct drbd_peer_device *peer_device;
+	enum drbd_disk_state disk_state[2];
+	enum drbd_conns repl_state[2];  /* drbd9: enum drbd_repl_state */
+	bool resync_susp_user[2];
+	bool resync_susp_peer[2];
+	bool resync_susp_dependency[2];
+};
+
+struct drbd_state_change {
+	struct list_head list;
+	unsigned int n_devices;
+	unsigned int n_connections;
+	struct drbd_resource_state_change resource[1];
+	struct drbd_device_state_change *devices;
+	struct drbd_connection_state_change *connections;
+	struct drbd_peer_device_state_change *peer_devices;
+};
+
+extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_t);
+extern void copy_old_to_new_state_change(struct drbd_state_change *);
+extern void forget_state_change(struct drbd_state_change *);
+
+extern void notify_resource_state_change(struct sk_buff *,
+					 unsigned int,
+					 struct drbd_resource_state_change *,
+					 enum drbd_notification_type type);
+extern void notify_connection_state_change(struct sk_buff *,
+					   unsigned int,
+					   struct drbd_connection_state_change *,
+					   enum drbd_notification_type type);
+extern void notify_device_state_change(struct sk_buff *,
+				       unsigned int,
+				       struct drbd_device_state_change *,
+				       enum drbd_notification_type type);
+extern void notify_peer_device_state_change(struct sk_buff *,
+					    unsigned int,
+					    struct drbd_peer_device_state_change *,
+					    enum drbd_notification_type type);
+
+#endif  /* DRBD_STATE_CHANGE_H */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 5578c1477ba6..eff716c27b1f 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -55,13 +55,6 @@ static int make_resync_request(struct drbd_device *, int);
  *
  */
 
-
-/* About the global_state_lock
-   Each state transition on an device holds a read lock. In case we have
-   to evaluate the resync after dependencies, we grab a write lock, because
-   we need stable states on all devices for that.  */
-rwlock_t global_state_lock;
-
 /* used for synchronous meta data and bitmap IO
  * submitted by drbd_md_sync_page_io()
  */
@@ -120,6 +113,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
 	unsigned long flags = 0;
 	struct drbd_peer_device *peer_device = peer_req->peer_device;
 	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection = peer_device->connection;
 	struct drbd_interval i;
 	int do_wake;
 	u64 block_id;
@@ -152,6 +146,12 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
 	 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
 	if (peer_req->flags & EE_WAS_ERROR)
 		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+
+	if (connection->cstate >= C_WF_REPORT_PARAMS) {
+		kref_get(&device->kref); /* put is in drbd_send_acks_wf() */
+		if (!queue_work(connection->ack_sender, &peer_device->send_acks_work))
+			kref_put(&device->kref, drbd_destroy_device);
+	}
 	spin_unlock_irqrestore(&device->resource->req_lock, flags);
 
 	if (block_id == ID_SYNCER)
@@ -163,7 +163,6 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
 	if (do_al_complete_io)
 		drbd_al_complete_io(device, &i);
 
-	wake_asender(peer_device->connection);
 	put_ldev(device);
 }
 
@@ -195,6 +194,12 @@ void drbd_peer_request_endio(struct bio *bio)
 	}
 }
 
+void drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device)
+{
+	panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n",
+		device->minor, device->resource->name, device->vnr);
+}
+
 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
  */
 void drbd_request_endio(struct bio *bio)
@@ -238,7 +243,7 @@ void drbd_request_endio(struct bio *bio)
 			drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
 
 		if (!bio->bi_error)
-			panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+			drbd_panic_after_delayed_completion_of_aborted_request(device);
 	}
 
 	/* to avoid recursion in __req_mod */
@@ -1291,6 +1296,7 @@ static int drbd_send_barrier(struct drbd_connection *connection)
 	p->barrier = connection->send.current_epoch_nr;
 	p->pad = 0;
 	connection->send.current_epoch_writes = 0;
+	connection->send.last_sent_barrier_jif = jiffies;
 
 	return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
 }
@@ -1315,6 +1321,7 @@ static void re_init_if_first_write(struct drbd_connection *connection, unsigned
 		connection->send.seen_any_write_yet = true;
 		connection->send.current_epoch_nr = epoch;
 		connection->send.current_epoch_writes = 0;
+		connection->send.last_sent_barrier_jif = jiffies;
 	}
 }
 
@@ -1456,70 +1463,73 @@ static int _drbd_may_sync_now(struct drbd_device *device)
 }
 
 /**
- * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * drbd_pause_after() - Pause resync on all devices that may not resync now
  * @device:	DRBD device.
  *
  * Called from process context only (admin command and after_state_ch).
  */
-static int _drbd_pause_after(struct drbd_device *device)
+static bool drbd_pause_after(struct drbd_device *device)
 {
+	bool changed = false;
 	struct drbd_device *odev;
-	int i, rv = 0;
+	int i;
 
 	rcu_read_lock();
 	idr_for_each_entry(&drbd_devices, odev, i) {
 		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
 			continue;
-		if (!_drbd_may_sync_now(odev))
-			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
-			       != SS_NOTHING_TO_DO);
+		if (!_drbd_may_sync_now(odev) &&
+		    _drbd_set_state(_NS(odev, aftr_isp, 1),
+				    CS_HARD, NULL) != SS_NOTHING_TO_DO)
+			changed = true;
 	}
 	rcu_read_unlock();
 
-	return rv;
+	return changed;
 }
 
 /**
- * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * drbd_resume_next() - Resume resync on all devices that may resync now
  * @device:	DRBD device.
  *
  * Called from process context only (admin command and worker).
  */
-static int _drbd_resume_next(struct drbd_device *device)
+static bool drbd_resume_next(struct drbd_device *device)
 {
+	bool changed = false;
 	struct drbd_device *odev;
-	int i, rv = 0;
+	int i;
 
 	rcu_read_lock();
 	idr_for_each_entry(&drbd_devices, odev, i) {
 		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
 			continue;
 		if (odev->state.aftr_isp) {
-			if (_drbd_may_sync_now(odev))
-				rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
-							CS_HARD, NULL)
-				       != SS_NOTHING_TO_DO) ;
+			if (_drbd_may_sync_now(odev) &&
+			    _drbd_set_state(_NS(odev, aftr_isp, 0),
+					    CS_HARD, NULL) != SS_NOTHING_TO_DO)
+				changed = true;
 		}
 	}
 	rcu_read_unlock();
-	return rv;
+	return changed;
 }
 
 void resume_next_sg(struct drbd_device *device)
 {
-	write_lock_irq(&global_state_lock);
-	_drbd_resume_next(device);
-	write_unlock_irq(&global_state_lock);
+	lock_all_resources();
+	drbd_resume_next(device);
+	unlock_all_resources();
 }
 
 void suspend_other_sg(struct drbd_device *device)
 {
-	write_lock_irq(&global_state_lock);
-	_drbd_pause_after(device);
-	write_unlock_irq(&global_state_lock);
+	lock_all_resources();
+	drbd_pause_after(device);
+	unlock_all_resources();
 }
 
-/* caller must hold global_state_lock */
+/* caller must lock_all_resources() */
 enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
 {
 	struct drbd_device *odev;
@@ -1557,15 +1567,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_min
 	}
 }
 
-/* caller must hold global_state_lock */
+/* caller must lock_all_resources() */
 void drbd_resync_after_changed(struct drbd_device *device)
 {
-	int changes;
+	int changed;
 
 	do {
-		changes  = _drbd_pause_after(device);
-		changes |= _drbd_resume_next(device);
-	} while (changes);
+		changed  = drbd_pause_after(device);
+		changed |= drbd_resume_next(device);
+	} while (changed);
 }
 
 void drbd_rs_controller_reset(struct drbd_device *device)
@@ -1685,19 +1695,14 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	} else {
 		mutex_lock(device->state_mutex);
 	}
-	clear_bit(B_RS_H_DONE, &device->flags);
 
-	/* req_lock: serialize with drbd_send_and_submit() and others
-	 * global_state_lock: for stable sync-after dependencies */
-	spin_lock_irq(&device->resource->req_lock);
-	write_lock(&global_state_lock);
+	lock_all_resources();
+	clear_bit(B_RS_H_DONE, &device->flags);
 	/* Did some connection breakage or IO error race with us? */
 	if (device->state.conn < C_CONNECTED
 	|| !get_ldev_if_state(device, D_NEGOTIATING)) {
-		write_unlock(&global_state_lock);
-		spin_unlock_irq(&device->resource->req_lock);
-		mutex_unlock(device->state_mutex);
-		return;
+		unlock_all_resources();
+		goto out;
 	}
 
 	ns = drbd_read_state(device);
@@ -1711,7 +1716,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	else /* side == C_SYNC_SOURCE */
 		ns.pdsk = D_INCONSISTENT;
 
-	r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
+	r = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
 	ns = drbd_read_state(device);
 
 	if (ns.conn < C_CONNECTED)
@@ -1732,7 +1737,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 			device->rs_mark_left[i] = tw;
 			device->rs_mark_time[i] = now;
 		}
-		_drbd_pause_after(device);
+		drbd_pause_after(device);
 		/* Forget potentially stale cached per resync extent bit-counts.
 		 * Open coded drbd_rs_cancel_all(device), we already have IRQs
 		 * disabled, and know the disk state is ok. */
@@ -1742,8 +1747,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		device->resync_wenr = LC_FREE;
 		spin_unlock(&device->al_lock);
 	}
-	write_unlock(&global_state_lock);
-	spin_unlock_irq(&device->resource->req_lock);
+	unlock_all_resources();
 
 	if (r == SS_SUCCESS) {
 		wake_up(&device->al_wait); /* for lc_reset() above */
@@ -1807,6 +1811,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		drbd_md_sync(device);
 	}
 	put_ldev(device);
+out:
 	mutex_unlock(device->state_mutex);
 }
 
@@ -1836,7 +1841,7 @@ static void drbd_ldev_destroy(struct drbd_device *device)
 	device->act_log = NULL;
 
 	__acquire(local);
-	drbd_free_ldev(device->ldev);
+	drbd_backing_dev_free(device, device->ldev);
 	device->ldev = NULL;
 	__release(local);
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 15bec407ac37..9b180dbbd03c 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -104,9 +104,9 @@
 /* Device instance number, incremented each time a device is probed. */
 static int instance;
 
-struct list_head online_list;
-struct list_head removing_list;
-spinlock_t dev_lock;
+static struct list_head online_list;
+static struct list_head removing_list;
+static spinlock_t dev_lock;
 
 /*
  * Global variable used to hold the major block device number
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 95dff91135ad..8ba1e97d573c 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -436,9 +436,8 @@ static void null_del_dev(struct nullb *nullb)
 static void null_lnvm_end_io(struct request *rq, int error)
 {
 	struct nvm_rq *rqd = rq->end_io_data;
-	struct nvm_dev *dev = rqd->dev;
 
-	dev->mt->end_io(rqd, error);
+	nvm_end_io(rqd, error);
 
 	blk_put_request(rq);
 }
@@ -495,17 +494,17 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
 	id->ppaf.ch_offset = 56;
 	id->ppaf.ch_len = 8;
 
-	do_div(size, bs); /* convert size to pages */
-	do_div(size, 256); /* concert size to pgs pr blk */
+	sector_div(size, bs); /* convert size to pages */
+	size >>= 8; /* concert size to pgs pr blk */
 	grp = &id->groups[0];
 	grp->mtype = 0;
 	grp->fmtype = 0;
 	grp->num_ch = 1;
 	grp->num_pg = 256;
 	blksize = size;
-	do_div(size, (1 << 16));
+	size >>= 16;
 	grp->num_lun = size + 1;
-	do_div(blksize, grp->num_lun);
+	sector_div(blksize, grp->num_lun);
 	grp->num_blk = blksize;
 	grp->num_pln = 1;
 
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 81ea69fee7ca..4a876785b68c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -5185,8 +5185,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
 
 out_err:
 	rbd_dev_unparent(rbd_dev);
-	if (parent)
-		rbd_dev_destroy(parent);
+	rbd_dev_destroy(parent);
 	return ret;
 }
 
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 59c91d49b14b..ba4bfe933276 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -23,7 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
-#include <linux/time.h>
+#include <linux/ktime.h>
 #include <linux/hdreg.h>
 #include <linux/dma-mapping.h>
 #include <linux/completion.h>
@@ -671,16 +671,15 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 static unsigned int carm_fill_sync_time(struct carm_host *host,
 					unsigned int idx, void *mem)
 {
-	struct timeval tv;
 	struct carm_msg_sync_time *st = mem;
 
-	do_gettimeofday(&tv);
+	time64_t tv = ktime_get_real_seconds();
 
 	memset(st, 0, sizeof(*st));
 	st->type	= CARM_MSG_MISC;
 	st->subtype	= MISC_SET_TIME;
 	st->handle	= cpu_to_le32(TAG_ENCODE(idx));
-	st->timestamp	= cpu_to_le32(tv.tv_sec);
+	st->timestamp	= cpu_to_le32(tv);
 
 	return sizeof(struct carm_msg_sync_time);
 }
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 41fb1a917b17..4809c1501d7e 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -84,6 +84,16 @@ MODULE_PARM_DESC(max_persistent_grants,
                  "Maximum number of grants to map persistently");
 
 /*
+ * Maximum number of rings/queues blkback supports, allow as many queues as there
+ * are CPUs if user has not specified a value.
+ */
+unsigned int xenblk_max_queues;
+module_param_named(max_queues, xenblk_max_queues, uint, 0644);
+MODULE_PARM_DESC(max_queues,
+		 "Maximum number of hardware queues per virtual disk." \
+		 "By default it is the number of online CPUs.");
+
+/*
  * Maximum order of pages to be used for the shared ring between front and
  * backend, 4KB page granularity is used.
  */
@@ -113,71 +123,71 @@ module_param(log_stats, int, 0644);
 /* Number of free pages to remove on each call to gnttab_free_pages */
 #define NUM_BATCH_FREE_PAGES 10
 
-static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
+static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&blkif->free_pages_lock, flags);
-	if (list_empty(&blkif->free_pages)) {
-		BUG_ON(blkif->free_pages_num != 0);
-		spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+	spin_lock_irqsave(&ring->free_pages_lock, flags);
+	if (list_empty(&ring->free_pages)) {
+		BUG_ON(ring->free_pages_num != 0);
+		spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 		return gnttab_alloc_pages(1, page);
 	}
-	BUG_ON(blkif->free_pages_num == 0);
-	page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
+	BUG_ON(ring->free_pages_num == 0);
+	page[0] = list_first_entry(&ring->free_pages, struct page, lru);
 	list_del(&page[0]->lru);
-	blkif->free_pages_num--;
-	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+	ring->free_pages_num--;
+	spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 
 	return 0;
 }
 
-static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
+static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
                                   int num)
 {
 	unsigned long flags;
 	int i;
 
-	spin_lock_irqsave(&blkif->free_pages_lock, flags);
+	spin_lock_irqsave(&ring->free_pages_lock, flags);
 	for (i = 0; i < num; i++)
-		list_add(&page[i]->lru, &blkif->free_pages);
-	blkif->free_pages_num += num;
-	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+		list_add(&page[i]->lru, &ring->free_pages);
+	ring->free_pages_num += num;
+	spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 }
 
-static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
+static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
 {
 	/* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
 	struct page *page[NUM_BATCH_FREE_PAGES];
 	unsigned int num_pages = 0;
 	unsigned long flags;
 
-	spin_lock_irqsave(&blkif->free_pages_lock, flags);
-	while (blkif->free_pages_num > num) {
-		BUG_ON(list_empty(&blkif->free_pages));
-		page[num_pages] = list_first_entry(&blkif->free_pages,
+	spin_lock_irqsave(&ring->free_pages_lock, flags);
+	while (ring->free_pages_num > num) {
+		BUG_ON(list_empty(&ring->free_pages));
+		page[num_pages] = list_first_entry(&ring->free_pages,
 		                                   struct page, lru);
 		list_del(&page[num_pages]->lru);
-		blkif->free_pages_num--;
+		ring->free_pages_num--;
 		if (++num_pages == NUM_BATCH_FREE_PAGES) {
-			spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+			spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 			gnttab_free_pages(num_pages, page);
-			spin_lock_irqsave(&blkif->free_pages_lock, flags);
+			spin_lock_irqsave(&ring->free_pages_lock, flags);
 			num_pages = 0;
 		}
 	}
-	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+	spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 	if (num_pages != 0)
 		gnttab_free_pages(num_pages, page);
 }
 
 #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
 
-static int do_block_io_op(struct xen_blkif *blkif);
-static int dispatch_rw_block_io(struct xen_blkif *blkif,
+static int do_block_io_op(struct xen_blkif_ring *ring);
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 				struct blkif_request *req,
 				struct pending_req *pending_req);
-static void make_response(struct xen_blkif *blkif, u64 id,
+static void make_response(struct xen_blkif_ring *ring, u64 id,
 			  unsigned short op, int st);
 
 #define foreach_grant_safe(pos, n, rbtree, node) \
@@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id,
 
 /*
  * We don't need locking around the persistent grant helpers
- * because blkback uses a single-thread for each backed, so we
+ * because blkback uses a single-thread for each backend, so we
  * can be sure that this functions will never be called recursively.
  *
  * The only exception to that is put_persistent_grant, that can be called
@@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
  * bit operations to modify the flags of a persistent grant and to count
  * the number of used grants.
  */
-static int add_persistent_gnt(struct xen_blkif *blkif,
+static int add_persistent_gnt(struct xen_blkif_ring *ring,
 			       struct persistent_gnt *persistent_gnt)
 {
 	struct rb_node **new = NULL, *parent = NULL;
 	struct persistent_gnt *this;
+	struct xen_blkif *blkif = ring->blkif;
 
-	if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
+	if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
 		if (!blkif->vbd.overflow_max_grants)
 			blkif->vbd.overflow_max_grants = 1;
 		return -EBUSY;
 	}
 	/* Figure out where to put new node */
-	new = &blkif->persistent_gnts.rb_node;
+	new = &ring->persistent_gnts.rb_node;
 	while (*new) {
 		this = container_of(*new, struct persistent_gnt, node);
 
@@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif,
 	set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
 	/* Add new node and rebalance tree. */
 	rb_link_node(&(persistent_gnt->node), parent, new);
-	rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
-	blkif->persistent_gnt_c++;
-	atomic_inc(&blkif->persistent_gnt_in_use);
+	rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
+	ring->persistent_gnt_c++;
+	atomic_inc(&ring->persistent_gnt_in_use);
 	return 0;
 }
 
-static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
+static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
 						 grant_ref_t gref)
 {
 	struct persistent_gnt *data;
 	struct rb_node *node = NULL;
 
-	node = blkif->persistent_gnts.rb_node;
+	node = ring->persistent_gnts.rb_node;
 	while (node) {
 		data = container_of(node, struct persistent_gnt, node);
 
@@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
 				return NULL;
 			}
 			set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
-			atomic_inc(&blkif->persistent_gnt_in_use);
+			atomic_inc(&ring->persistent_gnt_in_use);
 			return data;
 		}
 	}
 	return NULL;
 }
 
-static void put_persistent_gnt(struct xen_blkif *blkif,
+static void put_persistent_gnt(struct xen_blkif_ring *ring,
                                struct persistent_gnt *persistent_gnt)
 {
 	if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
 		pr_alert_ratelimited("freeing a grant already unused\n");
 	set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
 	clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
-	atomic_dec(&blkif->persistent_gnt_in_use);
+	atomic_dec(&ring->persistent_gnt_in_use);
 }
 
-static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
+static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
                                  unsigned int num)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
 			unmap_data.count = segs_to_unmap;
 			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
 
-			put_free_pages(blkif, pages, segs_to_unmap);
+			put_free_pages(ring, pages, segs_to_unmap);
 			segs_to_unmap = 0;
 		}
 
@@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
 	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct persistent_gnt *persistent_gnt;
 	int segs_to_unmap = 0;
-	struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
+	struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
 	struct gntab_unmap_queue_data unmap_data;
 
 	unmap_data.pages = pages;
 	unmap_data.unmap_ops = unmap;
 	unmap_data.kunmap_ops = NULL;
 
-	while(!list_empty(&blkif->persistent_purge_list)) {
-		persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
+	while(!list_empty(&ring->persistent_purge_list)) {
+		persistent_gnt = list_first_entry(&ring->persistent_purge_list,
 		                                  struct persistent_gnt,
 		                                  remove_node);
 		list_del(&persistent_gnt->remove_node);
@@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
 		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
 			unmap_data.count = segs_to_unmap;
 			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
-			put_free_pages(blkif, pages, segs_to_unmap);
+			put_free_pages(ring, pages, segs_to_unmap);
 			segs_to_unmap = 0;
 		}
 		kfree(persistent_gnt);
@@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
 	if (segs_to_unmap > 0) {
 		unmap_data.count = segs_to_unmap;
 		BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
-		put_free_pages(blkif, pages, segs_to_unmap);
+		put_free_pages(ring, pages, segs_to_unmap);
 	}
 }
 
-static void purge_persistent_gnt(struct xen_blkif *blkif)
+static void purge_persistent_gnt(struct xen_blkif_ring *ring)
 {
 	struct persistent_gnt *persistent_gnt;
 	struct rb_node *n;
@@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
 	bool scan_used = false, clean_used = false;
 	struct rb_root *root;
 
-	if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
-	    (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
-	    !blkif->vbd.overflow_max_grants)) {
-		return;
+	if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
+	    (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
+	    !ring->blkif->vbd.overflow_max_grants)) {
+		goto out;
 	}
 
-	if (work_busy(&blkif->persistent_purge_work)) {
+	if (work_busy(&ring->persistent_purge_work)) {
 		pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
-		return;
+		goto out;
 	}
 
 	num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
-	num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
-	num_clean = min(blkif->persistent_gnt_c, num_clean);
+	num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
+	num_clean = min(ring->persistent_gnt_c, num_clean);
 	if ((num_clean == 0) ||
-	    (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
-		return;
+	    (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use))))
+		goto out;
 
 	/*
 	 * At this point, we can assure that there will be no calls
@@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
 
 	pr_debug("Going to purge %u persistent grants\n", num_clean);
 
-	BUG_ON(!list_empty(&blkif->persistent_purge_list));
-	root = &blkif->persistent_gnts;
+	BUG_ON(!list_empty(&ring->persistent_purge_list));
+	root = &ring->persistent_gnts;
 purge_list:
 	foreach_grant_safe(persistent_gnt, n, root, node) {
 		BUG_ON(persistent_gnt->handle ==
@@ -414,7 +425,7 @@ purge_list:
 
 		rb_erase(&persistent_gnt->node, root);
 		list_add(&persistent_gnt->remove_node,
-		         &blkif->persistent_purge_list);
+			 &ring->persistent_purge_list);
 		if (--num_clean == 0)
 			goto finished;
 	}
@@ -435,30 +446,32 @@ finished:
 		goto purge_list;
 	}
 
-	blkif->persistent_gnt_c -= (total - num_clean);
-	blkif->vbd.overflow_max_grants = 0;
+	ring->persistent_gnt_c -= (total - num_clean);
+	ring->blkif->vbd.overflow_max_grants = 0;
 
 	/* We can defer this work */
-	schedule_work(&blkif->persistent_purge_work);
+	schedule_work(&ring->persistent_purge_work);
 	pr_debug("Purged %u/%u\n", (total - num_clean), total);
+
+out:
 	return;
 }
 
 /*
  * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
  */
-static struct pending_req *alloc_req(struct xen_blkif *blkif)
+static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
 {
 	struct pending_req *req = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&blkif->pending_free_lock, flags);
-	if (!list_empty(&blkif->pending_free)) {
-		req = list_entry(blkif->pending_free.next, struct pending_req,
+	spin_lock_irqsave(&ring->pending_free_lock, flags);
+	if (!list_empty(&ring->pending_free)) {
+		req = list_entry(ring->pending_free.next, struct pending_req,
 				 free_list);
 		list_del(&req->free_list);
 	}
-	spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
 	return req;
 }
 
@@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif)
  * Return the 'pending_req' structure back to the freepool. We also
  * wake up the thread if it was waiting for a free page.
  */
-static void free_req(struct xen_blkif *blkif, struct pending_req *req)
+static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
 {
 	unsigned long flags;
 	int was_empty;
 
-	spin_lock_irqsave(&blkif->pending_free_lock, flags);
-	was_empty = list_empty(&blkif->pending_free);
-	list_add(&req->free_list, &blkif->pending_free);
-	spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+	spin_lock_irqsave(&ring->pending_free_lock, flags);
+	was_empty = list_empty(&ring->pending_free);
+	list_add(&req->free_list, &ring->pending_free);
+	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
 	if (was_empty)
-		wake_up(&blkif->pending_free_wq);
+		wake_up(&ring->pending_free_wq);
 }
 
 /*
@@ -556,10 +569,10 @@ abort:
 /*
  * Notification from the guest OS.
  */
-static void blkif_notify_work(struct xen_blkif *blkif)
+static void blkif_notify_work(struct xen_blkif_ring *ring)
 {
-	blkif->waiting_reqs = 1;
-	wake_up(&blkif->wq);
+	ring->waiting_reqs = 1;
+	wake_up(&ring->wq);
 }
 
 irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
@@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
  * SCHEDULER FUNCTIONS
  */
 
-static void print_stats(struct xen_blkif *blkif)
+static void print_stats(struct xen_blkif_ring *ring)
 {
 	pr_info("(%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
 		 "  |  ds %4llu | pg: %4u/%4d\n",
-		 current->comm, blkif->st_oo_req,
-		 blkif->st_rd_req, blkif->st_wr_req,
-		 blkif->st_f_req, blkif->st_ds_req,
-		 blkif->persistent_gnt_c,
+		 current->comm, ring->st_oo_req,
+		 ring->st_rd_req, ring->st_wr_req,
+		 ring->st_f_req, ring->st_ds_req,
+		 ring->persistent_gnt_c,
 		 xen_blkif_max_pgrants);
-	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
-	blkif->st_rd_req = 0;
-	blkif->st_wr_req = 0;
-	blkif->st_oo_req = 0;
-	blkif->st_ds_req = 0;
+	ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+	ring->st_rd_req = 0;
+	ring->st_wr_req = 0;
+	ring->st_oo_req = 0;
+	ring->st_ds_req = 0;
 }
 
 int xen_blkif_schedule(void *arg)
 {
-	struct xen_blkif *blkif = arg;
+	struct xen_blkif_ring *ring = arg;
+	struct xen_blkif *blkif = ring->blkif;
 	struct xen_vbd *vbd = &blkif->vbd;
 	unsigned long timeout;
 	int ret;
 
 	xen_blkif_get(blkif);
 
+	set_freezable();
 	while (!kthread_should_stop()) {
 		if (try_to_freeze())
 			continue;
@@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg)
 		timeout = msecs_to_jiffies(LRU_INTERVAL);
 
 		timeout = wait_event_interruptible_timeout(
-			blkif->wq,
-			blkif->waiting_reqs || kthread_should_stop(),
+			ring->wq,
+			ring->waiting_reqs || kthread_should_stop(),
 			timeout);
 		if (timeout == 0)
 			goto purge_gnt_list;
 		timeout = wait_event_interruptible_timeout(
-			blkif->pending_free_wq,
-			!list_empty(&blkif->pending_free) ||
+			ring->pending_free_wq,
+			!list_empty(&ring->pending_free) ||
 			kthread_should_stop(),
 			timeout);
 		if (timeout == 0)
 			goto purge_gnt_list;
 
-		blkif->waiting_reqs = 0;
+		ring->waiting_reqs = 0;
 		smp_mb(); /* clear flag *before* checking for work */
 
-		ret = do_block_io_op(blkif);
+		ret = do_block_io_op(ring);
 		if (ret > 0)
-			blkif->waiting_reqs = 1;
+			ring->waiting_reqs = 1;
 		if (ret == -EACCES)
-			wait_event_interruptible(blkif->shutdown_wq,
+			wait_event_interruptible(ring->shutdown_wq,
 						 kthread_should_stop());
 
 purge_gnt_list:
 		if (blkif->vbd.feature_gnt_persistent &&
-		    time_after(jiffies, blkif->next_lru)) {
-			purge_persistent_gnt(blkif);
-			blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
+		    time_after(jiffies, ring->next_lru)) {
+			purge_persistent_gnt(ring);
+			ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
 		}
 
 		/* Shrink if we have more than xen_blkif_max_buffer_pages */
-		shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
+		shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
 
-		if (log_stats && time_after(jiffies, blkif->st_print))
-			print_stats(blkif);
+		if (log_stats && time_after(jiffies, ring->st_print))
+			print_stats(ring);
 	}
 
 	/* Drain pending purge work */
-	flush_work(&blkif->persistent_purge_work);
+	flush_work(&ring->persistent_purge_work);
 
 	if (log_stats)
-		print_stats(blkif);
+		print_stats(ring);
 
-	blkif->xenblkd = NULL;
+	ring->xenblkd = NULL;
 	xen_blkif_put(blkif);
 
 	return 0;
@@ -658,22 +673,22 @@ purge_gnt_list:
 /*
  * Remove persistent grants and empty the pool of free pages
  */
-void xen_blkbk_free_caches(struct xen_blkif *blkif)
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
 {
 	/* Free all persistent grant pages */
-	if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
-		free_persistent_gnts(blkif, &blkif->persistent_gnts,
-			blkif->persistent_gnt_c);
+	if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
+		free_persistent_gnts(ring, &ring->persistent_gnts,
+			ring->persistent_gnt_c);
 
-	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
-	blkif->persistent_gnt_c = 0;
+	BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
+	ring->persistent_gnt_c = 0;
 
 	/* Since we are shutting down remove all pages from the buffer */
-	shrink_free_pagepool(blkif, 0 /* All */);
+	shrink_free_pagepool(ring, 0 /* All */);
 }
 
 static unsigned int xen_blkbk_unmap_prepare(
-	struct xen_blkif *blkif,
+	struct xen_blkif_ring *ring,
 	struct grant_page **pages,
 	unsigned int num,
 	struct gnttab_unmap_grant_ref *unmap_ops,
@@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare(
 
 	for (i = 0; i < num; i++) {
 		if (pages[i]->persistent_gnt != NULL) {
-			put_persistent_gnt(blkif, pages[i]->persistent_gnt);
+			put_persistent_gnt(ring, pages[i]->persistent_gnt);
 			continue;
 		}
 		if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
@@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare(
 
 static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
 {
-	struct pending_req* pending_req = (struct pending_req*) (data->data);
-	struct xen_blkif *blkif = pending_req->blkif;
+	struct pending_req *pending_req = (struct pending_req *)(data->data);
+	struct xen_blkif_ring *ring = pending_req->ring;
+	struct xen_blkif *blkif = ring->blkif;
 
 	/* BUG_ON used to reproduce existing behaviour,
 	   but is this the best way to deal with this? */
 	BUG_ON(result);
 
-	put_free_pages(blkif, data->pages, data->count);
-	make_response(blkif, pending_req->id,
+	put_free_pages(ring, data->pages, data->count);
+	make_response(ring, pending_req->id,
 		      pending_req->operation, pending_req->status);
-	free_req(blkif, pending_req);
+	free_req(ring, pending_req);
 	/*
 	 * Make sure the request is freed before releasing blkif,
 	 * or there could be a race between free_req and the
@@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
 	 * pending_free_wq if there's a drain going on, but it has
 	 * to be taken into account if the current model is changed.
 	 */
-	if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) {
+	if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
 		complete(&blkif->drain_complete);
 	}
 	xen_blkif_put(blkif);
@@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
 static void xen_blkbk_unmap_and_respond(struct pending_req *req)
 {
 	struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
-	struct xen_blkif *blkif = req->blkif;
+	struct xen_blkif_ring *ring = req->ring;
 	struct grant_page **pages = req->segments;
 	unsigned int invcount;
 
-	invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs,
+	invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
 					   req->unmap, req->unmap_pages);
 
 	work->data = req;
@@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req)
  * of hypercalls, but since this is only used in error paths there's
  * no real need.
  */
-static void xen_blkbk_unmap(struct xen_blkif *blkif,
+static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
                             struct grant_page *pages[],
                             int num)
 {
@@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif,
 
 	while (num) {
 		unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-		
-		invcount = xen_blkbk_unmap_prepare(blkif, pages, batch,
+
+		invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
 						   unmap, unmap_pages);
 		if (invcount) {
 			ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
 			BUG_ON(ret);
-			put_free_pages(blkif, unmap_pages, invcount);
+			put_free_pages(ring, unmap_pages, invcount);
 		}
 		pages += batch;
 		num -= batch;
 	}
 }
 
-static int xen_blkbk_map(struct xen_blkif *blkif,
+static int xen_blkbk_map(struct xen_blkif_ring *ring,
 			 struct grant_page *pages[],
 			 int num, bool ro)
 {
@@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
 	int ret = 0;
 	int last_map = 0, map_until = 0;
 	int use_persistent_gnts;
+	struct xen_blkif *blkif = ring->blkif;
 
 	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
 
@@ -806,10 +823,11 @@ again:
 	for (i = map_until; i < num; i++) {
 		uint32_t flags;
 
-		if (use_persistent_gnts)
+		if (use_persistent_gnts) {
 			persistent_gnt = get_persistent_gnt(
-				blkif,
+				ring,
 				pages[i]->gref);
+		}
 
 		if (persistent_gnt) {
 			/*
@@ -819,7 +837,7 @@ again:
 			pages[i]->page = persistent_gnt->page;
 			pages[i]->persistent_gnt = persistent_gnt;
 		} else {
-			if (get_free_page(blkif, &pages[i]->page))
+			if (get_free_page(ring, &pages[i]->page))
 				goto out_of_memory;
 			addr = vaddr(pages[i]->page);
 			pages_to_gnt[segs_to_map] = pages[i]->page;
@@ -852,7 +870,7 @@ again:
 			BUG_ON(new_map_idx >= segs_to_map);
 			if (unlikely(map[new_map_idx].status != 0)) {
 				pr_debug("invalid buffer -- could not remap it\n");
-				put_free_pages(blkif, &pages[seg_idx]->page, 1);
+				put_free_pages(ring, &pages[seg_idx]->page, 1);
 				pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
 				ret |= 1;
 				goto next;
@@ -862,7 +880,7 @@ again:
 			continue;
 		}
 		if (use_persistent_gnts &&
-		    blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
+		    ring->persistent_gnt_c < xen_blkif_max_pgrants) {
 			/*
 			 * We are using persistent grants, the grant is
 			 * not mapped but we might have room for it.
@@ -880,7 +898,7 @@ again:
 			persistent_gnt->gnt = map[new_map_idx].ref;
 			persistent_gnt->handle = map[new_map_idx].handle;
 			persistent_gnt->page = pages[seg_idx]->page;
-			if (add_persistent_gnt(blkif,
+			if (add_persistent_gnt(ring,
 			                       persistent_gnt)) {
 				kfree(persistent_gnt);
 				persistent_gnt = NULL;
@@ -888,7 +906,7 @@ again:
 			}
 			pages[seg_idx]->persistent_gnt = persistent_gnt;
 			pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
-				 persistent_gnt->gnt, blkif->persistent_gnt_c,
+				 persistent_gnt->gnt, ring->persistent_gnt_c,
 				 xen_blkif_max_pgrants);
 			goto next;
 		}
@@ -913,7 +931,7 @@ next:
 
 out_of_memory:
 	pr_alert("%s: out of memory\n", __func__);
-	put_free_pages(blkif, pages_to_gnt, segs_to_map);
+	put_free_pages(ring, pages_to_gnt, segs_to_map);
 	return -ENOMEM;
 }
 
@@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req)
 {
 	int rc;
 
-	rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
+	rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
 			   pending_req->nr_segs,
 	                   (pending_req->operation != BLKIF_OP_READ));
 
@@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
 				    struct phys_req *preq)
 {
 	struct grant_page **pages = pending_req->indirect_pages;
-	struct xen_blkif *blkif = pending_req->blkif;
+	struct xen_blkif_ring *ring = pending_req->ring;
 	int indirect_grefs, rc, n, nseg, i;
 	struct blkif_request_segment *segments = NULL;
 
@@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
 	for (i = 0; i < indirect_grefs; i++)
 		pages[i]->gref = req->u.indirect.indirect_grefs[i];
 
-	rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
+	rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
 	if (rc)
 		goto unmap;
 
@@ -977,15 +995,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
 unmap:
 	if (segments)
 		kunmap_atomic(segments);
-	xen_blkbk_unmap(blkif, pages, indirect_grefs);
+	xen_blkbk_unmap(ring, pages, indirect_grefs);
 	return rc;
 }
 
-static int dispatch_discard_io(struct xen_blkif *blkif,
+static int dispatch_discard_io(struct xen_blkif_ring *ring,
 				struct blkif_request *req)
 {
 	int err = 0;
 	int status = BLKIF_RSP_OKAY;
+	struct xen_blkif *blkif = ring->blkif;
 	struct block_device *bdev = blkif->vbd.bdev;
 	unsigned long secure;
 	struct phys_req preq;
@@ -1002,7 +1021,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
 			preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
 		goto fail_response;
 	}
-	blkif->st_ds_req++;
+	ring->st_ds_req++;
 
 	secure = (blkif->vbd.discard_secure &&
 		 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
@@ -1018,26 +1037,28 @@ fail_response:
 	} else if (err)
 		status = BLKIF_RSP_ERROR;
 
-	make_response(blkif, req->u.discard.id, req->operation, status);
+	make_response(ring, req->u.discard.id, req->operation, status);
 	xen_blkif_put(blkif);
 	return err;
 }
 
-static int dispatch_other_io(struct xen_blkif *blkif,
+static int dispatch_other_io(struct xen_blkif_ring *ring,
 			     struct blkif_request *req,
 			     struct pending_req *pending_req)
 {
-	free_req(blkif, pending_req);
-	make_response(blkif, req->u.other.id, req->operation,
+	free_req(ring, pending_req);
+	make_response(ring, req->u.other.id, req->operation,
 		      BLKIF_RSP_EOPNOTSUPP);
 	return -EIO;
 }
 
-static void xen_blk_drain_io(struct xen_blkif *blkif)
+static void xen_blk_drain_io(struct xen_blkif_ring *ring)
 {
+	struct xen_blkif *blkif = ring->blkif;
+
 	atomic_set(&blkif->drain, 1);
 	do {
-		if (atomic_read(&blkif->inflight) == 0)
+		if (atomic_read(&ring->inflight) == 0)
 			break;
 		wait_for_completion_interruptible_timeout(
 				&blkif->drain_complete, HZ);
@@ -1058,12 +1079,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 	if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
 	    (error == -EOPNOTSUPP)) {
 		pr_debug("flush diskcache op failed, not supported\n");
-		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
+		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
 		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 	} else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
 		    (error == -EOPNOTSUPP)) {
 		pr_debug("write barrier op failed, not supported\n");
-		xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
+		xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
 		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 	} else if (error) {
 		pr_debug("Buffer not up-to-date at end of operation,"
@@ -1097,9 +1118,9 @@ static void end_block_io_op(struct bio *bio)
  * and transmute  it to the block API to hand it over to the proper block disk.
  */
 static int
-__do_block_io_op(struct xen_blkif *blkif)
+__do_block_io_op(struct xen_blkif_ring *ring)
 {
-	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	union blkif_back_rings *blk_rings = &ring->blk_rings;
 	struct blkif_request req;
 	struct pending_req *pending_req;
 	RING_IDX rc, rp;
@@ -1112,7 +1133,7 @@ __do_block_io_op(struct xen_blkif *blkif)
 	if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
 		rc = blk_rings->common.rsp_prod_pvt;
 		pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
-			rp, rc, rp - rc, blkif->vbd.pdevice);
+			rp, rc, rp - rc, ring->blkif->vbd.pdevice);
 		return -EACCES;
 	}
 	while (rc != rp) {
@@ -1125,14 +1146,14 @@ __do_block_io_op(struct xen_blkif *blkif)
 			break;
 		}
 
-		pending_req = alloc_req(blkif);
+		pending_req = alloc_req(ring);
 		if (NULL == pending_req) {
-			blkif->st_oo_req++;
+			ring->st_oo_req++;
 			more_to_do = 1;
 			break;
 		}
 
-		switch (blkif->blk_protocol) {
+		switch (ring->blkif->blk_protocol) {
 		case BLKIF_PROTOCOL_NATIVE:
 			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
 			break;
@@ -1156,16 +1177,16 @@ __do_block_io_op(struct xen_blkif *blkif)
 		case BLKIF_OP_WRITE_BARRIER:
 		case BLKIF_OP_FLUSH_DISKCACHE:
 		case BLKIF_OP_INDIRECT:
-			if (dispatch_rw_block_io(blkif, &req, pending_req))
+			if (dispatch_rw_block_io(ring, &req, pending_req))
 				goto done;
 			break;
 		case BLKIF_OP_DISCARD:
-			free_req(blkif, pending_req);
-			if (dispatch_discard_io(blkif, &req))
+			free_req(ring, pending_req);
+			if (dispatch_discard_io(ring, &req))
 				goto done;
 			break;
 		default:
-			if (dispatch_other_io(blkif, &req, pending_req))
+			if (dispatch_other_io(ring, &req, pending_req))
 				goto done;
 			break;
 		}
@@ -1178,13 +1199,13 @@ done:
 }
 
 static int
-do_block_io_op(struct xen_blkif *blkif)
+do_block_io_op(struct xen_blkif_ring *ring)
 {
-	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	union blkif_back_rings *blk_rings = &ring->blk_rings;
 	int more_to_do;
 
 	do {
-		more_to_do = __do_block_io_op(blkif);
+		more_to_do = __do_block_io_op(ring);
 		if (more_to_do)
 			break;
 
@@ -1197,7 +1218,7 @@ do_block_io_op(struct xen_blkif *blkif)
  * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
  * and call the 'submit_bio' to pass it to the underlying storage.
  */
-static int dispatch_rw_block_io(struct xen_blkif *blkif,
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 				struct blkif_request *req,
 				struct pending_req *pending_req)
 {
@@ -1225,17 +1246,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
 	switch (req_operation) {
 	case BLKIF_OP_READ:
-		blkif->st_rd_req++;
+		ring->st_rd_req++;
 		operation = READ;
 		break;
 	case BLKIF_OP_WRITE:
-		blkif->st_wr_req++;
+		ring->st_wr_req++;
 		operation = WRITE_ODIRECT;
 		break;
 	case BLKIF_OP_WRITE_BARRIER:
 		drain = true;
 	case BLKIF_OP_FLUSH_DISKCACHE:
-		blkif->st_f_req++;
+		ring->st_f_req++;
 		operation = WRITE_FLUSH;
 		break;
 	default:
@@ -1260,7 +1281,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
 	preq.nr_sects      = 0;
 
-	pending_req->blkif     = blkif;
+	pending_req->ring      = ring;
 	pending_req->id        = req->u.rw.id;
 	pending_req->operation = req_operation;
 	pending_req->status    = BLKIF_RSP_OKAY;
@@ -1287,12 +1308,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 			goto fail_response;
 	}
 
-	if (xen_vbd_translate(&preq, blkif, operation) != 0) {
+	if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
 		pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
 			 operation == READ ? "read" : "write",
 			 preq.sector_number,
 			 preq.sector_number + preq.nr_sects,
-			 blkif->vbd.pdevice);
+			 ring->blkif->vbd.pdevice);
 		goto fail_response;
 	}
 
@@ -1304,7 +1325,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		if (((int)preq.sector_number|(int)seg[i].nsec) &
 		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
 			pr_debug("Misaligned I/O request from domain %d\n",
-				 blkif->domid);
+				 ring->blkif->domid);
 			goto fail_response;
 		}
 	}
@@ -1313,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	 * issue the WRITE_FLUSH.
 	 */
 	if (drain)
-		xen_blk_drain_io(pending_req->blkif);
+		xen_blk_drain_io(pending_req->ring);
 
 	/*
 	 * If we have failed at this point, we need to undo the M2P override,
@@ -1328,8 +1349,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	 * This corresponding xen_blkif_put is done in __end_block_io_op, or
 	 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
 	 */
-	xen_blkif_get(blkif);
-	atomic_inc(&blkif->inflight);
+	xen_blkif_get(ring->blkif);
+	atomic_inc(&ring->inflight);
 
 	for (i = 0; i < nseg; i++) {
 		while ((bio == NULL) ||
@@ -1377,19 +1398,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	blk_finish_plug(&plug);
 
 	if (operation == READ)
-		blkif->st_rd_sect += preq.nr_sects;
+		ring->st_rd_sect += preq.nr_sects;
 	else if (operation & WRITE)
-		blkif->st_wr_sect += preq.nr_sects;
+		ring->st_wr_sect += preq.nr_sects;
 
 	return 0;
 
  fail_flush:
-	xen_blkbk_unmap(blkif, pending_req->segments,
+	xen_blkbk_unmap(ring, pending_req->segments,
 	                pending_req->nr_segs);
  fail_response:
 	/* Haven't submitted any bio's yet. */
-	make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
-	free_req(blkif, pending_req);
+	make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
+	free_req(ring, pending_req);
 	msleep(1); /* back off a bit */
 	return -EIO;
 
@@ -1407,21 +1428,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 /*
  * Put a response on the ring on how the operation fared.
  */
-static void make_response(struct xen_blkif *blkif, u64 id,
+static void make_response(struct xen_blkif_ring *ring, u64 id,
 			  unsigned short op, int st)
 {
 	struct blkif_response  resp;
 	unsigned long     flags;
-	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	union blkif_back_rings *blk_rings;
 	int notify;
 
 	resp.id        = id;
 	resp.operation = op;
 	resp.status    = st;
 
-	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+	spin_lock_irqsave(&ring->blk_ring_lock, flags);
+	blk_rings = &ring->blk_rings;
 	/* Place on the response ring for the relevant domain. */
-	switch (blkif->blk_protocol) {
+	switch (ring->blkif->blk_protocol) {
 	case BLKIF_PROTOCOL_NATIVE:
 		memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
 		       &resp, sizeof(resp));
@@ -1439,9 +1461,9 @@ static void make_response(struct xen_blkif *blkif, u64 id,
 	}
 	blk_rings->common.rsp_prod_pvt++;
 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
-	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+	spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
 	if (notify)
-		notify_remote_via_irq(blkif->irq);
+		notify_remote_via_irq(ring->irq);
 }
 
 static int __init xen_blkif_init(void)
@@ -1457,6 +1479,9 @@ static int __init xen_blkif_init(void)
 		xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
 	}
 
+	if (xenblk_max_queues == 0)
+		xenblk_max_queues = num_online_cpus();
+
 	rc = xen_blkif_interface_init();
 	if (rc)
 		goto failed_init;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index c929ae22764c..dea61f6ab8cb 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -46,6 +46,7 @@
 #include <xen/interface/io/protocols.h>
 
 extern unsigned int xen_blkif_max_ring_order;
+extern unsigned int xenblk_max_queues;
 /*
  * This is the maximum number of segments that would be allowed in indirect
  * requests. This value will also be passed to the frontend.
@@ -269,68 +270,79 @@ struct persistent_gnt {
 	struct list_head remove_node;
 };
 
-struct xen_blkif {
-	/* Unique identifier for this interface. */
-	domid_t			domid;
-	unsigned int		handle;
+/* Per-ring information. */
+struct xen_blkif_ring {
 	/* Physical parameters of the comms window. */
 	unsigned int		irq;
-	/* Comms information. */
-	enum blkif_protocol	blk_protocol;
 	union blkif_back_rings	blk_rings;
 	void			*blk_ring;
-	/* The VBD attached to this interface. */
-	struct xen_vbd		vbd;
-	/* Back pointer to the backend_info. */
-	struct backend_info	*be;
 	/* Private fields. */
 	spinlock_t		blk_ring_lock;
-	atomic_t		refcnt;
 
 	wait_queue_head_t	wq;
-	/* for barrier (drain) requests */
-	struct completion	drain_complete;
-	atomic_t		drain;
 	atomic_t		inflight;
-	/* One thread per one blkif. */
+	/* One thread per blkif ring. */
 	struct task_struct	*xenblkd;
 	unsigned int		waiting_reqs;
 
-	/* tree to store persistent grants */
+	/* List of all 'pending_req' available */
+	struct list_head	pending_free;
+	/* And its spinlock. */
+	spinlock_t		pending_free_lock;
+	wait_queue_head_t	pending_free_wq;
+
+	/* Tree to store persistent grants. */
+	spinlock_t		pers_gnts_lock;
 	struct rb_root		persistent_gnts;
 	unsigned int		persistent_gnt_c;
 	atomic_t		persistent_gnt_in_use;
 	unsigned long           next_lru;
 
-	/* used by the kworker that offload work from the persistent purge */
+	/* Statistics. */
+	unsigned long		st_print;
+	unsigned long long	st_rd_req;
+	unsigned long long	st_wr_req;
+	unsigned long long	st_oo_req;
+	unsigned long long	st_f_req;
+	unsigned long long	st_ds_req;
+	unsigned long long	st_rd_sect;
+	unsigned long long	st_wr_sect;
+
+	/* Used by the kworker that offload work from the persistent purge. */
 	struct list_head	persistent_purge_list;
 	struct work_struct	persistent_purge_work;
 
-	/* buffer of free pages to map grant refs */
+	/* Buffer of free pages to map grant refs. */
 	spinlock_t		free_pages_lock;
 	int			free_pages_num;
 	struct list_head	free_pages;
 
-	/* List of all 'pending_req' available */
-	struct list_head	pending_free;
-	/* And its spinlock. */
-	spinlock_t		pending_free_lock;
-	wait_queue_head_t	pending_free_wq;
-
-	/* statistics */
-	unsigned long		st_print;
-	unsigned long long			st_rd_req;
-	unsigned long long			st_wr_req;
-	unsigned long long			st_oo_req;
-	unsigned long long			st_f_req;
-	unsigned long long			st_ds_req;
-	unsigned long long			st_rd_sect;
-	unsigned long long			st_wr_sect;
-
 	struct work_struct	free_work;
 	/* Thread shutdown wait queue. */
 	wait_queue_head_t	shutdown_wq;
-	unsigned int nr_ring_pages;
+	struct xen_blkif 	*blkif;
+};
+
+struct xen_blkif {
+	/* Unique identifier for this interface. */
+	domid_t			domid;
+	unsigned int		handle;
+	/* Comms information. */
+	enum blkif_protocol	blk_protocol;
+	/* The VBD attached to this interface. */
+	struct xen_vbd		vbd;
+	/* Back pointer to the backend_info. */
+	struct backend_info	*be;
+	atomic_t		refcnt;
+	/* for barrier (drain) requests */
+	struct completion	drain_complete;
+	atomic_t		drain;
+
+	struct work_struct	free_work;
+	unsigned int 		nr_ring_pages;
+	/* All rings for this device. */
+	struct xen_blkif_ring	*rings;
+	unsigned int		nr_rings;
 };
 
 struct seg_buf {
@@ -352,7 +364,7 @@ struct grant_page {
  * response queued for it, with the saved 'id' passed back.
  */
 struct pending_req {
-	struct xen_blkif	*blkif;
+	struct xen_blkif_ring   *ring;
 	u64			id;
 	int			nr_segs;
 	atomic_t		pendcnt;
@@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void);
 irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
 int xen_blkif_schedule(void *arg);
 int xen_blkif_purge_persistent(void *arg);
-void xen_blkbk_free_caches(struct xen_blkif *blkif);
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring);
 
 int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
 			      struct backend_info *be, int state);
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index f53cff42f8da..876763f7f13e 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 {
 	int err;
 	char name[BLKBACK_NAME_LEN];
+	struct xen_blkif_ring *ring;
+	int i;
 
 	/* Not ready to connect? */
-	if (!blkif->irq || !blkif->vbd.bdev)
+	if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
 		return;
 
 	/* Already connected? */
@@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 	}
 	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
 
-	blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name);
-	if (IS_ERR(blkif->xenblkd)) {
-		err = PTR_ERR(blkif->xenblkd);
-		blkif->xenblkd = NULL;
-		xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
-		return;
+	for (i = 0; i < blkif->nr_rings; i++) {
+		ring = &blkif->rings[i];
+		ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i);
+		if (IS_ERR(ring->xenblkd)) {
+			err = PTR_ERR(ring->xenblkd);
+			ring->xenblkd = NULL;
+			xenbus_dev_fatal(blkif->be->dev, err,
+					"start %s-%d xenblkd", name, i);
+			goto out;
+		}
+	}
+	return;
+
+out:
+	while (--i >= 0) {
+		ring = &blkif->rings[i];
+		kthread_stop(ring->xenblkd);
+	}
+	return;
+}
+
+static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
+{
+	unsigned int r;
+
+	blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL);
+	if (!blkif->rings)
+		return -ENOMEM;
+
+	for (r = 0; r < blkif->nr_rings; r++) {
+		struct xen_blkif_ring *ring = &blkif->rings[r];
+
+		spin_lock_init(&ring->blk_ring_lock);
+		init_waitqueue_head(&ring->wq);
+		INIT_LIST_HEAD(&ring->pending_free);
+		INIT_LIST_HEAD(&ring->persistent_purge_list);
+		INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants);
+		spin_lock_init(&ring->free_pages_lock);
+		INIT_LIST_HEAD(&ring->free_pages);
+
+		spin_lock_init(&ring->pending_free_lock);
+		init_waitqueue_head(&ring->pending_free_wq);
+		init_waitqueue_head(&ring->shutdown_wq);
+		ring->blkif = blkif;
+		ring->st_print = jiffies;
+		xen_blkif_get(blkif);
 	}
+
+	return 0;
 }
 
 static struct xen_blkif *xen_blkif_alloc(domid_t domid)
@@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 		return ERR_PTR(-ENOMEM);
 
 	blkif->domid = domid;
-	spin_lock_init(&blkif->blk_ring_lock);
 	atomic_set(&blkif->refcnt, 1);
-	init_waitqueue_head(&blkif->wq);
 	init_completion(&blkif->drain_complete);
-	atomic_set(&blkif->drain, 0);
-	blkif->st_print = jiffies;
-	blkif->persistent_gnts.rb_node = NULL;
-	spin_lock_init(&blkif->free_pages_lock);
-	INIT_LIST_HEAD(&blkif->free_pages);
-	INIT_LIST_HEAD(&blkif->persistent_purge_list);
-	blkif->free_pages_num = 0;
-	atomic_set(&blkif->persistent_gnt_in_use, 0);
-	atomic_set(&blkif->inflight, 0);
-	INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
-
-	INIT_LIST_HEAD(&blkif->pending_free);
 	INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
-	spin_lock_init(&blkif->pending_free_lock);
-	init_waitqueue_head(&blkif->pending_free_wq);
-	init_waitqueue_head(&blkif->shutdown_wq);
 
 	return blkif;
 }
 
-static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
+static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
 			 unsigned int nr_grefs, unsigned int evtchn)
 {
 	int err;
+	struct xen_blkif *blkif = ring->blkif;
 
 	/* Already connected through? */
-	if (blkif->irq)
+	if (ring->irq)
 		return 0;
 
 	err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
-				     &blkif->blk_ring);
+				     &ring->blk_ring);
 	if (err < 0)
 		return err;
 
@@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
 	case BLKIF_PROTOCOL_NATIVE:
 	{
 		struct blkif_sring *sring;
-		sring = (struct blkif_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.native, sring,
+		sring = (struct blkif_sring *)ring->blk_ring;
+		BACK_RING_INIT(&ring->blk_rings.native, sring,
 			       XEN_PAGE_SIZE * nr_grefs);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_32:
 	{
 		struct blkif_x86_32_sring *sring_x86_32;
-		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32,
+		sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring;
+		BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32,
 			       XEN_PAGE_SIZE * nr_grefs);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_64:
 	{
 		struct blkif_x86_64_sring *sring_x86_64;
-		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64,
+		sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring;
+		BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64,
 			       XEN_PAGE_SIZE * nr_grefs);
 		break;
 	}
@@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
 
 	err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
 						    xen_blkif_be_int, 0,
-						    "blkif-backend", blkif);
+						    "blkif-backend", ring);
 	if (err < 0) {
-		xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
-		blkif->blk_rings.common.sring = NULL;
+		xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+		ring->blk_rings.common.sring = NULL;
 		return err;
 	}
-	blkif->irq = err;
+	ring->irq = err;
 
 	return 0;
 }
@@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
 static int xen_blkif_disconnect(struct xen_blkif *blkif)
 {
 	struct pending_req *req, *n;
-	int i = 0, j;
+	unsigned int j, r;
 
-	if (blkif->xenblkd) {
-		kthread_stop(blkif->xenblkd);
-		wake_up(&blkif->shutdown_wq);
-		blkif->xenblkd = NULL;
-	}
+	for (r = 0; r < blkif->nr_rings; r++) {
+		struct xen_blkif_ring *ring = &blkif->rings[r];
+		unsigned int i = 0;
 
-	/* The above kthread_stop() guarantees that at this point we
-	 * don't have any discard_io or other_io requests. So, checking
-	 * for inflight IO is enough.
-	 */
-	if (atomic_read(&blkif->inflight) > 0)
-		return -EBUSY;
+		if (ring->xenblkd) {
+			kthread_stop(ring->xenblkd);
+			wake_up(&ring->shutdown_wq);
+			ring->xenblkd = NULL;
+		}
 
-	if (blkif->irq) {
-		unbind_from_irqhandler(blkif->irq, blkif);
-		blkif->irq = 0;
-	}
+		/* The above kthread_stop() guarantees that at this point we
+		 * don't have any discard_io or other_io requests. So, checking
+		 * for inflight IO is enough.
+		 */
+		if (atomic_read(&ring->inflight) > 0)
+			return -EBUSY;
 
-	if (blkif->blk_rings.common.sring) {
-		xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
-		blkif->blk_rings.common.sring = NULL;
-	}
+		if (ring->irq) {
+			unbind_from_irqhandler(ring->irq, ring);
+			ring->irq = 0;
+		}
 
-	/* Remove all persistent grants and the cache of ballooned pages. */
-	xen_blkbk_free_caches(blkif);
+		if (ring->blk_rings.common.sring) {
+			xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+			ring->blk_rings.common.sring = NULL;
+		}
 
-	/* Check that there is no request in use */
-	list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
-		list_del(&req->free_list);
+		/* Remove all persistent grants and the cache of ballooned pages. */
+		xen_blkbk_free_caches(ring);
 
-		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
-			kfree(req->segments[j]);
+		/* Check that there is no request in use */
+		list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
+			list_del(&req->free_list);
 
-		for (j = 0; j < MAX_INDIRECT_PAGES; j++)
-			kfree(req->indirect_pages[j]);
+			for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+				kfree(req->segments[j]);
 
-		kfree(req);
-		i++;
-	}
+			for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+				kfree(req->indirect_pages[j]);
+
+			kfree(req);
+			i++;
+		}
 
-	WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+		BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0);
+		BUG_ON(!list_empty(&ring->persistent_purge_list));
+		BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
+		BUG_ON(!list_empty(&ring->free_pages));
+		BUG_ON(ring->free_pages_num != 0);
+		BUG_ON(ring->persistent_gnt_c != 0);
+		WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+		xen_blkif_put(blkif);
+	}
 	blkif->nr_ring_pages = 0;
+	/*
+	 * blkif->rings was allocated in connect_ring, so we should free it in
+	 * here.
+	 */
+	kfree(blkif->rings);
+	blkif->rings = NULL;
+	blkif->nr_rings = 0;
 
 	return 0;
 }
@@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif)
 	xen_vbd_free(&blkif->vbd);
 
 	/* Make sure everything is drained before shutting down */
-	BUG_ON(blkif->persistent_gnt_c != 0);
-	BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
-	BUG_ON(blkif->free_pages_num != 0);
-	BUG_ON(!list_empty(&blkif->persistent_purge_list));
-	BUG_ON(!list_empty(&blkif->free_pages));
-	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
-
 	kmem_cache_free(xen_blkif_cachep, blkif);
 }
 
@@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void)
  *  sysfs interface for VBD I/O requests
  */
 
-#define VBD_SHOW(name, format, args...)					\
+#define VBD_SHOW_ALLRING(name, format)					\
 	static ssize_t show_##name(struct device *_dev,			\
 				   struct device_attribute *attr,	\
 				   char *buf)				\
 	{								\
 		struct xenbus_device *dev = to_xenbus_device(_dev);	\
 		struct backend_info *be = dev_get_drvdata(&dev->dev);	\
+		struct xen_blkif *blkif = be->blkif;			\
+		unsigned int i;						\
+		unsigned long long result = 0;				\
 									\
-		return sprintf(buf, format, ##args);			\
+		if (!blkif->rings)				\
+			goto out;					\
+									\
+		for (i = 0; i < blkif->nr_rings; i++) {		\
+			struct xen_blkif_ring *ring = &blkif->rings[i];	\
+									\
+			result += ring->st_##name;			\
+		}							\
+									\
+out:									\
+		return sprintf(buf, format, result);			\
 	}								\
 	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
 
-VBD_SHOW(oo_req,  "%llu\n", be->blkif->st_oo_req);
-VBD_SHOW(rd_req,  "%llu\n", be->blkif->st_rd_req);
-VBD_SHOW(wr_req,  "%llu\n", be->blkif->st_wr_req);
-VBD_SHOW(f_req,  "%llu\n", be->blkif->st_f_req);
-VBD_SHOW(ds_req,  "%llu\n", be->blkif->st_ds_req);
-VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect);
-VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect);
+VBD_SHOW_ALLRING(oo_req,  "%llu\n");
+VBD_SHOW_ALLRING(rd_req,  "%llu\n");
+VBD_SHOW_ALLRING(wr_req,  "%llu\n");
+VBD_SHOW_ALLRING(f_req,  "%llu\n");
+VBD_SHOW_ALLRING(ds_req,  "%llu\n");
+VBD_SHOW_ALLRING(rd_sect, "%llu\n");
+VBD_SHOW_ALLRING(wr_sect, "%llu\n");
 
 static struct attribute *xen_vbdstat_attrs[] = {
 	&dev_attr_oo_req.attr,
@@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = {
 	.attrs = xen_vbdstat_attrs,
 };
 
+#define VBD_SHOW(name, format, args...)					\
+	static ssize_t show_##name(struct device *_dev,			\
+				   struct device_attribute *attr,	\
+				   char *buf)				\
+	{								\
+		struct xenbus_device *dev = to_xenbus_device(_dev);	\
+		struct backend_info *be = dev_get_drvdata(&dev->dev);	\
+									\
+		return sprintf(buf, format, ##args);			\
+	}								\
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
 VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
 VBD_SHOW(mode, "%s\n", be->mode);
 
@@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
 
 	dev_set_drvdata(&dev->dev, NULL);
 
-	if (be->blkif) {
+	if (be->blkif)
 		xen_blkif_disconnect(be->blkif);
-		xen_blkif_put(be->blkif);
-	}
 
+	/* Put the reference we set in xen_blkif_alloc(). */
+	xen_blkif_put(be->blkif);
 	kfree(be->mode);
 	kfree(be);
 	return 0;
@@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
 		goto fail;
 	}
 
+	/* Multi-queue: advertise how many queues are supported by us.*/
+	err = xenbus_printf(XBT_NIL, dev->nodename,
+			    "multi-queue-max-queues", "%u", xenblk_max_queues);
+	if (err)
+		pr_warn("Error writing multi-queue-max-queues\n");
+
 	/* setup back pointer */
 	be->blkif->be = be;
 
@@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev,
 		}
 
 		err = connect_ring(be);
-		if (err)
+		if (err) {
+			/*
+			 * Clean up so that memory resources can be used by
+			 * other devices. connect_ring reported already error.
+			 */
+			xen_blkif_disconnect(be->blkif);
 			break;
+		}
 		xen_update_blkif_status(be->blkif);
 		break;
 
@@ -825,50 +902,43 @@ again:
 	xenbus_transaction_end(xbt, 1);
 }
 
-
-static int connect_ring(struct backend_info *be)
+/*
+ * Each ring may have multi pages, depends on "ring-page-order".
+ */
+static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
 {
-	struct xenbus_device *dev = be->dev;
 	unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
-	unsigned int evtchn, nr_grefs, ring_page_order;
-	unsigned int pers_grants;
-	char protocol[64] = "";
 	struct pending_req *req, *n;
 	int err, i, j;
+	struct xen_blkif *blkif = ring->blkif;
+	struct xenbus_device *dev = blkif->be->dev;
+	unsigned int ring_page_order, nr_grefs, evtchn;
 
-	pr_debug("%s %s\n", __func__, dev->otherend);
-
-	err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
+	err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
 			  &evtchn);
 	if (err != 1) {
 		err = -EINVAL;
-		xenbus_dev_fatal(dev, err, "reading %s/event-channel",
-				 dev->otherend);
+		xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir);
 		return err;
 	}
-	pr_info("event-channel %u\n", evtchn);
 
 	err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
 			  &ring_page_order);
 	if (err != 1) {
-		err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
-				  "%u", &ring_ref[0]);
+		err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]);
 		if (err != 1) {
 			err = -EINVAL;
-			xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
-					 dev->otherend);
+			xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
 			return err;
 		}
 		nr_grefs = 1;
-		pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
-			ring_ref[0]);
 	} else {
 		unsigned int i;
 
 		if (ring_page_order > xen_blkif_max_ring_order) {
 			err = -EINVAL;
 			xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
-					 dev->otherend, ring_page_order,
+					 dir, ring_page_order,
 					 xen_blkif_max_ring_order);
 			return err;
 		}
@@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be)
 			char ring_ref_name[RINGREF_NAME_LEN];
 
 			snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
-			err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name,
+			err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
 					   "%u", &ring_ref[i]);
 			if (err != 1) {
 				err = -EINVAL;
 				xenbus_dev_fatal(dev, err, "reading %s/%s",
-						 dev->otherend, ring_ref_name);
+						 dir, ring_ref_name);
 				return err;
 			}
-			pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
 		}
 	}
-
-	be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
-	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
-			    "%63s", protocol, NULL);
-	if (err)
-		strcpy(protocol, "unspecified, assuming default");
-	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
-		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
-	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
-		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
-	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
-		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
-	else {
-		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
-		return -1;
-	}
-	err = xenbus_gather(XBT_NIL, dev->otherend,
-			    "feature-persistent", "%u",
-			    &pers_grants, NULL);
-	if (err)
-		pers_grants = 0;
-
-	be->blkif->vbd.feature_gnt_persistent = pers_grants;
-	be->blkif->vbd.overflow_max_grants = 0;
-	be->blkif->nr_ring_pages = nr_grefs;
-
-	pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
-		nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
-		pers_grants ? "persistent grants" : "");
+	blkif->nr_ring_pages = nr_grefs;
 
 	for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
 		req = kzalloc(sizeof(*req), GFP_KERNEL);
 		if (!req)
 			goto fail;
-		list_add_tail(&req->free_list, &be->blkif->pending_free);
+		list_add_tail(&req->free_list, &ring->pending_free);
 		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
 			req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
 			if (!req->segments[j])
@@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be)
 	}
 
 	/* Map the shared frame, irq etc. */
-	err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
+	err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn);
 	if (err) {
 		xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
 		return err;
@@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be)
 	return 0;
 
 fail:
-	list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) {
+	list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
 		list_del(&req->free_list);
 		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
 			if (!req->segments[j])
@@ -962,6 +1003,93 @@ fail:
 		kfree(req);
 	}
 	return -ENOMEM;
+
+}
+
+static int connect_ring(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	unsigned int pers_grants;
+	char protocol[64] = "";
+	int err, i;
+	char *xspath;
+	size_t xspathsize;
+	const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */
+	unsigned int requested_num_queues = 0;
+
+	pr_debug("%s %s\n", __func__, dev->otherend);
+
+	be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
+	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+			    "%63s", protocol, NULL);
+	if (err)
+		strcpy(protocol, "unspecified, assuming default");
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+	else {
+		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+		return -ENOSYS;
+	}
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+			    "feature-persistent", "%u",
+			    &pers_grants, NULL);
+	if (err)
+		pers_grants = 0;
+
+	be->blkif->vbd.feature_gnt_persistent = pers_grants;
+	be->blkif->vbd.overflow_max_grants = 0;
+
+	/*
+	 * Read the number of hardware queues from frontend.
+	 */
+	err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues",
+			   "%u", &requested_num_queues);
+	if (err < 0) {
+		requested_num_queues = 1;
+	} else {
+		if (requested_num_queues > xenblk_max_queues
+		    || requested_num_queues == 0) {
+			/* Buggy or malicious guest. */
+			xenbus_dev_fatal(dev, err,
+					"guest requested %u queues, exceeding the maximum of %u.",
+					requested_num_queues, xenblk_max_queues);
+			return -ENOSYS;
+		}
+	}
+	be->blkif->nr_rings = requested_num_queues;
+	if (xen_blkif_alloc_rings(be->blkif))
+		return -ENOMEM;
+
+	pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
+		 be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
+		 pers_grants ? "persistent grants" : "");
+
+	if (be->blkif->nr_rings == 1)
+		return read_per_ring_refs(&be->blkif->rings[0], dev->otherend);
+	else {
+		xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
+		xspath = kmalloc(xspathsize, GFP_KERNEL);
+		if (!xspath) {
+			xenbus_dev_fatal(dev, -ENOMEM, "reading ring references");
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < be->blkif->nr_rings; i++) {
+			memset(xspath, 0, xspathsize);
+			snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i);
+			err = read_per_ring_refs(&be->blkif->rings[i], xspath);
+			if (err) {
+				kfree(xspath);
+				return err;
+			}
+		}
+		kfree(xspath);
+	}
+	return 0;
 }
 
 static const struct xenbus_device_id xen_blkbk_ids[] = {
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2fee2eef988d..8a8dc91c39f7 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -60,6 +60,20 @@
 
 #include <asm/xen/hypervisor.h>
 
+/*
+ * The minimal size of segment supported by the block framework is PAGE_SIZE.
+ * When Linux is using a different page size than Xen, it may not be possible
+ * to put all the data in a single segment.
+ * This can happen when the backend doesn't support indirect descriptor and
+ * therefore the maximum amount of data that a request can carry is
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
+ *
+ * Note that we only support one extra request. So the Linux page size
+ * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
+ * 88KB.
+ */
+#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
+
 enum blkif_state {
 	BLKIF_STATE_DISCONNECTED,
 	BLKIF_STATE_CONNECTED,
@@ -72,6 +86,13 @@ struct grant {
 	struct list_head node;
 };
 
+enum blk_req_status {
+	REQ_WAITING,
+	REQ_DONE,
+	REQ_ERROR,
+	REQ_EOPNOTSUPP,
+};
+
 struct blk_shadow {
 	struct blkif_request req;
 	struct request *request;
@@ -79,6 +100,14 @@ struct blk_shadow {
 	struct grant **indirect_grants;
 	struct scatterlist *sg;
 	unsigned int num_sg;
+	enum blk_req_status status;
+
+	#define NO_ASSOCIATED_ID ~0UL
+	/*
+	 * Id of the sibling if we ever need 2 requests when handling a
+	 * block I/O request
+	 */
+	unsigned long associated_id;
 };
 
 struct split_bio {
@@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32;
 module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
 MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
 
+static unsigned int xen_blkif_max_queues = 4;
+module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO);
+MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
+
 /*
  * Maximum order of pages to be used for the shared ring between front and
  * backend, 4KB page granularity is used.
@@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
 	__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
 
 /*
- * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
- * characters are enough. Define to 20 to keep consist with backend.
+ * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
+ * characters are enough. Define to 20 to keep consistent with backend.
  */
 #define RINGREF_NAME_LEN (20)
+/*
+ * queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
+ */
+#define QUEUE_NAME_LEN (17)
+
+/*
+ *  Per-ring info.
+ *  Every blkfront device can associate with one or more blkfront_ring_info,
+ *  depending on how many hardware queues/rings to be used.
+ */
+struct blkfront_ring_info {
+	/* Lock to protect data in every ring buffer. */
+	spinlock_t ring_lock;
+	struct blkif_front_ring ring;
+	unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
+	unsigned int evtchn, irq;
+	struct work_struct work;
+	struct gnttab_free_callback callback;
+	struct blk_shadow shadow[BLK_MAX_RING_SIZE];
+	struct list_head indirect_pages;
+	struct list_head grants;
+	unsigned int persistent_gnts_c;
+	unsigned long shadow_free;
+	struct blkfront_info *dev_info;
+};
 
 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
  */
 struct blkfront_info
 {
-	spinlock_t io_lock;
 	struct mutex mutex;
 	struct xenbus_device *xbdev;
 	struct gendisk *gd;
 	int vdevice;
 	blkif_vdev_t handle;
 	enum blkif_state connected;
-	int ring_ref[XENBUS_MAX_RING_GRANTS];
+	/* Number of pages per ring buffer. */
 	unsigned int nr_ring_pages;
-	struct blkif_front_ring ring;
-	unsigned int evtchn, irq;
 	struct request_queue *rq;
-	struct work_struct work;
-	struct gnttab_free_callback callback;
-	struct blk_shadow shadow[BLK_MAX_RING_SIZE];
-	struct list_head grants;
-	struct list_head indirect_pages;
-	unsigned int persistent_gnts_c;
-	unsigned long shadow_free;
 	unsigned int feature_flush;
 	unsigned int feature_discard:1;
 	unsigned int feature_secdiscard:1;
@@ -155,6 +203,8 @@ struct blkfront_info
 	unsigned int max_indirect_segments;
 	int is_ready;
 	struct blk_mq_tag_set tag_set;
+	struct blkfront_ring_info *rinfo;
+	unsigned int nr_rings;
 };
 
 static unsigned int nr_minors;
@@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock);
 
 #define GREFS(_psegs)	((_psegs) * GRANTS_PER_PSEG)
 
-static int blkfront_setup_indirect(struct blkfront_info *info);
-static int blkfront_gather_backend_features(struct blkfront_info *info);
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
+static void blkfront_gather_backend_features(struct blkfront_info *info);
 
-static int get_id_from_freelist(struct blkfront_info *info)
+static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
 {
-	unsigned long free = info->shadow_free;
-	BUG_ON(free >= BLK_RING_SIZE(info));
-	info->shadow_free = info->shadow[free].req.u.rw.id;
-	info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
+	unsigned long free = rinfo->shadow_free;
+
+	BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
+	rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
+	rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
 	return free;
 }
 
-static int add_id_to_freelist(struct blkfront_info *info,
-			       unsigned long id)
+static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
+			      unsigned long id)
 {
-	if (info->shadow[id].req.u.rw.id != id)
+	if (rinfo->shadow[id].req.u.rw.id != id)
 		return -EINVAL;
-	if (info->shadow[id].request == NULL)
+	if (rinfo->shadow[id].request == NULL)
 		return -EINVAL;
-	info->shadow[id].req.u.rw.id  = info->shadow_free;
-	info->shadow[id].request = NULL;
-	info->shadow_free = id;
+	rinfo->shadow[id].req.u.rw.id  = rinfo->shadow_free;
+	rinfo->shadow[id].request = NULL;
+	rinfo->shadow_free = id;
 	return 0;
 }
 
-static int fill_grant_buffer(struct blkfront_info *info, int num)
+static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 {
+	struct blkfront_info *info = rinfo->dev_info;
 	struct page *granted_page;
 	struct grant *gnt_list_entry, *n;
 	int i = 0;
 
-	while(i < num) {
+	while (i < num) {
 		gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
 		if (!gnt_list_entry)
 			goto out_of_memory;
@@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
 		}
 
 		gnt_list_entry->gref = GRANT_INVALID_REF;
-		list_add(&gnt_list_entry->node, &info->grants);
+		list_add(&gnt_list_entry->node, &rinfo->grants);
 		i++;
 	}
 
@@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
 
 out_of_memory:
 	list_for_each_entry_safe(gnt_list_entry, n,
-	                         &info->grants, node) {
+	                         &rinfo->grants, node) {
 		list_del(&gnt_list_entry->node);
 		if (info->feature_persistent)
 			__free_page(gnt_list_entry->page);
@@ -263,17 +315,17 @@ out_of_memory:
 	return -ENOMEM;
 }
 
-static struct grant *get_free_grant(struct blkfront_info *info)
+static struct grant *get_free_grant(struct blkfront_ring_info *rinfo)
 {
 	struct grant *gnt_list_entry;
 
-	BUG_ON(list_empty(&info->grants));
-	gnt_list_entry = list_first_entry(&info->grants, struct grant,
+	BUG_ON(list_empty(&rinfo->grants));
+	gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
 					  node);
 	list_del(&gnt_list_entry->node);
 
 	if (gnt_list_entry->gref != GRANT_INVALID_REF)
-		info->persistent_gnts_c--;
+		rinfo->persistent_gnts_c--;
 
 	return gnt_list_entry;
 }
@@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry,
 
 static struct grant *get_grant(grant_ref_t *gref_head,
 			       unsigned long gfn,
-			       struct blkfront_info *info)
+			       struct blkfront_ring_info *rinfo)
 {
-	struct grant *gnt_list_entry = get_free_grant(info);
+	struct grant *gnt_list_entry = get_free_grant(rinfo);
+	struct blkfront_info *info = rinfo->dev_info;
 
 	if (gnt_list_entry->gref != GRANT_INVALID_REF)
 		return gnt_list_entry;
@@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
 }
 
 static struct grant *get_indirect_grant(grant_ref_t *gref_head,
-					struct blkfront_info *info)
+					struct blkfront_ring_info *rinfo)
 {
-	struct grant *gnt_list_entry = get_free_grant(info);
+	struct grant *gnt_list_entry = get_free_grant(rinfo);
+	struct blkfront_info *info = rinfo->dev_info;
 
 	if (gnt_list_entry->gref != GRANT_INVALID_REF)
 		return gnt_list_entry;
@@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
 		struct page *indirect_page;
 
 		/* Fetch a pre-allocated page to use for indirect grefs */
-		BUG_ON(list_empty(&info->indirect_pages));
-		indirect_page = list_first_entry(&info->indirect_pages,
+		BUG_ON(list_empty(&rinfo->indirect_pages));
+		indirect_page = list_first_entry(&rinfo->indirect_pages,
 						 struct page, lru);
 		list_del(&indirect_page->lru);
 		gnt_list_entry->page = indirect_page;
@@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
 
 static void blkif_restart_queue_callback(void *arg)
 {
-	struct blkfront_info *info = (struct blkfront_info *)arg;
-	schedule_work(&info->work);
+	struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg;
+	schedule_work(&rinfo->work);
 }
 
 static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
@@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
 	return 0;
 }
 
-static int blkif_queue_discard_req(struct request *req)
+static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
+					    struct request *req,
+					    struct blkif_request **ring_req)
 {
-	struct blkfront_info *info = req->rq_disk->private_data;
+	unsigned long id;
+
+	*ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
+	rinfo->ring.req_prod_pvt++;
+
+	id = get_id_from_freelist(rinfo);
+	rinfo->shadow[id].request = req;
+	rinfo->shadow[id].status = REQ_WAITING;
+	rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
+
+	(*ring_req)->u.rw.id = id;
+
+	return id;
+}
+
+static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
+{
+	struct blkfront_info *info = rinfo->dev_info;
 	struct blkif_request *ring_req;
 	unsigned long id;
 
 	/* Fill out a communications ring structure. */
-	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
-	id = get_id_from_freelist(info);
-	info->shadow[id].request = req;
+	id = blkif_ring_get_request(rinfo, req, &ring_req);
 
 	ring_req->operation = BLKIF_OP_DISCARD;
 	ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
@@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req)
 	else
 		ring_req->u.discard.flag = 0;
 
-	info->ring.req_prod_pvt++;
-
 	/* Keep a private copy so we can reissue requests when recovering. */
-	info->shadow[id].req = *ring_req;
+	rinfo->shadow[id].req = *ring_req;
 
 	return 0;
 }
@@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req)
 struct setup_rw_req {
 	unsigned int grant_idx;
 	struct blkif_request_segment *segments;
-	struct blkfront_info *info;
+	struct blkfront_ring_info *rinfo;
 	struct blkif_request *ring_req;
 	grant_ref_t gref_head;
 	unsigned int id;
@@ -495,6 +564,9 @@ struct setup_rw_req {
 	bool need_copy;
 	unsigned int bvec_off;
 	char *bvec_data;
+
+	bool require_extra_req;
+	struct blkif_request *extra_ring_req;
 };
 
 static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
 	/* Convenient aliases */
 	unsigned int grant_idx = setup->grant_idx;
 	struct blkif_request *ring_req = setup->ring_req;
-	struct blkfront_info *info = setup->info;
-	struct blk_shadow *shadow = &info->shadow[setup->id];
+	struct blkfront_ring_info *rinfo = setup->rinfo;
+	/*
+	 * We always use the shadow of the first request to store the list
+	 * of grant associated to the block I/O request. This made the
+	 * completion more easy to handle even if the block I/O request is
+	 * split.
+	 */
+	struct blk_shadow *shadow = &rinfo->shadow[setup->id];
+
+	if (unlikely(setup->require_extra_req &&
+		     grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+		/*
+		 * We are using the second request, setup grant_idx
+		 * to be the index of the segment array.
+		 */
+		grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
+		ring_req = setup->extra_ring_req;
+	}
 
 	if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
 	    (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
@@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
 			kunmap_atomic(setup->segments);
 
 		n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
-		gnt_list_entry = get_indirect_grant(&setup->gref_head, info);
+		gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo);
 		shadow->indirect_grants[n] = gnt_list_entry;
 		setup->segments = kmap_atomic(gnt_list_entry->page);
 		ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
 	}
 
-	gnt_list_entry = get_grant(&setup->gref_head, gfn, info);
+	gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
 	ref = gnt_list_entry->gref;
-	shadow->grants_used[grant_idx] = gnt_list_entry;
+	/*
+	 * All the grants are stored in the shadow of the first
+	 * request. Therefore we have to use the global index.
+	 */
+	shadow->grants_used[setup->grant_idx] = gnt_list_entry;
 
 	if (setup->need_copy) {
 		void *shared_data;
@@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
 	(setup->grant_idx)++;
 }
 
-static int blkif_queue_rw_req(struct request *req)
+static void blkif_setup_extra_req(struct blkif_request *first,
+				  struct blkif_request *second)
 {
-	struct blkfront_info *info = req->rq_disk->private_data;
-	struct blkif_request *ring_req;
-	unsigned long id;
+	uint16_t nr_segments = first->u.rw.nr_segments;
+
+	/*
+	 * The second request is only present when the first request uses
+	 * all its segments. It's always the continuity of the first one.
+	 */
+	first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+	second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	second->u.rw.sector_number = first->u.rw.sector_number +
+		(BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
+
+	second->u.rw.handle = first->u.rw.handle;
+	second->operation = first->operation;
+}
+
+static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
+{
+	struct blkfront_info *info = rinfo->dev_info;
+	struct blkif_request *ring_req, *extra_ring_req = NULL;
+	unsigned long id, extra_id = NO_ASSOCIATED_ID;
+	bool require_extra_req = false;
 	int i;
 	struct setup_rw_req setup = {
 		.grant_idx = 0,
 		.segments = NULL,
-		.info = info,
+		.rinfo = rinfo,
 		.need_copy = rq_data_dir(req) && info->feature_persistent,
 	};
 
@@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req)
 	 * existing persistent grants, or if we have to get new grants,
 	 * as there are not sufficiently many free.
 	 */
-	bool new_persistent_gnts;
 	struct scatterlist *sg;
 	int num_sg, max_grefs, num_grant;
 
@@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req)
 		 */
 		max_grefs += INDIRECT_GREFS(max_grefs);
 
-	/* Check if we have enough grants to allocate a requests */
-	if (info->persistent_gnts_c < max_grefs) {
-		new_persistent_gnts = 1;
-		if (gnttab_alloc_grant_references(
-		    max_grefs - info->persistent_gnts_c,
-		    &setup.gref_head) < 0) {
+	/*
+	 * We have to reserve 'max_grefs' grants because persistent
+	 * grants are shared by all rings.
+	 */
+	if (max_grefs > 0)
+		if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) {
 			gnttab_request_free_callback(
-				&info->callback,
+				&rinfo->callback,
 				blkif_restart_queue_callback,
-				info,
+				rinfo,
 				max_grefs);
 			return 1;
 		}
-	} else
-		new_persistent_gnts = 0;
 
 	/* Fill out a communications ring structure. */
-	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
-	id = get_id_from_freelist(info);
-	info->shadow[id].request = req;
-
-	BUG_ON(info->max_indirect_segments == 0 &&
-	       GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-	BUG_ON(info->max_indirect_segments &&
-	       GREFS(req->nr_phys_segments) > info->max_indirect_segments);
+	id = blkif_ring_get_request(rinfo, req, &ring_req);
 
-	num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
+	num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
 	num_grant = 0;
 	/* Calculate the number of grant used */
-	for_each_sg(info->shadow[id].sg, sg, num_sg, i)
+	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
 	       num_grant += gnttab_count_grant(sg->offset, sg->length);
 
-	ring_req->u.rw.id = id;
-	info->shadow[id].num_sg = num_sg;
-	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+	require_extra_req = info->max_indirect_segments == 0 &&
+		num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
+
+	rinfo->shadow[id].num_sg = num_sg;
+	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
+	    likely(!require_extra_req)) {
 		/*
 		 * The indirect operation can only be a BLKIF_OP_READ or
 		 * BLKIF_OP_WRITE
@@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req)
 			}
 		}
 		ring_req->u.rw.nr_segments = num_grant;
+		if (unlikely(require_extra_req)) {
+			extra_id = blkif_ring_get_request(rinfo, req,
+							  &extra_ring_req);
+			/*
+			 * Only the first request contains the scatter-gather
+			 * list.
+			 */
+			rinfo->shadow[extra_id].num_sg = 0;
+
+			blkif_setup_extra_req(ring_req, extra_ring_req);
+
+			/* Link the 2 requests together */
+			rinfo->shadow[extra_id].associated_id = id;
+			rinfo->shadow[id].associated_id = extra_id;
+		}
 	}
 
 	setup.ring_req = ring_req;
 	setup.id = id;
-	for_each_sg(info->shadow[id].sg, sg, num_sg, i) {
+
+	setup.require_extra_req = require_extra_req;
+	if (unlikely(require_extra_req))
+		setup.extra_ring_req = extra_ring_req;
+
+	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
 		BUG_ON(sg->offset + sg->length > PAGE_SIZE);
 
 		if (setup.need_copy) {
@@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req)
 	if (setup.segments)
 		kunmap_atomic(setup.segments);
 
-	info->ring.req_prod_pvt++;
-
 	/* Keep a private copy so we can reissue requests when recovering. */
-	info->shadow[id].req = *ring_req;
+	rinfo->shadow[id].req = *ring_req;
+	if (unlikely(require_extra_req))
+		rinfo->shadow[extra_id].req = *extra_ring_req;
 
-	if (new_persistent_gnts)
+	if (max_grefs > 0)
 		gnttab_free_grant_references(setup.gref_head);
 
 	return 0;
@@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req)
  *
  * @req: a request struct
  */
-static int blkif_queue_request(struct request *req)
+static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo)
 {
-	struct blkfront_info *info = req->rq_disk->private_data;
-
-	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+	if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
 		return 1;
 
 	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE)))
-		return blkif_queue_discard_req(req);
+		return blkif_queue_discard_req(req, rinfo);
 	else
-		return blkif_queue_rw_req(req);
+		return blkif_queue_rw_req(req, rinfo);
 }
 
-static inline void flush_requests(struct blkfront_info *info)
+static inline void flush_requests(struct blkfront_ring_info *rinfo)
 {
 	int notify;
 
-	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
 
 	if (notify)
-		notify_remote_via_irq(info->irq);
+		notify_remote_via_irq(rinfo->irq);
 }
 
 static inline bool blkif_request_flush_invalid(struct request *req,
@@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req,
 }
 
 static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
-			   const struct blk_mq_queue_data *qd)
+			  const struct blk_mq_queue_data *qd)
 {
-	struct blkfront_info *info = qd->rq->rq_disk->private_data;
+	unsigned long flags;
+	struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data;
 
 	blk_mq_start_request(qd->rq);
-	spin_lock_irq(&info->io_lock);
-	if (RING_FULL(&info->ring))
+	spin_lock_irqsave(&rinfo->ring_lock, flags);
+	if (RING_FULL(&rinfo->ring))
 		goto out_busy;
 
-	if (blkif_request_flush_invalid(qd->rq, info))
+	if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
 		goto out_err;
 
-	if (blkif_queue_request(qd->rq))
+	if (blkif_queue_request(qd->rq, rinfo))
 		goto out_busy;
 
-	flush_requests(info);
-	spin_unlock_irq(&info->io_lock);
+	flush_requests(rinfo);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 	return BLK_MQ_RQ_QUEUE_OK;
 
 out_err:
-	spin_unlock_irq(&info->io_lock);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 	return BLK_MQ_RQ_QUEUE_ERROR;
 
 out_busy:
-	spin_unlock_irq(&info->io_lock);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 	blk_mq_stop_hw_queue(hctx);
 	return BLK_MQ_RQ_QUEUE_BUSY;
 }
 
+static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+			    unsigned int index)
+{
+	struct blkfront_info *info = (struct blkfront_info *)data;
+
+	BUG_ON(info->nr_rings <= index);
+	hctx->driver_data = &info->rinfo[index];
+	return 0;
+}
+
 static struct blk_mq_ops blkfront_mq_ops = {
 	.queue_rq = blkif_queue_rq,
 	.map_queue = blk_mq_map_queue,
+	.init_hctx = blk_mq_init_hctx,
 };
 
 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
@@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
 
 	memset(&info->tag_set, 0, sizeof(info->tag_set));
 	info->tag_set.ops = &blkfront_mq_ops;
-	info->tag_set.nr_hw_queues = 1;
-	info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+	info->tag_set.nr_hw_queues = info->nr_rings;
+	if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+		/*
+		 * When indirect descriptior is not supported, the I/O request
+		 * will be split between multiple request in the ring.
+		 * To avoid problems when sending the request, divide by
+		 * 2 the depth of the queue.
+		 */
+		info->tag_set.queue_depth =  BLK_RING_SIZE(info) / 2;
+	} else
+		info->tag_set.queue_depth = BLK_RING_SIZE(info);
 	info->tag_set.numa_node = NUMA_NO_NODE;
 	info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
 	info->tag_set.cmd_size = 0;
 	info->tag_set.driver_data = info;
 
 	if (blk_mq_alloc_tag_set(&info->tag_set))
-		return -1;
+		return -EINVAL;
 	rq = blk_mq_init_queue(&info->tag_set);
 	if (IS_ERR(rq)) {
 		blk_mq_free_tag_set(&info->tag_set);
-		return -1;
+		return PTR_ERR(rq);
 	}
 
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
@@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 
 static void xlvbd_release_gendisk(struct blkfront_info *info)
 {
-	unsigned int minor, nr_minors;
+	unsigned int minor, nr_minors, i;
 
 	if (info->rq == NULL)
 		return;
@@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
 	/* No more blkif_request(). */
 	blk_mq_stop_hw_queues(info->rq);
 
-	/* No more gnttab callback work. */
-	gnttab_cancel_free_callback(&info->callback);
+	for (i = 0; i < info->nr_rings; i++) {
+		struct blkfront_ring_info *rinfo = &info->rinfo[i];
 
-	/* Flush gnttab callback work. Must be done with no locks held. */
-	flush_work(&info->work);
+		/* No more gnttab callback work. */
+		gnttab_cancel_free_callback(&rinfo->callback);
+
+		/* Flush gnttab callback work. Must be done with no locks held. */
+		flush_work(&rinfo->work);
+	}
 
 	del_gendisk(info->gd);
 
@@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
 	info->gd = NULL;
 }
 
-/* Must be called with io_lock holded */
-static void kick_pending_request_queues(struct blkfront_info *info)
+/* Already hold rinfo->ring_lock. */
+static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
 {
-	if (!RING_FULL(&info->ring))
-		blk_mq_start_stopped_hw_queues(info->rq, true);
+	if (!RING_FULL(&rinfo->ring))
+		blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
 }
 
-static void blkif_restart_queue(struct work_struct *work)
+static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
 {
-	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
+	unsigned long flags;
 
-	spin_lock_irq(&info->io_lock);
-	if (info->connected == BLKIF_STATE_CONNECTED)
-		kick_pending_request_queues(info);
-	spin_unlock_irq(&info->io_lock);
+	spin_lock_irqsave(&rinfo->ring_lock, flags);
+	kick_pending_request_queues_locked(rinfo);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 }
 
-static void blkif_free(struct blkfront_info *info, int suspend)
+static void blkif_restart_queue(struct work_struct *work)
 {
-	struct grant *persistent_gnt;
-	struct grant *n;
-	int i, j, segs;
+	struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
 
-	/* Prevent new requests being issued until we fix things up. */
-	spin_lock_irq(&info->io_lock);
-	info->connected = suspend ?
-		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
-	/* No more blkif_request(). */
-	if (info->rq)
-		blk_mq_stop_hw_queues(info->rq);
+	if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
+		kick_pending_request_queues(rinfo);
+}
 
-	/* Remove all persistent grants */
-	if (!list_empty(&info->grants)) {
-		list_for_each_entry_safe(persistent_gnt, n,
-		                         &info->grants, node) {
-			list_del(&persistent_gnt->node);
-			if (persistent_gnt->gref != GRANT_INVALID_REF) {
-				gnttab_end_foreign_access(persistent_gnt->gref,
-				                          0, 0UL);
-				info->persistent_gnts_c--;
-			}
-			if (info->feature_persistent)
-				__free_page(persistent_gnt->page);
-			kfree(persistent_gnt);
-		}
-	}
-	BUG_ON(info->persistent_gnts_c != 0);
+static void blkif_free_ring(struct blkfront_ring_info *rinfo)
+{
+	struct grant *persistent_gnt, *n;
+	struct blkfront_info *info = rinfo->dev_info;
+	int i, j, segs;
 
 	/*
 	 * Remove indirect pages, this only happens when using indirect
 	 * descriptors but not persistent grants
 	 */
-	if (!list_empty(&info->indirect_pages)) {
+	if (!list_empty(&rinfo->indirect_pages)) {
 		struct page *indirect_page, *n;
 
 		BUG_ON(info->feature_persistent);
-		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+		list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
 			list_del(&indirect_page->lru);
 			__free_page(indirect_page);
 		}
 	}
 
+	/* Remove all persistent grants. */
+	if (!list_empty(&rinfo->grants)) {
+		list_for_each_entry_safe(persistent_gnt, n,
+					 &rinfo->grants, node) {
+			list_del(&persistent_gnt->node);
+			if (persistent_gnt->gref != GRANT_INVALID_REF) {
+				gnttab_end_foreign_access(persistent_gnt->gref,
+							  0, 0UL);
+				rinfo->persistent_gnts_c--;
+			}
+			if (info->feature_persistent)
+				__free_page(persistent_gnt->page);
+			kfree(persistent_gnt);
+		}
+	}
+	BUG_ON(rinfo->persistent_gnts_c != 0);
+
 	for (i = 0; i < BLK_RING_SIZE(info); i++) {
 		/*
 		 * Clear persistent grants present in requests already
 		 * on the shared ring
 		 */
-		if (!info->shadow[i].request)
+		if (!rinfo->shadow[i].request)
 			goto free_shadow;
 
-		segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
-		       info->shadow[i].req.u.indirect.nr_segments :
-		       info->shadow[i].req.u.rw.nr_segments;
+		segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
+		       rinfo->shadow[i].req.u.indirect.nr_segments :
+		       rinfo->shadow[i].req.u.rw.nr_segments;
 		for (j = 0; j < segs; j++) {
-			persistent_gnt = info->shadow[i].grants_used[j];
+			persistent_gnt = rinfo->shadow[i].grants_used[j];
 			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
 			if (info->feature_persistent)
 				__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
 		}
 
-		if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
+		if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
 			/*
 			 * If this is not an indirect operation don't try to
 			 * free indirect segments
@@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 			goto free_shadow;
 
 		for (j = 0; j < INDIRECT_GREFS(segs); j++) {
-			persistent_gnt = info->shadow[i].indirect_grants[j];
+			persistent_gnt = rinfo->shadow[i].indirect_grants[j];
 			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
 			__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
 		}
 
 free_shadow:
-		kfree(info->shadow[i].grants_used);
-		info->shadow[i].grants_used = NULL;
-		kfree(info->shadow[i].indirect_grants);
-		info->shadow[i].indirect_grants = NULL;
-		kfree(info->shadow[i].sg);
-		info->shadow[i].sg = NULL;
+		kfree(rinfo->shadow[i].grants_used);
+		rinfo->shadow[i].grants_used = NULL;
+		kfree(rinfo->shadow[i].indirect_grants);
+		rinfo->shadow[i].indirect_grants = NULL;
+		kfree(rinfo->shadow[i].sg);
+		rinfo->shadow[i].sg = NULL;
 	}
 
 	/* No more gnttab callback work. */
-	gnttab_cancel_free_callback(&info->callback);
-	spin_unlock_irq(&info->io_lock);
+	gnttab_cancel_free_callback(&rinfo->callback);
 
 	/* Flush gnttab callback work. Must be done with no locks held. */
-	flush_work(&info->work);
+	flush_work(&rinfo->work);
 
 	/* Free resources associated with old device channel. */
 	for (i = 0; i < info->nr_ring_pages; i++) {
-		if (info->ring_ref[i] != GRANT_INVALID_REF) {
-			gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
-			info->ring_ref[i] = GRANT_INVALID_REF;
+		if (rinfo->ring_ref[i] != GRANT_INVALID_REF) {
+			gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0);
+			rinfo->ring_ref[i] = GRANT_INVALID_REF;
 		}
 	}
-	free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
-	info->ring.sring = NULL;
+	free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
+	rinfo->ring.sring = NULL;
 
-	if (info->irq)
-		unbind_from_irqhandler(info->irq, info);
-	info->evtchn = info->irq = 0;
+	if (rinfo->irq)
+		unbind_from_irqhandler(rinfo->irq, rinfo);
+	rinfo->evtchn = rinfo->irq = 0;
+}
 
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+	unsigned int i;
+
+	/* Prevent new requests being issued until we fix things up. */
+	info->connected = suspend ?
+		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+	/* No more blkif_request(). */
+	if (info->rq)
+		blk_mq_stop_hw_queues(info->rq);
+
+	for (i = 0; i < info->nr_rings; i++)
+		blkif_free_ring(&info->rinfo[i]);
+
+	kfree(info->rinfo);
+	info->rinfo = NULL;
+	info->nr_rings = 0;
 }
 
 struct copy_from_grant {
@@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
 	kunmap_atomic(shared_data);
 }
 
-static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
+static enum blk_req_status blkif_rsp_to_req_status(int rsp)
+{
+	switch (rsp)
+	{
+	case BLKIF_RSP_OKAY:
+		return REQ_DONE;
+	case BLKIF_RSP_EOPNOTSUPP:
+		return REQ_EOPNOTSUPP;
+	case BLKIF_RSP_ERROR:
+		/* Fallthrough. */
+	default:
+		return REQ_ERROR;
+	}
+}
+
+/*
+ * Get the final status of the block request based on two ring response
+ */
+static int blkif_get_final_status(enum blk_req_status s1,
+				  enum blk_req_status s2)
+{
+	BUG_ON(s1 == REQ_WAITING);
+	BUG_ON(s2 == REQ_WAITING);
+
+	if (s1 == REQ_ERROR || s2 == REQ_ERROR)
+		return BLKIF_RSP_ERROR;
+	else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
+		return BLKIF_RSP_EOPNOTSUPP;
+	return BLKIF_RSP_OKAY;
+}
+
+static bool blkif_completion(unsigned long *id,
+			     struct blkfront_ring_info *rinfo,
 			     struct blkif_response *bret)
 {
 	int i = 0;
 	struct scatterlist *sg;
 	int num_sg, num_grant;
+	struct blkfront_info *info = rinfo->dev_info;
+	struct blk_shadow *s = &rinfo->shadow[*id];
 	struct copy_from_grant data = {
-		.s = s,
 		.grant_idx = 0,
 	};
 
 	num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
 		s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+
+	/* The I/O request may be split in two. */
+	if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
+		struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
+
+		/* Keep the status of the current response in shadow. */
+		s->status = blkif_rsp_to_req_status(bret->status);
+
+		/* Wait the second response if not yet here. */
+		if (s2->status == REQ_WAITING)
+			return 0;
+
+		bret->status = blkif_get_final_status(s->status,
+						      s2->status);
+
+		/*
+		 * All the grants is stored in the first shadow in order
+		 * to make the completion code simpler.
+		 */
+		num_grant += s2->req.u.rw.nr_segments;
+
+		/*
+		 * The two responses may not come in order. Only the
+		 * first request will store the scatter-gather list.
+		 */
+		if (s2->num_sg != 0) {
+			/* Update "id" with the ID of the first response. */
+			*id = s->associated_id;
+			s = s2;
+		}
+
+		/*
+		 * We don't need anymore the second request, so recycling
+		 * it now.
+		 */
+		if (add_id_to_freelist(rinfo, s->associated_id))
+			WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
+			     info->gd->disk_name, s->associated_id);
+	}
+
+	data.s = s;
 	num_sg = s->num_sg;
 
 	if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 			if (!info->feature_persistent)
 				pr_alert_ratelimited("backed has not unmapped grant: %u\n",
 						     s->grants_used[i]->gref);
-			list_add(&s->grants_used[i]->node, &info->grants);
-			info->persistent_gnts_c++;
+			list_add(&s->grants_used[i]->node, &rinfo->grants);
+			rinfo->persistent_gnts_c++;
 		} else {
 			/*
 			 * If the grant is not mapped by the backend we end the
@@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 			 */
 			gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
 			s->grants_used[i]->gref = GRANT_INVALID_REF;
-			list_add_tail(&s->grants_used[i]->node, &info->grants);
+			list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
 		}
 	}
 	if (s->req.operation == BLKIF_OP_INDIRECT) {
@@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 				if (!info->feature_persistent)
 					pr_alert_ratelimited("backed has not unmapped grant: %u\n",
 							     s->indirect_grants[i]->gref);
-				list_add(&s->indirect_grants[i]->node, &info->grants);
-				info->persistent_gnts_c++;
+				list_add(&s->indirect_grants[i]->node, &rinfo->grants);
+				rinfo->persistent_gnts_c++;
 			} else {
 				struct page *indirect_page;
 
@@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 				 */
 				if (!info->feature_persistent) {
 					indirect_page = s->indirect_grants[i]->page;
-					list_add(&indirect_page->lru, &info->indirect_pages);
+					list_add(&indirect_page->lru, &rinfo->indirect_pages);
 				}
 				s->indirect_grants[i]->gref = GRANT_INVALID_REF;
-				list_add_tail(&s->indirect_grants[i]->node, &info->grants);
+				list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
 			}
 		}
 	}
+
+	return 1;
 }
 
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	struct blkif_response *bret;
 	RING_IDX i, rp;
 	unsigned long flags;
-	struct blkfront_info *info = (struct blkfront_info *)dev_id;
+	struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
+	struct blkfront_info *info = rinfo->dev_info;
 	int error;
 
-	spin_lock_irqsave(&info->io_lock, flags);
-
-	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
-		spin_unlock_irqrestore(&info->io_lock, flags);
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 		return IRQ_HANDLED;
-	}
 
+	spin_lock_irqsave(&rinfo->ring_lock, flags);
  again:
-	rp = info->ring.sring->rsp_prod;
+	rp = rinfo->ring.sring->rsp_prod;
 	rmb(); /* Ensure we see queued responses up to 'rp'. */
 
-	for (i = info->ring.rsp_cons; i != rp; i++) {
+	for (i = rinfo->ring.rsp_cons; i != rp; i++) {
 		unsigned long id;
 
-		bret = RING_GET_RESPONSE(&info->ring, i);
+		bret = RING_GET_RESPONSE(&rinfo->ring, i);
 		id   = bret->id;
 		/*
 		 * The backend has messed up and given us an id that we would
@@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 			 * the id is busted. */
 			continue;
 		}
-		req  = info->shadow[id].request;
+		req  = rinfo->shadow[id].request;
 
-		if (bret->operation != BLKIF_OP_DISCARD)
-			blkif_completion(&info->shadow[id], info, bret);
+		if (bret->operation != BLKIF_OP_DISCARD) {
+			/*
+			 * We may need to wait for an extra response if the
+			 * I/O request is split in 2
+			 */
+			if (!blkif_completion(&id, rinfo, bret))
+				continue;
+		}
 
-		if (add_id_to_freelist(info, id)) {
+		if (add_id_to_freelist(rinfo, id)) {
 			WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
 			     info->gd->disk_name, op_name(bret->operation), id);
 			continue;
@@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				error = -EOPNOTSUPP;
 			}
 			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
-				     info->shadow[id].req.u.rw.nr_segments == 0)) {
+				     rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
 				printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
 				       info->gd->disk_name, op_name(bret->operation));
 				error = -EOPNOTSUPP;
@@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 		}
 	}
 
-	info->ring.rsp_cons = i;
+	rinfo->ring.rsp_cons = i;
 
-	if (i != info->ring.req_prod_pvt) {
+	if (i != rinfo->ring.req_prod_pvt) {
 		int more_to_do;
-		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+		RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
 		if (more_to_do)
 			goto again;
 	} else
-		info->ring.sring->rsp_event = i + 1;
+		rinfo->ring.sring->rsp_event = i + 1;
 
-	kick_pending_request_queues(info);
+	kick_pending_request_queues_locked(rinfo);
 
-	spin_unlock_irqrestore(&info->io_lock, flags);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 
 	return IRQ_HANDLED;
 }
 
 
 static int setup_blkring(struct xenbus_device *dev,
-			 struct blkfront_info *info)
+			 struct blkfront_ring_info *rinfo)
 {
 	struct blkif_sring *sring;
 	int err, i;
+	struct blkfront_info *info = rinfo->dev_info;
 	unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
 	grant_ref_t gref[XENBUS_MAX_RING_GRANTS];
 
 	for (i = 0; i < info->nr_ring_pages; i++)
-		info->ring_ref[i] = GRANT_INVALID_REF;
+		rinfo->ring_ref[i] = GRANT_INVALID_REF;
 
 	sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
 						       get_order(ring_size));
@@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev,
 		return -ENOMEM;
 	}
 	SHARED_RING_INIT(sring);
-	FRONT_RING_INIT(&info->ring, sring, ring_size);
+	FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
 
-	err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
+	err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref);
 	if (err < 0) {
 		free_pages((unsigned long)sring, get_order(ring_size));
-		info->ring.sring = NULL;
+		rinfo->ring.sring = NULL;
 		goto fail;
 	}
 	for (i = 0; i < info->nr_ring_pages; i++)
-		info->ring_ref[i] = gref[i];
+		rinfo->ring_ref[i] = gref[i];
 
-	err = xenbus_alloc_evtchn(dev, &info->evtchn);
+	err = xenbus_alloc_evtchn(dev, &rinfo->evtchn);
 	if (err)
 		goto fail;
 
-	err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
-					"blkif", info);
+	err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0,
+					"blkif", rinfo);
 	if (err <= 0) {
 		xenbus_dev_fatal(dev, err,
 				 "bind_evtchn_to_irqhandler failed");
 		goto fail;
 	}
-	info->irq = err;
+	rinfo->irq = err;
 
 	return 0;
 fail:
@@ -1455,6 +1701,53 @@ fail:
 	return err;
 }
 
+/*
+ * Write out per-ring/queue nodes including ring-ref and event-channel, and each
+ * ring buffer may have multi pages depending on ->nr_ring_pages.
+ */
+static int write_per_ring_nodes(struct xenbus_transaction xbt,
+				struct blkfront_ring_info *rinfo, const char *dir)
+{
+	int err;
+	unsigned int i;
+	const char *message = NULL;
+	struct blkfront_info *info = rinfo->dev_info;
+
+	if (info->nr_ring_pages == 1) {
+		err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]);
+		if (err) {
+			message = "writing ring-ref";
+			goto abort_transaction;
+		}
+	} else {
+		for (i = 0; i < info->nr_ring_pages; i++) {
+			char ring_ref_name[RINGREF_NAME_LEN];
+
+			snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+			err = xenbus_printf(xbt, dir, ring_ref_name,
+					    "%u", rinfo->ring_ref[i]);
+			if (err) {
+				message = "writing ring-ref";
+				goto abort_transaction;
+			}
+		}
+	}
+
+	err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn);
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+
+	return 0;
+
+abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	if (message)
+		xenbus_dev_fatal(info->xbdev, err, "%s", message);
+
+	return err;
+}
 
 /* Common code used when first setting up, and when resuming. */
 static int talk_to_blkback(struct xenbus_device *dev,
@@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev,
 {
 	const char *message = NULL;
 	struct xenbus_transaction xbt;
-	int err, i;
-	unsigned int max_page_order = 0;
+	int err;
+	unsigned int i, max_page_order = 0;
 	unsigned int ring_page_order = 0;
 
 	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
@@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev,
 		info->nr_ring_pages = 1 << ring_page_order;
 	}
 
-	/* Create shared ring, alloc event channel. */
-	err = setup_blkring(dev, info);
-	if (err)
-		goto out;
+	for (i = 0; i < info->nr_rings; i++) {
+		struct blkfront_ring_info *rinfo = &info->rinfo[i];
+
+		/* Create shared ring, alloc event channel. */
+		err = setup_blkring(dev, rinfo);
+		if (err)
+			goto destroy_blkring;
+	}
 
 again:
 	err = xenbus_transaction_start(&xbt);
@@ -1487,38 +1784,49 @@ again:
 		goto destroy_blkring;
 	}
 
-	if (info->nr_ring_pages == 1) {
-		err = xenbus_printf(xbt, dev->nodename,
-				    "ring-ref", "%u", info->ring_ref[0]);
+	if (info->nr_ring_pages > 1) {
+		err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u",
+				    ring_page_order);
 		if (err) {
-			message = "writing ring-ref";
+			message = "writing ring-page-order";
 			goto abort_transaction;
 		}
+	}
+
+	/* We already got the number of queues/rings in _probe */
+	if (info->nr_rings == 1) {
+		err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename);
+		if (err)
+			goto destroy_blkring;
 	} else {
-		err = xenbus_printf(xbt, dev->nodename,
-				    "ring-page-order", "%u", ring_page_order);
+		char *path;
+		size_t pathsize;
+
+		err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u",
+				    info->nr_rings);
 		if (err) {
-			message = "writing ring-page-order";
+			message = "writing multi-queue-num-queues";
 			goto abort_transaction;
 		}
 
-		for (i = 0; i < info->nr_ring_pages; i++) {
-			char ring_ref_name[RINGREF_NAME_LEN];
+		pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
+		path = kmalloc(pathsize, GFP_KERNEL);
+		if (!path) {
+			err = -ENOMEM;
+			message = "ENOMEM while writing ring references";
+			goto abort_transaction;
+		}
 
-			snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
-			err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
-					    "%u", info->ring_ref[i]);
+		for (i = 0; i < info->nr_rings; i++) {
+			memset(path, 0, pathsize);
+			snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i);
+			err = write_per_ring_nodes(xbt, &info->rinfo[i], path);
 			if (err) {
-				message = "writing ring-ref";
-				goto abort_transaction;
+				kfree(path);
+				goto destroy_blkring;
 			}
 		}
-	}
-	err = xenbus_printf(xbt, dev->nodename,
-			    "event-channel", "%u", info->evtchn);
-	if (err) {
-		message = "writing event-channel";
-		goto abort_transaction;
+		kfree(path);
 	}
 	err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
 			    XEN_IO_PROTO_ABI_NATIVE);
@@ -1540,9 +1848,14 @@ again:
 		goto destroy_blkring;
 	}
 
-	for (i = 0; i < BLK_RING_SIZE(info); i++)
-		info->shadow[i].req.u.rw.id = i+1;
-	info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+	for (i = 0; i < info->nr_rings; i++) {
+		unsigned int j;
+		struct blkfront_ring_info *rinfo = &info->rinfo[i];
+
+		for (j = 0; j < BLK_RING_SIZE(info); j++)
+			rinfo->shadow[j].req.u.rw.id = j + 1;
+		rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+	}
 	xenbus_switch_state(dev, XenbusStateInitialised);
 
 	return 0;
@@ -1553,7 +1866,10 @@ again:
 		xenbus_dev_fatal(dev, err, "%s", message);
  destroy_blkring:
 	blkif_free(info, 0);
- out:
+
+	kfree(info);
+	dev_set_drvdata(&dev->dev, NULL);
+
 	return err;
 }
 
@@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev,
 			  const struct xenbus_device_id *id)
 {
 	int err, vdevice;
+	unsigned int r_index;
 	struct blkfront_info *info;
+	unsigned int backend_max_queues = 0;
 
 	/* FIXME: Use dynamic device id if this is not set. */
 	err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev,
 		return -ENOMEM;
 	}
 
-	mutex_init(&info->mutex);
-	spin_lock_init(&info->io_lock);
 	info->xbdev = dev;
+	/* Check if backend supports multiple queues. */
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "multi-queue-max-queues", "%u", &backend_max_queues);
+	if (err < 0)
+		backend_max_queues = 1;
+
+	info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
+	/* We need at least one ring. */
+	if (!info->nr_rings)
+		info->nr_rings = 1;
+
+	info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
+	if (!info->rinfo) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure");
+		kfree(info);
+		return -ENOMEM;
+	}
+
+	for (r_index = 0; r_index < info->nr_rings; r_index++) {
+		struct blkfront_ring_info *rinfo;
+
+		rinfo = &info->rinfo[r_index];
+		INIT_LIST_HEAD(&rinfo->indirect_pages);
+		INIT_LIST_HEAD(&rinfo->grants);
+		rinfo->dev_info = info;
+		INIT_WORK(&rinfo->work, blkif_restart_queue);
+		spin_lock_init(&rinfo->ring_lock);
+	}
+
+	mutex_init(&info->mutex);
 	info->vdevice = vdevice;
-	INIT_LIST_HEAD(&info->grants);
-	INIT_LIST_HEAD(&info->indirect_pages);
-	info->persistent_gnts_c = 0;
 	info->connected = BLKIF_STATE_DISCONNECTED;
-	INIT_WORK(&info->work, blkif_restart_queue);
 
 	/* Front end dir is a number, which is used as the id. */
 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio)
 
 static int blkif_recover(struct blkfront_info *info)
 {
-	int i;
+	unsigned int i, r_index;
 	struct request *req, *n;
 	struct blk_shadow *copy;
 	int rc;
@@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info)
 	struct split_bio *split_bio;
 	struct list_head requests;
 
-	/* Stage 1: Make a safe copy of the shadow state. */
-	copy = kmemdup(info->shadow, sizeof(info->shadow),
-		       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
-	if (!copy)
-		return -ENOMEM;
-
-	/* Stage 2: Set up free list. */
-	memset(&info->shadow, 0, sizeof(info->shadow));
-	for (i = 0; i < BLK_RING_SIZE(info); i++)
-		info->shadow[i].req.u.rw.id = i+1;
-	info->shadow_free = info->ring.req_prod_pvt;
-	info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
-
-	rc = blkfront_gather_backend_features(info);
-	if (rc) {
-		kfree(copy);
-		return rc;
-	}
-
+	blkfront_gather_backend_features(info);
 	segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	blk_queue_max_segments(info->rq, segs);
 	bio_list_init(&bio_list);
 	INIT_LIST_HEAD(&requests);
-	for (i = 0; i < BLK_RING_SIZE(info); i++) {
-		/* Not in use? */
-		if (!copy[i].request)
-			continue;
 
-		/*
-		 * Get the bios in the request so we can re-queue them.
-		 */
-		if (copy[i].request->cmd_flags &
-		    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+	for (r_index = 0; r_index < info->nr_rings; r_index++) {
+		struct blkfront_ring_info *rinfo;
+
+		rinfo = &info->rinfo[r_index];
+		/* Stage 1: Make a safe copy of the shadow state. */
+		copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
+			       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
+		if (!copy)
+			return -ENOMEM;
+
+		/* Stage 2: Set up free list. */
+		memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
+		for (i = 0; i < BLK_RING_SIZE(info); i++)
+			rinfo->shadow[i].req.u.rw.id = i+1;
+		rinfo->shadow_free = rinfo->ring.req_prod_pvt;
+		rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+
+		rc = blkfront_setup_indirect(rinfo);
+		if (rc) {
+			kfree(copy);
+			return rc;
+		}
+
+		for (i = 0; i < BLK_RING_SIZE(info); i++) {
+			/* Not in use? */
+			if (!copy[i].request)
+				continue;
+
 			/*
-			 * Flush operations don't contain bios, so
-			 * we need to requeue the whole request
+			 * Get the bios in the request so we can re-queue them.
 			 */
-			list_add(&copy[i].request->queuelist, &requests);
-			continue;
+			if (copy[i].request->cmd_flags &
+			    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+				/*
+				 * Flush operations don't contain bios, so
+				 * we need to requeue the whole request
+				 */
+				list_add(&copy[i].request->queuelist, &requests);
+				continue;
+			}
+			merge_bio.head = copy[i].request->bio;
+			merge_bio.tail = copy[i].request->biotail;
+			bio_list_merge(&bio_list, &merge_bio);
+			copy[i].request->bio = NULL;
+			blk_end_request_all(copy[i].request, 0);
 		}
-		merge_bio.head = copy[i].request->bio;
-		merge_bio.tail = copy[i].request->biotail;
-		bio_list_merge(&bio_list, &merge_bio);
-		copy[i].request->bio = NULL;
-		blk_end_request_all(copy[i].request, 0);
-	}
-
-	kfree(copy);
 
+		kfree(copy);
+	}
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
-	spin_lock_irq(&info->io_lock);
-
 	/* Now safe for us to use the shared ring */
 	info->connected = BLKIF_STATE_CONNECTED;
 
-	/* Kick any other new requests queued since we resumed */
-	kick_pending_request_queues(info);
+	for (r_index = 0; r_index < info->nr_rings; r_index++) {
+		struct blkfront_ring_info *rinfo;
+
+		rinfo = &info->rinfo[r_index];
+		/* Kick any other new requests queued since we resumed */
+		kick_pending_request_queues(rinfo);
+	}
 
 	list_for_each_entry_safe(req, n, &requests, queuelist) {
 		/* Requeue pending requests (flush or discard) */
@@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info)
 		BUG_ON(req->nr_phys_segments > segs);
 		blk_mq_requeue_request(req);
 	}
-	spin_unlock_irq(&info->io_lock);
 	blk_mq_kick_requeue_list(info->rq);
 
 	while ((bio = bio_list_pop(&bio_list)) != NULL) {
@@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev)
 	return err;
 }
 
-static void
-blkfront_closing(struct blkfront_info *info)
+static void blkfront_closing(struct blkfront_info *info)
 {
 	struct xenbus_device *xbdev = info->xbdev;
 	struct block_device *bdev = NULL;
@@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info)
 		info->feature_secdiscard = !!discard_secure;
 }
 
-static int blkfront_setup_indirect(struct blkfront_info *info)
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
 {
 	unsigned int psegs, grants;
 	int err, i;
+	struct blkfront_info *info = rinfo->dev_info;
 
-	if (info->max_indirect_segments == 0)
-		grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	if (info->max_indirect_segments == 0) {
+		if (!HAS_EXTRA_REQ)
+			grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+		else {
+			/*
+			 * When an extra req is required, the maximum
+			 * grants supported is related to the size of the
+			 * Linux block segment.
+			 */
+			grants = GRANTS_PER_PSEG;
+		}
+	}
 	else
 		grants = info->max_indirect_segments;
 	psegs = grants / GRANTS_PER_PSEG;
 
-	err = fill_grant_buffer(info,
+	err = fill_grant_buffer(rinfo,
 				(grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
 	if (err)
 		goto out_of_memory;
@@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
 		 */
 		int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
 
-		BUG_ON(!list_empty(&info->indirect_pages));
+		BUG_ON(!list_empty(&rinfo->indirect_pages));
 		for (i = 0; i < num; i++) {
 			struct page *indirect_page = alloc_page(GFP_NOIO);
 			if (!indirect_page)
 				goto out_of_memory;
-			list_add(&indirect_page->lru, &info->indirect_pages);
+			list_add(&indirect_page->lru, &rinfo->indirect_pages);
 		}
 	}
 
 	for (i = 0; i < BLK_RING_SIZE(info); i++) {
-		info->shadow[i].grants_used = kzalloc(
-			sizeof(info->shadow[i].grants_used[0]) * grants,
+		rinfo->shadow[i].grants_used = kzalloc(
+			sizeof(rinfo->shadow[i].grants_used[0]) * grants,
 			GFP_NOIO);
-		info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO);
+		rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO);
 		if (info->max_indirect_segments)
-			info->shadow[i].indirect_grants = kzalloc(
-				sizeof(info->shadow[i].indirect_grants[0]) *
+			rinfo->shadow[i].indirect_grants = kzalloc(
+				sizeof(rinfo->shadow[i].indirect_grants[0]) *
 				INDIRECT_GREFS(grants),
 				GFP_NOIO);
-		if ((info->shadow[i].grants_used == NULL) ||
-			(info->shadow[i].sg == NULL) ||
+		if ((rinfo->shadow[i].grants_used == NULL) ||
+			(rinfo->shadow[i].sg == NULL) ||
 		     (info->max_indirect_segments &&
-		     (info->shadow[i].indirect_grants == NULL)))
+		     (rinfo->shadow[i].indirect_grants == NULL)))
 			goto out_of_memory;
-		sg_init_table(info->shadow[i].sg, psegs);
+		sg_init_table(rinfo->shadow[i].sg, psegs);
 	}
 
 
@@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
 
 out_of_memory:
 	for (i = 0; i < BLK_RING_SIZE(info); i++) {
-		kfree(info->shadow[i].grants_used);
-		info->shadow[i].grants_used = NULL;
-		kfree(info->shadow[i].sg);
-		info->shadow[i].sg = NULL;
-		kfree(info->shadow[i].indirect_grants);
-		info->shadow[i].indirect_grants = NULL;
-	}
-	if (!list_empty(&info->indirect_pages)) {
+		kfree(rinfo->shadow[i].grants_used);
+		rinfo->shadow[i].grants_used = NULL;
+		kfree(rinfo->shadow[i].sg);
+		rinfo->shadow[i].sg = NULL;
+		kfree(rinfo->shadow[i].indirect_grants);
+		rinfo->shadow[i].indirect_grants = NULL;
+	}
+	if (!list_empty(&rinfo->indirect_pages)) {
 		struct page *indirect_page, *n;
-		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+		list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
 			list_del(&indirect_page->lru);
 			__free_page(indirect_page);
 		}
@@ -1927,7 +2287,7 @@ out_of_memory:
 /*
  * Gather all backend feature-*
  */
-static int blkfront_gather_backend_features(struct blkfront_info *info)
+static void blkfront_gather_backend_features(struct blkfront_info *info)
 {
 	int err;
 	int barrier, flush, discard, persistent;
@@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info)
 	else
 		info->max_indirect_segments = min(indirect_segments,
 						  xen_blkif_max_segments);
-
-	return blkfront_setup_indirect(info);
 }
 
 /*
@@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	unsigned long sector_size;
 	unsigned int physical_sector_size;
 	unsigned int binfo;
-	int err;
+	int err, i;
 
 	switch (info->connected) {
 	case BLKIF_STATE_CONNECTED:
@@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info)
 	if (err != 1)
 		physical_sector_size = sector_size;
 
-	err = blkfront_gather_backend_features(info);
-	if (err) {
-		xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
-				 info->xbdev->otherend);
-		return;
+	blkfront_gather_backend_features(info);
+	for (i = 0; i < info->nr_rings; i++) {
+		err = blkfront_setup_indirect(&info->rinfo[i]);
+		if (err) {
+			xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+					 info->xbdev->otherend);
+			blkif_free(info, 0);
+			break;
+		}
 	}
 
 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
@@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info)
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
 	/* Kick pending requests. */
-	spin_lock_irq(&info->io_lock);
 	info->connected = BLKIF_STATE_CONNECTED;
-	kick_pending_request_queues(info);
-	spin_unlock_irq(&info->io_lock);
+	for (i = 0; i < info->nr_rings; i++)
+		kick_pending_request_queues(&info->rinfo[i]);
 
 	add_disk(info->gd);
 
@@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev,
 	case XenbusStateInitWait:
 		if (dev->state != XenbusStateInitialising)
 			break;
-		if (talk_to_blkback(dev, info)) {
-			kfree(info);
-			dev_set_drvdata(&dev->dev, NULL);
+		if (talk_to_blkback(dev, info))
 			break;
-		}
 	case XenbusStateInitialising:
 	case XenbusStateInitialised:
 	case XenbusStateReconfiguring:
@@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev,
 		break;
 
 	case XenbusStateConnected:
+		if (dev->state != XenbusStateInitialised) {
+			if (talk_to_blkback(dev, info))
+				break;
+		}
 		blkfront_connect(info);
 		break;
 
@@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = {
 static int __init xlblk_init(void)
 {
 	int ret;
+	int nr_cpus = num_online_cpus();
 
 	if (!xen_domain())
 		return -ENODEV;
@@ -2288,7 +2651,13 @@ static int __init xlblk_init(void)
 	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
 		pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
 			xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
-		xen_blkif_max_ring_order = 0;
+		xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
+	}
+
+	if (xen_blkif_max_queues > nr_cpus) {
+		pr_info("Invalid max_queues (%d), will use default max: %d.\n",
+			xen_blkif_max_queues, nr_cpus);
+		xen_blkif_max_queues = nr_cpus;
 	}
 
 	if (!xen_has_pv_disk_devices())
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 6b1721f978c2..4f6f94c43412 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -689,7 +689,7 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig)
 {
 	loff_t ret;
 
-	mutex_lock(&file_inode(file)->i_mutex);
+	inode_lock(file_inode(file));
 	switch (orig) {
 	case SEEK_CUR:
 		offset += file->f_pos;
@@ -706,7 +706,7 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig)
 	default:
 		ret = -EINVAL;
 	}
-	mutex_unlock(&file_inode(file)->i_mutex);
+	inode_unlock(file_inode(file));
 	return ret;
 }
 
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index f1d7fa45c275..f3f92d5fcda0 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -93,14 +93,11 @@ struct vma_data {
 	spinlock_t lock;	/* Serialize access to this structure. */
 	int count;		/* Number of pages allocated. */
 	enum mspec_page_type type; /* Type of pages allocated. */
-	int flags;		/* See VMD_xxx below. */
 	unsigned long vm_start;	/* Original (unsplit) base. */
 	unsigned long vm_end;	/* Original (unsplit) end. */
 	unsigned long maddr[0];	/* Array of MSPEC addresses. */
 };
 
-#define VMD_VMALLOCED 0x1	/* vmalloc'd rather than kmalloc'd */
-
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES	4
@@ -185,10 +182,7 @@ mspec_close(struct vm_area_struct *vma)
 			       "failed to zero page %ld\n", my_page);
 	}
 
-	if (vdata->flags & VMD_VMALLOCED)
-		vfree(vdata);
-	else
-		kfree(vdata);
+	kvfree(vdata);
 }
 
 /*
@@ -256,7 +250,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
 					enum mspec_page_type type)
 {
 	struct vma_data *vdata;
-	int pages, vdata_size, flags = 0;
+	int pages, vdata_size;
 
 	if (vma->vm_pgoff != 0)
 		return -EINVAL;
@@ -271,16 +265,13 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
 	vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
 	if (vdata_size <= PAGE_SIZE)
 		vdata = kzalloc(vdata_size, GFP_KERNEL);
-	else {
+	else
 		vdata = vzalloc(vdata_size);
-		flags = VMD_VMALLOCED;
-	}
 	if (!vdata)
 		return -ENOMEM;
 
 	vdata->vm_start = vma->vm_start;
 	vdata->vm_end = vma->vm_end;
-	vdata->flags = flags;
 	vdata->type = type;
 	spin_lock_init(&vdata->lock);
 	atomic_set(&vdata->refcnt, 1);
diff --git a/drivers/char/ps3flash.c b/drivers/char/ps3flash.c
index 0b311fa277ef..b526dc15c271 100644
--- a/drivers/char/ps3flash.c
+++ b/drivers/char/ps3flash.c
@@ -290,9 +290,9 @@ static int ps3flash_fsync(struct file *file, loff_t start, loff_t end, int datas
 {
 	struct inode *inode = file_inode(file);
 	int err;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	err = ps3flash_writeback(ps3flash_dev);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err;
 }
 
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 3dd69df9c970..07d494276aad 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -381,6 +381,7 @@ config CRYPTO_DEV_BFIN_CRC
 
 config CRYPTO_DEV_ATMEL_AES
 	tristate "Support for Atmel AES hw accelerator"
+	depends on HAS_DMA
 	depends on AT_XDMAC || AT_HDMAC || COMPILE_TEST
 	select CRYPTO_AES
 	select CRYPTO_AEAD
diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c
index 5621612ee921..6dd3317ca365 100644
--- a/drivers/crypto/atmel-aes.c
+++ b/drivers/crypto/atmel-aes.c
@@ -280,6 +280,7 @@ static const char *atmel_aes_reg_name(u32 offset, char *tmp, size_t sz)
 	case AES_GCMHR(2):
 	case AES_GCMHR(3):
 		snprintf(tmp, sz, "GCMHR[%u]", (offset - AES_GCMHR(0)) >> 2);
+		break;
 
 	default:
 		snprintf(tmp, sz, "0x%02x", offset);
diff --git a/drivers/crypto/qat/qat_common/qat_hal.c b/drivers/crypto/qat/qat_common/qat_hal.c
index 0ac0ba867611..1e480f140663 100644
--- a/drivers/crypto/qat/qat_common/qat_hal.c
+++ b/drivers/crypto/qat/qat_common/qat_hal.c
@@ -389,7 +389,7 @@ static int qat_hal_check_ae_alive(struct icp_qat_fw_loader_handle *handle)
 {
 	unsigned int base_cnt, cur_cnt;
 	unsigned char ae;
-	unsigned int times = MAX_RETRY_TIMES;
+	int times = MAX_RETRY_TIMES;
 
 	for (ae = 0; ae < handle->hal_handle->ae_max_num; ae++) {
 		qat_hal_rd_ae_csr(handle, ae, PROFILE_COUNT,
@@ -402,7 +402,7 @@ static int qat_hal_check_ae_alive(struct icp_qat_fw_loader_handle *handle)
 			cur_cnt &= 0xffff;
 		} while (times-- && (cur_cnt == base_cnt));
 
-		if (!times) {
+		if (times < 0) {
 			pr_err("QAT: AE%d is inactive!!\n", ae);
 			return -EFAULT;
 		}
@@ -453,7 +453,11 @@ static int qat_hal_init_esram(struct icp_qat_fw_loader_handle *handle)
 	void __iomem *csr_addr =
 			(void __iomem *)((uintptr_t)handle->hal_ep_csr_addr_v +
 			ESRAM_AUTO_INIT_CSR_OFFSET);
-	unsigned int csr_val, times = 30;
+	unsigned int csr_val;
+	int times = 30;
+
+	if (handle->pci_dev->device == ADF_C3XXX_PCI_DEVICE_ID)
+		return 0;
 
 	csr_val = ADF_CSR_RD(csr_addr, 0);
 	if ((csr_val & ESRAM_AUTO_TINIT) && (csr_val & ESRAM_AUTO_TINIT_DONE))
@@ -467,7 +471,7 @@ static int qat_hal_init_esram(struct icp_qat_fw_loader_handle *handle)
 		qat_hal_wait_cycles(handle, 0, ESRAM_AUTO_INIT_USED_CYCLES, 0);
 		csr_val = ADF_CSR_RD(csr_addr, 0);
 	} while (!(csr_val & ESRAM_AUTO_TINIT_DONE) && times--);
-	if ((!times)) {
+	if ((times < 0)) {
 		pr_err("QAT: Fail to init eSram!\n");
 		return -EFAULT;
 	}
@@ -658,7 +662,7 @@ static int qat_hal_clear_gpr(struct icp_qat_fw_loader_handle *handle)
 			ret = qat_hal_wait_cycles(handle, ae, 20, 1);
 		} while (ret && times--);
 
-		if (!times) {
+		if (times < 0) {
 			pr_err("QAT: clear GPR of AE %d failed", ae);
 			return -EINVAL;
 		}
@@ -693,14 +697,12 @@ int qat_hal_init(struct adf_accel_dev *accel_dev)
 	struct adf_hw_device_data *hw_data = accel_dev->hw_device;
 	struct adf_bar *misc_bar =
 			&pci_info->pci_bars[hw_data->get_misc_bar_id(hw_data)];
-	struct adf_bar *sram_bar =
-			&pci_info->pci_bars[hw_data->get_sram_bar_id(hw_data)];
+	struct adf_bar *sram_bar;
 
 	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
 	if (!handle)
 		return -ENOMEM;
 
-	handle->hal_sram_addr_v = sram_bar->virt_addr;
 	handle->hal_cap_g_ctl_csr_addr_v =
 		(void __iomem *)((uintptr_t)misc_bar->virt_addr +
 				 ICP_QAT_CAP_OFFSET);
@@ -714,6 +716,11 @@ int qat_hal_init(struct adf_accel_dev *accel_dev)
 		(void __iomem *)((uintptr_t)handle->hal_cap_ae_xfer_csr_addr_v +
 				 LOCAL_TO_XFER_REG_OFFSET);
 	handle->pci_dev = pci_info->pci_dev;
+	if (handle->pci_dev->device != ADF_C3XXX_PCI_DEVICE_ID) {
+		sram_bar =
+			&pci_info->pci_bars[hw_data->get_sram_bar_id(hw_data)];
+		handle->hal_sram_addr_v = sram_bar->virt_addr;
+	}
 	handle->fw_auth = (handle->pci_dev->device ==
 			   ADF_DH895XCC_PCI_DEVICE_ID) ? false : true;
 	handle->hal_handle = kzalloc(sizeof(*handle->hal_handle), GFP_KERNEL);
diff --git a/drivers/gpu/drm/drm_hashtab.c b/drivers/gpu/drm/drm_hashtab.c
index c3b80fd65d62..7b30b307674b 100644
--- a/drivers/gpu/drm/drm_hashtab.c
+++ b/drivers/gpu/drm/drm_hashtab.c
@@ -198,10 +198,7 @@ EXPORT_SYMBOL(drm_ht_remove_item);
 void drm_ht_remove(struct drm_open_hash *ht)
 {
 	if (ht->table) {
-		if ((PAGE_SIZE / sizeof(*ht->table)) >> ht->order)
-			kfree(ht->table);
-		else
-			vfree(ht->table);
+		kvfree(ht->table);
 		ht->table = NULL;
 	}
 }
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index aa26f3c3416b..8a8440c0eed1 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -5,6 +5,7 @@ menuconfig INFINIBAND
 	depends on NET
 	depends on INET
 	depends on m || IPV6 != m
+	select IRQ_POLL
 	---help---
 	  Core support for InfiniBand (IB).  Make sure to also select
 	  any protocols you wish to use as well as drivers for your
@@ -54,6 +55,15 @@ config INFINIBAND_ADDR_TRANS
 	depends on INFINIBAND
 	default y
 
+config INFINIBAND_ADDR_TRANS_CONFIGFS
+	bool
+	depends on INFINIBAND_ADDR_TRANS && CONFIGFS_FS && !(INFINIBAND=y && CONFIGFS_FS=m)
+	default y
+	---help---
+	  ConfigFS support for RDMA communication manager (CM).
+	  This allows the user to config the default GID type that the CM
+	  uses for each device, when initiaing new connections.
+
 source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d43a8994ac5c..f818538a7f4e 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) +=	ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
 					$(user_access-y)
 
-ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
+ib_core-y :=			packer.o ud_header.o verbs.o cq.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				roce_gid_mgmt.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
@@ -24,6 +24,8 @@ iw_cm-y :=			iwcm.o iwpm_util.o iwpm_msg.o
 
 rdma_cm-y :=			cma.o
 
+rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o
+
 rdma_ucm-y :=			ucma.o
 
 ib_addr-y :=			addr.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 34b1adad07aa..337353d86cfa 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -121,7 +121,8 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
 }
 EXPORT_SYMBOL(rdma_copy_addr);
 
-int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
+int rdma_translate_ip(const struct sockaddr *addr,
+		      struct rdma_dev_addr *dev_addr,
 		      u16 *vlan_id)
 {
 	struct net_device *dev;
@@ -139,7 +140,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
 	switch (addr->sa_family) {
 	case AF_INET:
 		dev = ip_dev_find(dev_addr->net,
-			((struct sockaddr_in *) addr)->sin_addr.s_addr);
+			((const struct sockaddr_in *)addr)->sin_addr.s_addr);
 
 		if (!dev)
 			return ret;
@@ -154,7 +155,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
 		rcu_read_lock();
 		for_each_netdev_rcu(dev_addr->net, dev) {
 			if (ipv6_chk_addr(dev_addr->net,
-					  &((struct sockaddr_in6 *) addr)->sin6_addr,
+					  &((const struct sockaddr_in6 *)addr)->sin6_addr,
 					  dev, 1)) {
 				ret = rdma_copy_addr(dev_addr, dev, NULL);
 				if (vlan_id)
@@ -198,7 +199,8 @@ static void queue_req(struct addr_req *req)
 	mutex_unlock(&lock);
 }
 
-static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, void *daddr)
+static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+			const void *daddr)
 {
 	struct neighbour *n;
 	int ret;
@@ -222,8 +224,9 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, v
 }
 
 static int addr4_resolve(struct sockaddr_in *src_in,
-			 struct sockaddr_in *dst_in,
-			 struct rdma_dev_addr *addr)
+			 const struct sockaddr_in *dst_in,
+			 struct rdma_dev_addr *addr,
+			 struct rtable **prt)
 {
 	__be32 src_ip = src_in->sin_addr.s_addr;
 	__be32 dst_ip = dst_in->sin_addr.s_addr;
@@ -243,33 +246,29 @@ static int addr4_resolve(struct sockaddr_in *src_in,
 	src_in->sin_family = AF_INET;
 	src_in->sin_addr.s_addr = fl4.saddr;
 
-	if (rt->dst.dev->flags & IFF_LOOPBACK) {
-		ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
-		if (!ret)
-			memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
-		goto put;
-	}
+	/* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
+	 * routable) and we could set the network type accordingly.
+	 */
+	if (rt->rt_uses_gateway)
+		addr->network = RDMA_NETWORK_IPV4;
 
-	/* If the device does ARP internally, return 'done' */
-	if (rt->dst.dev->flags & IFF_NOARP) {
-		ret = rdma_copy_addr(addr, rt->dst.dev, NULL);
-		goto put;
-	}
+	addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
 
-	ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr);
-put:
-	ip_rt_put(rt);
+	*prt = rt;
+	return 0;
 out:
 	return ret;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static int addr6_resolve(struct sockaddr_in6 *src_in,
-			 struct sockaddr_in6 *dst_in,
-			 struct rdma_dev_addr *addr)
+			 const struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr,
+			 struct dst_entry **pdst)
 {
 	struct flowi6 fl6;
 	struct dst_entry *dst;
+	struct rt6_info *rt;
 	int ret;
 
 	memset(&fl6, 0, sizeof fl6);
@@ -281,6 +280,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 	if ((ret = dst->error))
 		goto put;
 
+	rt = (struct rt6_info *)dst;
 	if (ipv6_addr_any(&fl6.saddr)) {
 		ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev,
 					 &fl6.daddr, 0, &fl6.saddr);
@@ -291,43 +291,111 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 		src_in->sin6_addr = fl6.saddr;
 	}
 
-	if (dst->dev->flags & IFF_LOOPBACK) {
-		ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
-		if (!ret)
-			memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
-		goto put;
-	}
+	/* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
+	 * routable) and we could set the network type accordingly.
+	 */
+	if (rt->rt6i_flags & RTF_GATEWAY)
+		addr->network = RDMA_NETWORK_IPV6;
 
-	/* If the device does ARP internally, return 'done' */
-	if (dst->dev->flags & IFF_NOARP) {
-		ret = rdma_copy_addr(addr, dst->dev, NULL);
-		goto put;
-	}
+	addr->hoplimit = ip6_dst_hoplimit(dst);
 
-	ret = dst_fetch_ha(dst, addr, &fl6.daddr);
+	*pdst = dst;
+	return 0;
 put:
 	dst_release(dst);
 	return ret;
 }
 #else
 static int addr6_resolve(struct sockaddr_in6 *src_in,
-			 struct sockaddr_in6 *dst_in,
-			 struct rdma_dev_addr *addr)
+			 const struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr,
+			 struct dst_entry **pdst)
 {
 	return -EADDRNOTAVAIL;
 }
 #endif
 
+static int addr_resolve_neigh(struct dst_entry *dst,
+			      const struct sockaddr *dst_in,
+			      struct rdma_dev_addr *addr)
+{
+	if (dst->dev->flags & IFF_LOOPBACK) {
+		int ret;
+
+		ret = rdma_translate_ip(dst_in, addr, NULL);
+		if (!ret)
+			memcpy(addr->dst_dev_addr, addr->src_dev_addr,
+			       MAX_ADDR_LEN);
+
+		return ret;
+	}
+
+	/* If the device doesn't do ARP internally */
+	if (!(dst->dev->flags & IFF_NOARP)) {
+		const struct sockaddr_in *dst_in4 =
+			(const struct sockaddr_in *)dst_in;
+		const struct sockaddr_in6 *dst_in6 =
+			(const struct sockaddr_in6 *)dst_in;
+
+		return dst_fetch_ha(dst, addr,
+				    dst_in->sa_family == AF_INET ?
+				    (const void *)&dst_in4->sin_addr.s_addr :
+				    (const void *)&dst_in6->sin6_addr);
+	}
+
+	return rdma_copy_addr(addr, dst->dev, NULL);
+}
+
 static int addr_resolve(struct sockaddr *src_in,
-			struct sockaddr *dst_in,
-			struct rdma_dev_addr *addr)
+			const struct sockaddr *dst_in,
+			struct rdma_dev_addr *addr,
+			bool resolve_neigh)
 {
+	struct net_device *ndev;
+	struct dst_entry *dst;
+	int ret;
+
 	if (src_in->sa_family == AF_INET) {
-		return addr4_resolve((struct sockaddr_in *) src_in,
-			(struct sockaddr_in *) dst_in, addr);
-	} else
-		return addr6_resolve((struct sockaddr_in6 *) src_in,
-			(struct sockaddr_in6 *) dst_in, addr);
+		struct rtable *rt = NULL;
+		const struct sockaddr_in *dst_in4 =
+			(const struct sockaddr_in *)dst_in;
+
+		ret = addr4_resolve((struct sockaddr_in *)src_in,
+				    dst_in4, addr, &rt);
+		if (ret)
+			return ret;
+
+		if (resolve_neigh)
+			ret = addr_resolve_neigh(&rt->dst, dst_in, addr);
+
+		ndev = rt->dst.dev;
+		dev_hold(ndev);
+
+		ip_rt_put(rt);
+	} else {
+		const struct sockaddr_in6 *dst_in6 =
+			(const struct sockaddr_in6 *)dst_in;
+
+		ret = addr6_resolve((struct sockaddr_in6 *)src_in,
+				    dst_in6, addr,
+				    &dst);
+		if (ret)
+			return ret;
+
+		if (resolve_neigh)
+			ret = addr_resolve_neigh(dst, dst_in, addr);
+
+		ndev = dst->dev;
+		dev_hold(ndev);
+
+		dst_release(dst);
+	}
+
+	addr->bound_dev_if = ndev->ifindex;
+	addr->net = dev_net(ndev);
+	dev_put(ndev);
+
+	return ret;
 }
 
 static void process_req(struct work_struct *work)
@@ -343,7 +411,8 @@ static void process_req(struct work_struct *work)
 		if (req->status == -ENODATA) {
 			src_in = (struct sockaddr *) &req->src_addr;
 			dst_in = (struct sockaddr *) &req->dst_addr;
-			req->status = addr_resolve(src_in, dst_in, req->addr);
+			req->status = addr_resolve(src_in, dst_in, req->addr,
+						   true);
 			if (req->status && time_after_eq(jiffies, req->timeout))
 				req->status = -ETIMEDOUT;
 			else if (req->status == -ENODATA)
@@ -403,7 +472,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
 	req->client = client;
 	atomic_inc(&client->refcount);
 
-	req->status = addr_resolve(src_in, dst_in, addr);
+	req->status = addr_resolve(src_in, dst_in, addr, true);
 	switch (req->status) {
 	case 0:
 		req->timeout = jiffies;
@@ -425,6 +494,26 @@ err:
 }
 EXPORT_SYMBOL(rdma_resolve_ip);
 
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+			  const struct sockaddr *dst_addr,
+			  struct rdma_dev_addr *addr)
+{
+	struct sockaddr_storage ssrc_addr = {};
+	struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr;
+
+	if (src_addr) {
+		if (src_addr->sa_family != dst_addr->sa_family)
+			return -EINVAL;
+
+		memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+	} else {
+		src_in->sa_family = dst_addr->sa_family;
+	}
+
+	return addr_resolve(src_in, dst_addr, addr, false);
+}
+EXPORT_SYMBOL(rdma_resolve_ip_route);
+
 void rdma_addr_cancel(struct rdma_dev_addr *addr)
 {
 	struct addr_req *req, *temp_req;
@@ -456,8 +545,10 @@ static void resolve_cb(int status, struct sockaddr *src_addr,
 	complete(&((struct resolve_cb_context *)context)->comp);
 }
 
-int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid,
-			       u8 *dmac, u16 *vlan_id, int if_index)
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+				 const union ib_gid *dgid,
+				 u8 *dmac, u16 *vlan_id, int *if_index,
+				 int *hoplimit)
 {
 	int ret = 0;
 	struct rdma_dev_addr dev_addr;
@@ -475,7 +566,8 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi
 	rdma_gid2ip(&dgid_addr._sockaddr, dgid);
 
 	memset(&dev_addr, 0, sizeof(dev_addr));
-	dev_addr.bound_dev_if = if_index;
+	if (if_index)
+		dev_addr.bound_dev_if = *if_index;
 	dev_addr.net = &init_net;
 
 	ctx.addr = &dev_addr;
@@ -491,12 +583,16 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi
 	dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if);
 	if (!dev)
 		return -ENODEV;
+	if (if_index)
+		*if_index = dev_addr.bound_dev_if;
 	if (vlan_id)
 		*vlan_id = rdma_vlan_dev_vlan_id(dev);
+	if (hoplimit)
+		*hoplimit = dev_addr.hoplimit;
 	dev_put(dev);
 	return ret;
 }
-EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh);
+EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh);
 
 int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
 {
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 89bebeada38b..53343ffbff7a 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -64,6 +64,7 @@ enum gid_attr_find_mask {
 	GID_ATTR_FIND_MASK_GID          = 1UL << 0,
 	GID_ATTR_FIND_MASK_NETDEV	= 1UL << 1,
 	GID_ATTR_FIND_MASK_DEFAULT	= 1UL << 2,
+	GID_ATTR_FIND_MASK_GID_TYPE	= 1UL << 3,
 };
 
 enum gid_table_entry_props {
@@ -81,10 +82,6 @@ enum gid_table_write_action {
 };
 
 struct ib_gid_table_entry {
-	/* This lock protects an entry from being
-	 * read and written simultaneously.
-	 */
-	rwlock_t	    lock;
 	unsigned long	    props;
 	union ib_gid        gid;
 	struct ib_gid_attr  attr;
@@ -109,28 +106,86 @@ struct ib_gid_table {
 	 * are locked by this lock.
 	 **/
 	struct mutex         lock;
+	/* This lock protects the table entries from being
+	 * read and written simultaneously.
+	 */
+	rwlock_t	     rwlock;
 	struct ib_gid_table_entry *data_vec;
 };
 
+static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)
+{
+	if (rdma_cap_roce_gid_table(ib_dev, port)) {
+		struct ib_event event;
+
+		event.device		= ib_dev;
+		event.element.port_num	= port;
+		event.event		= IB_EVENT_GID_CHANGE;
+
+		ib_dispatch_event(&event);
+	}
+}
+
+static const char * const gid_type_str[] = {
+	[IB_GID_TYPE_IB]	= "IB/RoCE v1",
+	[IB_GID_TYPE_ROCE_UDP_ENCAP]	= "RoCE v2",
+};
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type)
+{
+	if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type])
+		return gid_type_str[gid_type];
+
+	return "Invalid GID type";
+}
+EXPORT_SYMBOL(ib_cache_gid_type_str);
+
+int ib_cache_gid_parse_type_str(const char *buf)
+{
+	unsigned int i;
+	size_t len;
+	int err = -EINVAL;
+
+	len = strlen(buf);
+	if (len == 0)
+		return -EINVAL;
+
+	if (buf[len - 1] == '\n')
+		len--;
+
+	for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i)
+		if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) &&
+		    len == strlen(gid_type_str[i])) {
+			err = i;
+			break;
+		}
+
+	return err;
+}
+EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
+
+/* This function expects that rwlock will be write locked in all
+ * scenarios and that lock will be locked in sleep-able (RoCE)
+ * scenarios.
+ */
 static int write_gid(struct ib_device *ib_dev, u8 port,
 		     struct ib_gid_table *table, int ix,
 		     const union ib_gid *gid,
 		     const struct ib_gid_attr *attr,
 		     enum gid_table_write_action action,
 		     bool  default_gid)
+	__releases(&table->rwlock) __acquires(&table->rwlock)
 {
 	int ret = 0;
 	struct net_device *old_net_dev;
-	unsigned long flags;
 
 	/* in rdma_cap_roce_gid_table, this funciton should be protected by a
 	 * sleep-able lock.
 	 */
-	write_lock_irqsave(&table->data_vec[ix].lock, flags);
 
 	if (rdma_cap_roce_gid_table(ib_dev, port)) {
 		table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
-		write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+		write_unlock_irq(&table->rwlock);
 		/* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by
 		 * RoCE providers and thus only updates the cache.
 		 */
@@ -140,7 +195,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port,
 		else if (action == GID_TABLE_WRITE_ACTION_DEL)
 			ret = ib_dev->del_gid(ib_dev, port, ix,
 					      &table->data_vec[ix].context);
-		write_lock_irqsave(&table->data_vec[ix].lock, flags);
+		write_lock_irq(&table->rwlock);
 	}
 
 	old_net_dev = table->data_vec[ix].attr.ndev;
@@ -162,17 +217,6 @@ static int write_gid(struct ib_device *ib_dev, u8 port,
 
 	table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID;
 
-	write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
-
-	if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) {
-		struct ib_event event;
-
-		event.device		= ib_dev;
-		event.element.port_num	= port;
-		event.event		= IB_EVENT_GID_CHANGE;
-
-		ib_dispatch_event(&event);
-	}
 	return ret;
 }
 
@@ -201,41 +245,58 @@ static int del_gid(struct ib_device *ib_dev, u8 port,
 			 GID_TABLE_WRITE_ACTION_DEL, default_gid);
 }
 
+/* rwlock should be read locked */
 static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
 		    const struct ib_gid_attr *val, bool default_gid,
-		    unsigned long mask)
+		    unsigned long mask, int *pempty)
 {
-	int i;
+	int i = 0;
+	int found = -1;
+	int empty = pempty ? -1 : 0;
 
-	for (i = 0; i < table->sz; i++) {
-		unsigned long flags;
-		struct ib_gid_attr *attr = &table->data_vec[i].attr;
+	while (i < table->sz && (found < 0 || empty < 0)) {
+		struct ib_gid_table_entry *data = &table->data_vec[i];
+		struct ib_gid_attr *attr = &data->attr;
+		int curr_index = i;
 
-		read_lock_irqsave(&table->data_vec[i].lock, flags);
+		i++;
 
-		if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
-			goto next;
+		if (data->props & GID_TABLE_ENTRY_INVALID)
+			continue;
+
+		if (empty < 0)
+			if (!memcmp(&data->gid, &zgid, sizeof(*gid)) &&
+			    !memcmp(attr, &zattr, sizeof(*attr)) &&
+			    !data->props)
+				empty = curr_index;
+
+		if (found >= 0)
+			continue;
+
+		if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
+		    attr->gid_type != val->gid_type)
+			continue;
 
 		if (mask & GID_ATTR_FIND_MASK_GID &&
-		    memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
-			goto next;
+		    memcmp(gid, &data->gid, sizeof(*gid)))
+			continue;
 
 		if (mask & GID_ATTR_FIND_MASK_NETDEV &&
 		    attr->ndev != val->ndev)
-			goto next;
+			continue;
 
 		if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
-		    !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) !=
+		    !!(data->props & GID_TABLE_ENTRY_DEFAULT) !=
 		    default_gid)
-			goto next;
+			continue;
 
-		read_unlock_irqrestore(&table->data_vec[i].lock, flags);
-		return i;
-next:
-		read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+		found = curr_index;
 	}
 
-	return -1;
+	if (pempty)
+		*pempty = empty;
+
+	return found;
 }
 
 static void make_default_gid(struct  net_device *dev, union ib_gid *gid)
@@ -252,6 +313,7 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
 	int ix;
 	int ret = 0;
 	struct net_device *idev;
+	int empty;
 
 	table = ports_table[port - rdma_start_port(ib_dev)];
 
@@ -275,22 +337,25 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
 	}
 
 	mutex_lock(&table->lock);
+	write_lock_irq(&table->rwlock);
 
 	ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID |
-		      GID_ATTR_FIND_MASK_NETDEV);
+		      GID_ATTR_FIND_MASK_GID_TYPE |
+		      GID_ATTR_FIND_MASK_NETDEV, &empty);
 	if (ix >= 0)
 		goto out_unlock;
 
-	ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID |
-		      GID_ATTR_FIND_MASK_DEFAULT);
-	if (ix < 0) {
+	if (empty < 0) {
 		ret = -ENOSPC;
 		goto out_unlock;
 	}
 
-	add_gid(ib_dev, port, table, ix, gid, attr, false);
+	ret = add_gid(ib_dev, port, table, empty, gid, attr, false);
+	if (!ret)
+		dispatch_gid_change_event(ib_dev, port);
 
 out_unlock:
+	write_unlock_irq(&table->rwlock);
 	mutex_unlock(&table->lock);
 	return ret;
 }
@@ -305,17 +370,22 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
 	table = ports_table[port - rdma_start_port(ib_dev)];
 
 	mutex_lock(&table->lock);
+	write_lock_irq(&table->rwlock);
 
 	ix = find_gid(table, gid, attr, false,
 		      GID_ATTR_FIND_MASK_GID	  |
+		      GID_ATTR_FIND_MASK_GID_TYPE |
 		      GID_ATTR_FIND_MASK_NETDEV	  |
-		      GID_ATTR_FIND_MASK_DEFAULT);
+		      GID_ATTR_FIND_MASK_DEFAULT,
+		      NULL);
 	if (ix < 0)
 		goto out_unlock;
 
-	del_gid(ib_dev, port, table, ix, false);
+	if (!del_gid(ib_dev, port, table, ix, false))
+		dispatch_gid_change_event(ib_dev, port);
 
 out_unlock:
+	write_unlock_irq(&table->rwlock);
 	mutex_unlock(&table->lock);
 	return 0;
 }
@@ -326,16 +396,24 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
 	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
 	struct ib_gid_table *table;
 	int ix;
+	bool deleted = false;
 
 	table  = ports_table[port - rdma_start_port(ib_dev)];
 
 	mutex_lock(&table->lock);
+	write_lock_irq(&table->rwlock);
 
 	for (ix = 0; ix < table->sz; ix++)
 		if (table->data_vec[ix].attr.ndev == ndev)
-			del_gid(ib_dev, port, table, ix, false);
+			if (!del_gid(ib_dev, port, table, ix, false))
+				deleted = true;
 
+	write_unlock_irq(&table->rwlock);
 	mutex_unlock(&table->lock);
+
+	if (deleted)
+		dispatch_gid_change_event(ib_dev, port);
+
 	return 0;
 }
 
@@ -344,18 +422,14 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
 {
 	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
 	struct ib_gid_table *table;
-	unsigned long flags;
 
 	table = ports_table[port - rdma_start_port(ib_dev)];
 
 	if (index < 0 || index >= table->sz)
 		return -EINVAL;
 
-	read_lock_irqsave(&table->data_vec[index].lock, flags);
-	if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) {
-		read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+	if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID)
 		return -EAGAIN;
-	}
 
 	memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
 	if (attr) {
@@ -364,7 +438,6 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
 			dev_hold(attr->ndev);
 	}
 
-	read_unlock_irqrestore(&table->data_vec[index].lock, flags);
 	return 0;
 }
 
@@ -378,17 +451,21 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
 	struct ib_gid_table *table;
 	u8 p;
 	int local_index;
+	unsigned long flags;
 
 	for (p = 0; p < ib_dev->phys_port_cnt; p++) {
 		table = ports_table[p];
-		local_index = find_gid(table, gid, val, false, mask);
+		read_lock_irqsave(&table->rwlock, flags);
+		local_index = find_gid(table, gid, val, false, mask, NULL);
 		if (local_index >= 0) {
 			if (index)
 				*index = local_index;
 			if (port)
 				*port = p + rdma_start_port(ib_dev);
+			read_unlock_irqrestore(&table->rwlock, flags);
 			return 0;
 		}
+		read_unlock_irqrestore(&table->rwlock, flags);
 	}
 
 	return -ENOENT;
@@ -396,11 +473,13 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
 
 static int ib_cache_gid_find(struct ib_device *ib_dev,
 			     const union ib_gid *gid,
+			     enum ib_gid_type gid_type,
 			     struct net_device *ndev, u8 *port,
 			     u16 *index)
 {
-	unsigned long mask = GID_ATTR_FIND_MASK_GID;
-	struct ib_gid_attr gid_attr_val = {.ndev = ndev};
+	unsigned long mask = GID_ATTR_FIND_MASK_GID |
+			     GID_ATTR_FIND_MASK_GID_TYPE;
+	struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
 
 	if (ndev)
 		mask |= GID_ATTR_FIND_MASK_NETDEV;
@@ -411,14 +490,17 @@ static int ib_cache_gid_find(struct ib_device *ib_dev,
 
 int ib_find_cached_gid_by_port(struct ib_device *ib_dev,
 			       const union ib_gid *gid,
+			       enum ib_gid_type gid_type,
 			       u8 port, struct net_device *ndev,
 			       u16 *index)
 {
 	int local_index;
 	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
 	struct ib_gid_table *table;
-	unsigned long mask = GID_ATTR_FIND_MASK_GID;
-	struct ib_gid_attr val = {.ndev = ndev};
+	unsigned long mask = GID_ATTR_FIND_MASK_GID |
+			     GID_ATTR_FIND_MASK_GID_TYPE;
+	struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type};
+	unsigned long flags;
 
 	if (port < rdma_start_port(ib_dev) ||
 	    port > rdma_end_port(ib_dev))
@@ -429,13 +511,16 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev,
 	if (ndev)
 		mask |= GID_ATTR_FIND_MASK_NETDEV;
 
-	local_index = find_gid(table, gid, &val, false, mask);
+	read_lock_irqsave(&table->rwlock, flags);
+	local_index = find_gid(table, gid, &val, false, mask, NULL);
 	if (local_index >= 0) {
 		if (index)
 			*index = local_index;
+		read_unlock_irqrestore(&table->rwlock, flags);
 		return 0;
 	}
 
+	read_unlock_irqrestore(&table->rwlock, flags);
 	return -ENOENT;
 }
 EXPORT_SYMBOL(ib_find_cached_gid_by_port);
@@ -472,6 +557,7 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
 	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
 	struct ib_gid_table *table;
 	unsigned int i;
+	unsigned long flags;
 	bool found = false;
 
 	if (!ports_table)
@@ -484,11 +570,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
 
 	table = ports_table[port - rdma_start_port(ib_dev)];
 
+	read_lock_irqsave(&table->rwlock, flags);
 	for (i = 0; i < table->sz; i++) {
 		struct ib_gid_attr attr;
-		unsigned long flags;
 
-		read_lock_irqsave(&table->data_vec[i].lock, flags);
 		if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
 			goto next;
 
@@ -501,11 +586,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
 			found = true;
 
 next:
-		read_unlock_irqrestore(&table->data_vec[i].lock, flags);
-
 		if (found)
 			break;
 	}
+	read_unlock_irqrestore(&table->rwlock, flags);
 
 	if (!found)
 		return -ENOENT;
@@ -517,9 +601,9 @@ next:
 
 static struct ib_gid_table *alloc_gid_table(int sz)
 {
-	unsigned int i;
 	struct ib_gid_table *table =
 		kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
+
 	if (!table)
 		return NULL;
 
@@ -530,9 +614,7 @@ static struct ib_gid_table *alloc_gid_table(int sz)
 	mutex_init(&table->lock);
 
 	table->sz = sz;
-
-	for (i = 0; i < sz; i++)
-		rwlock_init(&table->data_vec[i].lock);
+	rwlock_init(&table->rwlock);
 
 	return table;
 
@@ -553,30 +635,37 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
 				   struct ib_gid_table *table)
 {
 	int i;
+	bool deleted = false;
 
 	if (!table)
 		return;
 
+	write_lock_irq(&table->rwlock);
 	for (i = 0; i < table->sz; ++i) {
 		if (memcmp(&table->data_vec[i].gid, &zgid,
 			   sizeof(table->data_vec[i].gid)))
-			del_gid(ib_dev, port, table, i,
-				table->data_vec[i].props &
-				GID_ATTR_FIND_MASK_DEFAULT);
+			if (!del_gid(ib_dev, port, table, i,
+				     table->data_vec[i].props &
+				     GID_ATTR_FIND_MASK_DEFAULT))
+				deleted = true;
 	}
+	write_unlock_irq(&table->rwlock);
+
+	if (deleted)
+		dispatch_gid_change_event(ib_dev, port);
 }
 
 void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
 				  struct net_device *ndev,
+				  unsigned long gid_type_mask,
 				  enum ib_cache_gid_default_mode mode)
 {
 	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
 	union ib_gid gid;
 	struct ib_gid_attr gid_attr;
+	struct ib_gid_attr zattr_type = zattr;
 	struct ib_gid_table *table;
-	int ix;
-	union ib_gid current_gid;
-	struct ib_gid_attr current_gid_attr = {};
+	unsigned int gid_type;
 
 	table  = ports_table[port - rdma_start_port(ib_dev)];
 
@@ -584,46 +673,82 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
 	memset(&gid_attr, 0, sizeof(gid_attr));
 	gid_attr.ndev = ndev;
 
-	mutex_lock(&table->lock);
-	ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT);
-
-	/* Coudn't find default GID location */
-	WARN_ON(ix < 0);
-
-	if (!__ib_cache_gid_get(ib_dev, port, ix,
-				&current_gid, &current_gid_attr) &&
-	    mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
-	    !memcmp(&gid, &current_gid, sizeof(gid)) &&
-	    !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
-		goto unlock;
-
-	if ((memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
-	     memcmp(&current_gid_attr, &zattr,
-		    sizeof(current_gid_attr))) &&
-	    del_gid(ib_dev, port, table, ix, true)) {
-		pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
-			ix, gid.raw);
-		goto unlock;
-	}
+	for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) {
+		int ix;
+		union ib_gid current_gid;
+		struct ib_gid_attr current_gid_attr = {};
+
+		if (1UL << gid_type & ~gid_type_mask)
+			continue;
+
+		gid_attr.gid_type = gid_type;
+
+		mutex_lock(&table->lock);
+		write_lock_irq(&table->rwlock);
+		ix = find_gid(table, NULL, &gid_attr, true,
+			      GID_ATTR_FIND_MASK_GID_TYPE |
+			      GID_ATTR_FIND_MASK_DEFAULT,
+			      NULL);
+
+		/* Coudn't find default GID location */
+		WARN_ON(ix < 0);
+
+		zattr_type.gid_type = gid_type;
+
+		if (!__ib_cache_gid_get(ib_dev, port, ix,
+					&current_gid, &current_gid_attr) &&
+		    mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
+		    !memcmp(&gid, &current_gid, sizeof(gid)) &&
+		    !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
+			goto release;
+
+		if (memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
+		    memcmp(&current_gid_attr, &zattr_type,
+			   sizeof(current_gid_attr))) {
+			if (del_gid(ib_dev, port, table, ix, true)) {
+				pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
+					ix, gid.raw);
+				goto release;
+			} else {
+				dispatch_gid_change_event(ib_dev, port);
+			}
+		}
 
-	if (mode == IB_CACHE_GID_DEFAULT_MODE_SET)
-		if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
-			pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
-				gid.raw);
+		if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) {
+			if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
+				pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
+					gid.raw);
+			else
+				dispatch_gid_change_event(ib_dev, port);
+		}
 
-unlock:
-	if (current_gid_attr.ndev)
-		dev_put(current_gid_attr.ndev);
-	mutex_unlock(&table->lock);
+release:
+		if (current_gid_attr.ndev)
+			dev_put(current_gid_attr.ndev);
+		write_unlock_irq(&table->rwlock);
+		mutex_unlock(&table->lock);
+	}
 }
 
 static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
 				     struct ib_gid_table *table)
 {
-	if (rdma_protocol_roce(ib_dev, port)) {
-		struct ib_gid_table_entry *entry = &table->data_vec[0];
+	unsigned int i;
+	unsigned long roce_gid_type_mask;
+	unsigned int num_default_gids;
+	unsigned int current_gid = 0;
+
+	roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+	num_default_gids = hweight_long(roce_gid_type_mask);
+	for (i = 0; i < num_default_gids && i < table->sz; i++) {
+		struct ib_gid_table_entry *entry =
+			&table->data_vec[i];
 
 		entry->props |= GID_TABLE_ENTRY_DEFAULT;
+		current_gid = find_next_bit(&roce_gid_type_mask,
+					    BITS_PER_LONG,
+					    current_gid);
+		entry->attr.gid_type = current_gid++;
 	}
 
 	return 0;
@@ -728,20 +853,30 @@ int ib_get_cached_gid(struct ib_device *device,
 		      union ib_gid     *gid,
 		      struct ib_gid_attr *gid_attr)
 {
+	int res;
+	unsigned long flags;
+	struct ib_gid_table **ports_table = device->cache.gid_cache;
+	struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)];
+
 	if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
 		return -EINVAL;
 
-	return __ib_cache_gid_get(device, port_num, index, gid, gid_attr);
+	read_lock_irqsave(&table->rwlock, flags);
+	res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr);
+	read_unlock_irqrestore(&table->rwlock, flags);
+
+	return res;
 }
 EXPORT_SYMBOL(ib_get_cached_gid);
 
 int ib_find_cached_gid(struct ib_device *device,
 		       const union ib_gid *gid,
+		       enum ib_gid_type gid_type,
 		       struct net_device *ndev,
 		       u8               *port_num,
 		       u16              *index)
 {
-	return ib_cache_gid_find(device, gid, ndev, port_num, index);
+	return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index);
 }
 EXPORT_SYMBOL(ib_find_cached_gid);
 
@@ -956,10 +1091,12 @@ static void ib_cache_update(struct ib_device *device,
 
 	device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache;
 	if (!use_roce_gid_table) {
+		write_lock(&table->rwlock);
 		for (i = 0; i < gid_cache->table_len; i++) {
 			modify_gid(device, port, table, i, gid_cache->table + i,
 				   &zattr, false);
 		}
+		write_unlock(&table->rwlock);
 	}
 
 	device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 0a26dd6d9b19..1d92e091e22e 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -364,7 +364,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
 	read_lock_irqsave(&cm.device_lock, flags);
 	list_for_each_entry(cm_dev, &cm.device_list, list) {
 		if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
-					ndev, &p, NULL)) {
+					path->gid_type, ndev, &p, NULL)) {
 			port = cm_dev->port[p-1];
 			break;
 		}
@@ -782,11 +782,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
 	wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
 
 	/* Check if the device started its remove_one */
-	spin_lock_irq(&cm.lock);
+	spin_lock_irqsave(&cm.lock, flags);
 	if (!cm_dev->going_down)
 		queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
 				   msecs_to_jiffies(wait_time));
-	spin_unlock_irq(&cm.lock);
+	spin_unlock_irqrestore(&cm.lock, flags);
 
 	cm_id_priv->timewait_info = NULL;
 }
@@ -1600,6 +1600,8 @@ static int cm_req_handler(struct cm_work *work)
 	struct ib_cm_id *cm_id;
 	struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
 	struct cm_req_msg *req_msg;
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr;
 	int ret;
 
 	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
@@ -1639,11 +1641,31 @@ static int cm_req_handler(struct cm_work *work)
 	cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
 
 	memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN);
-	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+	work->path[0].hop_limit = cm_id_priv->av.ah_attr.grh.hop_limit;
+	ret = ib_get_cached_gid(work->port->cm_dev->ib_device,
+				work->port->port_num,
+				cm_id_priv->av.ah_attr.grh.sgid_index,
+				&gid, &gid_attr);
+	if (!ret) {
+		if (gid_attr.ndev) {
+			work->path[0].ifindex = gid_attr.ndev->ifindex;
+			work->path[0].net = dev_net(gid_attr.ndev);
+			dev_put(gid_attr.ndev);
+		}
+		work->path[0].gid_type = gid_attr.gid_type;
+		ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+	}
 	if (ret) {
-		ib_get_cached_gid(work->port->cm_dev->ib_device,
-				  work->port->port_num, 0, &work->path[0].sgid,
-				  NULL);
+		int err = ib_get_cached_gid(work->port->cm_dev->ib_device,
+					    work->port->port_num, 0,
+					    &work->path[0].sgid,
+					    &gid_attr);
+		if (!err && gid_attr.ndev) {
+			work->path[0].ifindex = gid_attr.ndev->ifindex;
+			work->path[0].net = dev_net(gid_attr.ndev);
+			dev_put(gid_attr.ndev);
+		}
+		work->path[0].gid_type = gid_attr.gid_type;
 		ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
 			       &work->path[0].sgid, sizeof work->path[0].sgid,
 			       NULL, 0);
@@ -3482,6 +3504,7 @@ int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
 EXPORT_SYMBOL(ib_cm_notify);
 
 static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_send_buf *send_buf,
 			    struct ib_mad_recv_wc *mad_recv_wc)
 {
 	struct cm_port *port = mad_agent->context;
@@ -3731,16 +3754,6 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
 }
 EXPORT_SYMBOL(ib_cm_init_qp_attr);
 
-static void cm_get_ack_delay(struct cm_device *cm_dev)
-{
-	struct ib_device_attr attr;
-
-	if (ib_query_device(cm_dev->ib_device, &attr))
-		cm_dev->ack_delay = 0; /* acks will rely on packet life time */
-	else
-		cm_dev->ack_delay = attr.local_ca_ack_delay;
-}
-
 static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
 			       char *buf)
 {
@@ -3852,7 +3865,7 @@ static void cm_add_one(struct ib_device *ib_device)
 		return;
 
 	cm_dev->ib_device = ib_device;
-	cm_get_ack_delay(cm_dev);
+	cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
 	cm_dev->going_down = 0;
 	cm_dev->device = device_create(&cm_class, &ib_device->dev,
 				       MKDEV(0, 0), NULL,
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 2d762a2ecd81..9729639df407 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -38,6 +38,7 @@
 #include <linux/in6.h>
 #include <linux/mutex.h>
 #include <linux/random.h>
+#include <linux/igmp.h>
 #include <linux/idr.h>
 #include <linux/inetdevice.h>
 #include <linux/slab.h>
@@ -60,6 +61,8 @@
 #include <rdma/ib_sa.h>
 #include <rdma/iw_cm.h>
 
+#include "core_priv.h"
+
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("Generic RDMA CM Agent");
 MODULE_LICENSE("Dual BSD/GPL");
@@ -150,6 +153,7 @@ struct cma_device {
 	struct completion	comp;
 	atomic_t		refcount;
 	struct list_head	id_list;
+	enum ib_gid_type	*default_gid_type;
 };
 
 struct rdma_bind_list {
@@ -185,6 +189,67 @@ enum {
 	CMA_OPTION_AFONLY,
 };
 
+void cma_ref_dev(struct cma_device *cma_dev)
+{
+	atomic_inc(&cma_dev->refcount);
+}
+
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter	filter,
+					     void		*cookie)
+{
+	struct cma_device *cma_dev;
+	struct cma_device *found_cma_dev = NULL;
+
+	mutex_lock(&lock);
+
+	list_for_each_entry(cma_dev, &dev_list, list)
+		if (filter(cma_dev->device, cookie)) {
+			found_cma_dev = cma_dev;
+			break;
+		}
+
+	if (found_cma_dev)
+		cma_ref_dev(found_cma_dev);
+	mutex_unlock(&lock);
+	return found_cma_dev;
+}
+
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+			     unsigned int port)
+{
+	if (port < rdma_start_port(cma_dev->device) ||
+	    port > rdma_end_port(cma_dev->device))
+		return -EINVAL;
+
+	return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)];
+}
+
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+			     unsigned int port,
+			     enum ib_gid_type default_gid_type)
+{
+	unsigned long supported_gids;
+
+	if (port < rdma_start_port(cma_dev->device) ||
+	    port > rdma_end_port(cma_dev->device))
+		return -EINVAL;
+
+	supported_gids = roce_gid_type_mask_support(cma_dev->device, port);
+
+	if (!(supported_gids & 1 << default_gid_type))
+		return -EINVAL;
+
+	cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] =
+		default_gid_type;
+
+	return 0;
+}
+
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev)
+{
+	return cma_dev->device;
+}
+
 /*
  * Device removal can occur at anytime, so we need extra handling to
  * serialize notifying the user of device removal with other callbacks.
@@ -228,6 +293,7 @@ struct rdma_id_private {
 	u8			tos;
 	u8			reuseaddr;
 	u8			afonly;
+	enum ib_gid_type	gid_type;
 };
 
 struct cma_multicast {
@@ -239,6 +305,7 @@ struct cma_multicast {
 	void			*context;
 	struct sockaddr_storage	addr;
 	struct kref		mcref;
+	bool			igmp_joined;
 };
 
 struct cma_work {
@@ -335,18 +402,48 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
 	hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
 }
 
-static void cma_attach_to_dev(struct rdma_id_private *id_priv,
-			      struct cma_device *cma_dev)
+static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
 {
-	atomic_inc(&cma_dev->refcount);
+	struct in_device *in_dev = NULL;
+
+	if (ndev) {
+		rtnl_lock();
+		in_dev = __in_dev_get_rtnl(ndev);
+		if (in_dev) {
+			if (join)
+				ip_mc_inc_group(in_dev,
+						*(__be32 *)(mgid->raw + 12));
+			else
+				ip_mc_dec_group(in_dev,
+						*(__be32 *)(mgid->raw + 12));
+		}
+		rtnl_unlock();
+	}
+	return (in_dev) ? 0 : -ENODEV;
+}
+
+static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
+			       struct cma_device *cma_dev)
+{
+	cma_ref_dev(cma_dev);
 	id_priv->cma_dev = cma_dev;
+	id_priv->gid_type = 0;
 	id_priv->id.device = cma_dev->device;
 	id_priv->id.route.addr.dev_addr.transport =
 		rdma_node_get_transport(cma_dev->device->node_type);
 	list_add_tail(&id_priv->list, &cma_dev->id_list);
 }
 
-static inline void cma_deref_dev(struct cma_device *cma_dev)
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+			      struct cma_device *cma_dev)
+{
+	_cma_attach_to_dev(id_priv, cma_dev);
+	id_priv->gid_type =
+		cma_dev->default_gid_type[id_priv->id.port_num -
+					  rdma_start_port(cma_dev->device)];
+}
+
+void cma_deref_dev(struct cma_device *cma_dev)
 {
 	if (atomic_dec_and_test(&cma_dev->refcount))
 		complete(&cma_dev->comp);
@@ -441,6 +538,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a
 }
 
 static inline int cma_validate_port(struct ib_device *device, u8 port,
+				    enum ib_gid_type gid_type,
 				      union ib_gid *gid, int dev_type,
 				      int bound_if_index)
 {
@@ -453,10 +551,25 @@ static inline int cma_validate_port(struct ib_device *device, u8 port,
 	if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
 		return ret;
 
-	if (dev_type == ARPHRD_ETHER)
+	if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
 		ndev = dev_get_by_index(&init_net, bound_if_index);
+		if (ndev && ndev->flags & IFF_LOOPBACK) {
+			pr_info("detected loopback device\n");
+			dev_put(ndev);
 
-	ret = ib_find_cached_gid_by_port(device, gid, port, ndev, NULL);
+			if (!device->get_netdev)
+				return -EOPNOTSUPP;
+
+			ndev = device->get_netdev(device, port);
+			if (!ndev)
+				return -ENODEV;
+		}
+	} else {
+		gid_type = IB_GID_TYPE_IB;
+	}
+
+	ret = ib_find_cached_gid_by_port(device, gid, gid_type, port,
+					 ndev, NULL);
 
 	if (ndev)
 		dev_put(ndev);
@@ -490,7 +603,10 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
 		gidp = rdma_protocol_roce(cma_dev->device, port) ?
 		       &iboe_gid : &gid;
 
-		ret = cma_validate_port(cma_dev->device, port, gidp,
+		ret = cma_validate_port(cma_dev->device, port,
+					rdma_protocol_ib(cma_dev->device, port) ?
+					IB_GID_TYPE_IB :
+					listen_id_priv->gid_type, gidp,
 					dev_addr->dev_type,
 					dev_addr->bound_dev_if);
 		if (!ret) {
@@ -509,8 +625,11 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
 			gidp = rdma_protocol_roce(cma_dev->device, port) ?
 			       &iboe_gid : &gid;
 
-			ret = cma_validate_port(cma_dev->device, port, gidp,
-						dev_addr->dev_type,
+			ret = cma_validate_port(cma_dev->device, port,
+						rdma_protocol_ib(cma_dev->device, port) ?
+						IB_GID_TYPE_IB :
+						cma_dev->default_gid_type[port - 1],
+						gidp, dev_addr->dev_type,
 						dev_addr->bound_dev_if);
 			if (!ret) {
 				id_priv->id.port_num = port;
@@ -1437,8 +1556,24 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
 				      id_priv->id.port_num)) {
 			ib_sa_free_multicast(mc->multicast.ib);
 			kfree(mc);
-		} else
+		} else {
+			if (mc->igmp_joined) {
+				struct rdma_dev_addr *dev_addr =
+					&id_priv->id.route.addr.dev_addr;
+				struct net_device *ndev = NULL;
+
+				if (dev_addr->bound_dev_if)
+					ndev = dev_get_by_index(&init_net,
+								dev_addr->bound_dev_if);
+				if (ndev) {
+					cma_igmp_send(ndev,
+						      &mc->multicast.ib->rec.mgid,
+						      false);
+					dev_put(ndev);
+				}
+			}
 			kref_put(&mc->mcref, release_mc);
+		}
 	}
 }
 
@@ -1896,7 +2031,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	struct rdma_id_private *listen_id, *conn_id;
 	struct rdma_cm_event event;
 	int ret;
-	struct ib_device_attr attr;
 	struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
 	struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
 
@@ -1938,13 +2072,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
 	memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));
 
-	ret = ib_query_device(conn_id->id.device, &attr);
-	if (ret) {
-		mutex_unlock(&conn_id->handler_mutex);
-		rdma_destroy_id(new_cm_id);
-		goto out;
-	}
-
 	memset(&event, 0, sizeof event);
 	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
 	event.param.conn.private_data = iw_event->private_data;
@@ -2051,7 +2178,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
 	memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv),
 	       rdma_addr_size(cma_src_addr(id_priv)));
 
-	cma_attach_to_dev(dev_id_priv, cma_dev);
+	_cma_attach_to_dev(dev_id_priv, cma_dev);
 	list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
 	atomic_inc(&id_priv->refcount);
 	dev_id_priv->internal_id = 1;
@@ -2321,8 +2448,23 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 
 	if (addr->dev_addr.bound_dev_if) {
 		ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+		if (!ndev)
+			return -ENODEV;
+
+		if (ndev->flags & IFF_LOOPBACK) {
+			dev_put(ndev);
+			if (!id_priv->id.device->get_netdev)
+				return -EOPNOTSUPP;
+
+			ndev = id_priv->id.device->get_netdev(id_priv->id.device,
+							      id_priv->id.port_num);
+			if (!ndev)
+				return -ENODEV;
+		}
+
 		route->path_rec->net = &init_net;
-		route->path_rec->ifindex = addr->dev_addr.bound_dev_if;
+		route->path_rec->ifindex = ndev->ifindex;
+		route->path_rec->gid_type = id_priv->gid_type;
 	}
 	if (!ndev) {
 		ret = -ENODEV;
@@ -2336,7 +2478,14 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
 		    &route->path_rec->dgid);
 
-	route->path_rec->hop_limit = 1;
+	/* Use the hint from IP Stack to select GID Type */
+	if (route->path_rec->gid_type < ib_network_to_gid_type(addr->dev_addr.network))
+		route->path_rec->gid_type = ib_network_to_gid_type(addr->dev_addr.network);
+	if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB)
+		/* TODO: get the hoplimit from the inet/inet6 device */
+		route->path_rec->hop_limit = addr->dev_addr.hoplimit;
+	else
+		route->path_rec->hop_limit = 1;
 	route->path_rec->reversible = 1;
 	route->path_rec->pkey = cpu_to_be16(0xffff);
 	route->path_rec->mtu_selector = IB_SA_EQ;
@@ -3534,12 +3683,23 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
 	event.status = status;
 	event.param.ud.private_data = mc->context;
 	if (!status) {
+		struct rdma_dev_addr *dev_addr =
+			&id_priv->id.route.addr.dev_addr;
+		struct net_device *ndev =
+			dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+		enum ib_gid_type gid_type =
+			id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+			rdma_start_port(id_priv->cma_dev->device)];
+
 		event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
 		ib_init_ah_from_mcmember(id_priv->id.device,
 					 id_priv->id.port_num, &multicast->rec,
+					 ndev, gid_type,
 					 &event.param.ud.ah_attr);
 		event.param.ud.qp_num = 0xFFFFFF;
 		event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+		if (ndev)
+			dev_put(ndev);
 	} else
 		event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
 
@@ -3672,9 +3832,10 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
 {
 	struct iboe_mcast_work *work;
 	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
-	int err;
+	int err = 0;
 	struct sockaddr *addr = (struct sockaddr *)&mc->addr;
 	struct net_device *ndev = NULL;
+	enum ib_gid_type gid_type;
 
 	if (cma_zero_addr((struct sockaddr *)&mc->addr))
 		return -EINVAL;
@@ -3704,9 +3865,25 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
 	mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
 	mc->multicast.ib->rec.hop_limit = 1;
 	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+
+	gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+		   rdma_start_port(id_priv->cma_dev->device)];
+	if (addr->sa_family == AF_INET) {
+		if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+			err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
+					    true);
+		if (!err) {
+			mc->igmp_joined = true;
+			mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
+		}
+	} else {
+		if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+			err = -ENOTSUPP;
+	}
 	dev_put(ndev);
-	if (!mc->multicast.ib->rec.mtu) {
-		err = -EINVAL;
+	if (err || !mc->multicast.ib->rec.mtu) {
+		if (!err)
+			err = -EINVAL;
 		goto out2;
 	}
 	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
@@ -3745,7 +3922,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
 	memcpy(&mc->addr, addr, rdma_addr_size(addr));
 	mc->context = context;
 	mc->id_priv = id_priv;
-
+	mc->igmp_joined = false;
 	spin_lock(&id_priv->lock);
 	list_add(&mc->list, &id_priv->mc_list);
 	spin_unlock(&id_priv->lock);
@@ -3790,9 +3967,25 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
 			if (rdma_cap_ib_mcast(id->device, id->port_num)) {
 				ib_sa_free_multicast(mc->multicast.ib);
 				kfree(mc);
-			} else if (rdma_protocol_roce(id->device, id->port_num))
+			} else if (rdma_protocol_roce(id->device, id->port_num)) {
+				if (mc->igmp_joined) {
+					struct rdma_dev_addr *dev_addr =
+						&id->route.addr.dev_addr;
+					struct net_device *ndev = NULL;
+
+					if (dev_addr->bound_dev_if)
+						ndev = dev_get_by_index(&init_net,
+									dev_addr->bound_dev_if);
+					if (ndev) {
+						cma_igmp_send(ndev,
+							      &mc->multicast.ib->rec.mgid,
+							      false);
+						dev_put(ndev);
+					}
+					mc->igmp_joined = false;
+				}
 				kref_put(&mc->mcref, release_mc);
-
+			}
 			return;
 		}
 	}
@@ -3861,12 +4054,27 @@ static void cma_add_one(struct ib_device *device)
 {
 	struct cma_device *cma_dev;
 	struct rdma_id_private *id_priv;
+	unsigned int i;
+	unsigned long supported_gids = 0;
 
 	cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
 	if (!cma_dev)
 		return;
 
 	cma_dev->device = device;
+	cma_dev->default_gid_type = kcalloc(device->phys_port_cnt,
+					    sizeof(*cma_dev->default_gid_type),
+					    GFP_KERNEL);
+	if (!cma_dev->default_gid_type) {
+		kfree(cma_dev);
+		return;
+	}
+	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+		supported_gids = roce_gid_type_mask_support(device, i);
+		WARN_ON(!supported_gids);
+		cma_dev->default_gid_type[i - rdma_start_port(device)] =
+			find_first_bit(&supported_gids, BITS_PER_LONG);
+	}
 
 	init_completion(&cma_dev->comp);
 	atomic_set(&cma_dev->refcount, 1);
@@ -3946,6 +4154,7 @@ static void cma_remove_one(struct ib_device *device, void *client_data)
 	mutex_unlock(&lock);
 
 	cma_process_remove(cma_dev);
+	kfree(cma_dev->default_gid_type);
 	kfree(cma_dev);
 }
 
@@ -4079,6 +4288,7 @@ static int __init cma_init(void)
 
 	if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table))
 		printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n");
+	cma_configfs_init();
 
 	return 0;
 
@@ -4093,6 +4303,7 @@ err_wq:
 
 static void __exit cma_cleanup(void)
 {
+	cma_configfs_exit();
 	ibnl_remove_client(RDMA_NL_RDMA_CM);
 	ib_unregister_client(&cma_client);
 	unregister_netdevice_notifier(&cma_nb);
diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c
new file mode 100644
index 000000000000..18b112aa577e
--- /dev/null
+++ b/drivers/infiniband/core/cma_configfs.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <rdma/ib_verbs.h>
+#include "core_priv.h"
+
+struct cma_device;
+
+struct cma_dev_group;
+
+struct cma_dev_port_group {
+	unsigned int		port_num;
+	struct cma_dev_group	*cma_dev_group;
+	struct config_group	group;
+};
+
+struct cma_dev_group {
+	char				name[IB_DEVICE_NAME_MAX];
+	struct config_group		device_group;
+	struct config_group		ports_group;
+	struct config_group		*default_dev_group[2];
+	struct config_group		**default_ports_group;
+	struct cma_dev_port_group	*ports;
+};
+
+static struct cma_dev_port_group *to_dev_port_group(struct config_item *item)
+{
+	struct config_group *group;
+
+	if (!item)
+		return NULL;
+
+	group = container_of(item, struct config_group, cg_item);
+	return container_of(group, struct cma_dev_port_group, group);
+}
+
+static bool filter_by_name(struct ib_device *ib_dev, void *cookie)
+{
+	return !strcmp(ib_dev->name, cookie);
+}
+
+static int cma_configfs_params_get(struct config_item *item,
+				   struct cma_device **pcma_dev,
+				   struct cma_dev_port_group **pgroup)
+{
+	struct cma_dev_port_group *group = to_dev_port_group(item);
+	struct cma_device *cma_dev;
+
+	if (!group)
+		return -ENODEV;
+
+	cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+					    group->cma_dev_group->name);
+	if (!cma_dev)
+		return -ENODEV;
+
+	*pcma_dev = cma_dev;
+	*pgroup = group;
+
+	return 0;
+}
+
+static void cma_configfs_params_put(struct cma_device *cma_dev)
+{
+	cma_deref_dev(cma_dev);
+}
+
+static ssize_t default_roce_mode_show(struct config_item *item,
+				      char *buf)
+{
+	struct cma_device *cma_dev;
+	struct cma_dev_port_group *group;
+	int gid_type;
+	ssize_t ret;
+
+	ret = cma_configfs_params_get(item, &cma_dev, &group);
+	if (ret)
+		return ret;
+
+	gid_type = cma_get_default_gid_type(cma_dev, group->port_num);
+	cma_configfs_params_put(cma_dev);
+
+	if (gid_type < 0)
+		return gid_type;
+
+	return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_type));
+}
+
+static ssize_t default_roce_mode_store(struct config_item *item,
+				       const char *buf, size_t count)
+{
+	struct cma_device *cma_dev;
+	struct cma_dev_port_group *group;
+	int gid_type = ib_cache_gid_parse_type_str(buf);
+	ssize_t ret;
+
+	if (gid_type < 0)
+		return -EINVAL;
+
+	ret = cma_configfs_params_get(item, &cma_dev, &group);
+	if (ret)
+		return ret;
+
+	ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type);
+
+	cma_configfs_params_put(cma_dev);
+
+	return !ret ? strnlen(buf, count) : ret;
+}
+
+CONFIGFS_ATTR(, default_roce_mode);
+
+static struct configfs_attribute *cma_configfs_attributes[] = {
+	&attr_default_roce_mode,
+	NULL,
+};
+
+static struct config_item_type cma_port_group_type = {
+	.ct_attrs	= cma_configfs_attributes,
+	.ct_owner	= THIS_MODULE
+};
+
+static int make_cma_ports(struct cma_dev_group *cma_dev_group,
+			  struct cma_device *cma_dev)
+{
+	struct ib_device *ibdev;
+	unsigned int i;
+	unsigned int ports_num;
+	struct cma_dev_port_group *ports;
+	struct config_group **ports_group;
+	int err;
+
+	ibdev = cma_get_ib_dev(cma_dev);
+
+	if (!ibdev)
+		return -ENODEV;
+
+	ports_num = ibdev->phys_port_cnt;
+	ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports),
+			GFP_KERNEL);
+	ports_group = kcalloc(ports_num + 1, sizeof(*ports_group), GFP_KERNEL);
+
+	if (!ports || !ports_group) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	for (i = 0; i < ports_num; i++) {
+		char port_str[10];
+
+		ports[i].port_num = i + 1;
+		snprintf(port_str, sizeof(port_str), "%u", i + 1);
+		ports[i].cma_dev_group = cma_dev_group;
+		config_group_init_type_name(&ports[i].group,
+					    port_str,
+					    &cma_port_group_type);
+		ports_group[i] = &ports[i].group;
+	}
+	ports_group[i] = NULL;
+	cma_dev_group->default_ports_group = ports_group;
+	cma_dev_group->ports = ports;
+
+	return 0;
+free:
+	kfree(ports);
+	kfree(ports_group);
+	cma_dev_group->ports = NULL;
+	cma_dev_group->default_ports_group = NULL;
+	return err;
+}
+
+static void release_cma_dev(struct config_item  *item)
+{
+	struct config_group *group = container_of(item, struct config_group,
+						  cg_item);
+	struct cma_dev_group *cma_dev_group = container_of(group,
+							   struct cma_dev_group,
+							   device_group);
+
+	kfree(cma_dev_group);
+};
+
+static void release_cma_ports_group(struct config_item  *item)
+{
+	struct config_group *group = container_of(item, struct config_group,
+						  cg_item);
+	struct cma_dev_group *cma_dev_group = container_of(group,
+							   struct cma_dev_group,
+							   ports_group);
+
+	kfree(cma_dev_group->ports);
+	kfree(cma_dev_group->default_ports_group);
+	cma_dev_group->ports = NULL;
+	cma_dev_group->default_ports_group = NULL;
+};
+
+static struct configfs_item_operations cma_ports_item_ops = {
+	.release = release_cma_ports_group
+};
+
+static struct config_item_type cma_ports_group_type = {
+	.ct_item_ops	= &cma_ports_item_ops,
+	.ct_owner	= THIS_MODULE
+};
+
+static struct configfs_item_operations cma_device_item_ops = {
+	.release = release_cma_dev
+};
+
+static struct config_item_type cma_device_group_type = {
+	.ct_item_ops	= &cma_device_item_ops,
+	.ct_owner	= THIS_MODULE
+};
+
+static struct config_group *make_cma_dev(struct config_group *group,
+					 const char *name)
+{
+	int err = -ENODEV;
+	struct cma_device *cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+							       (void *)name);
+	struct cma_dev_group *cma_dev_group = NULL;
+
+	if (!cma_dev)
+		goto fail;
+
+	cma_dev_group = kzalloc(sizeof(*cma_dev_group), GFP_KERNEL);
+
+	if (!cma_dev_group) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	strncpy(cma_dev_group->name, name, sizeof(cma_dev_group->name));
+
+	err = make_cma_ports(cma_dev_group, cma_dev);
+	if (err)
+		goto fail;
+
+	cma_dev_group->ports_group.default_groups =
+		cma_dev_group->default_ports_group;
+	config_group_init_type_name(&cma_dev_group->ports_group, "ports",
+				    &cma_ports_group_type);
+
+	cma_dev_group->device_group.default_groups
+		= cma_dev_group->default_dev_group;
+	cma_dev_group->default_dev_group[0] = &cma_dev_group->ports_group;
+	cma_dev_group->default_dev_group[1] = NULL;
+
+	config_group_init_type_name(&cma_dev_group->device_group, name,
+				    &cma_device_group_type);
+
+	cma_deref_dev(cma_dev);
+	return &cma_dev_group->device_group;
+
+fail:
+	if (cma_dev)
+		cma_deref_dev(cma_dev);
+	kfree(cma_dev_group);
+	return ERR_PTR(err);
+}
+
+static struct configfs_group_operations cma_subsys_group_ops = {
+	.make_group	= make_cma_dev,
+};
+
+static struct config_item_type cma_subsys_type = {
+	.ct_group_ops	= &cma_subsys_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct configfs_subsystem cma_subsys = {
+	.su_group	= {
+		.cg_item	= {
+			.ci_namebuf	= "rdma_cm",
+			.ci_type	= &cma_subsys_type,
+		},
+	},
+};
+
+int __init cma_configfs_init(void)
+{
+	config_group_init(&cma_subsys.su_group);
+	mutex_init(&cma_subsys.su_mutex);
+	return configfs_register_subsystem(&cma_subsys);
+}
+
+void __exit cma_configfs_exit(void)
+{
+	configfs_unregister_subsystem(&cma_subsys);
+}
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 5cf6eb716f00..eab32215756b 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -38,6 +38,32 @@
 
 #include <rdma/ib_verbs.h>
 
+#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
+int cma_configfs_init(void);
+void cma_configfs_exit(void);
+#else
+static inline int cma_configfs_init(void)
+{
+	return 0;
+}
+
+static inline void cma_configfs_exit(void)
+{
+}
+#endif
+struct cma_device;
+void cma_ref_dev(struct cma_device *cma_dev);
+void cma_deref_dev(struct cma_device *cma_dev);
+typedef bool (*cma_device_filter)(struct ib_device *, void *);
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter	filter,
+					     void		*cookie);
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+			     unsigned int port);
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+			     unsigned int port,
+			     enum ib_gid_type default_gid_type);
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev);
+
 int  ib_device_register_sysfs(struct ib_device *device,
 			      int (*port_callback)(struct ib_device *,
 						   u8, struct kobject *));
@@ -70,8 +96,13 @@ enum ib_cache_gid_default_mode {
 	IB_CACHE_GID_DEFAULT_MODE_DELETE
 };
 
+int ib_cache_gid_parse_type_str(const char *buf);
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type);
+
 void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
 				  struct net_device *ndev,
+				  unsigned long gid_type_mask,
 				  enum ib_cache_gid_default_mode mode);
 
 int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
@@ -87,9 +118,23 @@ int roce_gid_mgmt_init(void);
 void roce_gid_mgmt_cleanup(void);
 
 int roce_rescan_device(struct ib_device *ib_dev);
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port);
 
 int ib_cache_setup_one(struct ib_device *device);
 void ib_cache_cleanup_one(struct ib_device *device);
 void ib_cache_release_one(struct ib_device *device);
 
+static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
+					 struct net_device *upper)
+{
+	struct net_device *_upper = NULL;
+	struct list_head *iter;
+
+	netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
+		if (_upper == upper)
+			break;
+
+	return _upper == upper;
+}
+
 #endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
new file mode 100644
index 000000000000..a754fc727de5
--- /dev/null
+++ b/drivers/infiniband/core/cq.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2015 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+
+/* # of WCs to poll for with a single call to ib_poll_cq */
+#define IB_POLL_BATCH			16
+
+/* # of WCs to iterate over before yielding */
+#define IB_POLL_BUDGET_IRQ		256
+#define IB_POLL_BUDGET_WORKQUEUE	65536
+
+#define IB_POLL_FLAGS \
+	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+
+static int __ib_process_cq(struct ib_cq *cq, int budget)
+{
+	int i, n, completed = 0;
+
+	while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
+		for (i = 0; i < n; i++) {
+			struct ib_wc *wc = &cq->wc[i];
+
+			if (wc->wr_cqe)
+				wc->wr_cqe->done(cq, wc);
+			else
+				WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
+		}
+
+		completed += n;
+
+		if (n != IB_POLL_BATCH ||
+		    (budget != -1 && completed >= budget))
+			break;
+	}
+
+	return completed;
+}
+
+/**
+ * ib_process_direct_cq - process a CQ in caller context
+ * @cq:		CQ to process
+ * @budget:	number of CQEs to poll for
+ *
+ * This function is used to process all outstanding CQ entries on a
+ * %IB_POLL_DIRECT CQ.  It does not offload CQ processing to a different
+ * context and does not ask for completion interrupts from the HCA.
+ *
+ * Note: for compatibility reasons -1 can be passed in %budget for unlimited
+ * polling.  Do not use this feature in new code, it will be removed soon.
+ */
+int ib_process_cq_direct(struct ib_cq *cq, int budget)
+{
+	WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
+
+	return __ib_process_cq(cq, budget);
+}
+EXPORT_SYMBOL(ib_process_cq_direct);
+
+static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
+{
+	WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
+}
+
+static int ib_poll_handler(struct irq_poll *iop, int budget)
+{
+	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+	int completed;
+
+	completed = __ib_process_cq(cq, budget);
+	if (completed < budget) {
+		irq_poll_complete(&cq->iop);
+		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+			irq_poll_sched(&cq->iop);
+	}
+
+	return completed;
+}
+
+static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
+{
+	irq_poll_sched(&cq->iop);
+}
+
+static void ib_cq_poll_work(struct work_struct *work)
+{
+	struct ib_cq *cq = container_of(work, struct ib_cq, work);
+	int completed;
+
+	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
+	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
+	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+		queue_work(ib_comp_wq, &cq->work);
+}
+
+static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
+{
+	queue_work(ib_comp_wq, &cq->work);
+}
+
+/**
+ * ib_alloc_cq - allocate a completion queue
+ * @dev:		device to allocate the CQ for
+ * @private:		driver private data, accessible from cq->cq_context
+ * @nr_cqe:		number of CQEs to allocate
+ * @comp_vector:	HCA completion vectors for this CQ
+ * @poll_ctx:		context to poll the CQ from.
+ *
+ * This is the proper interface to allocate a CQ for in-kernel users. A
+ * CQ allocated with this interface will automatically be polled from the
+ * specified context.  The ULP needs must use wr->wr_cqe instead of wr->wr_id
+ * to use this CQ abstraction.
+ */
+struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+		int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx)
+{
+	struct ib_cq_init_attr cq_attr = {
+		.cqe		= nr_cqe,
+		.comp_vector	= comp_vector,
+	};
+	struct ib_cq *cq;
+	int ret = -ENOMEM;
+
+	cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
+	if (IS_ERR(cq))
+		return cq;
+
+	cq->device = dev;
+	cq->uobject = NULL;
+	cq->event_handler = NULL;
+	cq->cq_context = private;
+	cq->poll_ctx = poll_ctx;
+	atomic_set(&cq->usecnt, 0);
+
+	cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
+	if (!cq->wc)
+		goto out_destroy_cq;
+
+	switch (cq->poll_ctx) {
+	case IB_POLL_DIRECT:
+		cq->comp_handler = ib_cq_completion_direct;
+		break;
+	case IB_POLL_SOFTIRQ:
+		cq->comp_handler = ib_cq_completion_softirq;
+
+		irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+		break;
+	case IB_POLL_WORKQUEUE:
+		cq->comp_handler = ib_cq_completion_workqueue;
+		INIT_WORK(&cq->work, ib_cq_poll_work);
+		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out_free_wc;
+	}
+
+	return cq;
+
+out_free_wc:
+	kfree(cq->wc);
+out_destroy_cq:
+	cq->device->destroy_cq(cq);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_alloc_cq);
+
+/**
+ * ib_free_cq - free a completion queue
+ * @cq:		completion queue to free.
+ */
+void ib_free_cq(struct ib_cq *cq)
+{
+	int ret;
+
+	if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
+		return;
+
+	switch (cq->poll_ctx) {
+	case IB_POLL_DIRECT:
+		break;
+	case IB_POLL_SOFTIRQ:
+		irq_poll_disable(&cq->iop);
+		break;
+	case IB_POLL_WORKQUEUE:
+		flush_work(&cq->work);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+	}
+
+	kfree(cq->wc);
+	ret = cq->device->destroy_cq(cq);
+	WARN_ON_ONCE(ret);
+}
+EXPORT_SYMBOL(ib_free_cq);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 179e8134d57f..00da80e02154 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -58,6 +58,7 @@ struct ib_client_data {
 	bool		  going_down;
 };
 
+struct workqueue_struct *ib_comp_wq;
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
@@ -325,6 +326,7 @@ int ib_register_device(struct ib_device *device,
 {
 	int ret;
 	struct ib_client *client;
+	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
 
 	mutex_lock(&device_mutex);
 
@@ -352,6 +354,13 @@ int ib_register_device(struct ib_device *device,
 		goto out;
 	}
 
+	memset(&device->attrs, 0, sizeof(device->attrs));
+	ret = device->query_device(device, &device->attrs, &uhw);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't query the device attributes\n");
+		goto out;
+	}
+
 	ret = ib_device_register_sysfs(device, port_callback);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
@@ -628,25 +637,6 @@ void ib_dispatch_event(struct ib_event *event)
 EXPORT_SYMBOL(ib_dispatch_event);
 
 /**
- * ib_query_device - Query IB device attributes
- * @device:Device to query
- * @device_attr:Device attributes
- *
- * ib_query_device() returns the attributes of a device through the
- * @device_attr pointer.
- */
-int ib_query_device(struct ib_device *device,
-		    struct ib_device_attr *device_attr)
-{
-	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
-
-	memset(device_attr, 0, sizeof(*device_attr));
-
-	return device->query_device(device, device_attr, &uhw);
-}
-EXPORT_SYMBOL(ib_query_device);
-
-/**
  * ib_query_port - Query IB port attributes
  * @device:Device to query
  * @port_num:Port number to query
@@ -825,26 +815,31 @@ EXPORT_SYMBOL(ib_modify_port);
  *   a specified GID value occurs.
  * @device: The device to query.
  * @gid: The GID value to search for.
+ * @gid_type: Type of GID.
  * @ndev: The ndev related to the GID to search for.
  * @port_num: The port number of the device where the GID value was found.
  * @index: The index into the GID table where the GID was found.  This
  *   parameter may be NULL.
  */
 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
-		struct net_device *ndev, u8 *port_num, u16 *index)
+		enum ib_gid_type gid_type, struct net_device *ndev,
+		u8 *port_num, u16 *index)
 {
 	union ib_gid tmp_gid;
 	int ret, port, i;
 
 	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
 		if (rdma_cap_roce_gid_table(device, port)) {
-			if (!ib_find_cached_gid_by_port(device, gid, port,
+			if (!ib_find_cached_gid_by_port(device, gid, gid_type, port,
 							ndev, index)) {
 				*port_num = port;
 				return 0;
 			}
 		}
 
+		if (gid_type != IB_GID_TYPE_IB)
+			continue;
+
 		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
 			ret = ib_query_gid(device, port, i, &tmp_gid, NULL);
 			if (ret)
@@ -954,10 +949,18 @@ static int __init ib_core_init(void)
 	if (!ib_wq)
 		return -ENOMEM;
 
+	ib_comp_wq = alloc_workqueue("ib-comp-wq",
+			WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM,
+			WQ_UNBOUND_MAX_ACTIVE);
+	if (!ib_comp_wq) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
 	ret = class_register(&ib_class);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
-		goto err;
+		goto err_comp;
 	}
 
 	ret = ibnl_init();
@@ -972,7 +975,8 @@ static int __init ib_core_init(void)
 
 err_sysfs:
 	class_unregister(&ib_class);
-
+err_comp:
+	destroy_workqueue(ib_comp_wq);
 err:
 	destroy_workqueue(ib_wq);
 	return ret;
@@ -983,6 +987,7 @@ static void __exit ib_core_cleanup(void)
 	ib_cache_cleanup();
 	ibnl_cleanup();
 	class_unregister(&ib_class);
+	destroy_workqueue(ib_comp_wq);
 	/* Make sure that any pending umem accounting work is done. */
 	destroy_workqueue(ib_wq);
 }
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
index 9f5ad7cc33c8..6ac3683c144b 100644
--- a/drivers/infiniband/core/fmr_pool.c
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -212,7 +212,6 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
 {
 	struct ib_device   *device;
 	struct ib_fmr_pool *pool;
-	struct ib_device_attr *attr;
 	int i;
 	int ret;
 	int max_remaps;
@@ -228,25 +227,10 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
 		return ERR_PTR(-ENOSYS);
 	}
 
-	attr = kmalloc(sizeof *attr, GFP_KERNEL);
-	if (!attr) {
-		printk(KERN_WARNING PFX "couldn't allocate device attr struct\n");
-		return ERR_PTR(-ENOMEM);
-	}
-
-	ret = ib_query_device(device, attr);
-	if (ret) {
-		printk(KERN_WARNING PFX "couldn't query device: %d\n", ret);
-		kfree(attr);
-		return ERR_PTR(ret);
-	}
-
-	if (!attr->max_map_per_fmr)
+	if (!device->attrs.max_map_per_fmr)
 		max_remaps = IB_FMR_MAX_REMAPS;
 	else
-		max_remaps = attr->max_map_per_fmr;
-
-	kfree(attr);
+		max_remaps = device->attrs.max_map_per_fmr;
 
 	pool = kmalloc(sizeof *pool, GFP_KERNEL);
 	if (!pool) {
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 2281de122038..9fa5bf33f5a3 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -84,6 +84,9 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
 			      u8 mgmt_class);
 static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
 			   struct ib_mad_agent_private *agent_priv);
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+			      struct ib_wc *wc);
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc);
 
 /*
  * Returns a ib_mad_port_private structure or NULL for a device/port
@@ -681,7 +684,7 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info,
 
 		atomic_inc(&mad_snoop_priv->refcount);
 		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
-		mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+		mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL,
 						   mad_recv_wc);
 		deref_snoop_agent(mad_snoop_priv);
 		spin_lock_irqsave(&qp_info->snoop_lock, flags);
@@ -689,12 +692,11 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info,
 	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
 }
 
-static void build_smp_wc(struct ib_qp *qp,
-			 u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
-			 struct ib_wc *wc)
+static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid,
+		u16 pkey_index, u8 port_num, struct ib_wc *wc)
 {
 	memset(wc, 0, sizeof *wc);
-	wc->wr_id = wr_id;
+	wc->wr_cqe = cqe;
 	wc->status = IB_WC_SUCCESS;
 	wc->opcode = IB_WC_RECV;
 	wc->pkey_index = pkey_index;
@@ -832,7 +834,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 	}
 
 	build_smp_wc(mad_agent_priv->agent.qp,
-		     send_wr->wr.wr_id, drslid,
+		     send_wr->wr.wr_cqe, drslid,
 		     send_wr->pkey_index,
 		     send_wr->port_num, &mad_wc);
 
@@ -1039,7 +1041,9 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
 
 	mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
 
-	mad_send_wr->send_wr.wr.wr_id = (unsigned long) mad_send_wr;
+	mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+
+	mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
 	mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list;
 	mad_send_wr->send_wr.wr.num_sge = 2;
 	mad_send_wr->send_wr.wr.opcode = IB_WR_SEND;
@@ -1151,8 +1155,9 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
 
 	/* Set WR ID to find mad_send_wr upon completion */
 	qp_info = mad_send_wr->mad_agent_priv->qp_info;
-	mad_send_wr->send_wr.wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
 	mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+	mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+	mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
 
 	mad_agent = mad_send_wr->send_buf.mad_agent;
 	sge = mad_send_wr->sg_list;
@@ -1982,9 +1987,9 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 				/* user rmpp is in effect
 				 * and this is an active RMPP MAD
 				 */
-				mad_recv_wc->wc->wr_id = 0;
-				mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
-								   mad_recv_wc);
+				mad_agent_priv->agent.recv_handler(
+						&mad_agent_priv->agent, NULL,
+						mad_recv_wc);
 				atomic_dec(&mad_agent_priv->refcount);
 			} else {
 				/* not user rmpp, revert to normal behavior and
@@ -1998,9 +2003,10 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
 			/* Defined behavior is to complete response before request */
-			mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
-			mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
-							   mad_recv_wc);
+			mad_agent_priv->agent.recv_handler(
+					&mad_agent_priv->agent,
+					&mad_send_wr->send_buf,
+					mad_recv_wc);
 			atomic_dec(&mad_agent_priv->refcount);
 
 			mad_send_wc.status = IB_WC_SUCCESS;
@@ -2009,7 +2015,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 			ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
 		}
 	} else {
-		mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+		mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL,
 						   mad_recv_wc);
 		deref_mad_agent(mad_agent_priv);
 	}
@@ -2172,13 +2178,14 @@ handle_smi(struct ib_mad_port_private *port_priv,
 	return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response);
 }
 
-static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
-				     struct ib_wc *wc)
+static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct ib_mad_port_private *port_priv = cq->cq_context;
+	struct ib_mad_list_head *mad_list =
+		container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
 	struct ib_mad_qp_info *qp_info;
 	struct ib_mad_private_header *mad_priv_hdr;
 	struct ib_mad_private *recv, *response = NULL;
-	struct ib_mad_list_head *mad_list;
 	struct ib_mad_agent_private *mad_agent;
 	int port_num;
 	int ret = IB_MAD_RESULT_SUCCESS;
@@ -2186,7 +2193,17 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
 	u16 resp_mad_pkey_index = 0;
 	bool opa;
 
-	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	if (list_empty_careful(&port_priv->port_list))
+		return;
+
+	if (wc->status != IB_WC_SUCCESS) {
+		/*
+		 * Receive errors indicate that the QP has entered the error
+		 * state - error handling/shutdown code will cleanup
+		 */
+		return;
+	}
+
 	qp_info = mad_list->mad_queue->qp_info;
 	dequeue_mad(mad_list);
 
@@ -2227,7 +2244,7 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
 	response = alloc_mad_private(mad_size, GFP_KERNEL);
 	if (!response) {
 		dev_err(&port_priv->device->dev,
-			"ib_mad_recv_done_handler no memory for response buffer\n");
+			"%s: no memory for response buffer\n", __func__);
 		goto out;
 	}
 
@@ -2413,11 +2430,12 @@ done:
 	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 }
 
-static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
-				     struct ib_wc *wc)
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct ib_mad_port_private *port_priv = cq->cq_context;
+	struct ib_mad_list_head *mad_list =
+		container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
 	struct ib_mad_send_wr_private	*mad_send_wr, *queued_send_wr;
-	struct ib_mad_list_head		*mad_list;
 	struct ib_mad_qp_info		*qp_info;
 	struct ib_mad_queue		*send_queue;
 	struct ib_send_wr		*bad_send_wr;
@@ -2425,7 +2443,14 @@ static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
 	unsigned long flags;
 	int ret;
 
-	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	if (list_empty_careful(&port_priv->port_list))
+		return;
+
+	if (wc->status != IB_WC_SUCCESS) {
+		if (!ib_mad_send_error(port_priv, wc))
+			return;
+	}
+
 	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
 				   mad_list);
 	send_queue = mad_list->mad_queue;
@@ -2490,24 +2515,15 @@ static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
 	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
 }
 
-static void mad_error_handler(struct ib_mad_port_private *port_priv,
-			      struct ib_wc *wc)
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+		struct ib_wc *wc)
 {
-	struct ib_mad_list_head *mad_list;
-	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_list_head *mad_list =
+		container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+	struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info;
 	struct ib_mad_send_wr_private *mad_send_wr;
 	int ret;
 
-	/* Determine if failure was a send or receive */
-	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
-	qp_info = mad_list->mad_queue->qp_info;
-	if (mad_list->mad_queue == &qp_info->recv_queue)
-		/*
-		 * Receive errors indicate that the QP has entered the error
-		 * state - error handling/shutdown code will cleanup
-		 */
-		return;
-
 	/*
 	 * Send errors will transition the QP to SQE - move
 	 * QP to RTS and repost flushed work requests
@@ -2522,10 +2538,9 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv,
 			mad_send_wr->retry = 0;
 			ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,
 					&bad_send_wr);
-			if (ret)
-				ib_mad_send_done_handler(port_priv, wc);
-		} else
-			ib_mad_send_done_handler(port_priv, wc);
+			if (!ret)
+				return false;
+		}
 	} else {
 		struct ib_qp_attr *attr;
 
@@ -2539,42 +2554,14 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv,
 			kfree(attr);
 			if (ret)
 				dev_err(&port_priv->device->dev,
-					"mad_error_handler - ib_modify_qp to RTS : %d\n",
-					ret);
+					"%s - ib_modify_qp to RTS: %d\n",
+					__func__, ret);
 			else
 				mark_sends_for_retry(qp_info);
 		}
-		ib_mad_send_done_handler(port_priv, wc);
 	}
-}
 
-/*
- * IB MAD completion callback
- */
-static void ib_mad_completion_handler(struct work_struct *work)
-{
-	struct ib_mad_port_private *port_priv;
-	struct ib_wc wc;
-
-	port_priv = container_of(work, struct ib_mad_port_private, work);
-	ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
-
-	while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
-		if (wc.status == IB_WC_SUCCESS) {
-			switch (wc.opcode) {
-			case IB_WC_SEND:
-				ib_mad_send_done_handler(port_priv, &wc);
-				break;
-			case IB_WC_RECV:
-				ib_mad_recv_done_handler(port_priv, &wc);
-				break;
-			default:
-				BUG_ON(1);
-				break;
-			}
-		} else
-			mad_error_handler(port_priv, &wc);
-	}
+	return true;
 }
 
 static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
@@ -2716,7 +2703,7 @@ static void local_completions(struct work_struct *work)
 			 * before request
 			 */
 			build_smp_wc(recv_mad_agent->agent.qp,
-				     (unsigned long) local->mad_send_wr,
+				     local->mad_send_wr->send_wr.wr.wr_cqe,
 				     be16_to_cpu(IB_LID_PERMISSIVE),
 				     local->mad_send_wr->send_wr.pkey_index,
 				     recv_mad_agent->agent.port_num, &wc);
@@ -2744,6 +2731,7 @@ static void local_completions(struct work_struct *work)
 					   IB_MAD_SNOOP_RECVS);
 			recv_mad_agent->agent.recv_handler(
 						&recv_mad_agent->agent,
+						&local->mad_send_wr->send_buf,
 						&local->mad_priv->header.recv_wc);
 			spin_lock_irqsave(&recv_mad_agent->lock, flags);
 			atomic_dec(&recv_mad_agent->refcount);
@@ -2855,17 +2843,6 @@ static void timeout_sends(struct work_struct *work)
 	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 }
 
-static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg)
-{
-	struct ib_mad_port_private *port_priv = cq->cq_context;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
-	if (!list_empty(&port_priv->port_list))
-		queue_work(port_priv->wq, &port_priv->work);
-	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
-}
-
 /*
  * Allocate receive MADs and post receive WRs for them
  */
@@ -2913,8 +2890,9 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
 			break;
 		}
 		mad_priv->header.mapping = sg_list.addr;
-		recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
 		mad_priv->header.mad_list.mad_queue = recv_queue;
+		mad_priv->header.mad_list.cqe.done = ib_mad_recv_done;
+		recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe;
 
 		/* Post receive WR */
 		spin_lock_irqsave(&recv_queue->lock, flags);
@@ -3151,7 +3129,6 @@ static int ib_mad_port_open(struct ib_device *device,
 	unsigned long flags;
 	char name[sizeof "ib_mad123"];
 	int has_smi;
-	struct ib_cq_init_attr cq_attr = {};
 
 	if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE))
 		return -EFAULT;
@@ -3179,10 +3156,8 @@ static int ib_mad_port_open(struct ib_device *device,
 	if (has_smi)
 		cq_size *= 2;
 
-	cq_attr.cqe = cq_size;
-	port_priv->cq = ib_create_cq(port_priv->device,
-				     ib_mad_thread_completion_handler,
-				     NULL, port_priv, &cq_attr);
+	port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
+			IB_POLL_WORKQUEUE);
 	if (IS_ERR(port_priv->cq)) {
 		dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
 		ret = PTR_ERR(port_priv->cq);
@@ -3211,7 +3186,6 @@ static int ib_mad_port_open(struct ib_device *device,
 		ret = -ENOMEM;
 		goto error8;
 	}
-	INIT_WORK(&port_priv->work, ib_mad_completion_handler);
 
 	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
 	list_add_tail(&port_priv->port_list, &ib_mad_port_list);
@@ -3238,7 +3212,7 @@ error7:
 error6:
 	ib_dealloc_pd(port_priv->pd);
 error4:
-	ib_destroy_cq(port_priv->cq);
+	ib_free_cq(port_priv->cq);
 	cleanup_recv_queue(&port_priv->qp_info[1]);
 	cleanup_recv_queue(&port_priv->qp_info[0]);
 error3:
@@ -3271,7 +3245,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
 	destroy_mad_qp(&port_priv->qp_info[1]);
 	destroy_mad_qp(&port_priv->qp_info[0]);
 	ib_dealloc_pd(port_priv->pd);
-	ib_destroy_cq(port_priv->cq);
+	ib_free_cq(port_priv->cq);
 	cleanup_recv_queue(&port_priv->qp_info[1]);
 	cleanup_recv_queue(&port_priv->qp_info[0]);
 	/* XXX: Handle deallocation of MAD registration tables */
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 990698a6ab4b..28669f6419e1 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -64,6 +64,7 @@
 
 struct ib_mad_list_head {
 	struct list_head list;
+	struct ib_cqe cqe;
 	struct ib_mad_queue *mad_queue;
 };
 
@@ -204,7 +205,6 @@ struct ib_mad_port_private {
 	struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
 	struct list_head agent_list;
 	struct workqueue_struct *wq;
-	struct work_struct work;
 	struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
 };
 
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index bb6685fb08c6..250937cb9a1a 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -723,14 +723,27 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
 
 int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
 			     struct ib_sa_mcmember_rec *rec,
+			     struct net_device *ndev,
+			     enum ib_gid_type gid_type,
 			     struct ib_ah_attr *ah_attr)
 {
 	int ret;
 	u16 gid_index;
 	u8 p;
 
-	ret = ib_find_cached_gid(device, &rec->port_gid,
-				 NULL, &p, &gid_index);
+	if (rdma_protocol_roce(device, port_num)) {
+		ret = ib_find_cached_gid_by_port(device, &rec->port_gid,
+						 gid_type, port_num,
+						 ndev,
+						 &gid_index);
+	} else if (rdma_protocol_ib(device, port_num)) {
+		ret = ib_find_cached_gid(device, &rec->port_gid,
+					 IB_GID_TYPE_IB, NULL, &p,
+					 &gid_index);
+	} else {
+		ret = -EINVAL;
+	}
+
 	if (ret)
 		return ret;
 
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index 178f98482e13..06556c34606d 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -67,17 +67,53 @@ struct netdev_event_work {
 	struct netdev_event_work_cmd	cmds[ROCE_NETDEV_CALLBACK_SZ];
 };
 
+static const struct {
+	bool (*is_supported)(const struct ib_device *device, u8 port_num);
+	enum ib_gid_type gid_type;
+} PORT_CAP_TO_GID_TYPE[] = {
+	{rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE},
+	{rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP},
+};
+
+#define CAP_TO_GID_TABLE_SIZE	ARRAY_SIZE(PORT_CAP_TO_GID_TYPE)
+
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port)
+{
+	int i;
+	unsigned int ret_flags = 0;
+
+	if (!rdma_protocol_roce(ib_dev, port))
+		return 1UL << IB_GID_TYPE_IB;
+
+	for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++)
+		if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port))
+			ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type;
+
+	return ret_flags;
+}
+EXPORT_SYMBOL(roce_gid_type_mask_support);
+
 static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
 		       u8 port, union ib_gid *gid,
 		       struct ib_gid_attr *gid_attr)
 {
-	switch (gid_op) {
-	case GID_ADD:
-		ib_cache_gid_add(ib_dev, port, gid, gid_attr);
-		break;
-	case GID_DEL:
-		ib_cache_gid_del(ib_dev, port, gid, gid_attr);
-		break;
+	int i;
+	unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+	for (i = 0; i < IB_GID_TYPE_SIZE; i++) {
+		if ((1UL << i) & gid_type_mask) {
+			gid_attr->gid_type = i;
+			switch (gid_op) {
+			case GID_ADD:
+				ib_cache_gid_add(ib_dev, port,
+						 gid, gid_attr);
+				break;
+			case GID_DEL:
+				ib_cache_gid_del(ib_dev, port,
+						 gid, gid_attr);
+				break;
+			}
+		}
 	}
 }
 
@@ -103,18 +139,6 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de
 	return BONDING_SLAVE_STATE_NA;
 }
 
-static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper)
-{
-	struct net_device *_upper = NULL;
-	struct list_head *iter;
-
-	netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
-		if (_upper == upper)
-			break;
-
-	return _upper == upper;
-}
-
 #define REQUIRED_BOND_STATES		(BONDING_SLAVE_STATE_ACTIVE |	\
 					 BONDING_SLAVE_STATE_NA)
 static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
@@ -132,7 +156,7 @@ static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
 	if (!real_dev)
 		real_dev = event_ndev;
 
-	res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+	res = ((rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) &&
 	       (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
 		REQUIRED_BOND_STATES)) ||
 	       real_dev == rdma_ndev);
@@ -178,7 +202,7 @@ static int upper_device_filter(struct ib_device *ib_dev, u8 port,
 		return 1;
 
 	rcu_read_lock();
-	res = is_upper_dev_rcu(rdma_ndev, event_ndev);
+	res = rdma_is_upper_dev_rcu(rdma_ndev, event_ndev);
 	rcu_read_unlock();
 
 	return res;
@@ -203,10 +227,12 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev,
 				     u8 port, struct net_device *event_ndev,
 				     struct net_device *rdma_ndev)
 {
+	unsigned long gid_type_mask;
+
 	rcu_read_lock();
 	if (!rdma_ndev ||
 	    ((rdma_ndev != event_ndev &&
-	      !is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+	      !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
 	     is_eth_active_slave_of_bonding_rcu(rdma_ndev,
 						netdev_master_upper_dev_get_rcu(rdma_ndev)) ==
 	     BONDING_SLAVE_STATE_INACTIVE)) {
@@ -215,7 +241,9 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev,
 	}
 	rcu_read_unlock();
 
-	ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+	gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+	ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask,
 				     IB_CACHE_GID_DEFAULT_MODE_SET);
 }
 
@@ -234,12 +262,17 @@ static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
 
 	rcu_read_lock();
 
-	if (is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+	if (rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) &&
 	    is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) ==
 	    BONDING_SLAVE_STATE_INACTIVE) {
+		unsigned long gid_type_mask;
+
 		rcu_read_unlock();
 
+		gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
 		ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+					     gid_type_mask,
 					     IB_CACHE_GID_DEFAULT_MODE_DELETE);
 	} else {
 		rcu_read_unlock();
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index a95a32ba596e..f334090bb612 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -49,7 +49,9 @@
 #include <net/netlink.h>
 #include <uapi/rdma/ib_user_sa.h>
 #include <rdma/ib_marshall.h>
+#include <rdma/ib_addr.h>
 #include "sa.h"
+#include "core_priv.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand subnet administration query support");
@@ -715,7 +717,9 @@ static int ib_nl_handle_set_timeout(struct sk_buff *skb,
 	struct nlattr *tb[LS_NLA_TYPE_MAX];
 	int ret;
 
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
+	if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+	    !(NETLINK_CB(skb).sk) ||
+	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
 	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
@@ -789,7 +793,9 @@ static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
 	int found = 0;
 	int ret;
 
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
+	if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+	    !(NETLINK_CB(skb).sk) ||
+	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
 	spin_lock_irqsave(&ib_nl_request_lock, flags);
@@ -996,7 +1002,8 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 {
 	int ret;
 	u16 gid_index;
-	int force_grh;
+	int use_roce;
+	struct net_device *ndev = NULL;
 
 	memset(ah_attr, 0, sizeof *ah_attr);
 	ah_attr->dlid = be16_to_cpu(rec->dlid);
@@ -1006,16 +1013,71 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 	ah_attr->port_num = port_num;
 	ah_attr->static_rate = rec->rate;
 
-	force_grh = rdma_cap_eth_ah(device, port_num);
+	use_roce = rdma_cap_eth_ah(device, port_num);
+
+	if (use_roce) {
+		struct net_device *idev;
+		struct net_device *resolved_dev;
+		struct rdma_dev_addr dev_addr = {.bound_dev_if = rec->ifindex,
+						 .net = rec->net ? rec->net :
+							 &init_net};
+		union {
+			struct sockaddr     _sockaddr;
+			struct sockaddr_in  _sockaddr_in;
+			struct sockaddr_in6 _sockaddr_in6;
+		} sgid_addr, dgid_addr;
+
+		if (!device->get_netdev)
+			return -EOPNOTSUPP;
+
+		rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid);
+		rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid);
+
+		/* validate the route */
+		ret = rdma_resolve_ip_route(&sgid_addr._sockaddr,
+					    &dgid_addr._sockaddr, &dev_addr);
+		if (ret)
+			return ret;
 
-	if (rec->hop_limit > 1 || force_grh) {
-		struct net_device *ndev = ib_get_ndev_from_path(rec);
+		if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
+		     dev_addr.network == RDMA_NETWORK_IPV6) &&
+		    rec->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
+			return -EINVAL;
+
+		idev = device->get_netdev(device, port_num);
+		if (!idev)
+			return -ENODEV;
+
+		resolved_dev = dev_get_by_index(dev_addr.net,
+						dev_addr.bound_dev_if);
+		if (resolved_dev->flags & IFF_LOOPBACK) {
+			dev_put(resolved_dev);
+			resolved_dev = idev;
+			dev_hold(resolved_dev);
+		}
+		ndev = ib_get_ndev_from_path(rec);
+		rcu_read_lock();
+		if ((ndev && ndev != resolved_dev) ||
+		    (resolved_dev != idev &&
+		     !rdma_is_upper_dev_rcu(idev, resolved_dev)))
+			ret = -EHOSTUNREACH;
+		rcu_read_unlock();
+		dev_put(idev);
+		dev_put(resolved_dev);
+		if (ret) {
+			if (ndev)
+				dev_put(ndev);
+			return ret;
+		}
+	}
 
+	if (rec->hop_limit > 1 || use_roce) {
 		ah_attr->ah_flags = IB_AH_GRH;
 		ah_attr->grh.dgid = rec->dgid;
 
-		ret = ib_find_cached_gid(device, &rec->sgid, ndev, &port_num,
-					 &gid_index);
+		ret = ib_find_cached_gid_by_port(device, &rec->sgid,
+						 rec->gid_type, port_num, ndev,
+						 &gid_index);
 		if (ret) {
 			if (ndev)
 				dev_put(ndev);
@@ -1029,9 +1091,10 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 		if (ndev)
 			dev_put(ndev);
 	}
-	if (force_grh) {
+
+	if (use_roce)
 		memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
-	}
+
 	return 0;
 }
 EXPORT_SYMBOL(ib_init_ah_from_path);
@@ -1157,6 +1220,7 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
 			  mad->data, &rec);
 		rec.net = NULL;
 		rec.ifindex = 0;
+		rec.gid_type = IB_GID_TYPE_IB;
 		memset(rec.dmac, 0, ETH_ALEN);
 		query->callback(status, &rec, query->context);
 	} else
@@ -1609,14 +1673,15 @@ static void send_handler(struct ib_mad_agent *agent,
 }
 
 static void recv_handler(struct ib_mad_agent *mad_agent,
+			 struct ib_mad_send_buf *send_buf,
 			 struct ib_mad_recv_wc *mad_recv_wc)
 {
 	struct ib_sa_query *query;
-	struct ib_mad_send_buf *mad_buf;
 
-	mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id;
-	query = mad_buf->context[0];
+	if (!send_buf)
+		return;
 
+	query = send_buf->context[0];
 	if (query->callback) {
 		if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
 			query->callback(query,
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index b1f37d4095fa..3de93517efe4 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -37,15 +37,27 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/string.h>
+#include <linux/netdevice.h>
 
 #include <rdma/ib_mad.h>
+#include <rdma/ib_pma.h>
 
+struct ib_port;
+
+struct gid_attr_group {
+	struct ib_port		*port;
+	struct kobject		kobj;
+	struct attribute_group	ndev;
+	struct attribute_group	type;
+};
 struct ib_port {
 	struct kobject         kobj;
 	struct ib_device      *ibdev;
+	struct gid_attr_group *gid_attr_group;
 	struct attribute_group gid_group;
 	struct attribute_group pkey_group;
 	u8                     port_num;
+	struct attribute_group *pma_table;
 };
 
 struct port_attribute {
@@ -65,6 +77,7 @@ struct port_table_attribute {
 	struct port_attribute	attr;
 	char			name[8];
 	int			index;
+	__be16			attr_id;
 };
 
 static ssize_t port_attr_show(struct kobject *kobj,
@@ -84,6 +97,24 @@ static const struct sysfs_ops port_sysfs_ops = {
 	.show = port_attr_show
 };
 
+static ssize_t gid_attr_show(struct kobject *kobj,
+			     struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct ib_port *p = container_of(kobj, struct gid_attr_group,
+					 kobj)->port;
+
+	if (!port_attr->show)
+		return -EIO;
+
+	return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops gid_attr_sysfs_ops = {
+	.show = gid_attr_show
+};
+
 static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
 			  char *buf)
 {
@@ -281,6 +312,46 @@ static struct attribute *port_default_attrs[] = {
 	NULL
 };
 
+static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf)
+{
+	if (!gid_attr->ndev)
+		return -EINVAL;
+
+	return sprintf(buf, "%s\n", gid_attr->ndev->name);
+}
+
+static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf)
+{
+	return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type));
+}
+
+static ssize_t _show_port_gid_attr(struct ib_port *p,
+				   struct port_attribute *attr,
+				   char *buf,
+				   size_t (*print)(struct ib_gid_attr *gid_attr,
+						   char *buf))
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr = {};
+	ssize_t ret;
+	va_list args;
+
+	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid,
+			   &gid_attr);
+	if (ret)
+		goto err;
+
+	ret = print(&gid_attr, buf);
+
+err:
+	if (gid_attr.ndev)
+		dev_put(gid_attr.ndev);
+	va_end(args);
+	return ret;
+}
+
 static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
 			     char *buf)
 {
@@ -296,6 +367,19 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
 	return sprintf(buf, "%pI6\n", gid.raw);
 }
 
+static ssize_t show_port_gid_attr_ndev(struct ib_port *p,
+				       struct port_attribute *attr, char *buf)
+{
+	return _show_port_gid_attr(p, attr, buf, print_ndev);
+}
+
+static ssize_t show_port_gid_attr_gid_type(struct ib_port *p,
+					   struct port_attribute *attr,
+					   char *buf)
+{
+	return _show_port_gid_attr(p, attr, buf, print_gid_type);
+}
+
 static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
 			      char *buf)
 {
@@ -314,24 +398,32 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
 #define PORT_PMA_ATTR(_name, _counter, _width, _offset)			\
 struct port_table_attribute port_pma_attr_##_name = {			\
 	.attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),	\
-	.index = (_offset) | ((_width) << 16) | ((_counter) << 24)	\
+	.index = (_offset) | ((_width) << 16) | ((_counter) << 24),	\
+	.attr_id = IB_PMA_PORT_COUNTERS ,				\
 }
 
-static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
-				char *buf)
+#define PORT_PMA_ATTR_EXT(_name, _width, _offset)			\
+struct port_table_attribute port_pma_attr_ext_##_name = {		\
+	.attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),	\
+	.index = (_offset) | ((_width) << 16),				\
+	.attr_id = IB_PMA_PORT_COUNTERS_EXT ,				\
+}
+
+/*
+ * Get a Perfmgmt MAD block of data.
+ * Returns error code or the number of bytes retrieved.
+ */
+static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr,
+		void *data, int offset, size_t size)
 {
-	struct port_table_attribute *tab_attr =
-		container_of(attr, struct port_table_attribute, attr);
-	int offset = tab_attr->index & 0xffff;
-	int width  = (tab_attr->index >> 16) & 0xff;
-	struct ib_mad *in_mad  = NULL;
-	struct ib_mad *out_mad = NULL;
+	struct ib_mad *in_mad;
+	struct ib_mad *out_mad;
 	size_t mad_size = sizeof(*out_mad);
 	u16 out_mad_pkey_index = 0;
 	ssize_t ret;
 
-	if (!p->ibdev->process_mad)
-		return sprintf(buf, "N/A (no PMA)\n");
+	if (!dev->process_mad)
+		return -ENOSYS;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
 	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
@@ -344,12 +436,13 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
 	in_mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_PERF_MGMT;
 	in_mad->mad_hdr.class_version = 1;
 	in_mad->mad_hdr.method        = IB_MGMT_METHOD_GET;
-	in_mad->mad_hdr.attr_id       = cpu_to_be16(0x12); /* PortCounters */
+	in_mad->mad_hdr.attr_id       = attr;
 
-	in_mad->data[41] = p->port_num;	/* PortSelect field */
+	if (attr != IB_PMA_CLASS_PORT_INFO)
+		in_mad->data[41] = port_num;	/* PortSelect field */
 
-	if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
-		 p->port_num, NULL, NULL,
+	if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY,
+		 port_num, NULL, NULL,
 		 (const struct ib_mad_hdr *)in_mad, mad_size,
 		 (struct ib_mad_hdr *)out_mad, &mad_size,
 		 &out_mad_pkey_index) &
@@ -358,31 +451,54 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
 		ret = -EINVAL;
 		goto out;
 	}
+	memcpy(data, out_mad->data + offset, size);
+	ret = size;
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return ret;
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+				char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	int offset = tab_attr->index & 0xffff;
+	int width  = (tab_attr->index >> 16) & 0xff;
+	ssize_t ret;
+	u8 data[8];
+
+	ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data,
+			40 + offset / 8, sizeof(data));
+	if (ret < 0)
+		return sprintf(buf, "N/A (no PMA)\n");
 
 	switch (width) {
 	case 4:
-		ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
+		ret = sprintf(buf, "%u\n", (*data >>
 					    (4 - (offset % 8))) & 0xf);
 		break;
 	case 8:
-		ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
+		ret = sprintf(buf, "%u\n", *data);
 		break;
 	case 16:
 		ret = sprintf(buf, "%u\n",
-			      be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8)));
+			      be16_to_cpup((__be16 *)data));
 		break;
 	case 32:
 		ret = sprintf(buf, "%u\n",
-			      be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8)));
+			      be32_to_cpup((__be32 *)data));
+		break;
+	case 64:
+		ret = sprintf(buf, "%llu\n",
+				be64_to_cpup((__be64 *)data));
 		break;
+
 	default:
 		ret = 0;
 	}
 
-out:
-	kfree(in_mad);
-	kfree(out_mad);
-
 	return ret;
 }
 
@@ -403,6 +519,18 @@ static PORT_PMA_ATTR(port_rcv_data		    , 13, 32, 224);
 static PORT_PMA_ATTR(port_xmit_packets		    , 14, 32, 256);
 static PORT_PMA_ATTR(port_rcv_packets		    , 15, 32, 288);
 
+/*
+ * Counters added by extended set
+ */
+static PORT_PMA_ATTR_EXT(port_xmit_data		    , 64,  64);
+static PORT_PMA_ATTR_EXT(port_rcv_data		    , 64, 128);
+static PORT_PMA_ATTR_EXT(port_xmit_packets	    , 64, 192);
+static PORT_PMA_ATTR_EXT(port_rcv_packets	    , 64, 256);
+static PORT_PMA_ATTR_EXT(unicast_xmit_packets	    , 64, 320);
+static PORT_PMA_ATTR_EXT(unicast_rcv_packets	    , 64, 384);
+static PORT_PMA_ATTR_EXT(multicast_xmit_packets	    , 64, 448);
+static PORT_PMA_ATTR_EXT(multicast_rcv_packets	    , 64, 512);
+
 static struct attribute *pma_attrs[] = {
 	&port_pma_attr_symbol_error.attr.attr,
 	&port_pma_attr_link_error_recovery.attr.attr,
@@ -423,11 +551,65 @@ static struct attribute *pma_attrs[] = {
 	NULL
 };
 
+static struct attribute *pma_attrs_ext[] = {
+	&port_pma_attr_symbol_error.attr.attr,
+	&port_pma_attr_link_error_recovery.attr.attr,
+	&port_pma_attr_link_downed.attr.attr,
+	&port_pma_attr_port_rcv_errors.attr.attr,
+	&port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+	&port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+	&port_pma_attr_port_xmit_discards.attr.attr,
+	&port_pma_attr_port_xmit_constraint_errors.attr.attr,
+	&port_pma_attr_port_rcv_constraint_errors.attr.attr,
+	&port_pma_attr_local_link_integrity_errors.attr.attr,
+	&port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+	&port_pma_attr_VL15_dropped.attr.attr,
+	&port_pma_attr_ext_port_xmit_data.attr.attr,
+	&port_pma_attr_ext_port_rcv_data.attr.attr,
+	&port_pma_attr_ext_port_xmit_packets.attr.attr,
+	&port_pma_attr_ext_port_rcv_packets.attr.attr,
+	&port_pma_attr_ext_unicast_rcv_packets.attr.attr,
+	&port_pma_attr_ext_unicast_xmit_packets.attr.attr,
+	&port_pma_attr_ext_multicast_rcv_packets.attr.attr,
+	&port_pma_attr_ext_multicast_xmit_packets.attr.attr,
+	NULL
+};
+
+static struct attribute *pma_attrs_noietf[] = {
+	&port_pma_attr_symbol_error.attr.attr,
+	&port_pma_attr_link_error_recovery.attr.attr,
+	&port_pma_attr_link_downed.attr.attr,
+	&port_pma_attr_port_rcv_errors.attr.attr,
+	&port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+	&port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+	&port_pma_attr_port_xmit_discards.attr.attr,
+	&port_pma_attr_port_xmit_constraint_errors.attr.attr,
+	&port_pma_attr_port_rcv_constraint_errors.attr.attr,
+	&port_pma_attr_local_link_integrity_errors.attr.attr,
+	&port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+	&port_pma_attr_VL15_dropped.attr.attr,
+	&port_pma_attr_ext_port_xmit_data.attr.attr,
+	&port_pma_attr_ext_port_rcv_data.attr.attr,
+	&port_pma_attr_ext_port_xmit_packets.attr.attr,
+	&port_pma_attr_ext_port_rcv_packets.attr.attr,
+	NULL
+};
+
 static struct attribute_group pma_group = {
 	.name  = "counters",
 	.attrs  = pma_attrs
 };
 
+static struct attribute_group pma_group_ext = {
+	.name  = "counters",
+	.attrs  = pma_attrs_ext
+};
+
+static struct attribute_group pma_group_noietf = {
+	.name  = "counters",
+	.attrs  = pma_attrs_noietf
+};
+
 static void ib_port_release(struct kobject *kobj)
 {
 	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
@@ -451,12 +633,41 @@ static void ib_port_release(struct kobject *kobj)
 	kfree(p);
 }
 
+static void ib_port_gid_attr_release(struct kobject *kobj)
+{
+	struct gid_attr_group *g = container_of(kobj, struct gid_attr_group,
+						kobj);
+	struct attribute *a;
+	int i;
+
+	if (g->ndev.attrs) {
+		for (i = 0; (a = g->ndev.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(g->ndev.attrs);
+	}
+
+	if (g->type.attrs) {
+		for (i = 0; (a = g->type.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(g->type.attrs);
+	}
+
+	kfree(g);
+}
+
 static struct kobj_type port_type = {
 	.release       = ib_port_release,
 	.sysfs_ops     = &port_sysfs_ops,
 	.default_attrs = port_default_attrs
 };
 
+static struct kobj_type gid_attr_type = {
+	.sysfs_ops      = &gid_attr_sysfs_ops,
+	.release        = ib_port_gid_attr_release
+};
+
 static struct attribute **
 alloc_group_attrs(ssize_t (*show)(struct ib_port *,
 				  struct port_attribute *, char *buf),
@@ -500,6 +711,31 @@ err:
 	return NULL;
 }
 
+/*
+ * Figure out which counter table to use depending on
+ * the device capabilities.
+ */
+static struct attribute_group *get_counter_table(struct ib_device *dev,
+						 int port_num)
+{
+	struct ib_class_port_info cpi;
+
+	if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO,
+				&cpi, 40, sizeof(cpi)) >= 0) {
+
+		if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH)
+			/* We have extended counters */
+			return &pma_group_ext;
+
+		if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF)
+			/* But not the IETF ones */
+			return &pma_group_noietf;
+	}
+
+	/* Fall back to normal counters */
+	return &pma_group;
+}
+
 static int add_port(struct ib_device *device, int port_num,
 		    int (*port_callback)(struct ib_device *,
 					 u8, struct kobject *))
@@ -528,9 +764,24 @@ static int add_port(struct ib_device *device, int port_num,
 		return ret;
 	}
 
-	ret = sysfs_create_group(&p->kobj, &pma_group);
-	if (ret)
+	p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL);
+	if (!p->gid_attr_group) {
+		ret = -ENOMEM;
 		goto err_put;
+	}
+
+	p->gid_attr_group->port = p;
+	ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type,
+				   &p->kobj, "gid_attrs");
+	if (ret) {
+		kfree(p->gid_attr_group);
+		goto err_put;
+	}
+
+	p->pma_table = get_counter_table(device, port_num);
+	ret = sysfs_create_group(&p->kobj, p->pma_table);
+	if (ret)
+		goto err_put_gid_attrs;
 
 	p->gid_group.name  = "gids";
 	p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
@@ -543,12 +794,38 @@ static int add_port(struct ib_device *device, int port_num,
 	if (ret)
 		goto err_free_gid;
 
+	p->gid_attr_group->ndev.name = "ndevs";
+	p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev,
+							  attr.gid_tbl_len);
+	if (!p->gid_attr_group->ndev.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_gid;
+	}
+
+	ret = sysfs_create_group(&p->gid_attr_group->kobj,
+				 &p->gid_attr_group->ndev);
+	if (ret)
+		goto err_free_gid_ndev;
+
+	p->gid_attr_group->type.name = "types";
+	p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type,
+							  attr.gid_tbl_len);
+	if (!p->gid_attr_group->type.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_gid_ndev;
+	}
+
+	ret = sysfs_create_group(&p->gid_attr_group->kobj,
+				 &p->gid_attr_group->type);
+	if (ret)
+		goto err_free_gid_type;
+
 	p->pkey_group.name  = "pkeys";
 	p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
 						attr.pkey_tbl_len);
 	if (!p->pkey_group.attrs) {
 		ret = -ENOMEM;
-		goto err_remove_gid;
+		goto err_remove_gid_type;
 	}
 
 	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
@@ -576,6 +853,28 @@ err_free_pkey:
 	kfree(p->pkey_group.attrs);
 	p->pkey_group.attrs = NULL;
 
+err_remove_gid_type:
+	sysfs_remove_group(&p->gid_attr_group->kobj,
+			   &p->gid_attr_group->type);
+
+err_free_gid_type:
+	for (i = 0; i < attr.gid_tbl_len; ++i)
+		kfree(p->gid_attr_group->type.attrs[i]);
+
+	kfree(p->gid_attr_group->type.attrs);
+	p->gid_attr_group->type.attrs = NULL;
+
+err_remove_gid_ndev:
+	sysfs_remove_group(&p->gid_attr_group->kobj,
+			   &p->gid_attr_group->ndev);
+
+err_free_gid_ndev:
+	for (i = 0; i < attr.gid_tbl_len; ++i)
+		kfree(p->gid_attr_group->ndev.attrs[i]);
+
+	kfree(p->gid_attr_group->ndev.attrs);
+	p->gid_attr_group->ndev.attrs = NULL;
+
 err_remove_gid:
 	sysfs_remove_group(&p->kobj, &p->gid_group);
 
@@ -587,7 +886,10 @@ err_free_gid:
 	p->gid_group.attrs = NULL;
 
 err_remove_pma:
-	sysfs_remove_group(&p->kobj, &pma_group);
+	sysfs_remove_group(&p->kobj, p->pma_table);
+
+err_put_gid_attrs:
+	kobject_put(&p->gid_attr_group->kobj);
 
 err_put:
 	kobject_put(&p->kobj);
@@ -614,18 +916,12 @@ static ssize_t show_sys_image_guid(struct device *device,
 				   struct device_attribute *dev_attr, char *buf)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
-	struct ib_device_attr attr;
-	ssize_t ret;
-
-	ret = ib_query_device(dev, &attr);
-	if (ret)
-		return ret;
 
 	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]),
+		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]),
+		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]),
+		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3]));
 }
 
 static ssize_t show_node_guid(struct device *device,
@@ -800,9 +1096,14 @@ static void free_port_list_attributes(struct ib_device *device)
 	list_for_each_entry_safe(p, t, &device->port_list, entry) {
 		struct ib_port *port = container_of(p, struct ib_port, kobj);
 		list_del(&p->entry);
-		sysfs_remove_group(p, &pma_group);
+		sysfs_remove_group(p, port->pma_table);
 		sysfs_remove_group(p, &port->pkey_group);
 		sysfs_remove_group(p, &port->gid_group);
+		sysfs_remove_group(&port->gid_attr_group->kobj,
+				   &port->gid_attr_group->ndev);
+		sysfs_remove_group(&port->gid_attr_group->kobj,
+				   &port->gid_attr_group->type);
+		kobject_put(&port->gid_attr_group->kobj);
 		kobject_put(p);
 	}
 
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
index 72feee620ebf..19837d270278 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -35,6 +35,7 @@
 #include <linux/string.h>
 #include <linux/export.h>
 #include <linux/if_ether.h>
+#include <linux/ip.h>
 
 #include <rdma/ib_pack.h>
 
@@ -116,6 +117,72 @@ static const struct ib_field vlan_table[]  = {
 	  .size_bits    = 16 }
 };
 
+static const struct ib_field ip4_table[]  = {
+	{ STRUCT_FIELD(ip4, ver),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(ip4, hdr_len),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(ip4, tos),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(ip4, tot_len),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, id),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, frag_off),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, ttl),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(ip4, protocol),
+	  .offset_words = 2,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(ip4, check),
+	  .offset_words = 2,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, saddr),
+	  .offset_words = 3,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(ip4, daddr),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 }
+};
+
+static const struct ib_field udp_table[]  = {
+	{ STRUCT_FIELD(udp, sport),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(udp, dport),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(udp, length),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(udp, csum),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
 static const struct ib_field grh_table[]  = {
 	{ STRUCT_FIELD(grh, ip_version),
 	  .offset_words = 0,
@@ -213,26 +280,57 @@ static const struct ib_field deth_table[] = {
 	  .size_bits    = 24 }
 };
 
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header)
+{
+	struct iphdr iph;
+
+	iph.ihl		= 5;
+	iph.version	= 4;
+	iph.tos		= header->ip4.tos;
+	iph.tot_len	= header->ip4.tot_len;
+	iph.id		= header->ip4.id;
+	iph.frag_off	= header->ip4.frag_off;
+	iph.ttl		= header->ip4.ttl;
+	iph.protocol	= header->ip4.protocol;
+	iph.check	= 0;
+	iph.saddr	= header->ip4.saddr;
+	iph.daddr	= header->ip4.daddr;
+
+	return ip_fast_csum((u8 *)&iph, iph.ihl);
+}
+EXPORT_SYMBOL(ib_ud_ip4_csum);
+
 /**
  * ib_ud_header_init - Initialize UD header structure
  * @payload_bytes:Length of packet payload
  * @lrh_present: specify if LRH is present
  * @eth_present: specify if Eth header is present
  * @vlan_present: packet is tagged vlan
- * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @grh_present: GRH flag (if non-zero, GRH will be included)
+ * @ip_version: if non-zero, IP header, V4 or V6, will be included
+ * @udp_present :if non-zero, UDP header will be included
  * @immediate_present: specify if immediate data is present
  * @header:Structure to initialize
  */
-void ib_ud_header_init(int     		    payload_bytes,
-		       int		    lrh_present,
-		       int		    eth_present,
-		       int		    vlan_present,
-		       int    		    grh_present,
-		       int		    immediate_present,
-		       struct ib_ud_header *header)
+int ib_ud_header_init(int     payload_bytes,
+		      int    lrh_present,
+		      int    eth_present,
+		      int    vlan_present,
+		      int    grh_present,
+		      int    ip_version,
+		      int    udp_present,
+		      int    immediate_present,
+		      struct ib_ud_header *header)
 {
+	grh_present = grh_present && !ip_version;
 	memset(header, 0, sizeof *header);
 
+	/*
+	 * UDP header without IP header doesn't make sense
+	 */
+	if (udp_present && ip_version != 4 && ip_version != 6)
+		return -EINVAL;
+
 	if (lrh_present) {
 		u16 packet_length;
 
@@ -252,7 +350,7 @@ void ib_ud_header_init(int     		    payload_bytes,
 	if (vlan_present)
 		header->eth.type = cpu_to_be16(ETH_P_8021Q);
 
-	if (grh_present) {
+	if (ip_version == 6 || grh_present) {
 		header->grh.ip_version      = 6;
 		header->grh.payload_length  =
 			cpu_to_be16((IB_BTH_BYTES     +
@@ -260,8 +358,30 @@ void ib_ud_header_init(int     		    payload_bytes,
 				     payload_bytes    +
 				     4                + /* ICRC     */
 				     3) & ~3);          /* round up */
-		header->grh.next_header     = 0x1b;
+		header->grh.next_header     = udp_present ? IPPROTO_UDP : 0x1b;
+	}
+
+	if (ip_version == 4) {
+		int udp_bytes = udp_present ? IB_UDP_BYTES : 0;
+
+		header->ip4.ver = 4; /* version 4 */
+		header->ip4.hdr_len = 5; /* 5 words */
+		header->ip4.tot_len =
+			cpu_to_be16(IB_IP4_BYTES   +
+				     udp_bytes     +
+				     IB_BTH_BYTES  +
+				     IB_DETH_BYTES +
+				     payload_bytes +
+				     4);     /* ICRC     */
+		header->ip4.protocol = IPPROTO_UDP;
 	}
+	if (udp_present && ip_version)
+		header->udp.length =
+			cpu_to_be16(IB_UDP_BYTES   +
+				     IB_BTH_BYTES  +
+				     IB_DETH_BYTES +
+				     payload_bytes +
+				     4);     /* ICRC     */
 
 	if (immediate_present)
 		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
@@ -273,8 +393,11 @@ void ib_ud_header_init(int     		    payload_bytes,
 	header->lrh_present = lrh_present;
 	header->eth_present = eth_present;
 	header->vlan_present = vlan_present;
-	header->grh_present = grh_present;
+	header->grh_present = grh_present || (ip_version == 6);
+	header->ipv4_present = ip_version == 4;
+	header->udp_present = udp_present;
 	header->immediate_present = immediate_present;
+	return 0;
 }
 EXPORT_SYMBOL(ib_ud_header_init);
 
@@ -311,6 +434,16 @@ int ib_ud_header_pack(struct ib_ud_header *header,
 			&header->grh, buf + len);
 		len += IB_GRH_BYTES;
 	}
+	if (header->ipv4_present) {
+		ib_pack(ip4_table, ARRAY_SIZE(ip4_table),
+			&header->ip4, buf + len);
+		len += IB_IP4_BYTES;
+	}
+	if (header->udp_present) {
+		ib_pack(udp_table, ARRAY_SIZE(udp_table),
+			&header->udp, buf + len);
+		len += IB_UDP_BYTES;
+	}
 
 	ib_pack(bth_table, ARRAY_SIZE(bth_table),
 		&header->bth, buf + len);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 40becdb3196e..e69bf266049d 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -232,7 +232,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	ib_ucontext_notifier_end_account(context);
 }
 
-static struct mmu_notifier_ops ib_umem_notifiers = {
+static const struct mmu_notifier_ops ib_umem_notifiers = {
 	.release                    = ib_umem_notifier_release,
 	.invalidate_page            = ib_umem_notifier_invalidate_page,
 	.invalidate_range_start     = ib_umem_notifier_invalidate_range_start,
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 57f281f8d686..415a3185cde7 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -210,6 +210,7 @@ static void send_handler(struct ib_mad_agent *agent,
 }
 
 static void recv_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_buf *send_buf,
 			 struct ib_mad_recv_wc *mad_recv_wc)
 {
 	struct ib_umad_file *file = agent->context;
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 94bbd8c155fc..612ccfd39bf9 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -204,6 +204,8 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
 			     struct ib_event *event);
 void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd);
 
+int uverbs_dealloc_mw(struct ib_mw *mw);
+
 struct ib_uverbs_flow_spec {
 	union {
 		union {
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 1c02deab068f..6ffc9c4e93af 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -291,9 +291,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	struct ib_uverbs_get_context      cmd;
 	struct ib_uverbs_get_context_resp resp;
 	struct ib_udata                   udata;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	struct ib_device_attr		  dev_attr;
-#endif
 	struct ib_ucontext		 *ucontext;
 	struct file			 *filp;
 	int ret;
@@ -342,10 +339,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	ucontext->odp_mrs_count = 0;
 	INIT_LIST_HEAD(&ucontext->no_private_counters);
 
-	ret = ib_query_device(ib_dev, &dev_attr);
-	if (ret)
-		goto err_free;
-	if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
 		ucontext->invalidate_range = NULL;
 
 #endif
@@ -447,8 +441,6 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
 {
 	struct ib_uverbs_query_device      cmd;
 	struct ib_uverbs_query_device_resp resp;
-	struct ib_device_attr              attr;
-	int                                ret;
 
 	if (out_len < sizeof resp)
 		return -ENOSPC;
@@ -456,12 +448,8 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
 	if (copy_from_user(&cmd, buf, sizeof cmd))
 		return -EFAULT;
 
-	ret = ib_query_device(ib_dev, &attr);
-	if (ret)
-		return ret;
-
 	memset(&resp, 0, sizeof resp);
-	copy_query_dev_fields(file, ib_dev, &resp, &attr);
+	copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs);
 
 	if (copy_to_user((void __user *) (unsigned long) cmd.response,
 			 &resp, sizeof resp))
@@ -986,11 +974,8 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
 	}
 
 	if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
-		struct ib_device_attr attr;
-
-		ret = ib_query_device(pd->device, &attr);
-		if (ret || !(attr.device_cap_flags &
-				IB_DEVICE_ON_DEMAND_PAGING)) {
+		if (!(pd->device->attrs.device_cap_flags &
+		      IB_DEVICE_ON_DEMAND_PAGING)) {
 			pr_debug("ODP support not available\n");
 			ret = -EINVAL;
 			goto err_put;
@@ -1008,7 +993,6 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
 	mr->pd      = pd;
 	mr->uobject = uobj;
 	atomic_inc(&pd->usecnt);
-	atomic_set(&mr->usecnt, 0);
 
 	uobj->object = mr;
 	ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
@@ -1106,11 +1090,6 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
 		}
 	}
 
-	if (atomic_read(&mr->usecnt)) {
-		ret = -EBUSY;
-		goto put_uobj_pd;
-	}
-
 	old_pd = mr->pd;
 	ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
 					cmd.length, cmd.hca_va,
@@ -1258,7 +1237,7 @@ err_copy:
 	idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
 
 err_unalloc:
-	ib_dealloc_mw(mw);
+	uverbs_dealloc_mw(mw);
 
 err_put:
 	put_pd_read(pd);
@@ -1287,7 +1266,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
 
 	mw = uobj->object;
 
-	ret = ib_dealloc_mw(mw);
+	ret = uverbs_dealloc_mw(mw);
 	if (!ret)
 		uobj->live = 0;
 
@@ -1845,7 +1824,10 @@ static int create_qp(struct ib_uverbs_file *file,
 		      sizeof(cmd->create_flags))
 		attr.create_flags = cmd->create_flags;
 
-	if (attr.create_flags & ~IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
+	if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
+				IB_QP_CREATE_CROSS_CHANNEL |
+				IB_QP_CREATE_MANAGED_SEND |
+				IB_QP_CREATE_MANAGED_RECV)) {
 		ret = -EINVAL;
 		goto err_put;
 	}
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index e3ef28861be6..39680aed99dd 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -133,6 +133,17 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
 static void ib_uverbs_add_one(struct ib_device *device);
 static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
 
+int uverbs_dealloc_mw(struct ib_mw *mw)
+{
+	struct ib_pd *pd = mw->pd;
+	int ret;
+
+	ret = mw->device->dealloc_mw(mw);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+	return ret;
+}
+
 static void ib_uverbs_release_dev(struct kobject *kobj)
 {
 	struct ib_uverbs_device *dev =
@@ -224,7 +235,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 		struct ib_mw *mw = uobj->object;
 
 		idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
-		ib_dealloc_mw(mw);
+		uverbs_dealloc_mw(mw);
 		kfree(uobj);
 	}
 
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
index 7d2f14c9bbef..af020f80d50f 100644
--- a/drivers/infiniband/core/uverbs_marshall.c
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -144,5 +144,6 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
 	memset(dst->dmac, 0, sizeof(dst->dmac));
 	dst->net = NULL;
 	dst->ifindex = 0;
+	dst->gid_type = IB_GID_TYPE_IB;
 }
 EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 545906dec26d..5af6d024e053 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -229,12 +229,6 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
 struct ib_pd *ib_alloc_pd(struct ib_device *device)
 {
 	struct ib_pd *pd;
-	struct ib_device_attr devattr;
-	int rc;
-
-	rc = ib_query_device(device, &devattr);
-	if (rc)
-		return ERR_PTR(rc);
 
 	pd = device->alloc_pd(device, NULL, NULL);
 	if (IS_ERR(pd))
@@ -245,7 +239,7 @@ struct ib_pd *ib_alloc_pd(struct ib_device *device)
 	pd->local_mr = NULL;
 	atomic_set(&pd->usecnt, 0);
 
-	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+	if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
 		pd->local_dma_lkey = device->local_dma_lkey;
 	else {
 		struct ib_mr *mr;
@@ -311,8 +305,61 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
 }
 EXPORT_SYMBOL(ib_create_ah);
 
+static int ib_get_header_version(const union rdma_network_hdr *hdr)
+{
+	const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh;
+	struct iphdr ip4h_checked;
+	const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh;
+
+	/* If it's IPv6, the version must be 6, otherwise, the first
+	 * 20 bytes (before the IPv4 header) are garbled.
+	 */
+	if (ip6h->version != 6)
+		return (ip4h->version == 4) ? 4 : 0;
+	/* version may be 6 or 4 because the first 20 bytes could be garbled */
+
+	/* RoCE v2 requires no options, thus header length
+	 * must be 5 words
+	 */
+	if (ip4h->ihl != 5)
+		return 6;
+
+	/* Verify checksum.
+	 * We can't write on scattered buffers so we need to copy to
+	 * temp buffer.
+	 */
+	memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
+	ip4h_checked.check = 0;
+	ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5);
+	/* if IPv4 header checksum is OK, believe it */
+	if (ip4h->check == ip4h_checked.check)
+		return 4;
+	return 6;
+}
+
+static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
+						     u8 port_num,
+						     const struct ib_grh *grh)
+{
+	int grh_version;
+
+	if (rdma_protocol_ib(device, port_num))
+		return RDMA_NETWORK_IB;
+
+	grh_version = ib_get_header_version((union rdma_network_hdr *)grh);
+
+	if (grh_version == 4)
+		return RDMA_NETWORK_IPV4;
+
+	if (grh->next_hdr == IPPROTO_UDP)
+		return RDMA_NETWORK_IPV6;
+
+	return RDMA_NETWORK_ROCE_V1;
+}
+
 struct find_gid_index_context {
 	u16 vlan_id;
+	enum ib_gid_type gid_type;
 };
 
 static bool find_gid_index(const union ib_gid *gid,
@@ -322,6 +369,9 @@ static bool find_gid_index(const union ib_gid *gid,
 	struct find_gid_index_context *ctx =
 		(struct find_gid_index_context *)context;
 
+	if (ctx->gid_type != gid_attr->gid_type)
+		return false;
+
 	if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
 	    (is_vlan_dev(gid_attr->ndev) &&
 	     vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
@@ -332,14 +382,49 @@ static bool find_gid_index(const union ib_gid *gid,
 
 static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num,
 				   u16 vlan_id, const union ib_gid *sgid,
+				   enum ib_gid_type gid_type,
 				   u16 *gid_index)
 {
-	struct find_gid_index_context context = {.vlan_id = vlan_id};
+	struct find_gid_index_context context = {.vlan_id = vlan_id,
+						 .gid_type = gid_type};
 
 	return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index,
 				     &context, gid_index);
 }
 
+static int get_gids_from_rdma_hdr(union rdma_network_hdr *hdr,
+				  enum rdma_network_type net_type,
+				  union ib_gid *sgid, union ib_gid *dgid)
+{
+	struct sockaddr_in  src_in;
+	struct sockaddr_in  dst_in;
+	__be32 src_saddr, dst_saddr;
+
+	if (!sgid || !dgid)
+		return -EINVAL;
+
+	if (net_type == RDMA_NETWORK_IPV4) {
+		memcpy(&src_in.sin_addr.s_addr,
+		       &hdr->roce4grh.saddr, 4);
+		memcpy(&dst_in.sin_addr.s_addr,
+		       &hdr->roce4grh.daddr, 4);
+		src_saddr = src_in.sin_addr.s_addr;
+		dst_saddr = dst_in.sin_addr.s_addr;
+		ipv6_addr_set_v4mapped(src_saddr,
+				       (struct in6_addr *)sgid);
+		ipv6_addr_set_v4mapped(dst_saddr,
+				       (struct in6_addr *)dgid);
+		return 0;
+	} else if (net_type == RDMA_NETWORK_IPV6 ||
+		   net_type == RDMA_NETWORK_IB) {
+		*dgid = hdr->ibgrh.dgid;
+		*sgid = hdr->ibgrh.sgid;
+		return 0;
+	} else {
+		return -EINVAL;
+	}
+}
+
 int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
 		       const struct ib_wc *wc, const struct ib_grh *grh,
 		       struct ib_ah_attr *ah_attr)
@@ -347,33 +432,72 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
 	u32 flow_class;
 	u16 gid_index;
 	int ret;
+	enum rdma_network_type net_type = RDMA_NETWORK_IB;
+	enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+	int hoplimit = 0xff;
+	union ib_gid dgid;
+	union ib_gid sgid;
 
 	memset(ah_attr, 0, sizeof *ah_attr);
 	if (rdma_cap_eth_ah(device, port_num)) {
+		if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE)
+			net_type = wc->network_hdr_type;
+		else
+			net_type = ib_get_net_type_by_grh(device, port_num, grh);
+		gid_type = ib_network_to_gid_type(net_type);
+	}
+	ret = get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
+				     &sgid, &dgid);
+	if (ret)
+		return ret;
+
+	if (rdma_protocol_roce(device, port_num)) {
+		int if_index = 0;
 		u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
 				wc->vlan_id : 0xffff;
+		struct net_device *idev;
+		struct net_device *resolved_dev;
 
 		if (!(wc->wc_flags & IB_WC_GRH))
 			return -EPROTOTYPE;
 
-		if (!(wc->wc_flags & IB_WC_WITH_SMAC) ||
-		    !(wc->wc_flags & IB_WC_WITH_VLAN)) {
-			ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
-							 ah_attr->dmac,
-							 wc->wc_flags & IB_WC_WITH_VLAN ?
-							 NULL : &vlan_id,
-							 0);
-			if (ret)
-				return ret;
+		if (!device->get_netdev)
+			return -EOPNOTSUPP;
+
+		idev = device->get_netdev(device, port_num);
+		if (!idev)
+			return -ENODEV;
+
+		ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid,
+						   ah_attr->dmac,
+						   wc->wc_flags & IB_WC_WITH_VLAN ?
+						   NULL : &vlan_id,
+						   &if_index, &hoplimit);
+		if (ret) {
+			dev_put(idev);
+			return ret;
 		}
 
-		ret = get_sgid_index_from_eth(device, port_num, vlan_id,
-					      &grh->dgid, &gid_index);
+		resolved_dev = dev_get_by_index(&init_net, if_index);
+		if (resolved_dev->flags & IFF_LOOPBACK) {
+			dev_put(resolved_dev);
+			resolved_dev = idev;
+			dev_hold(resolved_dev);
+		}
+		rcu_read_lock();
+		if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev,
+								   resolved_dev))
+			ret = -EHOSTUNREACH;
+		rcu_read_unlock();
+		dev_put(idev);
+		dev_put(resolved_dev);
 		if (ret)
 			return ret;
 
-		if (wc->wc_flags & IB_WC_WITH_SMAC)
-			memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
+		ret = get_sgid_index_from_eth(device, port_num, vlan_id,
+					      &dgid, gid_type, &gid_index);
+		if (ret)
+			return ret;
 	}
 
 	ah_attr->dlid = wc->slid;
@@ -383,10 +507,11 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
 
 	if (wc->wc_flags & IB_WC_GRH) {
 		ah_attr->ah_flags = IB_AH_GRH;
-		ah_attr->grh.dgid = grh->sgid;
+		ah_attr->grh.dgid = sgid;
 
 		if (!rdma_cap_eth_ah(device, port_num)) {
-			ret = ib_find_cached_gid_by_port(device, &grh->dgid,
+			ret = ib_find_cached_gid_by_port(device, &dgid,
+							 IB_GID_TYPE_IB,
 							 port_num, NULL,
 							 &gid_index);
 			if (ret)
@@ -396,7 +521,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
 		ah_attr->grh.sgid_index = (u8) gid_index;
 		flow_class = be32_to_cpu(grh->version_tclass_flow);
 		ah_attr->grh.flow_label = flow_class & 0xFFFFF;
-		ah_attr->grh.hop_limit = 0xFF;
+		ah_attr->grh.hop_limit = hoplimit;
 		ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
 	}
 	return 0;
@@ -1014,6 +1139,7 @@ int ib_resolve_eth_dmac(struct ib_qp *qp,
 			union ib_gid		sgid;
 			struct ib_gid_attr	sgid_attr;
 			int			ifindex;
+			int			hop_limit;
 
 			ret = ib_query_gid(qp->device,
 					   qp_attr->ah_attr.port_num,
@@ -1028,12 +1154,14 @@ int ib_resolve_eth_dmac(struct ib_qp *qp,
 
 			ifindex = sgid_attr.ndev->ifindex;
 
-			ret = rdma_addr_find_dmac_by_grh(&sgid,
-							 &qp_attr->ah_attr.grh.dgid,
-							 qp_attr->ah_attr.dmac,
-							 NULL, ifindex);
+			ret = rdma_addr_find_l2_eth_by_grh(&sgid,
+							   &qp_attr->ah_attr.grh.dgid,
+							   qp_attr->ah_attr.dmac,
+							   NULL, &ifindex, &hop_limit);
 
 			dev_put(sgid_attr.ndev);
+
+			qp_attr->ah_attr.grh.hop_limit = hop_limit;
 		}
 	}
 out:
@@ -1215,29 +1343,17 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
 		mr->pd      = pd;
 		mr->uobject = NULL;
 		atomic_inc(&pd->usecnt);
-		atomic_set(&mr->usecnt, 0);
 	}
 
 	return mr;
 }
 EXPORT_SYMBOL(ib_get_dma_mr);
 
-int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
-{
-	return mr->device->query_mr ?
-		mr->device->query_mr(mr, mr_attr) : -ENOSYS;
-}
-EXPORT_SYMBOL(ib_query_mr);
-
 int ib_dereg_mr(struct ib_mr *mr)
 {
-	struct ib_pd *pd;
+	struct ib_pd *pd = mr->pd;
 	int ret;
 
-	if (atomic_read(&mr->usecnt))
-		return -EBUSY;
-
-	pd = mr->pd;
 	ret = mr->device->dereg_mr(mr);
 	if (!ret)
 		atomic_dec(&pd->usecnt);
@@ -1273,49 +1389,12 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
 		mr->pd      = pd;
 		mr->uobject = NULL;
 		atomic_inc(&pd->usecnt);
-		atomic_set(&mr->usecnt, 0);
 	}
 
 	return mr;
 }
 EXPORT_SYMBOL(ib_alloc_mr);
 
-/* Memory windows */
-
-struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
-{
-	struct ib_mw *mw;
-
-	if (!pd->device->alloc_mw)
-		return ERR_PTR(-ENOSYS);
-
-	mw = pd->device->alloc_mw(pd, type);
-	if (!IS_ERR(mw)) {
-		mw->device  = pd->device;
-		mw->pd      = pd;
-		mw->uobject = NULL;
-		mw->type    = type;
-		atomic_inc(&pd->usecnt);
-	}
-
-	return mw;
-}
-EXPORT_SYMBOL(ib_alloc_mw);
-
-int ib_dealloc_mw(struct ib_mw *mw)
-{
-	struct ib_pd *pd;
-	int ret;
-
-	pd = mw->pd;
-	ret = mw->device->dealloc_mw(mw);
-	if (!ret)
-		atomic_dec(&pd->usecnt);
-
-	return ret;
-}
-EXPORT_SYMBOL(ib_dealloc_mw);
-
 /* "Fast" memory regions */
 
 struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
@@ -1530,7 +1609,7 @@ int ib_sg_to_pages(struct ib_mr *mr,
 		   int (*set_page)(struct ib_mr *, u64))
 {
 	struct scatterlist *sg;
-	u64 last_end_dma_addr = 0, last_page_addr = 0;
+	u64 last_end_dma_addr = 0;
 	unsigned int last_page_off = 0;
 	u64 page_mask = ~((u64)mr->page_size - 1);
 	int i, ret;
@@ -1572,7 +1651,6 @@ next_page:
 
 		mr->length += dma_len;
 		last_end_dma_addr = end_dma_addr;
-		last_page_addr = end_dma_addr & page_mask;
 		last_page_off = end_dma_addr & ~page_mask;
 	}
 
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index cb78b1e9bcd9..f504ba73e5dc 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -149,7 +149,7 @@ static int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_en
 	error = l2t_send(tdev, skb, l2e);
 	if (error < 0)
 		kfree_skb(skb);
-	return error;
+	return error < 0 ? error : 0;
 }
 
 int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb)
@@ -165,7 +165,7 @@ int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb)
 	error = cxgb3_ofld_send(tdev, skb);
 	if (error < 0)
 		kfree_skb(skb);
-	return error;
+	return error < 0 ? error : 0;
 }
 
 static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c
index cfe404925a39..97fbfd2c298e 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cq.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cq.c
@@ -115,10 +115,6 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
 		case T3_SEND_WITH_SE_INV:
 			wc->opcode = IB_WC_SEND;
 			break;
-		case T3_BIND_MW:
-			wc->opcode = IB_WC_BIND_MW;
-			break;
-
 		case T3_LOCAL_INV:
 			wc->opcode = IB_WC_LOCAL_INV;
 			break;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c
index 5c36ee2809ac..1d04c872c9d5 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_mem.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c
@@ -75,37 +75,6 @@ int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
 	return ret;
 }
 
-int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
-					struct iwch_mr *mhp,
-					int shift,
-					int npages)
-{
-	u32 stag;
-	int ret;
-
-	/* We could support this... */
-	if (npages > mhp->attr.pbl_size)
-		return -ENOMEM;
-
-	stag = mhp->attr.stag;
-	if (cxio_reregister_phys_mem(&rhp->rdev,
-				   &stag, mhp->attr.pdid,
-				   mhp->attr.perms,
-				   mhp->attr.zbva,
-				   mhp->attr.va_fbo,
-				   mhp->attr.len,
-				   shift - 12,
-				   mhp->attr.pbl_size, mhp->attr.pbl_addr))
-		return -ENOMEM;
-
-	ret = iwch_finish_mem_reg(mhp, stag);
-	if (ret)
-		cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-		       mhp->attr.pbl_addr);
-
-	return ret;
-}
-
 int iwch_alloc_pbl(struct iwch_mr *mhp, int npages)
 {
 	mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev,
@@ -130,74 +99,3 @@ int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset)
 	return cxio_write_pbl(&mhp->rhp->rdev, pages,
 			      mhp->attr.pbl_addr + (offset << 3), npages);
 }
-
-int build_phys_page_list(struct ib_phys_buf *buffer_list,
-					int num_phys_buf,
-					u64 *iova_start,
-					u64 *total_size,
-					int *npages,
-					int *shift,
-					__be64 **page_list)
-{
-	u64 mask;
-	int i, j, n;
-
-	mask = 0;
-	*total_size = 0;
-	for (i = 0; i < num_phys_buf; ++i) {
-		if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
-			return -EINVAL;
-		if (i != 0 && i != num_phys_buf - 1 &&
-		    (buffer_list[i].size & ~PAGE_MASK))
-			return -EINVAL;
-		*total_size += buffer_list[i].size;
-		if (i > 0)
-			mask |= buffer_list[i].addr;
-		else
-			mask |= buffer_list[i].addr & PAGE_MASK;
-		if (i != num_phys_buf - 1)
-			mask |= buffer_list[i].addr + buffer_list[i].size;
-		else
-			mask |= (buffer_list[i].addr + buffer_list[i].size +
-				PAGE_SIZE - 1) & PAGE_MASK;
-	}
-
-	if (*total_size > 0xFFFFFFFFULL)
-		return -ENOMEM;
-
-	/* Find largest page shift we can use to cover buffers */
-	for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
-		if ((1ULL << *shift) & mask)
-			break;
-
-	buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
-	buffer_list[0].addr &= ~0ull << *shift;
-
-	*npages = 0;
-	for (i = 0; i < num_phys_buf; ++i)
-		*npages += (buffer_list[i].size +
-			(1ULL << *shift) - 1) >> *shift;
-
-	if (!*npages)
-		return -EINVAL;
-
-	*page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
-	if (!*page_list)
-		return -ENOMEM;
-
-	n = 0;
-	for (i = 0; i < num_phys_buf; ++i)
-		for (j = 0;
-		     j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
-		     ++j)
-			(*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
-			    ((u64) j << *shift));
-
-	PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
-	     __func__, (unsigned long long) *iova_start,
-	     (unsigned long long) mask, *shift, (unsigned long long) *total_size,
-	     *npages);
-
-	return 0;
-
-}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index c34725ca0bb4..2734820d291b 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -458,9 +458,6 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr)
 	u32 mmid;
 
 	PDBG("%s ib_mr %p\n", __func__, ib_mr);
-	/* There can be no memory windows */
-	if (atomic_read(&ib_mr->usecnt))
-		return -EINVAL;
 
 	mhp = to_iwch_mr(ib_mr);
 	kfree(mhp->pages);
@@ -479,24 +476,25 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr)
 	return 0;
 }
 
-static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
-					struct ib_phys_buf *buffer_list,
-					int num_phys_buf,
-					int acc,
-					u64 *iova_start)
+static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
 {
-	__be64 *page_list;
-	int shift;
-	u64 total_size;
-	int npages;
-	struct iwch_dev *rhp;
-	struct iwch_pd *php;
+	const u64 total_size = 0xffffffff;
+	const u64 mask = (total_size + PAGE_SIZE - 1) & PAGE_MASK;
+	struct iwch_pd *php = to_iwch_pd(pd);
+	struct iwch_dev *rhp = php->rhp;
 	struct iwch_mr *mhp;
-	int ret;
+	__be64 *page_list;
+	int shift = 26, npages, ret, i;
 
 	PDBG("%s ib_pd %p\n", __func__, pd);
-	php = to_iwch_pd(pd);
-	rhp = php->rhp;
+
+	/*
+	 * T3 only supports 32 bits of size.
+	 */
+	if (sizeof(phys_addr_t) > 4) {
+		pr_warn_once(MOD "Cannot support dma_mrs on this platform.\n");
+		return ERR_PTR(-ENOTSUPP);
+	}
 
 	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
 	if (!mhp)
@@ -504,22 +502,23 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
 
 	mhp->rhp = rhp;
 
-	/* First check that we have enough alignment */
-	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
+	npages = (total_size + (1ULL << shift) - 1) >> shift;
+	if (!npages) {
 		ret = -EINVAL;
 		goto err;
 	}
 
-	if (num_phys_buf > 1 &&
-	    ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
-		ret = -EINVAL;
+	page_list = kmalloc_array(npages, sizeof(u64), GFP_KERNEL);
+	if (!page_list) {
+		ret = -ENOMEM;
 		goto err;
 	}
 
-	ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
-				   &total_size, &npages, &shift, &page_list);
-	if (ret)
-		goto err;
+	for (i = 0; i < npages; i++)
+		page_list[i] = cpu_to_be64((u64)i << shift);
+
+	PDBG("%s mask 0x%llx shift %d len %lld pbl_size %d\n",
+		__func__, mask, shift, total_size, npages);
 
 	ret = iwch_alloc_pbl(mhp, npages);
 	if (ret) {
@@ -536,7 +535,7 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
 	mhp->attr.zbva = 0;
 
 	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
-	mhp->attr.va_fbo = *iova_start;
+	mhp->attr.va_fbo = 0;
 	mhp->attr.page_size = shift - 12;
 
 	mhp->attr.len = (u32) total_size;
@@ -553,76 +552,8 @@ err_pbl:
 err:
 	kfree(mhp);
 	return ERR_PTR(ret);
-
-}
-
-static int iwch_reregister_phys_mem(struct ib_mr *mr,
-				     int mr_rereg_mask,
-				     struct ib_pd *pd,
-	                             struct ib_phys_buf *buffer_list,
-	                             int num_phys_buf,
-	                             int acc, u64 * iova_start)
-{
-
-	struct iwch_mr mh, *mhp;
-	struct iwch_pd *php;
-	struct iwch_dev *rhp;
-	__be64 *page_list = NULL;
-	int shift = 0;
-	u64 total_size;
-	int npages = 0;
-	int ret;
-
-	PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
-
-	/* There can be no memory windows */
-	if (atomic_read(&mr->usecnt))
-		return -EINVAL;
-
-	mhp = to_iwch_mr(mr);
-	rhp = mhp->rhp;
-	php = to_iwch_pd(mr->pd);
-
-	/* make sure we are on the same adapter */
-	if (rhp != php->rhp)
-		return -EINVAL;
-
-	memcpy(&mh, mhp, sizeof *mhp);
-
-	if (mr_rereg_mask & IB_MR_REREG_PD)
-		php = to_iwch_pd(pd);
-	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
-		mh.attr.perms = iwch_ib_to_tpt_access(acc);
-	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-		ret = build_phys_page_list(buffer_list, num_phys_buf,
-					   iova_start,
-					   &total_size, &npages,
-					   &shift, &page_list);
-		if (ret)
-			return ret;
-	}
-
-	ret = iwch_reregister_mem(rhp, php, &mh, shift, npages);
-	kfree(page_list);
-	if (ret) {
-		return ret;
-	}
-	if (mr_rereg_mask & IB_MR_REREG_PD)
-		mhp->attr.pdid = php->pdid;
-	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
-		mhp->attr.perms = iwch_ib_to_tpt_access(acc);
-	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-		mhp->attr.zbva = 0;
-		mhp->attr.va_fbo = *iova_start;
-		mhp->attr.page_size = shift - 12;
-		mhp->attr.len = (u32) total_size;
-		mhp->attr.pbl_size = npages;
-	}
-
-	return 0;
 }
 
-
 static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				      u64 virt, int acc, struct ib_udata *udata)
 {
@@ -726,28 +657,6 @@ err:
 	return ERR_PTR(err);
 }
 
-static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
-{
-	struct ib_phys_buf bl;
-	u64 kva;
-	struct ib_mr *ibmr;
-
-	PDBG("%s ib_pd %p\n", __func__, pd);
-
-	/*
-	 * T3 only supports 32 bits of size.
-	 */
-	if (sizeof(phys_addr_t) > 4) {
-		pr_warn_once(MOD "Cannot support dma_mrs on this platform.\n");
-		return ERR_PTR(-ENOTSUPP);
-	}
-	bl.size = 0xffffffff;
-	bl.addr = 0;
-	kva = 0;
-	ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva);
-	return ibmr;
-}
-
 static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
 {
 	struct iwch_dev *rhp;
@@ -1452,12 +1361,9 @@ int iwch_register_device(struct iwch_dev *dev)
 	dev->ibdev.resize_cq = iwch_resize_cq;
 	dev->ibdev.poll_cq = iwch_poll_cq;
 	dev->ibdev.get_dma_mr = iwch_get_dma_mr;
-	dev->ibdev.reg_phys_mr = iwch_register_phys_mem;
-	dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem;
 	dev->ibdev.reg_user_mr = iwch_reg_user_mr;
 	dev->ibdev.dereg_mr = iwch_dereg_mr;
 	dev->ibdev.alloc_mw = iwch_alloc_mw;
-	dev->ibdev.bind_mw = iwch_bind_mw;
 	dev->ibdev.dealloc_mw = iwch_dealloc_mw;
 	dev->ibdev.alloc_mr = iwch_alloc_mr;
 	dev->ibdev.map_mr_sg = iwch_map_mr_sg;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
index 2ac85b86a680..252c464a09f6 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h
@@ -330,9 +330,6 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		      struct ib_send_wr **bad_wr);
 int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 		      struct ib_recv_wr **bad_wr);
-int iwch_bind_mw(struct ib_qp *qp,
-			     struct ib_mw *mw,
-			     struct ib_mw_bind *mw_bind);
 int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg);
 int iwch_post_zb_read(struct iwch_ep *ep);
@@ -341,21 +338,9 @@ void iwch_unregister_device(struct iwch_dev *dev);
 void stop_read_rep_timer(struct iwch_qp *qhp);
 int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
 		      struct iwch_mr *mhp, int shift);
-int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
-					struct iwch_mr *mhp,
-					int shift,
-					int npages);
 int iwch_alloc_pbl(struct iwch_mr *mhp, int npages);
 void iwch_free_pbl(struct iwch_mr *mhp);
 int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset);
-int build_phys_page_list(struct ib_phys_buf *buffer_list,
-					int num_phys_buf,
-					u64 *iova_start,
-					u64 *total_size,
-					int *npages,
-					int *shift,
-					__be64 **page_list);
-
 
 #define IWCH_NODE_DESC "cxgb3 Chelsio Communications"
 
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index d0548fc6395e..d939980a708f 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -526,88 +526,6 @@ out:
 	return err;
 }
 
-int iwch_bind_mw(struct ib_qp *qp,
-			     struct ib_mw *mw,
-			     struct ib_mw_bind *mw_bind)
-{
-	struct iwch_dev *rhp;
-	struct iwch_mw *mhp;
-	struct iwch_qp *qhp;
-	union t3_wr *wqe;
-	u32 pbl_addr;
-	u8 page_size;
-	u32 num_wrs;
-	unsigned long flag;
-	struct ib_sge sgl;
-	int err=0;
-	enum t3_wr_flags t3_wr_flags;
-	u32 idx;
-	struct t3_swsq *sqp;
-
-	qhp = to_iwch_qp(qp);
-	mhp = to_iwch_mw(mw);
-	rhp = qhp->rhp;
-
-	spin_lock_irqsave(&qhp->lock, flag);
-	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
-		spin_unlock_irqrestore(&qhp->lock, flag);
-		return -EINVAL;
-	}
-	num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
-			    qhp->wq.sq_size_log2);
-	if (num_wrs == 0) {
-		spin_unlock_irqrestore(&qhp->lock, flag);
-		return -ENOMEM;
-	}
-	idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
-	PDBG("%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p\n", __func__, idx,
-	     mw, mw_bind);
-	wqe = (union t3_wr *) (qhp->wq.queue + idx);
-
-	t3_wr_flags = 0;
-	if (mw_bind->send_flags & IB_SEND_SIGNALED)
-		t3_wr_flags = T3_COMPLETION_FLAG;
-
-	sgl.addr = mw_bind->bind_info.addr;
-	sgl.lkey = mw_bind->bind_info.mr->lkey;
-	sgl.length = mw_bind->bind_info.length;
-	wqe->bind.reserved = 0;
-	wqe->bind.type = TPT_VATO;
-
-	/* TBD: check perms */
-	wqe->bind.perms = iwch_ib_to_tpt_bind_access(
-		mw_bind->bind_info.mw_access_flags);
-	wqe->bind.mr_stag = cpu_to_be32(mw_bind->bind_info.mr->lkey);
-	wqe->bind.mw_stag = cpu_to_be32(mw->rkey);
-	wqe->bind.mw_len = cpu_to_be32(mw_bind->bind_info.length);
-	wqe->bind.mw_va = cpu_to_be64(mw_bind->bind_info.addr);
-	err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size);
-	if (err) {
-		spin_unlock_irqrestore(&qhp->lock, flag);
-		return err;
-	}
-	wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
-	sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
-	sqp->wr_id = mw_bind->wr_id;
-	sqp->opcode = T3_BIND_MW;
-	sqp->sq_wptr = qhp->wq.sq_wptr;
-	sqp->complete = 0;
-	sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED);
-	wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr);
-	wqe->bind.mr_pagesz = page_size;
-	build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags,
-		       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0,
-		       sizeof(struct t3_bind_mw_wr) >> 3, T3_SOPEOP);
-	++(qhp->wq.wptr);
-	++(qhp->wq.sq_wptr);
-	spin_unlock_irqrestore(&qhp->lock, flag);
-
-	if (cxio_wq_db_enabled(&qhp->wq))
-		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
-
-	return err;
-}
-
 static inline void build_term_codes(struct respQ_msg_t *rsp_msg,
 				    u8 *layer_type, u8 *ecode)
 {
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 326d07d823a5..cd2ff5f9518a 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3271,6 +3271,12 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
 				    &ep->com.mapped_local_addr;
 
+	if (ipv6_addr_type(&sin6->sin6_addr) != IPV6_ADDR_ANY) {
+		err = cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
+				     (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+		if (err)
+			return err;
+	}
 	c4iw_init_wr_wait(&ep->com.wr_wait);
 	err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0],
 				   ep->stid, &sin6->sin6_addr,
@@ -3282,13 +3288,13 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
 					  0, 0, __func__);
 	else if (err > 0)
 		err = net_xmit_errno(err);
-	if (err)
+	if (err) {
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
 		pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
 		       err, ep->stid,
 		       sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port));
-	else
-		cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
-			       (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+	}
 	return err;
 }
 
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index de9cd6901752..cf21df4a8bf5 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -744,9 +744,6 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)
 		case FW_RI_SEND_WITH_SE:
 			wc->opcode = IB_WC_SEND;
 			break;
-		case FW_RI_BIND_MW:
-			wc->opcode = IB_WC_BIND_MW;
-			break;
 
 		case FW_RI_LOCAL_INV:
 			wc->opcode = IB_WC_LOCAL_INV;
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 58fce1742b8d..8024ea4417b8 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -315,14 +315,12 @@ static int qp_release(struct inode *inode, struct file *file)
 static int qp_open(struct inode *inode, struct file *file)
 {
 	struct c4iw_debugfs_data *qpd;
-	int ret = 0;
 	int count = 1;
 
 	qpd = kmalloc(sizeof *qpd, GFP_KERNEL);
-	if (!qpd) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!qpd)
+		return -ENOMEM;
+
 	qpd->devp = inode->i_private;
 	qpd->pos = 0;
 
@@ -333,8 +331,8 @@ static int qp_open(struct inode *inode, struct file *file)
 	qpd->bufsize = count * 128;
 	qpd->buf = vmalloc(qpd->bufsize);
 	if (!qpd->buf) {
-		ret = -ENOMEM;
-		goto err1;
+		kfree(qpd);
+		return -ENOMEM;
 	}
 
 	spin_lock_irq(&qpd->devp->lock);
@@ -343,11 +341,7 @@ static int qp_open(struct inode *inode, struct file *file)
 
 	qpd->buf[qpd->pos++] = 0;
 	file->private_data = qpd;
-	goto out;
-err1:
-	kfree(qpd);
-out:
-	return ret;
+	return 0;
 }
 
 static const struct file_operations qp_debugfs_fops = {
@@ -781,8 +775,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
 		pr_err(MOD "%s: unsupported udb/ucq densities %u/%u\n",
 		       pci_name(rdev->lldi.pdev), rdev->lldi.udb_density,
 		       rdev->lldi.ucq_density);
-		err = -EINVAL;
-		goto err1;
+		return -EINVAL;
 	}
 	if (rdev->lldi.vr->qp.start != rdev->lldi.vr->cq.start ||
 	    rdev->lldi.vr->qp.size != rdev->lldi.vr->cq.size) {
@@ -791,8 +784,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
 		       pci_name(rdev->lldi.pdev), rdev->lldi.vr->qp.start,
 		       rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.size,
 		       rdev->lldi.vr->cq.size);
-		err = -EINVAL;
-		goto err1;
+		return -EINVAL;
 	}
 
 	rdev->qpmask = rdev->lldi.udb_density - 1;
@@ -816,10 +808,8 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
 	     rdev->lldi.db_reg, rdev->lldi.gts_reg,
 	     rdev->qpmask, rdev->cqmask);
 
-	if (c4iw_num_stags(rdev) == 0) {
-		err = -EINVAL;
-		goto err1;
-	}
+	if (c4iw_num_stags(rdev) == 0)
+		return -EINVAL;
 
 	rdev->stats.pd.total = T4_MAX_NUM_PD;
 	rdev->stats.stag.total = rdev->lldi.vr->stag.size;
@@ -831,29 +821,31 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
 	err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD);
 	if (err) {
 		printk(KERN_ERR MOD "error %d initializing resources\n", err);
-		goto err1;
+		return err;
 	}
 	err = c4iw_pblpool_create(rdev);
 	if (err) {
 		printk(KERN_ERR MOD "error %d initializing pbl pool\n", err);
-		goto err2;
+		goto destroy_resource;
 	}
 	err = c4iw_rqtpool_create(rdev);
 	if (err) {
 		printk(KERN_ERR MOD "error %d initializing rqt pool\n", err);
-		goto err3;
+		goto destroy_pblpool;
 	}
 	err = c4iw_ocqp_pool_create(rdev);
 	if (err) {
 		printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err);
-		goto err4;
+		goto destroy_rqtpool;
 	}
 	rdev->status_page = (struct t4_dev_status_page *)
 			    __get_free_page(GFP_KERNEL);
-	if (!rdev->status_page) {
-		pr_err(MOD "error allocating status page\n");
-		goto err4;
-	}
+	if (!rdev->status_page)
+		goto destroy_ocqp_pool;
+	rdev->status_page->qp_start = rdev->lldi.vr->qp.start;
+	rdev->status_page->qp_size = rdev->lldi.vr->qp.size;
+	rdev->status_page->cq_start = rdev->lldi.vr->cq.start;
+	rdev->status_page->cq_size = rdev->lldi.vr->cq.size;
 
 	if (c4iw_wr_log) {
 		rdev->wr_log = kzalloc((1 << c4iw_wr_log_size_order) *
@@ -869,13 +861,14 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
 	rdev->status_page->db_off = 0;
 
 	return 0;
-err4:
+destroy_ocqp_pool:
+	c4iw_ocqp_pool_destroy(rdev);
+destroy_rqtpool:
 	c4iw_rqtpool_destroy(rdev);
-err3:
+destroy_pblpool:
 	c4iw_pblpool_destroy(rdev);
-err2:
+destroy_resource:
 	c4iw_destroy_resource(&rdev->resource);
-err1:
 	return err;
 }
 
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 00e55faa086a..fb2de75a0392 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -947,8 +947,6 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		      struct ib_send_wr **bad_wr);
 int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 		      struct ib_recv_wr **bad_wr);
-int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
-		 struct ib_mw_bind *mw_bind);
 int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
 int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog);
 int c4iw_destroy_listen(struct iw_cm_id *cm_id);
@@ -968,17 +966,6 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
 					   u64 length, u64 virt, int acc,
 					   struct ib_udata *udata);
 struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
-struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
-					struct ib_phys_buf *buffer_list,
-					int num_phys_buf,
-					int acc,
-					u64 *iova_start);
-int c4iw_reregister_phys_mem(struct ib_mr *mr,
-				     int mr_rereg_mask,
-				     struct ib_pd *pd,
-				     struct ib_phys_buf *buffer_list,
-				     int num_phys_buf,
-				     int acc, u64 *iova_start);
 int c4iw_dereg_mr(struct ib_mr *ib_mr);
 int c4iw_destroy_cq(struct ib_cq *ib_cq);
 struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index e1629ab58db7..7849890c4781 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -392,32 +392,6 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
 	return ret;
 }
 
-static int reregister_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
-			  struct c4iw_mr *mhp, int shift, int npages)
-{
-	u32 stag;
-	int ret;
-
-	if (npages > mhp->attr.pbl_size)
-		return -ENOMEM;
-
-	stag = mhp->attr.stag;
-	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
-			      FW_RI_STAG_NSMR, mhp->attr.perms,
-			      mhp->attr.mw_bind_enable, mhp->attr.zbva,
-			      mhp->attr.va_fbo, mhp->attr.len, shift - 12,
-			      mhp->attr.pbl_size, mhp->attr.pbl_addr);
-	if (ret)
-		return ret;
-
-	ret = finish_mem_reg(mhp, stag);
-	if (ret)
-		dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-		       mhp->attr.pbl_addr);
-
-	return ret;
-}
-
 static int alloc_pbl(struct c4iw_mr *mhp, int npages)
 {
 	mhp->attr.pbl_addr = c4iw_pblpool_alloc(&mhp->rhp->rdev,
@@ -431,228 +405,6 @@ static int alloc_pbl(struct c4iw_mr *mhp, int npages)
 	return 0;
 }
 
-static int build_phys_page_list(struct ib_phys_buf *buffer_list,
-				int num_phys_buf, u64 *iova_start,
-				u64 *total_size, int *npages,
-				int *shift, __be64 **page_list)
-{
-	u64 mask;
-	int i, j, n;
-
-	mask = 0;
-	*total_size = 0;
-	for (i = 0; i < num_phys_buf; ++i) {
-		if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
-			return -EINVAL;
-		if (i != 0 && i != num_phys_buf - 1 &&
-		    (buffer_list[i].size & ~PAGE_MASK))
-			return -EINVAL;
-		*total_size += buffer_list[i].size;
-		if (i > 0)
-			mask |= buffer_list[i].addr;
-		else
-			mask |= buffer_list[i].addr & PAGE_MASK;
-		if (i != num_phys_buf - 1)
-			mask |= buffer_list[i].addr + buffer_list[i].size;
-		else
-			mask |= (buffer_list[i].addr + buffer_list[i].size +
-				PAGE_SIZE - 1) & PAGE_MASK;
-	}
-
-	if (*total_size > 0xFFFFFFFFULL)
-		return -ENOMEM;
-
-	/* Find largest page shift we can use to cover buffers */
-	for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
-		if ((1ULL << *shift) & mask)
-			break;
-
-	buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
-	buffer_list[0].addr &= ~0ull << *shift;
-
-	*npages = 0;
-	for (i = 0; i < num_phys_buf; ++i)
-		*npages += (buffer_list[i].size +
-			(1ULL << *shift) - 1) >> *shift;
-
-	if (!*npages)
-		return -EINVAL;
-
-	*page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
-	if (!*page_list)
-		return -ENOMEM;
-
-	n = 0;
-	for (i = 0; i < num_phys_buf; ++i)
-		for (j = 0;
-		     j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
-		     ++j)
-			(*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
-			    ((u64) j << *shift));
-
-	PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
-	     __func__, (unsigned long long)*iova_start,
-	     (unsigned long long)mask, *shift, (unsigned long long)*total_size,
-	     *npages);
-
-	return 0;
-
-}
-
-int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask,
-			     struct ib_pd *pd, struct ib_phys_buf *buffer_list,
-			     int num_phys_buf, int acc, u64 *iova_start)
-{
-
-	struct c4iw_mr mh, *mhp;
-	struct c4iw_pd *php;
-	struct c4iw_dev *rhp;
-	__be64 *page_list = NULL;
-	int shift = 0;
-	u64 total_size;
-	int npages;
-	int ret;
-
-	PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
-
-	/* There can be no memory windows */
-	if (atomic_read(&mr->usecnt))
-		return -EINVAL;
-
-	mhp = to_c4iw_mr(mr);
-	rhp = mhp->rhp;
-	php = to_c4iw_pd(mr->pd);
-
-	/* make sure we are on the same adapter */
-	if (rhp != php->rhp)
-		return -EINVAL;
-
-	memcpy(&mh, mhp, sizeof *mhp);
-
-	if (mr_rereg_mask & IB_MR_REREG_PD)
-		php = to_c4iw_pd(pd);
-	if (mr_rereg_mask & IB_MR_REREG_ACCESS) {
-		mh.attr.perms = c4iw_ib_to_tpt_access(acc);
-		mh.attr.mw_bind_enable = (acc & IB_ACCESS_MW_BIND) ==
-					 IB_ACCESS_MW_BIND;
-	}
-	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-		ret = build_phys_page_list(buffer_list, num_phys_buf,
-						iova_start,
-						&total_size, &npages,
-						&shift, &page_list);
-		if (ret)
-			return ret;
-	}
-
-	if (mr_exceeds_hw_limits(rhp, total_size)) {
-		kfree(page_list);
-		return -EINVAL;
-	}
-
-	ret = reregister_mem(rhp, php, &mh, shift, npages);
-	kfree(page_list);
-	if (ret)
-		return ret;
-	if (mr_rereg_mask & IB_MR_REREG_PD)
-		mhp->attr.pdid = php->pdid;
-	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
-		mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
-	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-		mhp->attr.zbva = 0;
-		mhp->attr.va_fbo = *iova_start;
-		mhp->attr.page_size = shift - 12;
-		mhp->attr.len = (u32) total_size;
-		mhp->attr.pbl_size = npages;
-	}
-
-	return 0;
-}
-
-struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
-				     struct ib_phys_buf *buffer_list,
-				     int num_phys_buf, int acc, u64 *iova_start)
-{
-	__be64 *page_list;
-	int shift;
-	u64 total_size;
-	int npages;
-	struct c4iw_dev *rhp;
-	struct c4iw_pd *php;
-	struct c4iw_mr *mhp;
-	int ret;
-
-	PDBG("%s ib_pd %p\n", __func__, pd);
-	php = to_c4iw_pd(pd);
-	rhp = php->rhp;
-
-	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
-	if (!mhp)
-		return ERR_PTR(-ENOMEM);
-
-	mhp->rhp = rhp;
-
-	/* First check that we have enough alignment */
-	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
-		ret = -EINVAL;
-		goto err;
-	}
-
-	if (num_phys_buf > 1 &&
-	    ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
-		ret = -EINVAL;
-		goto err;
-	}
-
-	ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
-					&total_size, &npages, &shift,
-					&page_list);
-	if (ret)
-		goto err;
-
-	if (mr_exceeds_hw_limits(rhp, total_size)) {
-		kfree(page_list);
-		ret = -EINVAL;
-		goto err;
-	}
-
-	ret = alloc_pbl(mhp, npages);
-	if (ret) {
-		kfree(page_list);
-		goto err;
-	}
-
-	ret = write_pbl(&mhp->rhp->rdev, page_list, mhp->attr.pbl_addr,
-			     npages);
-	kfree(page_list);
-	if (ret)
-		goto err_pbl;
-
-	mhp->attr.pdid = php->pdid;
-	mhp->attr.zbva = 0;
-
-	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
-	mhp->attr.va_fbo = *iova_start;
-	mhp->attr.page_size = shift - 12;
-
-	mhp->attr.len = (u32) total_size;
-	mhp->attr.pbl_size = npages;
-	ret = register_mem(rhp, php, mhp, shift);
-	if (ret)
-		goto err_pbl;
-
-	return &mhp->ibmr;
-
-err_pbl:
-	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
-			      mhp->attr.pbl_size << 3);
-
-err:
-	kfree(mhp);
-	return ERR_PTR(ret);
-
-}
-
 struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
 {
 	struct c4iw_dev *rhp;
@@ -952,9 +704,6 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr)
 	u32 mmid;
 
 	PDBG("%s ib_mr %p\n", __func__, ib_mr);
-	/* There can be no memory windows */
-	if (atomic_read(&ib_mr->usecnt))
-		return -EINVAL;
 
 	mhp = to_c4iw_mr(ib_mr);
 	rhp = mhp->rhp;
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 0a7d99818b17..ec04272fbdc2 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -549,12 +549,9 @@ int c4iw_register_device(struct c4iw_dev *dev)
 	dev->ibdev.resize_cq = c4iw_resize_cq;
 	dev->ibdev.poll_cq = c4iw_poll_cq;
 	dev->ibdev.get_dma_mr = c4iw_get_dma_mr;
-	dev->ibdev.reg_phys_mr = c4iw_register_phys_mem;
-	dev->ibdev.rereg_phys_mr = c4iw_reregister_phys_mem;
 	dev->ibdev.reg_user_mr = c4iw_reg_user_mr;
 	dev->ibdev.dereg_mr = c4iw_dereg_mr;
 	dev->ibdev.alloc_mw = c4iw_alloc_mw;
-	dev->ibdev.bind_mw = c4iw_bind_mw;
 	dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
 	dev->ibdev.alloc_mr = c4iw_alloc_mr;
 	dev->ibdev.map_mr_sg = c4iw_map_mr_sg;
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index aa515afee724..e99345eb875a 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -933,11 +933,6 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 	return err;
 }
 
-int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind)
-{
-	return -ENOSYS;
-}
-
 static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type,
 				    u8 *ecode)
 {
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index 1092a2d1f607..6126bbe36095 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -699,4 +699,11 @@ static inline void t4_set_cq_in_error(struct t4_cq *cq)
 
 struct t4_dev_status_page {
 	u8 db_off;
+	u8 pad1;
+	u16 pad2;
+	u32 pad3;
+	u64 qp_start;
+	u64 qp_size;
+	u64 cq_start;
+	u64 cq_size;
 };
diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h
index cbd0ce170728..295f422b9a3a 100644
--- a/drivers/infiniband/hw/cxgb4/user.h
+++ b/drivers/infiniband/hw/cxgb4/user.h
@@ -32,7 +32,7 @@
 #ifndef __C4IW_USER_H__
 #define __C4IW_USER_H__
 
-#define C4IW_UVERBS_ABI_VERSION	2
+#define C4IW_UVERBS_ABI_VERSION	3
 
 /*
  * Make sure that all structs defined in this file remain laid out so
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index 86af71351d9a..105246fba2e7 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -92,7 +92,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
 				ah_attr->grh.sgid_index, &sgid, &gid_attr);
 	if (ret)
 		return ERR_PTR(ret);
-	memset(ah->av.eth.s_mac, 0, ETH_ALEN);
+	eth_zero_addr(ah->av.eth.s_mac);
 	if (gid_attr.ndev) {
 		if (is_vlan_dev(gid_attr.ndev))
 			vlan_tag = vlan_dev_vlan_id(gid_attr.ndev);
@@ -104,6 +104,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
 	ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
 	ah->av.eth.gid_index = mlx4_ib_gid_index_to_real_index(ibdev, ah_attr->port_num, ah_attr->grh.sgid_index);
 	ah->av.eth.vlan = cpu_to_be16(vlan_tag);
+	ah->av.eth.hop_limit = ah_attr->grh.hop_limit;
 	if (ah_attr->static_rate) {
 		ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
 		while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index b88fc8f5ab18..9f8b516eb2b0 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -811,9 +811,6 @@ repoll:
 			wc->opcode    = IB_WC_MASKED_FETCH_ADD;
 			wc->byte_len  = 8;
 			break;
-		case MLX4_OPCODE_BIND_MW:
-			wc->opcode    = IB_WC_BIND_MW;
-			break;
 		case MLX4_OPCODE_LSO:
 			wc->opcode    = IB_WC_LSO;
 			break;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 97d6878f9938..1c7ab6cabbb8 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -154,9 +154,9 @@ static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_n
 	return dev;
 }
 
-static int mlx4_ib_update_gids(struct gid_entry *gids,
-			       struct mlx4_ib_dev *ibdev,
-			       u8 port_num)
+static int mlx4_ib_update_gids_v1(struct gid_entry *gids,
+				  struct mlx4_ib_dev *ibdev,
+				  u8 port_num)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	int err;
@@ -187,6 +187,63 @@ static int mlx4_ib_update_gids(struct gid_entry *gids,
 	return err;
 }
 
+static int mlx4_ib_update_gids_v1_v2(struct gid_entry *gids,
+				     struct mlx4_ib_dev *ibdev,
+				     u8 port_num)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	struct mlx4_dev *dev = ibdev->dev;
+	int i;
+	struct {
+		union ib_gid	gid;
+		__be32		rsrvd1[2];
+		__be16		rsrvd2;
+		u8		type;
+		u8		version;
+		__be32		rsrvd3;
+	} *gid_tbl;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	gid_tbl = mailbox->buf;
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+		memcpy(&gid_tbl[i].gid, &gids[i].gid, sizeof(union ib_gid));
+		if (gids[i].gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
+			gid_tbl[i].version = 2;
+			if (!ipv6_addr_v4mapped((struct in6_addr *)&gids[i].gid))
+				gid_tbl[i].type = 1;
+			else
+				memset(&gid_tbl[i].gid, 0, 12);
+		}
+	}
+
+	err = mlx4_cmd(dev, mailbox->dma,
+		       MLX4_SET_PORT_ROCE_ADDR << 8 | port_num,
+		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+	if (mlx4_is_bonded(dev))
+		err += mlx4_cmd(dev, mailbox->dma,
+				MLX4_SET_PORT_ROCE_ADDR << 8 | 2,
+				1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_update_gids(struct gid_entry *gids,
+			       struct mlx4_ib_dev *ibdev,
+			       u8 port_num)
+{
+	if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+		return mlx4_ib_update_gids_v1_v2(gids, ibdev, port_num);
+
+	return mlx4_ib_update_gids_v1(gids, ibdev, port_num);
+}
+
 static int mlx4_ib_add_gid(struct ib_device *device,
 			   u8 port_num,
 			   unsigned int index,
@@ -215,7 +272,8 @@ static int mlx4_ib_add_gid(struct ib_device *device,
 	port_gid_table = &iboe->gids[port_num - 1];
 	spin_lock_bh(&iboe->lock);
 	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
-		if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid))) {
+		if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid)) &&
+		    (port_gid_table->gids[i].gid_type == attr->gid_type))  {
 			found = i;
 			break;
 		}
@@ -233,6 +291,7 @@ static int mlx4_ib_add_gid(struct ib_device *device,
 			} else {
 				*context = port_gid_table->gids[free].ctx;
 				memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid));
+				port_gid_table->gids[free].gid_type = attr->gid_type;
 				port_gid_table->gids[free].ctx->real_index = free;
 				port_gid_table->gids[free].ctx->refcount = 1;
 				hw_update = 1;
@@ -248,8 +307,10 @@ static int mlx4_ib_add_gid(struct ib_device *device,
 		if (!gids) {
 			ret = -ENOMEM;
 		} else {
-			for (i = 0; i < MLX4_MAX_PORT_GIDS; i++)
+			for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) {
 				memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+				gids[i].gid_type = port_gid_table->gids[i].gid_type;
+			}
 		}
 	}
 	spin_unlock_bh(&iboe->lock);
@@ -325,6 +386,7 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
 	int i;
 	int ret;
 	unsigned long flags;
+	struct ib_gid_attr attr;
 
 	if (port_num > MLX4_MAX_PORTS)
 		return -EINVAL;
@@ -335,10 +397,13 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
 	if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num))
 		return index;
 
-	ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, NULL);
+	ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, &attr);
 	if (ret)
 		return ret;
 
+	if (attr.ndev)
+		dev_put(attr.ndev);
+
 	if (!memcmp(&gid, &zgid, sizeof(gid)))
 		return -EINVAL;
 
@@ -346,7 +411,8 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
 	port_gid_table = &iboe->gids[port_num - 1];
 
 	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
-		if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid))) {
+		if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid)) &&
+		    attr.gid_type == port_gid_table->gids[i].gid_type) {
 			ctx = port_gid_table->gids[i].ctx;
 			break;
 		}
@@ -2119,6 +2185,7 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
 			       struct ib_port_immutable *immutable)
 {
 	struct ib_port_attr attr;
+	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
 	int err;
 
 	err = mlx4_ib_query_port(ibdev, port_num, &attr);
@@ -2128,10 +2195,15 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
 	immutable->pkey_tbl_len = attr.pkey_tbl_len;
 	immutable->gid_tbl_len = attr.gid_tbl_len;
 
-	if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND)
+	if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND) {
 		immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
-	else
-		immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+	} else {
+		if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)
+			immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+			immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE |
+				RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+	}
 
 	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
 
@@ -2283,7 +2355,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
 	    dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
 		ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
-		ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw;
 		ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw;
 
 		ibdev->ib_dev.uverbs_cmd_mask |=
@@ -2423,7 +2494,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	if (mlx4_ib_init_sriov(ibdev))
 		goto err_mad;
 
-	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) {
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE ||
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
 		if (!iboe->nb.notifier_call) {
 			iboe->nb.notifier_call = mlx4_ib_netdev_event;
 			err = register_netdevice_notifier(&iboe->nb);
@@ -2432,6 +2504,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 				goto err_notif;
 			}
 		}
+		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+			err = mlx4_config_roce_v2_port(dev, ROCE_V2_UDP_DPORT);
+			if (err) {
+				goto err_notif;
+			}
+		}
 	}
 
 	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 1caa11edac03..52ce7b000044 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -177,11 +177,18 @@ struct mlx4_ib_wq {
 	unsigned		tail;
 };
 
+enum {
+	MLX4_IB_QP_CREATE_ROCE_V2_GSI = IB_QP_CREATE_RESERVED_START
+};
+
 enum mlx4_ib_qp_flags {
 	MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
 	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
 	MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
 	MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO,
+
+	/* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */
+	MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI,
 	MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
 	MLX4_IB_SRIOV_SQP = 1 << 31,
 };
@@ -478,6 +485,7 @@ struct gid_cache_context {
 
 struct gid_entry {
 	union ib_gid	gid;
+	enum ib_gid_type gid_type;
 	struct gid_cache_context *ctx;
 };
 
@@ -704,8 +712,6 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  struct ib_udata *udata);
 int mlx4_ib_dereg_mr(struct ib_mr *mr);
 struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
-int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
-		    struct ib_mw_bind *mw_bind);
 int mlx4_ib_dealloc_mw(struct ib_mw *mw);
 struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
 			       enum ib_mr_type mr_type,
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 4d1e1c632603..242b94ec105b 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -366,28 +366,6 @@ err_free:
 	return ERR_PTR(err);
 }
 
-int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
-		    struct ib_mw_bind *mw_bind)
-{
-	struct ib_bind_mw_wr  wr;
-	struct ib_send_wr *bad_wr;
-	int ret;
-
-	memset(&wr, 0, sizeof(wr));
-	wr.wr.opcode		= IB_WR_BIND_MW;
-	wr.wr.wr_id		= mw_bind->wr_id;
-	wr.wr.send_flags	= mw_bind->send_flags;
-	wr.mw			= mw;
-	wr.bind_info		= mw_bind->bind_info;
-	wr.rkey			= ib_inc_rkey(mw->rkey);
-
-	ret = mlx4_ib_post_send(qp, &wr.wr, &bad_wr);
-	if (!ret)
-		mw->rkey = wr.rkey;
-
-	return ret;
-}
-
 int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
 {
 	struct mlx4_ib_mw *mw = to_mmw(ibmw);
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 13eaaf45288f..bc5536f00b6c 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -32,6 +32,8 @@
  */
 
 #include <linux/log2.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
 #include <linux/slab.h>
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
@@ -85,6 +87,7 @@ struct mlx4_ib_sqp {
 	u32			send_psn;
 	struct ib_ud_header	ud_header;
 	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
+	struct ib_qp		*roce_v2_gsi;
 };
 
 enum {
@@ -115,7 +118,6 @@ static const __be32 mlx4_ib_opcode[] = {
 	[IB_WR_REG_MR]				= cpu_to_be32(MLX4_OPCODE_FMR),
 	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
 	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
-	[IB_WR_BIND_MW]				= cpu_to_be32(MLX4_OPCODE_BIND_MW),
 };
 
 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
@@ -154,7 +156,10 @@ static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 			}
 		}
 	}
-	return proxy_sqp;
+	if (proxy_sqp)
+		return 1;
+
+	return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP);
 }
 
 /* used for INIT/CLOSE port logic */
@@ -796,11 +801,13 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 		if (err)
 			goto err_mtt;
 
-		qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp);
+		qp->sq.wrid = kmalloc_array(qp->sq.wqe_cnt, sizeof(u64),
+					gfp | __GFP_NOWARN);
 		if (!qp->sq.wrid)
 			qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64),
 						gfp, PAGE_KERNEL);
-		qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp);
+		qp->rq.wrid = kmalloc_array(qp->rq.wqe_cnt, sizeof(u64),
+					gfp | __GFP_NOWARN);
 		if (!qp->rq.wrid)
 			qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64),
 						gfp, PAGE_KERNEL);
@@ -1099,9 +1106,9 @@ static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
 		return dev->dev->caps.qp1_proxy[attr->port_num - 1];
 }
 
-struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
-				struct ib_qp_init_attr *init_attr,
-				struct ib_udata *udata)
+static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
+					struct ib_qp_init_attr *init_attr,
+					struct ib_udata *udata)
 {
 	struct mlx4_ib_qp *qp = NULL;
 	int err;
@@ -1120,6 +1127,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 					MLX4_IB_SRIOV_TUNNEL_QP |
 					MLX4_IB_SRIOV_SQP |
 					MLX4_IB_QP_NETIF |
+					MLX4_IB_QP_CREATE_ROCE_V2_GSI |
 					MLX4_IB_QP_CREATE_USE_GFP_NOIO))
 		return ERR_PTR(-EINVAL);
 
@@ -1128,15 +1136,21 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 			return ERR_PTR(-EINVAL);
 	}
 
-	if (init_attr->create_flags &&
-	    ((udata && init_attr->create_flags & ~(sup_u_create_flags)) ||
-	     ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP |
-					   MLX4_IB_QP_CREATE_USE_GFP_NOIO |
-					   MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)) &&
-	      init_attr->qp_type != IB_QPT_UD) ||
-	     ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) &&
-	      init_attr->qp_type > IB_QPT_GSI)))
-		return ERR_PTR(-EINVAL);
+	if (init_attr->create_flags) {
+		if (udata && init_attr->create_flags & ~(sup_u_create_flags))
+			return ERR_PTR(-EINVAL);
+
+		if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP |
+						 MLX4_IB_QP_CREATE_USE_GFP_NOIO |
+						 MLX4_IB_QP_CREATE_ROCE_V2_GSI  |
+						 MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) &&
+		     init_attr->qp_type != IB_QPT_UD) ||
+		    (init_attr->create_flags & MLX4_IB_SRIOV_SQP &&
+		     init_attr->qp_type > IB_QPT_GSI) ||
+		    (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI &&
+		     init_attr->qp_type != IB_QPT_GSI))
+			return ERR_PTR(-EINVAL);
+	}
 
 	switch (init_attr->qp_type) {
 	case IB_QPT_XRC_TGT:
@@ -1173,19 +1187,29 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	{
+		int sqpn;
+
 		/* Userspace is not allowed to create special QPs: */
 		if (udata)
 			return ERR_PTR(-EINVAL);
+		if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
+			int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, 1, 1, &sqpn, 0);
+
+			if (res)
+				return ERR_PTR(res);
+		} else {
+			sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
+		}
 
 		err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,
-				       get_sqp_num(to_mdev(pd->device), init_attr),
+				       sqpn,
 				       &qp, gfp);
 		if (err)
 			return ERR_PTR(err);
 
 		qp->port	= init_attr->port_num;
-		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
-
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 :
+			init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1;
 		break;
 	}
 	default:
@@ -1196,7 +1220,41 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 	return &qp->ibqp;
 }
 
-int mlx4_ib_destroy_qp(struct ib_qp *qp)
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata) {
+	struct ib_device *device = pd ? pd->device : init_attr->xrcd->device;
+	struct ib_qp *ibqp;
+	struct mlx4_ib_dev *dev = to_mdev(device);
+
+	ibqp = _mlx4_ib_create_qp(pd, init_attr, udata);
+
+	if (!IS_ERR(ibqp) &&
+	    (init_attr->qp_type == IB_QPT_GSI) &&
+	    !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) {
+		struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp)));
+		int is_eth = rdma_cap_eth_ah(&dev->ib_dev, init_attr->port_num);
+
+		if (is_eth &&
+		    dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+			init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+			sqp->roce_v2_gsi = ib_create_qp(pd, init_attr);
+
+			if (IS_ERR(sqp->roce_v2_gsi)) {
+				pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi));
+				sqp->roce_v2_gsi = NULL;
+			} else {
+				sqp = to_msqp(to_mqp(sqp->roce_v2_gsi));
+				sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP;
+			}
+
+			init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+		}
+	}
+	return ibqp;
+}
+
+static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
 {
 	struct mlx4_ib_dev *dev = to_mdev(qp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(qp);
@@ -1225,6 +1283,20 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp)
 	return 0;
 }
 
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+		struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+
+		if (sqp->roce_v2_gsi)
+			ib_destroy_qp(sqp->roce_v2_gsi);
+	}
+
+	return _mlx4_ib_destroy_qp(qp);
+}
+
 static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
 {
 	switch (type) {
@@ -1507,6 +1579,24 @@ static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 	return 0;
 }
 
+enum {
+	MLX4_QPC_ROCE_MODE_1 = 0,
+	MLX4_QPC_ROCE_MODE_2 = 2,
+	MLX4_QPC_ROCE_MODE_UNDEFINED = 0xff
+};
+
+static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
+{
+	switch (gid_type) {
+	case IB_GID_TYPE_ROCE:
+		return MLX4_QPC_ROCE_MODE_1;
+	case IB_GID_TYPE_ROCE_UDP_ENCAP:
+		return MLX4_QPC_ROCE_MODE_2;
+	default:
+		return MLX4_QPC_ROCE_MODE_UNDEFINED;
+	}
+}
+
 static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
@@ -1633,6 +1723,14 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			mlx4_ib_steer_qp_reg(dev, qp, 1);
 			steer_qp = 1;
 		}
+
+		if (ibqp->qp_type == IB_QPT_GSI) {
+			enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
+				IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE;
+			u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
+
+			context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+		}
 	}
 
 	if (attr_mask & IB_QP_PKEY_INDEX) {
@@ -1650,9 +1748,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		u16 vlan = 0xffff;
 		u8 smac[ETH_ALEN];
 		int status = 0;
+		int is_eth = rdma_cap_eth_ah(&dev->ib_dev, port_num) &&
+			attr->ah_attr.ah_flags & IB_AH_GRH;
 
-		if (rdma_cap_eth_ah(&dev->ib_dev, port_num) &&
-		    attr->ah_attr.ah_flags & IB_AH_GRH) {
+		if (is_eth) {
 			int index = attr->ah_attr.grh.sgid_index;
 
 			status = ib_get_cached_gid(ibqp->device, port_num,
@@ -1674,6 +1773,18 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 
 		optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
 			   MLX4_QP_OPTPAR_SCHED_QUEUE);
+
+		if (is_eth &&
+		    (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) {
+			u8 qpc_roce_mode = gid_type_to_qpc(gid_attr.gid_type);
+
+			if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_UNDEFINED) {
+				err = -EINVAL;
+				goto out;
+			}
+			context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+		}
+
 	}
 
 	if (attr_mask & IB_QP_TIMEOUT) {
@@ -1845,7 +1956,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		sqd_event = 0;
 
 	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
-		context->rlkey |= (1 << 4);
+		context->rlkey_roce_mode |= (1 << 4);
 
 	/*
 	 * Before passing a kernel QP to the HW, make sure that the
@@ -2022,8 +2133,8 @@ out:
 	return err;
 }
 
-int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		      int attr_mask, struct ib_udata *udata)
+static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			      int attr_mask, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *qp = to_mqp(ibqp);
@@ -2126,6 +2237,27 @@ out:
 	return err;
 }
 
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	int ret;
+
+	ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata);
+
+	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+		struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+		int err = 0;
+
+		if (sqp->roce_v2_gsi)
+			err = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask);
+		if (err)
+			pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n",
+			       err);
+	}
+	return ret;
+}
+
 static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
 {
 	int i;
@@ -2168,7 +2300,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
 	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
 		send_size += sizeof (struct mlx4_ib_tunnel_header);
 
-	ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header);
+	ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header);
 
 	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
 		sqp->ud_header.lrh.service_level =
@@ -2252,16 +2384,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
 	return 0;
 }
 
-static void mlx4_u64_to_smac(u8 *dst_mac, u64 src_mac)
-{
-	int i;
-
-	for (i = ETH_ALEN; i; i--) {
-		dst_mac[i - 1] = src_mac & 0xff;
-		src_mac >>= 8;
-	}
-}
-
+#define MLX4_ROCEV2_QP1_SPORT 0xC000
 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
 			    void *wqe, unsigned *mlx_seg_len)
 {
@@ -2281,6 +2404,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
 	bool is_eth;
 	bool is_vlan = false;
 	bool is_grh;
+	bool is_udp = false;
+	int ip_version = 0;
 
 	send_size = 0;
 	for (i = 0; i < wr->wr.num_sge; ++i)
@@ -2289,6 +2414,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
 	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
 	is_grh = mlx4_ib_ah_grh_present(ah);
 	if (is_eth) {
+		struct ib_gid_attr gid_attr;
+
 		if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
 			/* When multi-function is enabled, the ib_core gid
 			 * indexes don't necessarily match the hw ones, so
@@ -2302,19 +2429,35 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
 			err = ib_get_cached_gid(ib_dev,
 						be32_to_cpu(ah->av.ib.port_pd) >> 24,
 						ah->av.ib.gid_index, &sgid,
-						NULL);
-			if (!err && !memcmp(&sgid, &zgid, sizeof(sgid)))
-				err = -ENOENT;
-			if (err)
+						&gid_attr);
+			if (!err) {
+				if (gid_attr.ndev)
+					dev_put(gid_attr.ndev);
+				if (!memcmp(&sgid, &zgid, sizeof(sgid)))
+					err = -ENOENT;
+			}
+			if (!err) {
+				is_udp = gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
+				if (is_udp) {
+					if (ipv6_addr_v4mapped((struct in6_addr *)&sgid))
+						ip_version = 4;
+					else
+						ip_version = 6;
+					is_grh = false;
+				}
+			} else {
 				return err;
+			}
 		}
-
 		if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
 			vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
 			is_vlan = 1;
 		}
 	}
-	ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+	err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh,
+			  ip_version, is_udp, 0, &sqp->ud_header);
+	if (err)
+		return err;
 
 	if (!is_eth) {
 		sqp->ud_header.lrh.service_level =
@@ -2323,7 +2466,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
 		sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
 	}
 
-	if (is_grh) {
+	if (is_grh || (ip_version == 6)) {
 		sqp->ud_header.grh.traffic_class =
 			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
 		sqp->ud_header.grh.flow_label    =
@@ -2352,6 +2495,25 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
 		       ah->av.ib.dgid, 16);
 	}
 
+	if (ip_version == 4) {
+		sqp->ud_header.ip4.tos =
+			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+		sqp->ud_header.ip4.id = 0;
+		sqp->ud_header.ip4.frag_off = htons(IP_DF);
+		sqp->ud_header.ip4.ttl = ah->av.eth.hop_limit;
+
+		memcpy(&sqp->ud_header.ip4.saddr,
+		       sgid.raw + 12, 4);
+		memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4);
+		sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header);
+	}
+
+	if (is_udp) {
+		sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT);
+		sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT);
+		sqp->ud_header.udp.csum = 0;
+	}
+
 	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 
 	if (!is_eth) {
@@ -2380,34 +2542,27 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
 
 	if (is_eth) {
 		struct in6_addr in6;
-
+		u16 ether_type;
 		u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
 
+		ether_type = (!is_udp) ? MLX4_IB_IBOE_ETHERTYPE :
+			(ip_version == 4 ? ETH_P_IP : ETH_P_IPV6);
+
 		mlx->sched_prio = cpu_to_be16(pcp);
 
+		ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac);
 		memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
-		/* FIXME: cache smac value? */
 		memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
 		memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
 		memcpy(&in6, sgid.raw, sizeof(in6));
 
-		if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
-			u64 mac = atomic64_read(&to_mdev(ib_dev)->iboe.mac[sqp->qp.port - 1]);
-			u8 smac[ETH_ALEN];
-
-			mlx4_u64_to_smac(smac, mac);
-			memcpy(sqp->ud_header.eth.smac_h, smac, ETH_ALEN);
-		} else {
-			/* use the src mac of the tunnel */
-			memcpy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac, ETH_ALEN);
-		}
 
 		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
 			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
 		if (!is_vlan) {
-			sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+			sqp->ud_header.eth.type = cpu_to_be16(ether_type);
 		} else {
-			sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+			sqp->ud_header.vlan.type = cpu_to_be16(ether_type);
 			sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
 		}
 	} else {
@@ -2528,25 +2683,6 @@ static void set_reg_seg(struct mlx4_wqe_fmr_seg *fseg,
 	fseg->reserved[1]	= 0;
 }
 
-static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg,
-		struct ib_bind_mw_wr *wr)
-{
-	bseg->flags1 =
-		convert_access(wr->bind_info.mw_access_flags) &
-		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ  |
-			    MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE |
-			    MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC);
-	bseg->flags2 = 0;
-	if (wr->mw->type == IB_MW_TYPE_2)
-		bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2);
-	if (wr->bind_info.mw_access_flags & IB_ZERO_BASED)
-		bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED);
-	bseg->new_rkey = cpu_to_be32(wr->rkey);
-	bseg->lkey = cpu_to_be32(wr->bind_info.mr->lkey);
-	bseg->addr = cpu_to_be64(wr->bind_info.addr);
-	bseg->length = cpu_to_be64(wr->bind_info.length);
-}
-
 static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
 {
 	memset(iseg, 0, sizeof(*iseg));
@@ -2766,6 +2902,29 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	int i;
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 
+	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+		struct mlx4_ib_sqp *sqp = to_msqp(qp);
+
+		if (sqp->roce_v2_gsi) {
+			struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah);
+			struct ib_gid_attr gid_attr;
+			union ib_gid gid;
+
+			if (!ib_get_cached_gid(ibqp->device,
+					       be32_to_cpu(ah->av.ib.port_pd) >> 24,
+					       ah->av.ib.gid_index, &gid,
+					       &gid_attr)) {
+				if (gid_attr.ndev)
+					dev_put(gid_attr.ndev);
+				qp = (gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ?
+					to_mqp(sqp->roce_v2_gsi) : qp;
+			} else {
+				pr_err("Failed to get gid at index %d. RoCEv2 will not work properly\n",
+				       ah->av.ib.gid_index);
+			}
+		}
+	}
+
 	spin_lock_irqsave(&qp->sq.lock, flags);
 	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
 		err = -EIO;
@@ -2867,13 +3026,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				size += sizeof(struct mlx4_wqe_fmr_seg) / 16;
 				break;
 
-			case IB_WR_BIND_MW:
-				ctrl->srcrb_flags |=
-					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
-				set_bind_seg(wqe, bind_mw_wr(wr));
-				wqe  += sizeof(struct mlx4_wqe_bind_seg);
-				size += sizeof(struct mlx4_wqe_bind_seg) / 16;
-				break;
 			default:
 				/* No extra segments required for sends */
 				break;
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index c394376ebe06..0597f3eef5d0 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -171,7 +171,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 		if (err)
 			goto err_mtt;
 
-		srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
+		srq->wrid = kmalloc_array(srq->msrq.max, sizeof(u64),
+					GFP_KERNEL | __GFP_NOWARN);
 		if (!srq->wrid) {
 			srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64),
 					      GFP_KERNEL, PAGE_KERNEL);
diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
index 66080580e24d..745efa4cfc71 100644
--- a/drivers/infiniband/hw/mlx5/ah.c
+++ b/drivers/infiniband/hw/mlx5/ah.c
@@ -32,8 +32,10 @@
 
 #include "mlx5_ib.h"
 
-struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
-			   struct mlx5_ib_ah *ah)
+static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev,
+				  struct mlx5_ib_ah *ah,
+				  struct ib_ah_attr *ah_attr,
+				  enum rdma_link_layer ll)
 {
 	if (ah_attr->ah_flags & IB_AH_GRH) {
 		memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16);
@@ -44,9 +46,20 @@ struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
 		ah->av.tclass = ah_attr->grh.traffic_class;
 	}
 
-	ah->av.rlid = cpu_to_be16(ah_attr->dlid);
-	ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f;
-	ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf);
+	ah->av.stat_rate_sl = (ah_attr->static_rate << 4);
+
+	if (ll == IB_LINK_LAYER_ETHERNET) {
+		memcpy(ah->av.rmac, ah_attr->dmac, sizeof(ah_attr->dmac));
+		ah->av.udp_sport =
+			mlx5_get_roce_udp_sport(dev,
+						ah_attr->port_num,
+						ah_attr->grh.sgid_index);
+		ah->av.stat_rate_sl |= (ah_attr->sl & 0x7) << 1;
+	} else {
+		ah->av.rlid = cpu_to_be16(ah_attr->dlid);
+		ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f;
+		ah->av.stat_rate_sl |= (ah_attr->sl & 0xf);
+	}
 
 	return &ah->ibah;
 }
@@ -54,12 +67,19 @@ struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
 struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
 {
 	struct mlx5_ib_ah *ah;
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	enum rdma_link_layer ll;
+
+	ll = pd->device->get_link_layer(pd->device, ah_attr->port_num);
+
+	if (ll == IB_LINK_LAYER_ETHERNET && !(ah_attr->ah_flags & IB_AH_GRH))
+		return ERR_PTR(-EINVAL);
 
 	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
 	if (!ah)
 		return ERR_PTR(-ENOMEM);
 
-	return create_ib_ah(ah_attr, ah); /* never fails */
+	return create_ib_ah(dev, ah, ah_attr, ll); /* never fails */
 }
 
 int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 92ddae101ecc..fd1de31e0611 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -154,9 +154,6 @@ static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
 		wc->opcode    = IB_WC_MASKED_FETCH_ADD;
 		wc->byte_len  = 8;
 		break;
-	case MLX5_OPCODE_BIND_MW:
-		wc->opcode    = IB_WC_BIND_MW;
-		break;
 	case MLX5_OPCODE_UMR:
 		wc->opcode = get_umr_comp(wq, idx);
 		break;
@@ -171,6 +168,7 @@ enum {
 static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
 			     struct mlx5_ib_qp *qp)
 {
+	enum rdma_link_layer ll = rdma_port_get_link_layer(qp->ibqp.device, 1);
 	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
 	struct mlx5_ib_srq *srq;
 	struct mlx5_ib_wq *wq;
@@ -236,6 +234,22 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
 	} else {
 		wc->pkey_index = 0;
 	}
+
+	if (ll != IB_LINK_LAYER_ETHERNET)
+		return;
+
+	switch (wc->sl & 0x3) {
+	case MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH:
+		wc->network_hdr_type = RDMA_NETWORK_IB;
+		break;
+	case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6:
+		wc->network_hdr_type = RDMA_NETWORK_IPV6;
+		break;
+	case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4:
+		wc->network_hdr_type = RDMA_NETWORK_IPV4;
+		break;
+	}
+	wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
 }
 
 static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
@@ -760,12 +774,12 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 	int eqn;
 	int err;
 
-	if (attr->flags)
-		return ERR_PTR(-EINVAL);
-
 	if (entries < 0)
 		return ERR_PTR(-EINVAL);
 
+	if (check_cq_create_flags(attr->flags))
+		return ERR_PTR(-EOPNOTSUPP);
+
 	entries = roundup_pow_of_two(entries + 1);
 	if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
 		return ERR_PTR(-EINVAL);
@@ -779,6 +793,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 	spin_lock_init(&cq->lock);
 	cq->resize_buf = NULL;
 	cq->resize_umem = NULL;
+	cq->create_flags = attr->flags;
 
 	if (context) {
 		err = create_cq_user(dev, udata, context, cq, entries,
@@ -796,6 +811,10 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 
 	cq->cqe_size = cqe_size;
 	cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5;
+
+	if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN)
+		cqb->ctx.cqe_sz_flags |= (1 << 1);
+
 	cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index);
 	err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn);
 	if (err)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b0ec175cc6ba..ec737e2287fe 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -40,6 +40,8 @@
 #include <linux/io-mapping.h>
 #include <linux/sched.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 #include <linux/mlx5/vport.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
@@ -66,12 +68,14 @@ static char mlx5_version[] =
 	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
 	DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
 
+enum {
+	MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
+};
+
 static enum rdma_link_layer
-mlx5_ib_port_link_layer(struct ib_device *device)
+mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
 {
-	struct mlx5_ib_dev *dev = to_mdev(device);
-
-	switch (MLX5_CAP_GEN(dev->mdev, port_type)) {
+	switch (port_type_cap) {
 	case MLX5_CAP_PORT_TYPE_IB:
 		return IB_LINK_LAYER_INFINIBAND;
 	case MLX5_CAP_PORT_TYPE_ETH:
@@ -81,6 +85,202 @@ mlx5_ib_port_link_layer(struct ib_device *device)
 	}
 }
 
+static enum rdma_link_layer
+mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
+
+	return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+}
+
+static int mlx5_netdev_event(struct notifier_block *this,
+			     unsigned long event, void *ptr)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
+						 roce.nb);
+
+	if ((event != NETDEV_UNREGISTER) && (event != NETDEV_REGISTER))
+		return NOTIFY_DONE;
+
+	write_lock(&ibdev->roce.netdev_lock);
+	if (ndev->dev.parent == &ibdev->mdev->pdev->dev)
+		ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? NULL : ndev;
+	write_unlock(&ibdev->roce.netdev_lock);
+
+	return NOTIFY_DONE;
+}
+
+static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
+					     u8 port_num)
+{
+	struct mlx5_ib_dev *ibdev = to_mdev(device);
+	struct net_device *ndev;
+
+	/* Ensure ndev does not disappear before we invoke dev_hold()
+	 */
+	read_lock(&ibdev->roce.netdev_lock);
+	ndev = ibdev->roce.netdev;
+	if (ndev)
+		dev_hold(ndev);
+	read_unlock(&ibdev->roce.netdev_lock);
+
+	return ndev;
+}
+
+static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
+				struct ib_port_attr *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	struct net_device *ndev;
+	enum ib_mtu ndev_ib_mtu;
+	u16 qkey_viol_cntr;
+
+	memset(props, 0, sizeof(*props));
+
+	props->port_cap_flags  |= IB_PORT_CM_SUP;
+	props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
+
+	props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
+						roce_address_table_size);
+	props->max_mtu          = IB_MTU_4096;
+	props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
+	props->pkey_tbl_len     = 1;
+	props->state            = IB_PORT_DOWN;
+	props->phys_state       = 3;
+
+	mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
+	props->qkey_viol_cntr = qkey_viol_cntr;
+
+	ndev = mlx5_ib_get_netdev(device, port_num);
+	if (!ndev)
+		return 0;
+
+	if (netif_running(ndev) && netif_carrier_ok(ndev)) {
+		props->state      = IB_PORT_ACTIVE;
+		props->phys_state = 5;
+	}
+
+	ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
+
+	dev_put(ndev);
+
+	props->active_mtu	= min(props->max_mtu, ndev_ib_mtu);
+
+	props->active_width	= IB_WIDTH_4X;  /* TODO */
+	props->active_speed	= IB_SPEED_QDR; /* TODO */
+
+	return 0;
+}
+
+static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
+				     const struct ib_gid_attr *attr,
+				     void *mlx5_addr)
+{
+#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
+	char *mlx5_addr_l3_addr	= MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
+					       source_l3_address);
+	void *mlx5_addr_mac	= MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
+					       source_mac_47_32);
+
+	if (!gid)
+		return;
+
+	ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr);
+
+	if (is_vlan_dev(attr->ndev)) {
+		MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
+		MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev));
+	}
+
+	switch (attr->gid_type) {
+	case IB_GID_TYPE_IB:
+		MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
+		break;
+	case IB_GID_TYPE_ROCE_UDP_ENCAP:
+		MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
+		break;
+
+	default:
+		WARN_ON(true);
+	}
+
+	if (attr->gid_type != IB_GID_TYPE_IB) {
+		if (ipv6_addr_v4mapped((void *)gid))
+			MLX5_SET_RA(mlx5_addr, roce_l3_type,
+				    MLX5_ROCE_L3_TYPE_IPV4);
+		else
+			MLX5_SET_RA(mlx5_addr, roce_l3_type,
+				    MLX5_ROCE_L3_TYPE_IPV6);
+	}
+
+	if ((attr->gid_type == IB_GID_TYPE_IB) ||
+	    !ipv6_addr_v4mapped((void *)gid))
+		memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
+	else
+		memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
+}
+
+static int set_roce_addr(struct ib_device *device, u8 port_num,
+			 unsigned int index,
+			 const union ib_gid *gid,
+			 const struct ib_gid_attr *attr)
+{
+	struct mlx5_ib_dev *dev	= to_mdev(device);
+	u32  in[MLX5_ST_SZ_DW(set_roce_address_in)];
+	u32 out[MLX5_ST_SZ_DW(set_roce_address_out)];
+	void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
+
+	if (ll != IB_LINK_LAYER_ETHERNET)
+		return -EINVAL;
+
+	memset(in, 0, sizeof(in));
+
+	ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
+
+	MLX5_SET(set_roce_address_in, in, roce_address_index, index);
+	MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
+
+	memset(out, 0, sizeof(out));
+	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
+			   unsigned int index, const union ib_gid *gid,
+			   const struct ib_gid_attr *attr,
+			   __always_unused void **context)
+{
+	return set_roce_addr(device, port_num, index, gid, attr);
+}
+
+static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
+			   unsigned int index, __always_unused void **context)
+{
+	return set_roce_addr(device, port_num, index, NULL, NULL);
+}
+
+__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
+			       int index)
+{
+	struct ib_gid_attr attr;
+	union ib_gid gid;
+
+	if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
+		return 0;
+
+	if (!attr.ndev)
+		return 0;
+
+	dev_put(attr.ndev);
+
+	if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
+		return 0;
+
+	return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
+}
+
 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
 {
 	return !dev->mdev->issi;
@@ -97,13 +297,35 @@ static int mlx5_get_vport_access_method(struct ib_device *ibdev)
 	if (mlx5_use_mad_ifc(to_mdev(ibdev)))
 		return MLX5_VPORT_ACCESS_METHOD_MAD;
 
-	if (mlx5_ib_port_link_layer(ibdev) ==
+	if (mlx5_ib_port_link_layer(ibdev, 1) ==
 	    IB_LINK_LAYER_ETHERNET)
 		return MLX5_VPORT_ACCESS_METHOD_NIC;
 
 	return MLX5_VPORT_ACCESS_METHOD_HCA;
 }
 
+static void get_atomic_caps(struct mlx5_ib_dev *dev,
+			    struct ib_device_attr *props)
+{
+	u8 tmp;
+	u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
+	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
+	u8 atomic_req_8B_endianness_mode =
+		MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
+
+	/* Check if HW supports 8 bytes standard atomic operations and capable
+	 * of host endianness respond
+	 */
+	tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
+	if (((atomic_operations & tmp) == tmp) &&
+	    (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
+	    (atomic_req_8B_endianness_mode)) {
+		props->atomic_cap = IB_ATOMIC_HCA;
+	} else {
+		props->atomic_cap = IB_ATOMIC_NONE;
+	}
+}
+
 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
 					__be64 *sys_image_guid)
 {
@@ -119,13 +341,21 @@ static int mlx5_query_system_image_guid(struct ib_device *ibdev,
 
 	case MLX5_VPORT_ACCESS_METHOD_HCA:
 		err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
-		if (!err)
-			*sys_image_guid = cpu_to_be64(tmp);
-		return err;
+		break;
+
+	case MLX5_VPORT_ACCESS_METHOD_NIC:
+		err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
+		break;
 
 	default:
 		return -EINVAL;
 	}
+
+	if (!err)
+		*sys_image_guid = cpu_to_be64(tmp);
+
+	return err;
+
 }
 
 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
@@ -179,13 +409,20 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
 
 	case MLX5_VPORT_ACCESS_METHOD_HCA:
 		err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
-		if (!err)
-			*node_guid = cpu_to_be64(tmp);
-		return err;
+		break;
+
+	case MLX5_VPORT_ACCESS_METHOD_NIC:
+		err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
+		break;
 
 	default:
 		return -EINVAL;
 	}
+
+	if (!err)
+		*node_guid = cpu_to_be64(tmp);
+
+	return err;
 }
 
 struct mlx5_reg_node_desc {
@@ -263,6 +500,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	if (MLX5_CAP_GEN(mdev, block_lb_mc))
 		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 
+	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+	    (MLX5_CAP_ETH(dev->mdev, csum_cap)))
+			props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
+
 	props->vendor_part_id	   = mdev->pdev->device;
 	props->hw_ver		   = mdev->pdev->revision;
 
@@ -278,7 +519,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	props->max_sge = min(max_rq_sg, max_sq_sg);
 	props->max_sge_rd = props->max_sge;
 	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
-	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_eq_sz)) - 1;
+	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
 	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
 	props->max_pd		   = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
 	props->max_qp_rd_atom	   = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
@@ -289,13 +530,15 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
 	props->max_srq_sge	   = max_rq_sg - 1;
 	props->max_fast_reg_page_list_len = (unsigned int)-1;
-	props->atomic_cap	   = IB_ATOMIC_NONE;
+	get_atomic_caps(dev, props);
 	props->masked_atomic_cap   = IB_ATOMIC_NONE;
 	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
 	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
 	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
 					   props->max_mcast_grp;
 	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
+	props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
+	props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	if (MLX5_CAP_GEN(mdev, pg))
@@ -303,6 +546,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	props->odp_caps = dev->odp_caps;
 #endif
 
+	if (MLX5_CAP_GEN(mdev, cd))
+		props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
+
 	return 0;
 }
 
@@ -483,6 +729,9 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
 	case MLX5_VPORT_ACCESS_METHOD_HCA:
 		return mlx5_query_hca_port(ibdev, port, props);
 
+	case MLX5_VPORT_ACCESS_METHOD_NIC:
+		return mlx5_query_port_roce(ibdev, port, props);
+
 	default:
 		return -EINVAL;
 	}
@@ -583,8 +832,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 						  struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_ib_alloc_ucontext_req_v2 req;
-	struct mlx5_ib_alloc_ucontext_resp resp;
+	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
+	struct mlx5_ib_alloc_ucontext_resp resp = {};
 	struct mlx5_ib_ucontext *context;
 	struct mlx5_uuar_info *uuari;
 	struct mlx5_uar *uars;
@@ -599,20 +848,22 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (!dev->ib_active)
 		return ERR_PTR(-EAGAIN);
 
-	memset(&req, 0, sizeof(req));
+	if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr))
+		return ERR_PTR(-EINVAL);
+
 	reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
 	if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
 		ver = 0;
-	else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
+	else if (reqlen >= sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
 		ver = 2;
 	else
 		return ERR_PTR(-EINVAL);
 
-	err = ib_copy_from_udata(&req, udata, reqlen);
+	err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req)));
 	if (err)
 		return ERR_PTR(err);
 
-	if (req.flags || req.reserved)
+	if (req.flags)
 		return ERR_PTR(-EINVAL);
 
 	if (req.total_num_uuars > MLX5_MAX_UUARS)
@@ -621,6 +872,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (req.total_num_uuars == 0)
 		return ERR_PTR(-EINVAL);
 
+	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (reqlen > sizeof(req) &&
+	    !ib_is_udata_cleared(udata, sizeof(req),
+				 reqlen - sizeof(req)))
+		return ERR_PTR(-EOPNOTSUPP);
+
 	req.total_num_uuars = ALIGN(req.total_num_uuars,
 				    MLX5_NON_FP_BF_REGS_PER_PAGE);
 	if (req.num_low_latency_uuars > req.total_num_uuars - 1)
@@ -636,6 +895,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
 	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
 	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
+	resp.cqe_version = min_t(__u8,
+				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
+				 req.max_cqe_version);
+	resp.response_length = min(offsetof(typeof(resp), response_length) +
+				   sizeof(resp.response_length), udata->outlen);
 
 	context = kzalloc(sizeof(*context), GFP_KERNEL);
 	if (!context)
@@ -681,22 +945,49 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
 #endif
 
+	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
+		err = mlx5_core_alloc_transport_domain(dev->mdev,
+						       &context->tdn);
+		if (err)
+			goto out_uars;
+	}
+
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 
 	resp.tot_uuars = req.total_num_uuars;
 	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
-	err = ib_copy_to_udata(udata, &resp,
-			       sizeof(resp) - sizeof(resp.reserved));
+
+	if (field_avail(typeof(resp), cqe_version, udata->outlen))
+		resp.response_length += sizeof(resp.cqe_version);
+
+	if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
+		resp.comp_mask |=
+			MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
+		resp.hca_core_clock_offset =
+			offsetof(struct mlx5_init_seg, internal_timer_h) %
+			PAGE_SIZE;
+		resp.response_length += sizeof(resp.hca_core_clock_offset) +
+					sizeof(resp.reserved2) +
+					sizeof(resp.reserved3);
+	}
+
+	err = ib_copy_to_udata(udata, &resp, resp.response_length);
 	if (err)
-		goto out_uars;
+		goto out_td;
 
 	uuari->ver = ver;
 	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
 	uuari->uars = uars;
 	uuari->num_uars = num_uars;
+	context->cqe_version = resp.cqe_version;
+
 	return &context->ibucontext;
 
+out_td:
+	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
+		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
+
 out_uars:
 	for (i--; i >= 0; i--)
 		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
@@ -721,6 +1012,9 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 	struct mlx5_uuar_info *uuari = &context->uuari;
 	int i;
 
+	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
+		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
+
 	for (i = 0; i < uuari->num_uars; i++) {
 		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
 			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
@@ -790,6 +1084,30 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
 	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
 		return -ENOSYS;
 
+	case MLX5_IB_MMAP_CORE_CLOCK:
+		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+			return -EINVAL;
+
+		if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+			return -EPERM;
+
+		/* Don't expose to user-space information it shouldn't have */
+		if (PAGE_SIZE > 4096)
+			return -EOPNOTSUPP;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		pfn = (dev->mdev->iseg_base +
+		       offsetof(struct mlx5_init_seg, internal_timer_h)) >>
+			PAGE_SHIFT;
+		if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+
+		mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n",
+			    vma->vm_start,
+			    (unsigned long long)pfn << PAGE_SHIFT);
+		break;
+
 	default:
 		return -EINVAL;
 	}
@@ -1758,6 +2076,32 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr)
 	mlx5_ib_dealloc_pd(devr->p0);
 }
 
+static u32 get_core_cap_flags(struct ib_device *ibdev)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
+	u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
+	u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
+	u32 ret = 0;
+
+	if (ll == IB_LINK_LAYER_INFINIBAND)
+		return RDMA_CORE_PORT_IBA_IB;
+
+	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
+		return 0;
+
+	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
+		return 0;
+
+	if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
+		ret |= RDMA_CORE_PORT_IBA_ROCE;
+
+	if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
+		ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+
+	return ret;
+}
+
 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
 			       struct ib_port_immutable *immutable)
 {
@@ -1770,20 +2114,50 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
 
 	immutable->pkey_tbl_len = attr.pkey_tbl_len;
 	immutable->gid_tbl_len = attr.gid_tbl_len;
-	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+	immutable->core_cap_flags = get_core_cap_flags(ibdev);
 	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
 
 	return 0;
 }
 
+static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
+{
+	int err;
+
+	dev->roce.nb.notifier_call = mlx5_netdev_event;
+	err = register_netdevice_notifier(&dev->roce.nb);
+	if (err)
+		return err;
+
+	err = mlx5_nic_vport_enable_roce(dev->mdev);
+	if (err)
+		goto err_unregister_netdevice_notifier;
+
+	return 0;
+
+err_unregister_netdevice_notifier:
+	unregister_netdevice_notifier(&dev->roce.nb);
+	return err;
+}
+
+static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
+{
+	mlx5_nic_vport_disable_roce(dev->mdev);
+	unregister_netdevice_notifier(&dev->roce.nb);
+}
+
 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_ib_dev *dev;
+	enum rdma_link_layer ll;
+	int port_type_cap;
 	int err;
 	int i;
 
-	/* don't create IB instance over Eth ports, no RoCE yet! */
-	if (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH)
+	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
+	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+
+	if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce))
 		return NULL;
 
 	printk_once(KERN_INFO "%s", mlx5_version);
@@ -1794,6 +2168,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 
 	dev->mdev = mdev;
 
+	rwlock_init(&dev->roce.netdev_lock);
 	err = get_port_caps(dev);
 	if (err)
 		goto err_dealloc;
@@ -1843,7 +2218,12 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 
 	dev->ib_dev.query_device	= mlx5_ib_query_device;
 	dev->ib_dev.query_port		= mlx5_ib_query_port;
+	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
+	if (ll == IB_LINK_LAYER_ETHERNET)
+		dev->ib_dev.get_netdev	= mlx5_ib_get_netdev;
 	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
+	dev->ib_dev.add_gid		= mlx5_ib_add_gid;
+	dev->ib_dev.del_gid		= mlx5_ib_del_gid;
 	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
 	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
 	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
@@ -1893,7 +2273,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
 	}
 
-	if (mlx5_ib_port_link_layer(&dev->ib_dev) ==
+	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
 	    IB_LINK_LAYER_ETHERNET) {
 		dev->ib_dev.create_flow	= mlx5_ib_create_flow;
 		dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
@@ -1908,9 +2288,15 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	mutex_init(&dev->flow_db.lock);
 	mutex_init(&dev->cap_mask_mutex);
 
+	if (ll == IB_LINK_LAYER_ETHERNET) {
+		err = mlx5_enable_roce(dev);
+		if (err)
+			goto err_dealloc;
+	}
+
 	err = create_dev_resources(&dev->devr);
 	if (err)
-		goto err_dealloc;
+		goto err_disable_roce;
 
 	err = mlx5_ib_odp_init_one(dev);
 	if (err)
@@ -1947,6 +2333,10 @@ err_odp:
 err_rsrc:
 	destroy_dev_resources(&dev->devr);
 
+err_disable_roce:
+	if (ll == IB_LINK_LAYER_ETHERNET)
+		mlx5_disable_roce(dev);
+
 err_dealloc:
 	ib_dealloc_device((struct ib_device *)dev);
 
@@ -1956,11 +2346,14 @@ err_dealloc:
 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 {
 	struct mlx5_ib_dev *dev = context;
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
 
 	ib_unregister_device(&dev->ib_dev);
 	destroy_umrc_res(dev);
 	mlx5_ib_odp_remove_one(dev);
 	destroy_dev_resources(&dev->devr);
+	if (ll == IB_LINK_LAYER_ETHERNET)
+		mlx5_disable_roce(dev);
 	ib_dealloc_device(&dev->ib_dev);
 }
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 1474cccd1e0f..d2b9737baa36 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -42,6 +42,7 @@
 #include <linux/mlx5/qp.h>
 #include <linux/mlx5/srq.h>
 #include <linux/types.h>
+#include <linux/mlx5/transobj.h>
 
 #define mlx5_ib_dbg(dev, format, arg...)				\
 pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
@@ -55,6 +56,11 @@ pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
 pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
 	__LINE__, current->pid, ##arg)
 
+#define field_avail(type, fld, sz) (offsetof(type, fld) +		\
+				    sizeof(((type *)0)->fld) <= (sz))
+#define MLX5_IB_DEFAULT_UIDX 0xffffff
+#define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index)
+
 enum {
 	MLX5_IB_MMAP_CMD_SHIFT	= 8,
 	MLX5_IB_MMAP_CMD_MASK	= 0xff,
@@ -62,7 +68,9 @@ enum {
 
 enum mlx5_ib_mmap_cmd {
 	MLX5_IB_MMAP_REGULAR_PAGE		= 0,
-	MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES	= 1, /* always last */
+	MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES	= 1,
+	/* 5 is chosen in order to be compatible with old versions of libmlx5 */
+	MLX5_IB_MMAP_CORE_CLOCK			= 5,
 };
 
 enum {
@@ -85,6 +93,15 @@ enum mlx5_ib_mad_ifc_flags {
 	MLX5_MAD_IFC_NET_VIEW		= 4,
 };
 
+enum {
+	MLX5_CROSS_CHANNEL_UUAR         = 0,
+};
+
+enum {
+	MLX5_CQE_VERSION_V0,
+	MLX5_CQE_VERSION_V1,
+};
+
 struct mlx5_ib_ucontext {
 	struct ib_ucontext	ibucontext;
 	struct list_head	db_page_list;
@@ -93,6 +110,9 @@ struct mlx5_ib_ucontext {
 	 */
 	struct mutex		db_page_mutex;
 	struct mlx5_uuar_info	uuari;
+	u8			cqe_version;
+	/* Transport Domain number */
+	u32			tdn;
 };
 
 static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -201,47 +221,70 @@ struct mlx5_ib_pfault {
 	struct mlx5_pagefault	mpfault;
 };
 
+struct mlx5_ib_ubuffer {
+	struct ib_umem	       *umem;
+	int			buf_size;
+	u64			buf_addr;
+};
+
+struct mlx5_ib_qp_base {
+	struct mlx5_ib_qp	*container_mibqp;
+	struct mlx5_core_qp	mqp;
+	struct mlx5_ib_ubuffer	ubuffer;
+};
+
+struct mlx5_ib_qp_trans {
+	struct mlx5_ib_qp_base	base;
+	u16			xrcdn;
+	u8			alt_port;
+	u8			atomic_rd_en;
+	u8			resp_depth;
+};
+
 struct mlx5_ib_rq {
+	struct mlx5_ib_qp_base base;
+	struct mlx5_ib_wq	*rq;
+	struct mlx5_ib_ubuffer	ubuffer;
+	struct mlx5_db		*doorbell;
 	u32			tirn;
+	u8			state;
+};
+
+struct mlx5_ib_sq {
+	struct mlx5_ib_qp_base base;
+	struct mlx5_ib_wq	*sq;
+	struct mlx5_ib_ubuffer  ubuffer;
+	struct mlx5_db		*doorbell;
+	u32			tisn;
+	u8			state;
 };
 
 struct mlx5_ib_raw_packet_qp {
+	struct mlx5_ib_sq sq;
 	struct mlx5_ib_rq rq;
 };
 
 struct mlx5_ib_qp {
 	struct ib_qp		ibqp;
 	union {
-		struct mlx5_core_qp		mqp;
-		struct mlx5_ib_raw_packet_qp	raw_packet_qp;
+		struct mlx5_ib_qp_trans trans_qp;
+		struct mlx5_ib_raw_packet_qp raw_packet_qp;
 	};
-
 	struct mlx5_buf		buf;
 
 	struct mlx5_db		db;
 	struct mlx5_ib_wq	rq;
 
-	u32			doorbell_qpn;
 	u8			sq_signal_bits;
 	u8			fm_cache;
-	int			sq_max_wqes_per_wr;
-	int			sq_spare_wqes;
 	struct mlx5_ib_wq	sq;
 
-	struct ib_umem	       *umem;
-	int			buf_size;
-
 	/* serialize qp state modifications
 	 */
 	struct mutex		mutex;
-	u16			xrcdn;
 	u32			flags;
 	u8			port;
-	u8			alt_port;
-	u8			atomic_rd_en;
-	u8			resp_depth;
 	u8			state;
-	int			mlx_type;
 	int			wq_sig;
 	int			scat_cqe;
 	int			max_inline_data;
@@ -284,6 +327,9 @@ struct mlx5_ib_cq_buf {
 enum mlx5_ib_qp_flags {
 	MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK     = 1 << 0,
 	MLX5_IB_QP_SIGNATURE_HANDLING           = 1 << 1,
+	MLX5_IB_QP_CROSS_CHANNEL		= 1 << 2,
+	MLX5_IB_QP_MANAGED_SEND			= 1 << 3,
+	MLX5_IB_QP_MANAGED_RECV			= 1 << 4,
 };
 
 struct mlx5_umr_wr {
@@ -326,6 +372,7 @@ struct mlx5_ib_cq {
 	struct mlx5_ib_cq_buf  *resize_buf;
 	struct ib_umem	       *resize_umem;
 	int			cqe_size;
+	u32			create_flags;
 };
 
 struct mlx5_ib_srq {
@@ -449,9 +496,19 @@ struct mlx5_ib_resources {
 	struct ib_srq	*s1;
 };
 
+struct mlx5_roce {
+	/* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
+	 * netdev pointer
+	 */
+	rwlock_t		netdev_lock;
+	struct net_device	*netdev;
+	struct notifier_block	nb;
+};
+
 struct mlx5_ib_dev {
 	struct ib_device		ib_dev;
 	struct mlx5_core_dev		*mdev;
+	struct mlx5_roce		roce;
 	MLX5_DECLARE_DOORBELL_LOCK(uar_lock);
 	int				num_ports;
 	/* serialize update of capability mask
@@ -498,7 +555,7 @@ static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq)
 
 static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp)
 {
-	return container_of(mqp, struct mlx5_ib_qp, mqp);
+	return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp;
 }
 
 static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr)
@@ -550,8 +607,6 @@ void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
 int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
 		 u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
 		 const void *in_mad, void *response_mad);
-struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
-			   struct mlx5_ib_ah *ah);
 struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
 int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
 int mlx5_ib_destroy_ah(struct ib_ah *ah);
@@ -578,7 +633,8 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 		      struct ib_recv_wr **bad_wr);
 void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
 int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
-			  void *buffer, u32 length);
+			  void *buffer, u32 length,
+			  struct mlx5_ib_qp_base *base);
 struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 				const struct ib_cq_init_attr *attr,
 				struct ib_ucontext *context,
@@ -680,6 +736,9 @@ static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)  {}
 
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
+__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
+			       int index);
+
 static inline void init_query_mad(struct ib_smp *mad)
 {
 	mad->base_version  = 1;
@@ -705,4 +764,28 @@ static inline int is_qp1(enum ib_qp_type qp_type)
 #define MLX5_MAX_UMR_SHIFT 16
 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
 
+static inline u32 check_cq_create_flags(u32 flags)
+{
+	/*
+	 * It returns non-zero value for unsupported CQ
+	 * create flags, otherwise it returns zero.
+	 */
+	return (flags & ~(IB_CQ_FLAGS_IGNORE_OVERRUN |
+			  IB_CQ_FLAGS_TIMESTAMP_COMPLETION));
+}
+
+static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx,
+				     u32 *user_index)
+{
+	if (cqe_version) {
+		if ((cmd_uidx == MLX5_IB_DEFAULT_UIDX) ||
+		    (cmd_uidx & ~MLX5_USER_ASSIGNED_UIDX_MASK))
+			return -EINVAL;
+		*user_index = cmd_uidx;
+	} else {
+		*user_index = MLX5_IB_DEFAULT_UIDX;
+	}
+
+	return 0;
+}
 #endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index aa8391e75385..b8d76361a48d 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -153,14 +153,16 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
 
 static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
 				      struct mlx5_ib_pfault *pfault,
-				      int error) {
+				      int error)
+{
 	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
-	int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn,
+	u32 qpn = qp->trans_qp.base.mqp.qpn;
+	int ret = mlx5_core_page_fault_resume(dev->mdev,
+					      qpn,
 					      pfault->mpfault.flags,
 					      error);
 	if (ret)
-		pr_err("Failed to resolve the page fault on QP 0x%x\n",
-		       qp->mqp.qpn);
+		pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn);
 }
 
 /*
@@ -391,6 +393,7 @@ static int mlx5_ib_mr_initiator_pfault_handler(
 #if defined(DEBUG)
 	u32 ctrl_wqe_index, ctrl_qpn;
 #endif
+	u32 qpn = qp->trans_qp.base.mqp.qpn;
 
 	ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
 	if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
@@ -401,7 +404,7 @@ static int mlx5_ib_mr_initiator_pfault_handler(
 
 	if (ds == 0) {
 		mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
-			    wqe_index, qp->mqp.qpn);
+			    wqe_index, qpn);
 		return -EFAULT;
 	}
 
@@ -411,16 +414,16 @@ static int mlx5_ib_mr_initiator_pfault_handler(
 			MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
 	if (wqe_index != ctrl_wqe_index) {
 		mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
-			    wqe_index, qp->mqp.qpn,
+			    wqe_index, qpn,
 			    ctrl_wqe_index);
 		return -EFAULT;
 	}
 
 	ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
 		MLX5_WQE_CTRL_QPN_SHIFT;
-	if (qp->mqp.qpn != ctrl_qpn) {
+	if (qpn != ctrl_qpn) {
 		mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
-			    wqe_index, qp->mqp.qpn,
+			    wqe_index, qpn,
 			    ctrl_qpn);
 		return -EFAULT;
 	}
@@ -537,6 +540,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
 	int resume_with_error = 0;
 	u16 wqe_index = pfault->mpfault.wqe.wqe_index;
 	int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR;
+	u32 qpn = qp->trans_qp.base.mqp.qpn;
 
 	buffer = (char *)__get_free_page(GFP_KERNEL);
 	if (!buffer) {
@@ -546,10 +550,10 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
 	}
 
 	ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
-				    PAGE_SIZE);
+				    PAGE_SIZE, &qp->trans_qp.base);
 	if (ret < 0) {
 		mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n",
-			    -ret, wqe_index, qp->mqp.qpn);
+			    -ret, wqe_index, qpn);
 		resume_with_error = 1;
 		goto resolve_page_fault;
 	}
@@ -586,7 +590,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
 resolve_page_fault:
 	mlx5_ib_page_fault_resume(qp, pfault, resume_with_error);
 	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n",
-		    qp->mqp.qpn, resume_with_error, pfault->mpfault.flags);
+		    qpn, resume_with_error,
+		    pfault->mpfault.flags);
 
 	free_page((unsigned long)buffer);
 }
@@ -753,7 +758,7 @@ void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
 	qp->disable_page_faults = 1;
 	spin_lock_init(&qp->disable_page_faults_lock);
 
-	qp->mqp.pfault_handler	= mlx5_ib_pfault_handler;
+	qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler;
 
 	for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
 		INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 307bdbca8938..8fb9c27485e1 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -32,6 +32,8 @@
 
 #include <linux/module.h>
 #include <rdma/ib_umem.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_user_verbs.h>
 #include "mlx5_ib.h"
 #include "user.h"
 
@@ -114,14 +116,15 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
  * Return: the number of bytes copied, or an error code.
  */
 int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
-			  void *buffer, u32 length)
+			  void *buffer, u32 length,
+			  struct mlx5_ib_qp_base *base)
 {
 	struct ib_device *ibdev = qp->ibqp.device;
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
 	size_t offset;
 	size_t wq_end;
-	struct ib_umem *umem = qp->umem;
+	struct ib_umem *umem = base->ubuffer.umem;
 	u32 first_copy_length;
 	int wqe_length;
 	int ret;
@@ -172,8 +175,10 @@ static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
 	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
 	struct ib_event event;
 
-	if (type == MLX5_EVENT_TYPE_PATH_MIG)
-		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+	if (type == MLX5_EVENT_TYPE_PATH_MIG) {
+		/* This event is only valid for trans_qps */
+		to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port;
+	}
 
 	if (ibqp->event_handler) {
 		event.device     = ibqp->device;
@@ -366,7 +371,9 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
 
 static int set_user_buf_size(struct mlx5_ib_dev *dev,
 			    struct mlx5_ib_qp *qp,
-			    struct mlx5_ib_create_qp *ucmd)
+			    struct mlx5_ib_create_qp *ucmd,
+			    struct mlx5_ib_qp_base *base,
+			    struct ib_qp_init_attr *attr)
 {
 	int desc_sz = 1 << qp->sq.wqe_shift;
 
@@ -391,8 +398,13 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev,
 		return -EINVAL;
 	}
 
-	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
-		(qp->sq.wqe_cnt << 6);
+	if (attr->qp_type == IB_QPT_RAW_PACKET) {
+		base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+		qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6;
+	} else {
+		base->ubuffer.buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+					 (qp->sq.wqe_cnt << 6);
+	}
 
 	return 0;
 }
@@ -578,8 +590,8 @@ static int to_mlx5_st(enum ib_qp_type type)
 	case IB_QPT_SMI:		return MLX5_QP_ST_QP0;
 	case IB_QPT_GSI:		return MLX5_QP_ST_QP1;
 	case IB_QPT_RAW_IPV6:		return MLX5_QP_ST_RAW_IPV6;
-	case IB_QPT_RAW_ETHERTYPE:	return MLX5_QP_ST_RAW_ETHERTYPE;
 	case IB_QPT_RAW_PACKET:
+	case IB_QPT_RAW_ETHERTYPE:	return MLX5_QP_ST_RAW_ETHERTYPE;
 	case IB_QPT_MAX:
 	default:		return -EINVAL;
 	}
@@ -590,13 +602,51 @@ static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
 	return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
 }
 
+static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
+			    struct ib_pd *pd,
+			    unsigned long addr, size_t size,
+			    struct ib_umem **umem,
+			    int *npages, int *page_shift, int *ncont,
+			    u32 *offset)
+{
+	int err;
+
+	*umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0);
+	if (IS_ERR(*umem)) {
+		mlx5_ib_dbg(dev, "umem_get failed\n");
+		return PTR_ERR(*umem);
+	}
+
+	mlx5_ib_cont_pages(*umem, addr, npages, page_shift, ncont, NULL);
+
+	err = mlx5_ib_get_buf_offset(addr, *page_shift, offset);
+	if (err) {
+		mlx5_ib_warn(dev, "bad offset\n");
+		goto err_umem;
+	}
+
+	mlx5_ib_dbg(dev, "addr 0x%lx, size %zu, npages %d, page_shift %d, ncont %d, offset %d\n",
+		    addr, size, *npages, *page_shift, *ncont, *offset);
+
+	return 0;
+
+err_umem:
+	ib_umem_release(*umem);
+	*umem = NULL;
+
+	return err;
+}
+
 static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 			  struct mlx5_ib_qp *qp, struct ib_udata *udata,
+			  struct ib_qp_init_attr *attr,
 			  struct mlx5_create_qp_mbox_in **in,
-			  struct mlx5_ib_create_qp_resp *resp, int *inlen)
+			  struct mlx5_ib_create_qp_resp *resp, int *inlen,
+			  struct mlx5_ib_qp_base *base)
 {
 	struct mlx5_ib_ucontext *context;
 	struct mlx5_ib_create_qp ucmd;
+	struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer;
 	int page_shift = 0;
 	int uar_index;
 	int npages;
@@ -615,18 +665,23 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	/*
 	 * TBD: should come from the verbs when we have the API
 	 */
-	uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
-	if (uuarn < 0) {
-		mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
-		mlx5_ib_dbg(dev, "reverting to medium latency\n");
-		uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);
+	if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+		/* In CROSS_CHANNEL CQ and QP must use the same UAR */
+		uuarn = MLX5_CROSS_CHANNEL_UUAR;
+	else {
+		uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
 		if (uuarn < 0) {
-			mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n");
-			mlx5_ib_dbg(dev, "reverting to high latency\n");
-			uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
+			mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
+			mlx5_ib_dbg(dev, "reverting to medium latency\n");
+			uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);
 			if (uuarn < 0) {
-				mlx5_ib_warn(dev, "uuar allocation failed\n");
-				return uuarn;
+				mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n");
+				mlx5_ib_dbg(dev, "reverting to high latency\n");
+				uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
+				if (uuarn < 0) {
+					mlx5_ib_warn(dev, "uuar allocation failed\n");
+					return uuarn;
+				}
 			}
 		}
 	}
@@ -638,32 +693,20 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
 	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 
-	err = set_user_buf_size(dev, qp, &ucmd);
+	err = set_user_buf_size(dev, qp, &ucmd, base, attr);
 	if (err)
 		goto err_uuar;
 
-	if (ucmd.buf_addr && qp->buf_size) {
-		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
-				       qp->buf_size, 0, 0);
-		if (IS_ERR(qp->umem)) {
-			mlx5_ib_dbg(dev, "umem_get failed\n");
-			err = PTR_ERR(qp->umem);
+	if (ucmd.buf_addr && ubuffer->buf_size) {
+		ubuffer->buf_addr = ucmd.buf_addr;
+		err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr,
+				       ubuffer->buf_size,
+				       &ubuffer->umem, &npages, &page_shift,
+				       &ncont, &offset);
+		if (err)
 			goto err_uuar;
-		}
 	} else {
-		qp->umem = NULL;
-	}
-
-	if (qp->umem) {
-		mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift,
-				   &ncont, NULL);
-		err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset);
-		if (err) {
-			mlx5_ib_warn(dev, "bad offset\n");
-			goto err_umem;
-		}
-		mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n",
-			    ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset);
+		ubuffer->umem = NULL;
 	}
 
 	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
@@ -672,8 +715,9 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		err = -ENOMEM;
 		goto err_umem;
 	}
-	if (qp->umem)
-		mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0);
+	if (ubuffer->umem)
+		mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift,
+				     (*in)->pas, 0);
 	(*in)->ctx.log_pg_sz_remote_qpn =
 		cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
 	(*in)->ctx.params2 = cpu_to_be32(offset << 6);
@@ -704,29 +748,31 @@ err_free:
 	kvfree(*in);
 
 err_umem:
-	if (qp->umem)
-		ib_umem_release(qp->umem);
+	if (ubuffer->umem)
+		ib_umem_release(ubuffer->umem);
 
 err_uuar:
 	free_uuar(&context->uuari, uuarn);
 	return err;
 }
 
-static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp)
+static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp,
+			    struct mlx5_ib_qp_base *base)
 {
 	struct mlx5_ib_ucontext *context;
 
 	context = to_mucontext(pd->uobject->context);
 	mlx5_ib_db_unmap_user(context, &qp->db);
-	if (qp->umem)
-		ib_umem_release(qp->umem);
+	if (base->ubuffer.umem)
+		ib_umem_release(base->ubuffer.umem);
 	free_uuar(&context->uuari, qp->uuarn);
 }
 
 static int create_kernel_qp(struct mlx5_ib_dev *dev,
 			    struct ib_qp_init_attr *init_attr,
 			    struct mlx5_ib_qp *qp,
-			    struct mlx5_create_qp_mbox_in **in, int *inlen)
+			    struct mlx5_create_qp_mbox_in **in, int *inlen,
+			    struct mlx5_ib_qp_base *base)
 {
 	enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW;
 	struct mlx5_uuar_info *uuari;
@@ -758,9 +804,9 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 
 	qp->rq.offset = 0;
 	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
-	qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
+	base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
 
-	err = mlx5_buf_alloc(dev->mdev, qp->buf_size, &qp->buf);
+	err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf);
 	if (err) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
 		goto err_uuar;
@@ -853,19 +899,304 @@ static int is_connected(enum ib_qp_type qp_type)
 	return 0;
 }
 
+static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_sq *sq, u32 tdn)
+{
+	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
+	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(tisc, tisc, transport_domain, tdn);
+
+	return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn);
+}
+
+static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
+				      struct mlx5_ib_sq *sq)
+{
+	mlx5_core_destroy_tis(dev->mdev, sq->tisn);
+}
+
+static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+				   struct mlx5_ib_sq *sq, void *qpin,
+				   struct ib_pd *pd)
+{
+	struct mlx5_ib_ubuffer *ubuffer = &sq->ubuffer;
+	__be64 *pas;
+	void *in;
+	void *sqc;
+	void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc);
+	void *wq;
+	int inlen;
+	int err;
+	int page_shift = 0;
+	int npages;
+	int ncont = 0;
+	u32 offset = 0;
+
+	err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size,
+			       &sq->ubuffer.umem, &npages, &page_shift,
+			       &ncont, &offset);
+	if (err)
+		return err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * ncont;
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_umem;
+	}
+
+	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
+	MLX5_SET(sqc, sqc, flush_in_error_en, 1);
+	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
+	MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index));
+	MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd));
+	MLX5_SET(sqc, sqc, tis_lst_sz, 1);
+	MLX5_SET(sqc, sqc, tis_num_0, sq->tisn);
+
+	wq = MLX5_ADDR_OF(sqc, sqc, wq);
+	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+	MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd));
+	MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page));
+	MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
+	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
+	MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size));
+	MLX5_SET(wq, wq, log_wq_pg_sz,  page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(wq, wq, page_offset, offset);
+
+	pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+	mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0);
+
+	err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp);
+
+	kvfree(in);
+
+	if (err)
+		goto err_umem;
+
+	return 0;
+
+err_umem:
+	ib_umem_release(sq->ubuffer.umem);
+	sq->ubuffer.umem = NULL;
+
+	return err;
+}
+
+static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+				     struct mlx5_ib_sq *sq)
+{
+	mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp);
+	ib_umem_release(sq->ubuffer.umem);
+}
+
+static int get_rq_pas_size(void *qpc)
+{
+	u32 log_page_size = MLX5_GET(qpc, qpc, log_page_size) + 12;
+	u32 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride);
+	u32 log_rq_size   = MLX5_GET(qpc, qpc, log_rq_size);
+	u32 page_offset   = MLX5_GET(qpc, qpc, page_offset);
+	u32 po_quanta	  = 1 << (log_page_size - 6);
+	u32 rq_sz	  = 1 << (log_rq_size + 4 + log_rq_stride);
+	u32 page_size	  = 1 << log_page_size;
+	u32 rq_sz_po      = rq_sz + (page_offset * po_quanta);
+	u32 rq_num_pas	  = (rq_sz_po + page_size - 1) / page_size;
+
+	return rq_num_pas * sizeof(u64);
+}
+
+static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
+				   struct mlx5_ib_rq *rq, void *qpin)
+{
+	__be64 *pas;
+	__be64 *qp_pas;
+	void *in;
+	void *rqc;
+	void *wq;
+	void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc);
+	int inlen;
+	int err;
+	u32 rq_pas_size = get_rq_pas_size(qpc);
+
+	inlen = MLX5_ST_SZ_BYTES(create_rq_in) + rq_pas_size;
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+	MLX5_SET(rqc, rqc, vsd, 1);
+	MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE);
+	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
+	MLX5_SET(rqc, rqc, flush_in_error_en, 1);
+	MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index));
+	MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv));
+
+	wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+	MLX5_SET(wq, wq, end_padding_mode,
+		 MLX5_GET64(qpc, qpc, end_padding_mode));
+	MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset));
+	MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd));
+	MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
+	MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4);
+	MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(qpc, qpc, log_page_size));
+	MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size));
+
+	pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+	qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas);
+	memcpy(pas, qp_pas, rq_pas_size);
+
+	err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
+				     struct mlx5_ib_rq *rq)
+{
+	mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp);
+}
+
+static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_rq *rq, u32 tdn)
+{
+	u32 *in;
+	void *tirc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
+	MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn);
+	MLX5_SET(tirc, tirc, transport_domain, tdn);
+
+	err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
+				      struct mlx5_ib_rq *rq)
+{
+	mlx5_core_destroy_tir(dev->mdev, rq->tirn);
+}
+
+static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+				struct mlx5_create_qp_mbox_in *in,
+				struct ib_pd *pd)
+{
+	struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+	struct ib_uobject *uobj = pd->uobject;
+	struct ib_ucontext *ucontext = uobj->context;
+	struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+	int err;
+	u32 tdn = mucontext->tdn;
+
+	if (qp->sq.wqe_cnt) {
+		err = create_raw_packet_qp_tis(dev, sq, tdn);
+		if (err)
+			return err;
+
+		err = create_raw_packet_qp_sq(dev, sq, in, pd);
+		if (err)
+			goto err_destroy_tis;
+
+		sq->base.container_mibqp = qp;
+	}
+
+	if (qp->rq.wqe_cnt) {
+		err = create_raw_packet_qp_rq(dev, rq, in);
+		if (err)
+			goto err_destroy_sq;
+
+		rq->base.container_mibqp = qp;
+
+		err = create_raw_packet_qp_tir(dev, rq, tdn);
+		if (err)
+			goto err_destroy_rq;
+	}
+
+	qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn :
+						     rq->base.mqp.qpn;
+
+	return 0;
+
+err_destroy_rq:
+	destroy_raw_packet_qp_rq(dev, rq);
+err_destroy_sq:
+	if (!qp->sq.wqe_cnt)
+		return err;
+	destroy_raw_packet_qp_sq(dev, sq);
+err_destroy_tis:
+	destroy_raw_packet_qp_tis(dev, sq);
+
+	return err;
+}
+
+static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev,
+				  struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+
+	if (qp->rq.wqe_cnt) {
+		destroy_raw_packet_qp_tir(dev, rq);
+		destroy_raw_packet_qp_rq(dev, rq);
+	}
+
+	if (qp->sq.wqe_cnt) {
+		destroy_raw_packet_qp_sq(dev, sq);
+		destroy_raw_packet_qp_tis(dev, sq);
+	}
+}
+
+static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp,
+				    struct mlx5_ib_raw_packet_qp *raw_packet_qp)
+{
+	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+
+	sq->sq = &qp->sq;
+	rq->rq = &qp->rq;
+	sq->doorbell = &qp->db;
+	rq->doorbell = &qp->db;
+}
+
 static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata, struct mlx5_ib_qp *qp)
 {
 	struct mlx5_ib_resources *devr = &dev->devr;
 	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_ib_qp_base *base;
 	struct mlx5_ib_create_qp_resp resp;
 	struct mlx5_create_qp_mbox_in *in;
 	struct mlx5_ib_create_qp ucmd;
 	int inlen = sizeof(*in);
 	int err;
+	u32 uidx = MLX5_IB_DEFAULT_UIDX;
+	void *qpc;
+
+	base = init_attr->qp_type == IB_QPT_RAW_PACKET ?
+	       &qp->raw_packet_qp.rq.base :
+	       &qp->trans_qp.base;
 
-	mlx5_ib_odp_create_qp(qp);
+	if (init_attr->qp_type != IB_QPT_RAW_PACKET)
+		mlx5_ib_odp_create_qp(qp);
 
 	mutex_init(&qp->mutex);
 	spin_lock_init(&qp->sq.lock);
@@ -880,6 +1211,21 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		}
 	}
 
+	if (init_attr->create_flags &
+			(IB_QP_CREATE_CROSS_CHANNEL |
+			 IB_QP_CREATE_MANAGED_SEND |
+			 IB_QP_CREATE_MANAGED_RECV)) {
+		if (!MLX5_CAP_GEN(mdev, cd)) {
+			mlx5_ib_dbg(dev, "cross-channel isn't supported\n");
+			return -EINVAL;
+		}
+		if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL)
+			qp->flags |= MLX5_IB_QP_CROSS_CHANNEL;
+		if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND)
+			qp->flags |= MLX5_IB_QP_MANAGED_SEND;
+		if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV)
+			qp->flags |= MLX5_IB_QP_MANAGED_RECV;
+	}
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
 		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
 
@@ -889,6 +1235,11 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 			return -EFAULT;
 		}
 
+		err = get_qp_user_index(to_mucontext(pd->uobject->context),
+					&ucmd, udata->inlen, &uidx);
+		if (err)
+			return err;
+
 		qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
 		qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
 	} else {
@@ -918,11 +1269,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 					    ucmd.sq_wqe_count, max_wqes);
 				return -EINVAL;
 			}
-			err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen);
+			err = create_user_qp(dev, pd, qp, udata, init_attr, &in,
+					     &resp, &inlen, base);
 			if (err)
 				mlx5_ib_dbg(dev, "err %d\n", err);
 		} else {
-			err = create_kernel_qp(dev, init_attr, qp, &in, &inlen);
+			err = create_kernel_qp(dev, init_attr, qp, &in, &inlen,
+					       base);
 			if (err)
 				mlx5_ib_dbg(dev, "err %d\n", err);
 		}
@@ -954,6 +1307,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
 		in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST);
 
+	if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+		in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_MASTER);
+	if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
+		in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_SEND);
+	if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
+		in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_RECV);
+
 	if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
 		int rcqe_sz;
 		int scqe_sz;
@@ -1018,26 +1378,35 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
 	in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma);
 
-	err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen);
+	if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+		/* 0xffffff means we ask to work with cqe version 0 */
+		MLX5_SET(qpc, qpc, user_index, uidx);
+	}
+
+	if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
+		qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr;
+		raw_packet_qp_copy_info(qp, &qp->raw_packet_qp);
+		err = create_raw_packet_qp(dev, qp, in, pd);
+	} else {
+		err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen);
+	}
+
 	if (err) {
 		mlx5_ib_dbg(dev, "create qp failed\n");
 		goto err_create;
 	}
 
 	kvfree(in);
-	/* Hardware wants QPN written in big-endian order (after
-	 * shifting) for send doorbell.  Precompute this value to save
-	 * a little bit when posting sends.
-	 */
-	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
 
-	qp->mqp.event = mlx5_ib_qp_event;
+	base->container_mibqp = qp;
+	base->mqp.event = mlx5_ib_qp_event;
 
 	return 0;
 
 err_create:
 	if (qp->create_type == MLX5_QP_USER)
-		destroy_qp_user(pd, qp);
+		destroy_qp_user(pd, qp, base);
 	else if (qp->create_type == MLX5_QP_KERNEL)
 		destroy_qp_kernel(dev, qp);
 
@@ -1129,11 +1498,11 @@ static void get_cqs(struct mlx5_ib_qp *qp,
 	case IB_QPT_UD:
 	case IB_QPT_RAW_IPV6:
 	case IB_QPT_RAW_ETHERTYPE:
+	case IB_QPT_RAW_PACKET:
 		*send_cq = to_mcq(qp->ibqp.send_cq);
 		*recv_cq = to_mcq(qp->ibqp.recv_cq);
 		break;
 
-	case IB_QPT_RAW_PACKET:
 	case IB_QPT_MAX:
 	default:
 		*send_cq = NULL;
@@ -1142,45 +1511,66 @@ static void get_cqs(struct mlx5_ib_qp *qp,
 	}
 }
 
+static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+				u16 operation);
+
 static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 {
 	struct mlx5_ib_cq *send_cq, *recv_cq;
+	struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
 	struct mlx5_modify_qp_mbox_in *in;
 	int err;
 
+	base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ?
+	       &qp->raw_packet_qp.rq.base :
+	       &qp->trans_qp.base;
+
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
 	if (!in)
 		return;
 
 	if (qp->state != IB_QPS_RESET) {
-		mlx5_ib_qp_disable_pagefaults(qp);
-		if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state),
-					MLX5_QP_STATE_RST, in, 0, &qp->mqp))
-			mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
-				     qp->mqp.qpn);
+		if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) {
+			mlx5_ib_qp_disable_pagefaults(qp);
+			err = mlx5_core_qp_modify(dev->mdev,
+						  MLX5_CMD_OP_2RST_QP, in, 0,
+						  &base->mqp);
+		} else {
+			err = modify_raw_packet_qp(dev, qp,
+						   MLX5_CMD_OP_2RST_QP);
+		}
+		if (err)
+			mlx5_ib_warn(dev, "mlx5_ib: modify QP 0x%06x to RESET failed\n",
+				     base->mqp.qpn);
 	}
 
 	get_cqs(qp, &send_cq, &recv_cq);
 
 	if (qp->create_type == MLX5_QP_KERNEL) {
 		mlx5_ib_lock_cqs(send_cq, recv_cq);
-		__mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+		__mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
 				   qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
 		if (send_cq != recv_cq)
-			__mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+			__mlx5_ib_cq_clean(send_cq, base->mqp.qpn,
+					   NULL);
 		mlx5_ib_unlock_cqs(send_cq, recv_cq);
 	}
 
-	err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp);
-	if (err)
-		mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn);
-	kfree(in);
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+		destroy_raw_packet_qp(dev, qp);
+	} else {
+		err = mlx5_core_destroy_qp(dev->mdev, &base->mqp);
+		if (err)
+			mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n",
+				     base->mqp.qpn);
+	}
 
+	kfree(in);
 
 	if (qp->create_type == MLX5_QP_KERNEL)
 		destroy_qp_kernel(dev, qp);
 	else if (qp->create_type == MLX5_QP_USER)
-		destroy_qp_user(&get_pd(qp)->ibpd, qp);
+		destroy_qp_user(&get_pd(qp)->ibpd, qp, base);
 }
 
 static const char *ib_qp_type_str(enum ib_qp_type type)
@@ -1234,6 +1624,16 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 			return ERR_PTR(-EINVAL);
 		}
 		dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
+
+		if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
+			if (!pd->uobject) {
+				mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n");
+				return ERR_PTR(-EINVAL);
+			} else if (!to_mucontext(pd->uobject->context)->cqe_version) {
+				mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n");
+				return ERR_PTR(-EINVAL);
+			}
+		}
 	}
 
 	switch (init_attr->qp_type) {
@@ -1250,6 +1650,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 		}
 
 		/* fall through */
+	case IB_QPT_RAW_PACKET:
 	case IB_QPT_RC:
 	case IB_QPT_UC:
 	case IB_QPT_UD:
@@ -1272,19 +1673,19 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 		else if (is_qp1(init_attr->qp_type))
 			qp->ibqp.qp_num = 1;
 		else
-			qp->ibqp.qp_num = qp->mqp.qpn;
+			qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn;
 
 		mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n",
-			    qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn,
+			    qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn,
+			    to_mcq(init_attr->recv_cq)->mcq.cqn,
 			    to_mcq(init_attr->send_cq)->mcq.cqn);
 
-		qp->xrcdn = xrcdn;
+		qp->trans_qp.xrcdn = xrcdn;
 
 		break;
 
 	case IB_QPT_RAW_IPV6:
 	case IB_QPT_RAW_ETHERTYPE:
-	case IB_QPT_RAW_PACKET:
 	case IB_QPT_MAX:
 	default:
 		mlx5_ib_dbg(dev, "unsupported qp type %d\n",
@@ -1318,12 +1719,12 @@ static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_att
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
 		dest_rd_atomic = attr->max_dest_rd_atomic;
 	else
-		dest_rd_atomic = qp->resp_depth;
+		dest_rd_atomic = qp->trans_qp.resp_depth;
 
 	if (attr_mask & IB_QP_ACCESS_FLAGS)
 		access_flags = attr->qp_access_flags;
 	else
-		access_flags = qp->atomic_rd_en;
+		access_flags = qp->trans_qp.atomic_rd_en;
 
 	if (!dest_rd_atomic)
 		access_flags &= IB_ACCESS_REMOTE_WRITE;
@@ -1360,21 +1761,42 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
 	return rate + MLX5_STAT_RATE_OFFSET;
 }
 
-static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
+static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev,
+				      struct mlx5_ib_sq *sq, u8 sl)
+{
+	void *in;
+	void *tisc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_tis_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_tis_in, in, bitmask.prio, 1);
+
+	tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx);
+	MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1));
+
+	err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+			 const struct ib_ah_attr *ah,
 			 struct mlx5_qp_path *path, u8 port, int attr_mask,
 			 u32 path_flags, const struct ib_qp_attr *attr)
 {
+	enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port);
 	int err;
 
-	path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
-	path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0;
-
 	if (attr_mask & IB_QP_PKEY_INDEX)
 		path->pkey_index = attr->pkey_index;
 
-	path->grh_mlid	= ah->src_path_bits & 0x7f;
-	path->rlid	= cpu_to_be16(ah->dlid);
-
 	if (ah->ah_flags & IB_AH_GRH) {
 		if (ah->grh.sgid_index >=
 		    dev->mdev->port_caps[port - 1].gid_table_len) {
@@ -1383,7 +1805,27 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
 			       dev->mdev->port_caps[port - 1].gid_table_len);
 			return -EINVAL;
 		}
-		path->grh_mlid |= 1 << 7;
+	}
+
+	if (ll == IB_LINK_LAYER_ETHERNET) {
+		if (!(ah->ah_flags & IB_AH_GRH))
+			return -EINVAL;
+		memcpy(path->rmac, ah->dmac, sizeof(ah->dmac));
+		path->udp_sport = mlx5_get_roce_udp_sport(dev, port,
+							  ah->grh.sgid_index);
+		path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4;
+	} else {
+		path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
+		path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 :
+									0;
+		path->rlid = cpu_to_be16(ah->dlid);
+		path->grh_mlid = ah->src_path_bits & 0x7f;
+		if (ah->ah_flags & IB_AH_GRH)
+			path->grh_mlid	|= 1 << 7;
+		path->dci_cfi_prio_sl = ah->sl & 0xf;
+	}
+
+	if (ah->ah_flags & IB_AH_GRH) {
 		path->mgid_index = ah->grh.sgid_index;
 		path->hop_limit  = ah->grh.hop_limit;
 		path->tclass_flowlabel =
@@ -1401,7 +1843,10 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
 	if (attr_mask & IB_QP_TIMEOUT)
 		path->ackto_lt = attr->timeout << 3;
 
-	path->sl = ah->sl & 0xf;
+	if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt)
+		return modify_raw_packet_eth_prio(dev->mdev,
+						  &qp->raw_packet_qp.sq,
+						  ah->sl & 0xf);
 
 	return 0;
 }
@@ -1549,12 +1994,154 @@ static int ib_mask_to_mlx5_opt(int ib_mask)
 	return result;
 }
 
+static int modify_raw_packet_qp_rq(struct mlx5_core_dev *dev,
+				   struct mlx5_ib_rq *rq, int new_state)
+{
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_rq_in, in, rq_state, rq->state);
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+	MLX5_SET(rqc, rqc, state, new_state);
+
+	err = mlx5_core_modify_rq(dev, rq->base.mqp.qpn, in, inlen);
+	if (err)
+		goto out;
+
+	rq->state = new_state;
+
+out:
+	kvfree(in);
+	return err;
+}
+
+static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev,
+				   struct mlx5_ib_sq *sq, int new_state)
+{
+	void *in;
+	void *sqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_sq_in, in, sq_state, sq->state);
+
+	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
+	MLX5_SET(sqc, sqc, state, new_state);
+
+	err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in, inlen);
+	if (err)
+		goto out;
+
+	sq->state = new_state;
+
+out:
+	kvfree(in);
+	return err;
+}
+
+static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+				u16 operation)
+{
+	struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+	int rq_state;
+	int sq_state;
+	int err;
+
+	switch (operation) {
+	case MLX5_CMD_OP_RST2INIT_QP:
+		rq_state = MLX5_RQC_STATE_RDY;
+		sq_state = MLX5_SQC_STATE_RDY;
+		break;
+	case MLX5_CMD_OP_2ERR_QP:
+		rq_state = MLX5_RQC_STATE_ERR;
+		sq_state = MLX5_SQC_STATE_ERR;
+		break;
+	case MLX5_CMD_OP_2RST_QP:
+		rq_state = MLX5_RQC_STATE_RST;
+		sq_state = MLX5_SQC_STATE_RST;
+		break;
+	case MLX5_CMD_OP_INIT2INIT_QP:
+	case MLX5_CMD_OP_INIT2RTR_QP:
+	case MLX5_CMD_OP_RTR2RTS_QP:
+	case MLX5_CMD_OP_RTS2RTS_QP:
+		/* Nothing to do here... */
+		return 0;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (qp->rq.wqe_cnt) {
+		err =  modify_raw_packet_qp_rq(dev->mdev, rq, rq_state);
+		if (err)
+			return err;
+	}
+
+	if (qp->sq.wqe_cnt)
+		return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state);
+
+	return 0;
+}
+
 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
 {
+	static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
+		[MLX5_QP_STATE_RST] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_RST2INIT_QP,
+		},
+		[MLX5_QP_STATE_INIT]  = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_INIT2INIT_QP,
+			[MLX5_QP_STATE_RTR]	= MLX5_CMD_OP_INIT2RTR_QP,
+		},
+		[MLX5_QP_STATE_RTR]   = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTR2RTS_QP,
+		},
+		[MLX5_QP_STATE_RTS]   = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTS2RTS_QP,
+		},
+		[MLX5_QP_STATE_SQD] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+		},
+		[MLX5_QP_STATE_SQER] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_SQERR2RTS_QP,
+		},
+		[MLX5_QP_STATE_ERR] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+		}
+	};
+
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
 	struct mlx5_ib_cq *send_cq, *recv_cq;
 	struct mlx5_qp_context *context;
 	struct mlx5_modify_qp_mbox_in *in;
@@ -1564,6 +2151,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	int sqd_event;
 	int mlx5_st;
 	int err;
+	u16 op;
 
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
 	if (!in)
@@ -1623,7 +2211,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 		context->pri_path.port = attr->port_num;
 
 	if (attr_mask & IB_QP_AV) {
-		err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path,
+		err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path,
 				    attr_mask & IB_QP_PORT ? attr->port_num : qp->port,
 				    attr_mask, 0, attr);
 		if (err)
@@ -1634,7 +2222,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 		context->pri_path.ackto_lt |= attr->timeout << 3;
 
 	if (attr_mask & IB_QP_ALT_PATH) {
-		err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
+		err = mlx5_set_path(dev, qp, &attr->alt_ah_attr,
+				    &context->alt_path,
 				    attr->alt_port_num, attr_mask, 0, attr);
 		if (err)
 			goto out;
@@ -1706,41 +2295,51 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	 * again to RTS, and may cause the driver and the device to get out of
 	 * sync. */
 	if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
-	    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+	    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) &&
+	    (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
 		mlx5_ib_qp_disable_pagefaults(qp);
 
+	if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE ||
+	    !optab[mlx5_cur][mlx5_new])
+		goto out;
+
+	op = optab[mlx5_cur][mlx5_new];
 	optpar = ib_mask_to_mlx5_opt(attr_mask);
 	optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
 	in->optparam = cpu_to_be32(optpar);
-	err = mlx5_core_qp_modify(dev->mdev, to_mlx5_state(cur_state),
-				  to_mlx5_state(new_state), in, sqd_event,
-				  &qp->mqp);
+
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET)
+		err = modify_raw_packet_qp(dev, qp, op);
+	else
+		err = mlx5_core_qp_modify(dev->mdev, op, in, sqd_event,
+					  &base->mqp);
 	if (err)
 		goto out;
 
-	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT &&
+	    (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
 		mlx5_ib_qp_enable_pagefaults(qp);
 
 	qp->state = new_state;
 
 	if (attr_mask & IB_QP_ACCESS_FLAGS)
-		qp->atomic_rd_en = attr->qp_access_flags;
+		qp->trans_qp.atomic_rd_en = attr->qp_access_flags;
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
-		qp->resp_depth = attr->max_dest_rd_atomic;
+		qp->trans_qp.resp_depth = attr->max_dest_rd_atomic;
 	if (attr_mask & IB_QP_PORT)
 		qp->port = attr->port_num;
 	if (attr_mask & IB_QP_ALT_PATH)
-		qp->alt_port = attr->alt_port_num;
+		qp->trans_qp.alt_port = attr->alt_port_num;
 
 	/*
 	 * If we moved a kernel QP to RESET, clean up all old CQ
 	 * entries and reinitialize the QP.
 	 */
 	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
-		mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+		mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
 				 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
 		if (send_cq != recv_cq)
-			mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+			mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL);
 
 		qp->rq.head = 0;
 		qp->rq.tail = 0;
@@ -1765,15 +2364,21 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	enum ib_qp_state cur_state, new_state;
 	int err = -EINVAL;
 	int port;
+	enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
 
 	mutex_lock(&qp->mutex);
 
 	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
 	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
 
+	if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) {
+		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port);
+	}
+
 	if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR &&
 	    !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
-				IB_LINK_LAYER_UNSPECIFIED))
+				ll))
 		goto out;
 
 	if ((attr_mask & IB_QP_PORT) &&
@@ -2570,7 +3175,7 @@ static void finish_wqe(struct mlx5_ib_qp *qp,
 
 	ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) |
 					     mlx5_opcode | ((u32)opmod << 24));
-	ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8));
+	ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8));
 	ctrl->fm_ce_se |= fence;
 	qp->fm_cache = next_fence;
 	if (unlikely(qp->wq_sig))
@@ -3003,7 +3608,7 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at
 	    ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports))
 		return;
 
-	ib_ah_attr->sl = path->sl & 0xf;
+	ib_ah_attr->sl = path->dci_cfi_prio_sl & 0xf;
 
 	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
 	ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f;
@@ -3021,39 +3626,153 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at
 	}
 }
 
-int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
-		     struct ib_qp_init_attr *qp_init_attr)
+static int query_raw_packet_qp_sq_state(struct mlx5_ib_dev *dev,
+					struct mlx5_ib_sq *sq,
+					u8 *sq_state)
+{
+	void *out;
+	void *sqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(query_sq_out);
+	out = mlx5_vzalloc(inlen);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_sq(dev->mdev, sq->base.mqp.qpn, out);
+	if (err)
+		goto out;
+
+	sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context);
+	*sq_state = MLX5_GET(sqc, sqc, state);
+	sq->state = *sq_state;
+
+out:
+	kvfree(out);
+	return err;
+}
+
+static int query_raw_packet_qp_rq_state(struct mlx5_ib_dev *dev,
+					struct mlx5_ib_rq *rq,
+					u8 *rq_state)
+{
+	void *out;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(query_rq_out);
+	out = mlx5_vzalloc(inlen);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_rq(dev->mdev, rq->base.mqp.qpn, out);
+	if (err)
+		goto out;
+
+	rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context);
+	*rq_state = MLX5_GET(rqc, rqc, state);
+	rq->state = *rq_state;
+
+out:
+	kvfree(out);
+	return err;
+}
+
+static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state,
+				  struct mlx5_ib_qp *qp, u8 *qp_state)
+{
+	static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = {
+		[MLX5_RQC_STATE_RST] = {
+			[MLX5_SQC_STATE_RST]	= IB_QPS_RESET,
+			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE_BAD,
+			[MLX5_SQC_STATE_ERR]	= MLX5_QP_STATE_BAD,
+			[MLX5_SQ_STATE_NA]	= IB_QPS_RESET,
+		},
+		[MLX5_RQC_STATE_RDY] = {
+			[MLX5_SQC_STATE_RST]	= MLX5_QP_STATE_BAD,
+			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE,
+			[MLX5_SQC_STATE_ERR]	= IB_QPS_SQE,
+			[MLX5_SQ_STATE_NA]	= MLX5_QP_STATE,
+		},
+		[MLX5_RQC_STATE_ERR] = {
+			[MLX5_SQC_STATE_RST]    = MLX5_QP_STATE_BAD,
+			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE_BAD,
+			[MLX5_SQC_STATE_ERR]	= IB_QPS_ERR,
+			[MLX5_SQ_STATE_NA]	= IB_QPS_ERR,
+		},
+		[MLX5_RQ_STATE_NA] = {
+			[MLX5_SQC_STATE_RST]    = IB_QPS_RESET,
+			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE,
+			[MLX5_SQC_STATE_ERR]	= MLX5_QP_STATE,
+			[MLX5_SQ_STATE_NA]	= MLX5_QP_STATE_BAD,
+		},
+	};
+
+	*qp_state = sqrq_trans[rq_state][sq_state];
+
+	if (*qp_state == MLX5_QP_STATE_BAD) {
+		WARN(1, "Buggy Raw Packet QP state, SQ 0x%x state: 0x%x, RQ 0x%x state: 0x%x",
+		     qp->raw_packet_qp.sq.base.mqp.qpn, sq_state,
+		     qp->raw_packet_qp.rq.base.mqp.qpn, rq_state);
+		return -EINVAL;
+	}
+
+	if (*qp_state == MLX5_QP_STATE)
+		*qp_state = qp->state;
+
+	return 0;
+}
+
+static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev,
+				     struct mlx5_ib_qp *qp,
+				     u8 *raw_packet_qp_state)
+{
+	struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+	int err;
+	u8 sq_state = MLX5_SQ_STATE_NA;
+	u8 rq_state = MLX5_RQ_STATE_NA;
+
+	if (qp->sq.wqe_cnt) {
+		err = query_raw_packet_qp_sq_state(dev, sq, &sq_state);
+		if (err)
+			return err;
+	}
+
+	if (qp->rq.wqe_cnt) {
+		err = query_raw_packet_qp_rq_state(dev, rq, &rq_state);
+		if (err)
+			return err;
+	}
+
+	return sqrq_state_to_qp_state(sq_state, rq_state, qp,
+				      raw_packet_qp_state);
+}
+
+static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+			 struct ib_qp_attr *qp_attr)
 {
-	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
-	struct mlx5_ib_qp *qp = to_mqp(ibqp);
 	struct mlx5_query_qp_mbox_out *outb;
 	struct mlx5_qp_context *context;
 	int mlx5_state;
 	int err = 0;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	/*
-	 * Wait for any outstanding page faults, in case the user frees memory
-	 * based upon this query's result.
-	 */
-	flush_workqueue(mlx5_ib_page_fault_wq);
-#endif
-
-	mutex_lock(&qp->mutex);
 	outb = kzalloc(sizeof(*outb), GFP_KERNEL);
-	if (!outb) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!outb)
+		return -ENOMEM;
+
 	context = &outb->ctx;
-	err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb));
+	err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb,
+				 sizeof(*outb));
 	if (err)
-		goto out_free;
+		goto out;
 
 	mlx5_state = be32_to_cpu(context->flags) >> 28;
 
 	qp->state		     = to_ib_qp_state(mlx5_state);
-	qp_attr->qp_state	     = qp->state;
 	qp_attr->path_mtu	     = context->mtu_msgmax >> 5;
 	qp_attr->path_mig_state	     =
 		to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
@@ -3087,6 +3806,43 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
 	qp_attr->retry_cnt	    = (be32_to_cpu(context->params1) >> 16) & 0x7;
 	qp_attr->rnr_retry	    = (be32_to_cpu(context->params1) >> 13) & 0x7;
 	qp_attr->alt_timeout	    = context->alt_path.ackto_lt >> 3;
+
+out:
+	kfree(outb);
+	return err;
+}
+
+int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		     int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	int err = 0;
+	u8 raw_packet_qp_state;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	/*
+	 * Wait for any outstanding page faults, in case the user frees memory
+	 * based upon this query's result.
+	 */
+	flush_workqueue(mlx5_ib_page_fault_wq);
+#endif
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+		err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state);
+		if (err)
+			goto out;
+		qp->state = raw_packet_qp_state;
+		qp_attr->port_num = 1;
+	} else {
+		err = query_qp_attr(dev, qp, qp_attr);
+		if (err)
+			goto out;
+	}
+
+	qp_attr->qp_state	     = qp->state;
 	qp_attr->cur_qp_state	     = qp_attr->qp_state;
 	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
 	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
@@ -3110,12 +3866,16 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
 	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
 		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
 
+	if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+		qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL;
+	if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
+		qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND;
+	if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
+		qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV;
+
 	qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ?
 		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
 
-out_free:
-	kfree(outb);
-
 out:
 	mutex_unlock(&qp->mutex);
 	return err;
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index e008505e96e9..4659256cd95e 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -78,28 +78,41 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 			   struct ib_udata *udata, int buf_size, int *inlen)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct mlx5_ib_create_srq ucmd;
+	struct mlx5_ib_create_srq ucmd = {};
 	size_t ucmdlen;
+	void *xsrqc;
 	int err;
 	int npages;
 	int page_shift;
 	int ncont;
 	u32 offset;
+	u32 uidx = MLX5_IB_DEFAULT_UIDX;
+	int drv_data = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
 
-	ucmdlen =
-		(udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) <
-		 sizeof(ucmd)) ? (sizeof(ucmd) -
-				  sizeof(ucmd.reserved)) : sizeof(ucmd);
+	if (drv_data < 0)
+		return -EINVAL;
+
+	ucmdlen = (drv_data < sizeof(ucmd)) ?
+		  drv_data : sizeof(ucmd);
 
 	if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) {
 		mlx5_ib_dbg(dev, "failed copy udata\n");
 		return -EFAULT;
 	}
 
-	if (ucmdlen == sizeof(ucmd) &&
-	    ucmd.reserved != 0)
+	if (ucmd.reserved0 || ucmd.reserved1)
 		return -EINVAL;
 
+	if (drv_data > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 drv_data - sizeof(ucmd)))
+		return -EINVAL;
+
+	err = get_srq_user_index(to_mucontext(pd->uobject->context),
+				 &ucmd, udata->inlen, &uidx);
+	if (err)
+		return err;
+
 	srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
 
 	srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size,
@@ -138,6 +151,12 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 	(*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
 
+	if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+		xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
+				     xrc_srq_context_entry);
+		MLX5_SET(xrc_srqc, xsrqc, user_index, uidx);
+	}
+
 	return 0;
 
 err_in:
@@ -158,6 +177,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
 	struct mlx5_wqe_srq_next_seg *next;
 	int page_shift;
 	int npages;
+	void *xsrqc;
 
 	err = mlx5_db_alloc(dev->mdev, &srq->db);
 	if (err) {
@@ -204,6 +224,13 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
 
 	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 
+	if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+		xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
+				     xrc_srq_context_entry);
+		/* 0xffffff means we ask to work with cqe version 0 */
+		MLX5_SET(xrc_srqc, xsrqc, user_index, MLX5_IB_DEFAULT_UIDX);
+	}
+
 	return 0;
 
 err_in:
diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h
index 76fb7b927d37..b94a55404a59 100644
--- a/drivers/infiniband/hw/mlx5/user.h
+++ b/drivers/infiniband/hw/mlx5/user.h
@@ -35,6 +35,8 @@
 
 #include <linux/types.h>
 
+#include "mlx5_ib.h"
+
 enum {
 	MLX5_QP_FLAG_SIGNATURE		= 1 << 0,
 	MLX5_QP_FLAG_SCATTER_CQE	= 1 << 1,
@@ -66,7 +68,15 @@ struct mlx5_ib_alloc_ucontext_req_v2 {
 	__u32	total_num_uuars;
 	__u32	num_low_latency_uuars;
 	__u32	flags;
-	__u32	reserved;
+	__u32	comp_mask;
+	__u8	max_cqe_version;
+	__u8	reserved0;
+	__u16	reserved1;
+	__u32	reserved2;
+};
+
+enum mlx5_ib_alloc_ucontext_resp_mask {
+	MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0,
 };
 
 struct mlx5_ib_alloc_ucontext_resp {
@@ -80,7 +90,13 @@ struct mlx5_ib_alloc_ucontext_resp {
 	__u32	max_recv_wr;
 	__u32	max_srq_recv_wr;
 	__u16	num_ports;
-	__u16	reserved;
+	__u16	reserved1;
+	__u32	comp_mask;
+	__u32	response_length;
+	__u8	cqe_version;
+	__u8	reserved2;
+	__u16	reserved3;
+	__u64	hca_core_clock_offset;
 };
 
 struct mlx5_ib_alloc_pd_resp {
@@ -110,7 +126,9 @@ struct mlx5_ib_create_srq {
 	__u64	buf_addr;
 	__u64	db_addr;
 	__u32	flags;
-	__u32	reserved; /* explicit padding (optional on i386) */
+	__u32	reserved0; /* explicit padding (optional on i386) */
+	__u32	uidx;
+	__u32	reserved1;
 };
 
 struct mlx5_ib_create_srq_resp {
@@ -125,9 +143,48 @@ struct mlx5_ib_create_qp {
 	__u32	rq_wqe_count;
 	__u32	rq_wqe_shift;
 	__u32	flags;
+	__u32	uidx;
+	__u32	reserved0;
+	__u64	sq_buf_addr;
 };
 
 struct mlx5_ib_create_qp_resp {
 	__u32	uuar_index;
 };
+
+static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext,
+				    struct mlx5_ib_create_qp *ucmd,
+				    int inlen,
+				    u32 *user_index)
+{
+	u8 cqe_version = ucontext->cqe_version;
+
+	if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) &&
+	    !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+		return 0;
+
+	if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) !=
+	       !!cqe_version))
+		return -EINVAL;
+
+	return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
+}
+
+static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext,
+				     struct mlx5_ib_create_srq *ucmd,
+				     int inlen,
+				     u32 *user_index)
+{
+	u8 cqe_version = ucontext->cqe_version;
+
+	if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) &&
+	    !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+		return 0;
+
+	if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) !=
+	       !!cqe_version))
+		return -EINVAL;
+
+	return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
+}
 #endif /* MLX5_IB_USER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
index 40ba83338155..a6531ffe29a6 100644
--- a/drivers/infiniband/hw/mthca/mthca_cq.c
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -608,9 +608,6 @@ static inline int mthca_poll_one(struct mthca_dev *dev,
 			entry->opcode    = IB_WC_FETCH_ADD;
 			entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
 			break;
-		case MTHCA_OPCODE_BIND_MW:
-			entry->opcode    = IB_WC_BIND_MW;
-			break;
 		default:
 			entry->opcode    = MTHCA_OPCODE_INVALID;
 			break;
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index dc2d48c59e62..9866c35cc977 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -898,89 +898,6 @@ static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
 	return &mr->ibmr;
 }
 
-static struct ib_mr *mthca_reg_phys_mr(struct ib_pd       *pd,
-				       struct ib_phys_buf *buffer_list,
-				       int                 num_phys_buf,
-				       int                 acc,
-				       u64                *iova_start)
-{
-	struct mthca_mr *mr;
-	u64 *page_list;
-	u64 total_size;
-	unsigned long mask;
-	int shift;
-	int npages;
-	int err;
-	int i, j, n;
-
-	mask = buffer_list[0].addr ^ *iova_start;
-	total_size = 0;
-	for (i = 0; i < num_phys_buf; ++i) {
-		if (i != 0)
-			mask |= buffer_list[i].addr;
-		if (i != num_phys_buf - 1)
-			mask |= buffer_list[i].addr + buffer_list[i].size;
-
-		total_size += buffer_list[i].size;
-	}
-
-	if (mask & ~PAGE_MASK)
-		return ERR_PTR(-EINVAL);
-
-	shift = __ffs(mask | 1 << 31);
-
-	buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
-	buffer_list[0].addr &= ~0ull << shift;
-
-	mr = kmalloc(sizeof *mr, GFP_KERNEL);
-	if (!mr)
-		return ERR_PTR(-ENOMEM);
-
-	npages = 0;
-	for (i = 0; i < num_phys_buf; ++i)
-		npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
-
-	if (!npages)
-		return &mr->ibmr;
-
-	page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
-	if (!page_list) {
-		kfree(mr);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	n = 0;
-	for (i = 0; i < num_phys_buf; ++i)
-		for (j = 0;
-		     j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
-		     ++j)
-			page_list[n++] = buffer_list[i].addr + ((u64) j << shift);
-
-	mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) "
-		  "in PD %x; shift %d, npages %d.\n",
-		  (unsigned long long) buffer_list[0].addr,
-		  (unsigned long long) *iova_start,
-		  to_mpd(pd)->pd_num,
-		  shift, npages);
-
-	err = mthca_mr_alloc_phys(to_mdev(pd->device),
-				  to_mpd(pd)->pd_num,
-				  page_list, shift, npages,
-				  *iova_start, total_size,
-				  convert_access(acc), mr);
-
-	if (err) {
-		kfree(page_list);
-		kfree(mr);
-		return ERR_PTR(err);
-	}
-
-	kfree(page_list);
-	mr->umem = NULL;
-
-	return &mr->ibmr;
-}
-
 static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				       u64 virt, int acc, struct ib_udata *udata)
 {
@@ -1346,7 +1263,6 @@ int mthca_register_device(struct mthca_dev *dev)
 	dev->ib_dev.destroy_cq           = mthca_destroy_cq;
 	dev->ib_dev.poll_cq              = mthca_poll_cq;
 	dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
-	dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
 	dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
 	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
 	dev->ib_dev.get_port_immutable   = mthca_port_immutable;
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 35fe506e2cfa..96e5fb91fb48 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -1485,7 +1485,7 @@ static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
 	u16 pkey;
 
 	ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0,
-			  mthca_ah_grh_present(to_mah(wr->ah)), 0,
+			  mthca_ah_grh_present(to_mah(wr->ah)), 0, 0, 0,
 			  &sqp->ud_header);
 
 	err = mthca_read_ah(dev, to_mah(wr->ah), &sqp->ud_header);
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 8a3ad170d790..cb9f0f27308d 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -134,7 +134,7 @@ static void record_ird_ord(struct nes_cm_node *, u16, u16);
 /* External CM API Interface */
 /* instance of function pointers for client API */
 /* set address of this instance to cm_core->cm_ops at cm_core alloc */
-static struct nes_cm_ops nes_cm_api = {
+static const struct nes_cm_ops nes_cm_api = {
 	mini_cm_accelerated,
 	mini_cm_listen,
 	mini_cm_del_listen,
@@ -3232,7 +3232,6 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	int passive_state;
 	struct nes_ib_device *nesibdev;
 	struct ib_mr *ibmr = NULL;
-	struct ib_phys_buf ibphysbuf;
 	struct nes_pd *nespd;
 	u64 tagged_offset;
 	u8 mpa_frame_offset = 0;
@@ -3316,21 +3315,19 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 		u64temp = (unsigned long)nesqp;
 		nesibdev = nesvnic->nesibdev;
 		nespd = nesqp->nespd;
-		ibphysbuf.addr = nesqp->ietf_frame_pbase + mpa_frame_offset;
-		ibphysbuf.size = buff_len;
 		tagged_offset = (u64)(unsigned long)*start_buff;
-		ibmr = nesibdev->ibdev.reg_phys_mr((struct ib_pd *)nespd,
-						   &ibphysbuf, 1,
-						   IB_ACCESS_LOCAL_WRITE,
-						   &tagged_offset);
-		if (!ibmr) {
+		ibmr = nes_reg_phys_mr(&nespd->ibpd,
+				nesqp->ietf_frame_pbase + mpa_frame_offset,
+				buff_len, IB_ACCESS_LOCAL_WRITE,
+				&tagged_offset);
+		if (IS_ERR(ibmr)) {
 			nes_debug(NES_DBG_CM, "Unable to register memory region"
 				  "for lSMM for cm_node = %p \n",
 				  cm_node);
 			pci_free_consistent(nesdev->pcidev,
 					    nesqp->private_data_len + nesqp->ietf_frame_size,
 					    nesqp->ietf_frame, nesqp->ietf_frame_pbase);
-			return -ENOMEM;
+			return PTR_ERR(ibmr);
 		}
 
 		ibmr->pd = &nespd->ibpd;
diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h
index 32a6420c2940..147c2c884227 100644
--- a/drivers/infiniband/hw/nes/nes_cm.h
+++ b/drivers/infiniband/hw/nes/nes_cm.h
@@ -423,7 +423,7 @@ struct nes_cm_core {
 
 	struct timer_list       tcp_timer;
 
-	struct nes_cm_ops       *api;
+	const struct nes_cm_ops *api;
 
 	int (*post_event)(struct nes_cm_event *event);
 	atomic_t                events_posted;
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
index 2042c0f29759..6d3a169c049b 100644
--- a/drivers/infiniband/hw/nes/nes_utils.c
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -727,7 +727,7 @@ int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 acti
 	if (action == NES_ARP_DELETE) {
 		nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index);
 		nesadapter->arp_table[arp_index].ip_addr = 0;
-		memset(nesadapter->arp_table[arp_index].mac_addr, 0x00, ETH_ALEN);
+		eth_zero_addr(nesadapter->arp_table[arp_index].mac_addr);
 		nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index);
 		return arp_index;
 	}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 137880a19ebe..8c4daf7f22ec 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -206,80 +206,6 @@ static int nes_dealloc_mw(struct ib_mw *ibmw)
 }
 
 
-/**
- * nes_bind_mw
- */
-static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw,
-		struct ib_mw_bind *ibmw_bind)
-{
-	u64 u64temp;
-	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	/* struct nes_mr *nesmr = to_nesmw(ibmw); */
-	struct nes_qp *nesqp = to_nesqp(ibqp);
-	struct nes_hw_qp_wqe *wqe;
-	unsigned long flags = 0;
-	u32 head;
-	u32 wqe_misc = 0;
-	u32 qsize;
-
-	if (nesqp->ibqp_state > IB_QPS_RTS)
-		return -EINVAL;
-
-	spin_lock_irqsave(&nesqp->lock, flags);
-
-	head = nesqp->hwqp.sq_head;
-	qsize = nesqp->hwqp.sq_tail;
-
-	/* Check for SQ overflow */
-	if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
-		spin_unlock_irqrestore(&nesqp->lock, flags);
-		return -ENOMEM;
-	}
-
-	wqe = &nesqp->hwqp.sq_vbase[head];
-	/* nes_debug(NES_DBG_MR, "processing sq wqe at %p, head = %u.\n", wqe, head); */
-	nes_fill_init_qp_wqe(wqe, nesqp, head);
-	u64temp = ibmw_bind->wr_id;
-	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp);
-	wqe_misc = NES_IWARP_SQ_OP_BIND;
-
-	wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
-
-	if (ibmw_bind->send_flags & IB_SEND_SIGNALED)
-		wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
-
-	if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_WRITE)
-		wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE;
-	if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_READ)
-		wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_READ;
-
-	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_MISC_IDX, wqe_misc);
-	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX,
-			    ibmw_bind->bind_info.mr->lkey);
-	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MW_IDX, ibmw->rkey);
-	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX,
-			ibmw_bind->bind_info.length);
-	wqe->wqe_words[NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX] = 0;
-	u64temp = (u64)ibmw_bind->bind_info.addr;
-	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX, u64temp);
-
-	head++;
-	if (head >= qsize)
-		head = 0;
-
-	nesqp->hwqp.sq_head = head;
-	barrier();
-
-	nes_write32(nesdev->regs+NES_WQE_ALLOC,
-			(1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
-
-	spin_unlock_irqrestore(&nesqp->lock, flags);
-
-	return 0;
-}
-
-
 /*
  * nes_alloc_fast_mr
  */
@@ -2074,9 +2000,8 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 /**
  * nes_reg_phys_mr
  */
-static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
-		struct ib_phys_buf *buffer_list, int num_phys_buf, int acc,
-		u64 * iova_start)
+struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size,
+		int acc, u64 *iova_start)
 {
 	u64 region_length;
 	struct nes_pd *nespd = to_nespd(ib_pd);
@@ -2088,13 +2013,10 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
 	struct nes_vpbl vpbl;
 	struct nes_root_vpbl root_vpbl;
 	u32 stag;
-	u32 i;
 	unsigned long mask;
 	u32 stag_index = 0;
 	u32 next_stag_index = 0;
 	u32 driver_key = 0;
-	u32 root_pbl_index = 0;
-	u32 cur_pbl_index = 0;
 	int err = 0;
 	int ret = 0;
 	u16 pbl_count = 0;
@@ -2113,11 +2035,8 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
 
 	next_stag_index >>= 8;
 	next_stag_index %= nesadapter->max_mr;
-	if (num_phys_buf > (1024*512)) {
-		return ERR_PTR(-E2BIG);
-	}
 
-	if ((buffer_list[0].addr ^ *iova_start) & ~PAGE_MASK)
+	if ((addr ^ *iova_start) & ~PAGE_MASK)
 		return ERR_PTR(-EINVAL);
 
 	err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr,
@@ -2132,84 +2051,33 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	for (i = 0; i < num_phys_buf; i++) {
+	/* Allocate a 4K buffer for the PBL */
+	vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+			&vpbl.pbl_pbase);
+	nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
+			vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
+	if (!vpbl.pbl_vbase) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		ibmr = ERR_PTR(-ENOMEM);
+		kfree(nesmr);
+		goto reg_phys_err;
+	}
 
-		if ((i & 0x01FF) == 0) {
-			if (root_pbl_index == 1) {
-				/* Allocate the root PBL */
-				root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192,
-						&root_vpbl.pbl_pbase);
-				nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
-						root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
-				if (!root_vpbl.pbl_vbase) {
-					pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-							vpbl.pbl_pbase);
-					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-					kfree(nesmr);
-					return ERR_PTR(-ENOMEM);
-				}
-				root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, GFP_KERNEL);
-				if (!root_vpbl.leaf_vpbl) {
-					pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
-							root_vpbl.pbl_pbase);
-					pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-							vpbl.pbl_pbase);
-					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-					kfree(nesmr);
-					return ERR_PTR(-ENOMEM);
-				}
-				root_vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase);
-				root_vpbl.pbl_vbase[0].pa_high =
-						cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
-				root_vpbl.leaf_vpbl[0] = vpbl;
-			}
-			/* Allocate a 4K buffer for the PBL */
-			vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
-					&vpbl.pbl_pbase);
-			nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
-					vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
-			if (!vpbl.pbl_vbase) {
-				nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-				ibmr = ERR_PTR(-ENOMEM);
-				kfree(nesmr);
-				goto reg_phys_err;
-			}
-			/* Fill in the root table */
-			if (1 <= root_pbl_index) {
-				root_vpbl.pbl_vbase[root_pbl_index].pa_low =
-						cpu_to_le32((u32)vpbl.pbl_pbase);
-				root_vpbl.pbl_vbase[root_pbl_index].pa_high =
-						cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
-				root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
-			}
-			root_pbl_index++;
-			cur_pbl_index = 0;
-		}
 
-		mask = !buffer_list[i].size;
-		if (i != 0)
-			mask |= buffer_list[i].addr;
-		if (i != num_phys_buf - 1)
-			mask |= buffer_list[i].addr + buffer_list[i].size;
-
-		if (mask & ~PAGE_MASK) {
-			nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-			nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n");
-			ibmr = ERR_PTR(-EINVAL);
-			kfree(nesmr);
-			goto reg_phys_err;
-		}
+	mask = !size;
 
-		region_length += buffer_list[i].size;
-		if ((i != 0) && (single_page)) {
-			if ((buffer_list[i-1].addr+PAGE_SIZE) != buffer_list[i].addr)
-				single_page = 0;
-		}
-		vpbl.pbl_vbase[cur_pbl_index].pa_low = cpu_to_le32((u32)buffer_list[i].addr & PAGE_MASK);
-		vpbl.pbl_vbase[cur_pbl_index++].pa_high =
-				cpu_to_le32((u32)((((u64)buffer_list[i].addr) >> 32)));
+	if (mask & ~PAGE_MASK) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n");
+		ibmr = ERR_PTR(-EINVAL);
+		kfree(nesmr);
+		goto reg_phys_err;
 	}
 
+	region_length += size;
+	vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)addr & PAGE_MASK);
+	vpbl.pbl_vbase[0].pa_high = cpu_to_le32((u32)((((u64)addr) >> 32)));
+
 	stag = stag_index << 8;
 	stag |= driver_key;
 	stag += (u32)stag_key;
@@ -2219,17 +2087,15 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
 			stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index);
 
 	/* Make the leaf PBL the root if only one PBL */
-	if (root_pbl_index == 1) {
-		root_vpbl.pbl_pbase = vpbl.pbl_pbase;
-	}
+	root_vpbl.pbl_pbase = vpbl.pbl_pbase;
 
 	if (single_page) {
 		pbl_count = 0;
 	} else {
-		pbl_count = root_pbl_index;
+		pbl_count = 1;
 	}
 	ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl,
-			buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start,
+			addr, pbl_count, 1, acc, iova_start,
 			&nesmr->pbls_used, &nesmr->pbl_4k);
 
 	if (ret == 0) {
@@ -2242,21 +2108,9 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
 		ibmr = ERR_PTR(-ENOMEM);
 	}
 
-	reg_phys_err:
-	/* free the resources */
-	if (root_pbl_index == 1) {
-		/* single PBL case */
-		pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
-	} else {
-		for (i=0; i<root_pbl_index; i++) {
-			pci_free_consistent(nesdev->pcidev, 4096, root_vpbl.leaf_vpbl[i].pbl_vbase,
-					root_vpbl.leaf_vpbl[i].pbl_pbase);
-		}
-		kfree(root_vpbl.leaf_vpbl);
-		pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
-				root_vpbl.pbl_pbase);
-	}
-
+reg_phys_err:
+	/* single PBL case */
+	pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
 	return ibmr;
 }
 
@@ -2266,17 +2120,13 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
  */
 static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc)
 {
-	struct ib_phys_buf bl;
 	u64 kva = 0;
 
 	nes_debug(NES_DBG_MR, "\n");
 
-	bl.size = (u64)0xffffffffffULL;
-	bl.addr = 0;
-	return nes_reg_phys_mr(pd, &bl, 1, acc, &kva);
+	return nes_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva);
 }
 
-
 /**
  * nes_reg_user_mr
  */
@@ -3888,12 +3738,10 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
 	nesibdev->ibdev.destroy_cq = nes_destroy_cq;
 	nesibdev->ibdev.poll_cq = nes_poll_cq;
 	nesibdev->ibdev.get_dma_mr = nes_get_dma_mr;
-	nesibdev->ibdev.reg_phys_mr = nes_reg_phys_mr;
 	nesibdev->ibdev.reg_user_mr = nes_reg_user_mr;
 	nesibdev->ibdev.dereg_mr = nes_dereg_mr;
 	nesibdev->ibdev.alloc_mw = nes_alloc_mw;
 	nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
-	nesibdev->ibdev.bind_mw = nes_bind_mw;
 
 	nesibdev->ibdev.alloc_mr = nes_alloc_mr;
 	nesibdev->ibdev.map_mr_sg = nes_map_mr_sg;
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h
index a204b677af22..70290883d067 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.h
+++ b/drivers/infiniband/hw/nes/nes_verbs.h
@@ -190,4 +190,8 @@ struct nes_qp {
 	u8                    pau_state;
 	__u64                 nesuqp_addr;
 };
+
+struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
+		u64 addr, u64 size, int acc, u64 *iova_start);
+
 #endif			/* NES_VERBS_H */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index 9820074be59d..3790771f2baa 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -152,9 +152,10 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
 	if ((pd->uctx) &&
 	    (!rdma_is_multicast_addr((struct in6_addr *)attr->grh.dgid.raw)) &&
 	    (!rdma_link_local_addr((struct in6_addr *)attr->grh.dgid.raw))) {
-		status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid,
-						    attr->dmac, &vlan_tag,
-						    sgid_attr.ndev->ifindex);
+		status = rdma_addr_find_l2_eth_by_grh(&sgid, &attr->grh.dgid,
+						      attr->dmac, &vlan_tag,
+						      &sgid_attr.ndev->ifindex,
+						      NULL);
 		if (status) {
 			pr_err("%s(): Failed to resolve dmac from gid." 
 				"status = %d\n", __func__, status);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 3afb40b85159..573849354cb9 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -175,7 +175,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 	dev->ibdev.req_notify_cq = ocrdma_arm_cq;
 
 	dev->ibdev.get_dma_mr = ocrdma_get_dma_mr;
-	dev->ibdev.reg_phys_mr = ocrdma_reg_kernel_mr;
 	dev->ibdev.dereg_mr = ocrdma_dereg_mr;
 	dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
 
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 76e96f97b3f6..d4c687b548d8 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -3066,169 +3066,6 @@ pl_err:
 	return ERR_PTR(-ENOMEM);
 }
 
-#define MAX_KERNEL_PBE_SIZE 65536
-static inline int count_kernel_pbes(struct ib_phys_buf *buf_list,
-				    int buf_cnt, u32 *pbe_size)
-{
-	u64 total_size = 0;
-	u64 buf_size = 0;
-	int i;
-	*pbe_size = roundup(buf_list[0].size, PAGE_SIZE);
-	*pbe_size = roundup_pow_of_two(*pbe_size);
-
-	/* find the smallest PBE size that we can have */
-	for (i = 0; i < buf_cnt; i++) {
-		/* first addr may not be page aligned, so ignore checking */
-		if ((i != 0) && ((buf_list[i].addr & ~PAGE_MASK) ||
-				 (buf_list[i].size & ~PAGE_MASK))) {
-			return 0;
-		}
-
-		/* if configured PBE size is greater then the chosen one,
-		 * reduce the PBE size.
-		 */
-		buf_size = roundup(buf_list[i].size, PAGE_SIZE);
-		/* pbe_size has to be even multiple of 4K 1,2,4,8...*/
-		buf_size = roundup_pow_of_two(buf_size);
-		if (*pbe_size > buf_size)
-			*pbe_size = buf_size;
-
-		total_size += buf_size;
-	}
-	*pbe_size = *pbe_size > MAX_KERNEL_PBE_SIZE ?
-	    (MAX_KERNEL_PBE_SIZE) : (*pbe_size);
-
-	/* num_pbes = total_size / (*pbe_size);  this is implemented below. */
-
-	return total_size >> ilog2(*pbe_size);
-}
-
-static void build_kernel_pbes(struct ib_phys_buf *buf_list, int ib_buf_cnt,
-			      u32 pbe_size, struct ocrdma_pbl *pbl_tbl,
-			      struct ocrdma_hw_mr *hwmr)
-{
-	int i;
-	int idx;
-	int pbes_per_buf = 0;
-	u64 buf_addr = 0;
-	int num_pbes;
-	struct ocrdma_pbe *pbe;
-	int total_num_pbes = 0;
-
-	if (!hwmr->num_pbes)
-		return;
-
-	pbe = (struct ocrdma_pbe *)pbl_tbl->va;
-	num_pbes = 0;
-
-	/* go through the OS phy regions & fill hw pbe entries into pbls. */
-	for (i = 0; i < ib_buf_cnt; i++) {
-		buf_addr = buf_list[i].addr;
-		pbes_per_buf =
-		    roundup_pow_of_two(roundup(buf_list[i].size, PAGE_SIZE)) /
-		    pbe_size;
-		hwmr->len += buf_list[i].size;
-		/* number of pbes can be more for one OS buf, when
-		 * buffers are of different sizes.
-		 * split the ib_buf to one or more pbes.
-		 */
-		for (idx = 0; idx < pbes_per_buf; idx++) {
-			/* we program always page aligned addresses,
-			 * first unaligned address is taken care by fbo.
-			 */
-			if (i == 0) {
-				/* for non zero fbo, assign the
-				 * start of the page.
-				 */
-				pbe->pa_lo =
-				    cpu_to_le32((u32) (buf_addr & PAGE_MASK));
-				pbe->pa_hi =
-				    cpu_to_le32((u32) upper_32_bits(buf_addr));
-			} else {
-				pbe->pa_lo =
-				    cpu_to_le32((u32) (buf_addr & 0xffffffff));
-				pbe->pa_hi =
-				    cpu_to_le32((u32) upper_32_bits(buf_addr));
-			}
-			buf_addr += pbe_size;
-			num_pbes += 1;
-			total_num_pbes += 1;
-			pbe++;
-
-			if (total_num_pbes == hwmr->num_pbes)
-				goto mr_tbl_done;
-			/* if the pbl is full storing the pbes,
-			 * move to next pbl.
-			 */
-			if (num_pbes == (hwmr->pbl_size/sizeof(u64))) {
-				pbl_tbl++;
-				pbe = (struct ocrdma_pbe *)pbl_tbl->va;
-				num_pbes = 0;
-			}
-		}
-	}
-mr_tbl_done:
-	return;
-}
-
-struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *ibpd,
-				   struct ib_phys_buf *buf_list,
-				   int buf_cnt, int acc, u64 *iova_start)
-{
-	int status = -ENOMEM;
-	struct ocrdma_mr *mr;
-	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
-	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
-	u32 num_pbes;
-	u32 pbe_size = 0;
-
-	if ((acc & IB_ACCESS_REMOTE_WRITE) && !(acc & IB_ACCESS_LOCAL_WRITE))
-		return ERR_PTR(-EINVAL);
-
-	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-	if (!mr)
-		return ERR_PTR(status);
-
-	num_pbes = count_kernel_pbes(buf_list, buf_cnt, &pbe_size);
-	if (num_pbes == 0) {
-		status = -EINVAL;
-		goto pbl_err;
-	}
-	status = ocrdma_get_pbl_info(dev, mr, num_pbes);
-	if (status)
-		goto pbl_err;
-
-	mr->hwmr.pbe_size = pbe_size;
-	mr->hwmr.fbo = *iova_start - (buf_list[0].addr & PAGE_MASK);
-	mr->hwmr.va = *iova_start;
-	mr->hwmr.local_rd = 1;
-	mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
-	mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
-	mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
-	mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
-	mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0;
-
-	status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
-	if (status)
-		goto pbl_err;
-	build_kernel_pbes(buf_list, buf_cnt, pbe_size, mr->hwmr.pbl_table,
-			  &mr->hwmr);
-	status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc);
-	if (status)
-		goto mbx_err;
-
-	mr->ibmr.lkey = mr->hwmr.lkey;
-	if (mr->hwmr.remote_wr || mr->hwmr.remote_rd)
-		mr->ibmr.rkey = mr->hwmr.lkey;
-	return &mr->ibmr;
-
-mbx_err:
-	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
-pbl_err:
-	kfree(mr);
-	return ERR_PTR(status);
-}
-
 static int ocrdma_set_page(struct ib_mr *ibmr, u64 addr)
 {
 	struct ocrdma_mr *mr = get_ocrdma_mr(ibmr);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index a2f3b4dc20b0..8b517fd36779 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -117,9 +117,6 @@ int ocrdma_post_srq_recv(struct ib_srq *, struct ib_recv_wr *,
 
 int ocrdma_dereg_mr(struct ib_mr *);
 struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc);
-struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *,
-				   struct ib_phys_buf *buffer_list,
-				   int num_phys_buf, int acc, u64 *iova_start);
 struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
 				 u64 virt, int acc, struct ib_udata *);
 struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd,
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index 13ef22bd9459..fcdf37913a26 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -89,14 +89,14 @@ static int create_file(const char *name, umode_t mode,
 {
 	int error;
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 	*dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(*dentry))
 		error = qibfs_mknod(d_inode(parent), *dentry,
 				    mode, fops, data);
 	else
 		error = PTR_ERR(*dentry);
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 
 	return error;
 }
@@ -481,7 +481,7 @@ static int remove_device_files(struct super_block *sb,
 	int ret, i;
 
 	root = dget(sb->s_root);
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 	snprintf(unit, sizeof(unit), "%u", dd->unit);
 	dir = lookup_one_len(unit, root, strlen(unit));
 
@@ -491,7 +491,7 @@ static int remove_device_files(struct super_block *sb,
 		goto bail;
 	}
 
-	mutex_lock(&d_inode(dir)->i_mutex);
+	inode_lock(d_inode(dir));
 	remove_file(dir, "counters");
 	remove_file(dir, "counter_names");
 	remove_file(dir, "portcounter_names");
@@ -506,13 +506,13 @@ static int remove_device_files(struct super_block *sb,
 		}
 	}
 	remove_file(dir, "flash");
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	ret = simple_rmdir(d_inode(root), dir);
 	d_delete(dir);
 	dput(dir);
 
 bail:
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 	dput(root);
 	return ret;
 }
diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c
index 294f5c706be9..5f53304e8a9b 100644
--- a/drivers/infiniband/hw/qib/qib_mr.c
+++ b/drivers/infiniband/hw/qib/qib_mr.c
@@ -150,10 +150,7 @@ static struct qib_mr *alloc_mr(int count, struct ib_pd *pd)
 	rval = init_qib_mregion(&mr->mr, pd, count);
 	if (rval)
 		goto bail;
-	/*
-	 * ib_reg_phys_mr() will initialize mr->ibmr except for
-	 * lkey and rkey.
-	 */
+
 	rval = qib_alloc_lkey(&mr->mr, 0);
 	if (rval)
 		goto bail_mregion;
@@ -171,52 +168,6 @@ bail:
 }
 
 /**
- * qib_reg_phys_mr - register a physical memory region
- * @pd: protection domain for this memory region
- * @buffer_list: pointer to the list of physical buffers to register
- * @num_phys_buf: the number of physical buffers to register
- * @iova_start: the starting address passed over IB which maps to this MR
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd,
-			      struct ib_phys_buf *buffer_list,
-			      int num_phys_buf, int acc, u64 *iova_start)
-{
-	struct qib_mr *mr;
-	int n, m, i;
-	struct ib_mr *ret;
-
-	mr = alloc_mr(num_phys_buf, pd);
-	if (IS_ERR(mr)) {
-		ret = (struct ib_mr *)mr;
-		goto bail;
-	}
-
-	mr->mr.user_base = *iova_start;
-	mr->mr.iova = *iova_start;
-	mr->mr.access_flags = acc;
-
-	m = 0;
-	n = 0;
-	for (i = 0; i < num_phys_buf; i++) {
-		mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
-		mr->mr.map[m]->segs[n].length = buffer_list[i].size;
-		mr->mr.length += buffer_list[i].size;
-		n++;
-		if (n == QIB_SEGSZ) {
-			m++;
-			n = 0;
-		}
-	}
-
-	ret = &mr->ibmr;
-
-bail:
-	return ret;
-}
-
-/**
  * qib_reg_user_mr - register a userspace memory region
  * @pd: protection domain for this memory region
  * @start: starting userspace address
diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c
index 40f85bb3e0d3..3eff35c2d453 100644
--- a/drivers/infiniband/hw/qib/qib_qp.c
+++ b/drivers/infiniband/hw/qib/qib_qp.c
@@ -100,9 +100,10 @@ static u32 credit_table[31] = {
 	32768                   /* 1E */
 };
 
-static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map)
+static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map,
+			 gfp_t gfp)
 {
-	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	unsigned long page = get_zeroed_page(gfp);
 
 	/*
 	 * Free the page if someone raced with us installing it.
@@ -121,7 +122,7 @@ static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map)
  * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
  */
 static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt,
-		     enum ib_qp_type type, u8 port)
+		     enum ib_qp_type type, u8 port, gfp_t gfp)
 {
 	u32 i, offset, max_scan, qpn;
 	struct qpn_map *map;
@@ -151,7 +152,7 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt,
 	max_scan = qpt->nmaps - !offset;
 	for (i = 0;;) {
 		if (unlikely(!map->page)) {
-			get_map_page(qpt, map);
+			get_map_page(qpt, map, gfp);
 			if (unlikely(!map->page))
 				break;
 		}
@@ -983,13 +984,21 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
 	size_t sz;
 	size_t sg_list_sz;
 	struct ib_qp *ret;
+	gfp_t gfp;
+
 
 	if (init_attr->cap.max_send_sge > ib_qib_max_sges ||
 	    init_attr->cap.max_send_wr > ib_qib_max_qp_wrs ||
-	    init_attr->create_flags) {
-		ret = ERR_PTR(-EINVAL);
-		goto bail;
-	}
+	    init_attr->create_flags & ~(IB_QP_CREATE_USE_GFP_NOIO))
+		return ERR_PTR(-EINVAL);
+
+	/* GFP_NOIO is applicable in RC QPs only */
+	if (init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO &&
+	    init_attr->qp_type != IB_QPT_RC)
+		return ERR_PTR(-EINVAL);
+
+	gfp = init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO ?
+			GFP_NOIO : GFP_KERNEL;
 
 	/* Check receive queue parameters if no SRQ is specified. */
 	if (!init_attr->srq) {
@@ -1021,7 +1030,8 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
 		sz = sizeof(struct qib_sge) *
 			init_attr->cap.max_send_sge +
 			sizeof(struct qib_swqe);
-		swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+		swq = __vmalloc((init_attr->cap.max_send_wr + 1) * sz,
+				gfp, PAGE_KERNEL);
 		if (swq == NULL) {
 			ret = ERR_PTR(-ENOMEM);
 			goto bail;
@@ -1037,13 +1047,13 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
 		} else if (init_attr->cap.max_recv_sge > 1)
 			sg_list_sz = sizeof(*qp->r_sg_list) *
 				(init_attr->cap.max_recv_sge - 1);
-		qp = kzalloc(sz + sg_list_sz, GFP_KERNEL);
+		qp = kzalloc(sz + sg_list_sz, gfp);
 		if (!qp) {
 			ret = ERR_PTR(-ENOMEM);
 			goto bail_swq;
 		}
 		RCU_INIT_POINTER(qp->next, NULL);
-		qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL);
+		qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), gfp);
 		if (!qp->s_hdr) {
 			ret = ERR_PTR(-ENOMEM);
 			goto bail_qp;
@@ -1058,8 +1068,16 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
 			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
 			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
 				sizeof(struct qib_rwqe);
-			qp->r_rq.wq = vmalloc_user(sizeof(struct qib_rwq) +
-						   qp->r_rq.size * sz);
+			if (gfp != GFP_NOIO)
+				qp->r_rq.wq = vmalloc_user(
+						sizeof(struct qib_rwq) +
+						qp->r_rq.size * sz);
+			else
+				qp->r_rq.wq = __vmalloc(
+						sizeof(struct qib_rwq) +
+						qp->r_rq.size * sz,
+						gfp, PAGE_KERNEL);
+
 			if (!qp->r_rq.wq) {
 				ret = ERR_PTR(-ENOMEM);
 				goto bail_qp;
@@ -1090,7 +1108,7 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
 		dev = to_idev(ibpd->device);
 		dd = dd_from_dev(dev);
 		err = alloc_qpn(dd, &dev->qpn_table, init_attr->qp_type,
-				init_attr->port_num);
+				init_attr->port_num, gfp);
 		if (err < 0) {
 			ret = ERR_PTR(err);
 			vfree(qp->r_rq.wq);
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index de6cb6fcda8d..baf1e42b6896 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -346,6 +346,7 @@ static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr,
 	unsigned long flags;
 	struct qib_lkey_table *rkt;
 	struct qib_pd *pd;
+	int avoid_schedule = 0;
 
 	spin_lock_irqsave(&qp->s_lock, flags);
 
@@ -438,11 +439,15 @@ static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr,
 	    qp->ibqp.qp_type == IB_QPT_RC) {
 		if (wqe->length > 0x80000000U)
 			goto bail_inval_free;
+		if (wqe->length <= qp->pmtu)
+			avoid_schedule = 1;
 	} else if (wqe->length > (dd_from_ibdev(qp->ibqp.device)->pport +
-				  qp->port_num - 1)->ibmtu)
+				  qp->port_num - 1)->ibmtu) {
 		goto bail_inval_free;
-	else
+	} else {
 		atomic_inc(&to_iah(ud_wr(wr)->ah)->refcount);
+		avoid_schedule = 1;
+	}
 	wqe->ssn = qp->s_ssn++;
 	qp->s_head = next;
 
@@ -458,7 +463,7 @@ bail_inval_free:
 bail_inval:
 	ret = -EINVAL;
 bail:
-	if (!ret && !wr->next &&
+	if (!ret && !wr->next && !avoid_schedule &&
 	 !qib_sdma_empty(
 	   dd_from_ibdev(qp->ibqp.device)->pport + qp->port_num - 1)) {
 		qib_schedule_send(qp);
@@ -2256,7 +2261,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
 	ibdev->poll_cq = qib_poll_cq;
 	ibdev->req_notify_cq = qib_req_notify_cq;
 	ibdev->get_dma_mr = qib_get_dma_mr;
-	ibdev->reg_phys_mr = qib_reg_phys_mr;
 	ibdev->reg_user_mr = qib_reg_user_mr;
 	ibdev->dereg_mr = qib_dereg_mr;
 	ibdev->alloc_mr = qib_alloc_mr;
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index bc803f33d5f6..6c5e77753d85 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -1032,10 +1032,6 @@ int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 
 struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc);
 
-struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd,
-			      struct ib_phys_buf *buffer_list,
-			      int num_phys_buf, int acc, u64 *iova_start);
-
 struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			      u64 virt_addr, int mr_access_flags,
 			      struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/qib/qib_verbs_mcast.c b/drivers/infiniband/hw/qib/qib_verbs_mcast.c
index f8ea069a3eaf..b2fb5286dbd9 100644
--- a/drivers/infiniband/hw/qib/qib_verbs_mcast.c
+++ b/drivers/infiniband/hw/qib/qib_verbs_mcast.c
@@ -286,15 +286,13 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	struct qib_ibdev *dev = to_idev(ibqp->device);
 	struct qib_ibport *ibp = to_iport(ibqp->device, qp->port_num);
 	struct qib_mcast *mcast = NULL;
-	struct qib_mcast_qp *p, *tmp;
+	struct qib_mcast_qp *p, *tmp, *delp = NULL;
 	struct rb_node *n;
 	int last = 0;
 	int ret;
 
-	if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
-		ret = -EINVAL;
-		goto bail;
-	}
+	if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET)
+		return -EINVAL;
 
 	spin_lock_irq(&ibp->lock);
 
@@ -303,8 +301,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	while (1) {
 		if (n == NULL) {
 			spin_unlock_irq(&ibp->lock);
-			ret = -EINVAL;
-			goto bail;
+			return -EINVAL;
 		}
 
 		mcast = rb_entry(n, struct qib_mcast, rb_node);
@@ -328,6 +325,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 		 */
 		list_del_rcu(&p->list);
 		mcast->n_attached--;
+		delp = p;
 
 		/* If this was the last attached QP, remove the GID too. */
 		if (list_empty(&mcast->qp_list)) {
@@ -338,15 +336,16 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	}
 
 	spin_unlock_irq(&ibp->lock);
+	/* QP not attached */
+	if (!delp)
+		return -EINVAL;
+	/*
+	 * Wait for any list walkers to finish before freeing the
+	 * list element.
+	 */
+	wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+	qib_mcast_qp_free(delp);
 
-	if (p) {
-		/*
-		 * Wait for any list walkers to finish before freeing the
-		 * list element.
-		 */
-		wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
-		qib_mcast_qp_free(p);
-	}
 	if (last) {
 		atomic_dec(&mcast->refcount);
 		wait_event(mcast->wait, !atomic_read(&mcast->refcount));
@@ -355,11 +354,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 		dev->n_mcast_grps_allocated--;
 		spin_unlock_irq(&dev->n_mcast_grps_lock);
 	}
-
-	ret = 0;
-
-bail:
-	return ret;
+	return 0;
 }
 
 int qib_mcast_tree_empty(struct qib_ibport *ibp)
diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c
index 5e55b8bc6fe4..92dc66cc2d50 100644
--- a/drivers/infiniband/hw/usnic/usnic_debugfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c
@@ -157,8 +157,9 @@ void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow)
 							qp_flow,
 							&flowinfo_ops);
 	if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) {
-		usnic_err("Failed to create dbg fs entry for flow %u\n",
-				qp_flow->flow->flow_id);
+		usnic_err("Failed to create dbg fs entry for flow %u with error %ld\n",
+				qp_flow->flow->flow_id,
+				PTR_ERR(qp_flow->dbgfs_dentry));
 	}
 }
 
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
index fcea3a24d3eb..5f44b66ccb86 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
@@ -521,7 +521,7 @@ int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp,
 
 	if (!status) {
 		qp_grp->state = new_state;
-		usnic_info("Transistioned %u from %s to %s",
+		usnic_info("Transitioned %u from %s to %s",
 		qp_grp->grp_id,
 		usnic_ib_qp_grp_state_to_string(old_state),
 		usnic_ib_qp_grp_state_to_string(new_state));
@@ -575,7 +575,7 @@ alloc_res_chunk_list(struct usnic_vnic *vnic,
 	return res_chunk_list;
 
 out_free_res:
-	for (i--; i > 0; i--)
+	for (i--; i >= 0; i--)
 		usnic_vnic_put_resources(res_chunk_list[i]);
 	kfree(res_chunk_list);
 	return ERR_PTR(err);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index f8e3211689a3..6cdb4d23f78f 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -51,7 +51,7 @@
 
 static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver)
 {
-	*fw_ver = (u64) *fw_ver_str;
+	*fw_ver = *((u64 *)fw_ver_str);
 }
 
 static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp,
@@ -571,20 +571,20 @@ int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 
 	qp_grp = to_uqp_grp(ibqp);
 
-	/* TODO: Future Support All States */
 	mutex_lock(&qp_grp->vf->pf->usdev_lock);
-	if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT) {
-		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_INIT, NULL);
-	} else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTR) {
-		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTR, NULL);
-	} else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTS) {
-		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTS, NULL);
+	if ((attr_mask & IB_QP_PORT) && attr->port_num != 1) {
+		/* usnic devices only have one port */
+		status = -EINVAL;
+		goto out_unlock;
+	}
+	if (attr_mask & IB_QP_STATE) {
+		status = usnic_ib_qp_grp_modify(qp_grp, attr->qp_state, NULL);
 	} else {
-		usnic_err("Unexpected combination mask: %u state: %u\n",
-				attr_mask & IB_QP_STATE, attr->qp_state);
+		usnic_err("Unhandled request, attr_mask=0x%x\n", attr_mask);
 		status = -EINVAL;
 	}
 
+out_unlock:
 	mutex_unlock(&qp_grp->vf->pf->usdev_lock);
 	return status;
 }
@@ -625,8 +625,8 @@ struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
 			virt_addr, length);
 
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-	if (IS_ERR_OR_NULL(mr))
-		return ERR_PTR(mr ? PTR_ERR(mr) : -ENOMEM);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
 
 	mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length,
 					access_flags, 0);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
index 414eaa566bd9..0d9d2e6a14d5 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -43,8 +43,6 @@ int usnic_ib_query_device(struct ib_device *ibdev,
 			  struct ib_udata *uhw);
 int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
 				struct ib_port_attr *props);
-enum rdma_protocol_type
-usnic_ib_query_protocol(struct ib_device *device, u8 port_num);
 int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
 				int qp_attr_mask,
 				struct ib_qp_init_attr *qp_init_attr);
diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.c b/drivers/infiniband/hw/usnic/usnic_vnic.c
index 66de93fb8ea9..887510718690 100644
--- a/drivers/infiniband/hw/usnic/usnic_vnic.c
+++ b/drivers/infiniband/hw/usnic/usnic_vnic.c
@@ -237,7 +237,7 @@ usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type,
 	struct usnic_vnic_res *res;
 	int i;
 
-	if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 1 || !owner)
+	if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 0 || !owner)
 		return ERR_PTR(-EINVAL);
 
 	ret = kzalloc(sizeof(*ret), GFP_ATOMIC);
@@ -247,26 +247,28 @@ usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	ret->res = kzalloc(sizeof(*(ret->res))*cnt, GFP_ATOMIC);
-	if (!ret->res) {
-		usnic_err("Failed to allocate resources for %s. Out of memory\n",
-				usnic_vnic_pci_name(vnic));
-		kfree(ret);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (cnt > 0) {
+		ret->res = kcalloc(cnt, sizeof(*(ret->res)), GFP_ATOMIC);
+		if (!ret->res) {
+			usnic_err("Failed to allocate resources for %s. Out of memory\n",
+					usnic_vnic_pci_name(vnic));
+			kfree(ret);
+			return ERR_PTR(-ENOMEM);
+		}
 
-	spin_lock(&vnic->res_lock);
-	src = &vnic->chunks[type];
-	for (i = 0; i < src->cnt && ret->cnt < cnt; i++) {
-		res = src->res[i];
-		if (!res->owner) {
-			src->free_cnt--;
-			res->owner = owner;
-			ret->res[ret->cnt++] = res;
+		spin_lock(&vnic->res_lock);
+		src = &vnic->chunks[type];
+		for (i = 0; i < src->cnt && ret->cnt < cnt; i++) {
+			res = src->res[i];
+			if (!res->owner) {
+				src->free_cnt--;
+				res->owner = owner;
+				ret->res[ret->cnt++] = res;
+			}
 		}
-	}
 
-	spin_unlock(&vnic->res_lock);
+		spin_unlock(&vnic->res_lock);
+	}
 	ret->type = type;
 	ret->vnic = vnic;
 	WARN_ON(ret->cnt != cnt);
@@ -281,14 +283,16 @@ void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk)
 	int i;
 	struct usnic_vnic *vnic = chunk->vnic;
 
-	spin_lock(&vnic->res_lock);
-	while ((i = --chunk->cnt) >= 0) {
-		res = chunk->res[i];
-		chunk->res[i] = NULL;
-		res->owner = NULL;
-		vnic->chunks[res->type].free_cnt++;
+	if (chunk->cnt > 0) {
+		spin_lock(&vnic->res_lock);
+		while ((i = --chunk->cnt) >= 0) {
+			res = chunk->res[i];
+			chunk->res[i] = NULL;
+			res->owner = NULL;
+			vnic->chunks[res->type].free_cnt++;
+		}
+		spin_unlock(&vnic->res_lock);
 	}
-	spin_unlock(&vnic->res_lock);
 
 	kfree(chunk->res);
 	kfree(chunk);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 3ede10309754..a6f3eab0f350 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -495,7 +495,6 @@ void ipoib_dev_cleanup(struct net_device *dev);
 void ipoib_mcast_join_task(struct work_struct *work);
 void ipoib_mcast_carrier_on_task(struct work_struct *work);
 void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
-void ipoib_mcast_free(struct ipoib_mcast *mc);
 
 void ipoib_mcast_restart_task(struct work_struct *work);
 int ipoib_mcast_start_thread(struct net_device *dev);
@@ -549,8 +548,9 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 
 int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
 		       union ib_gid *mgid, int set_qkey);
-int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast);
-struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid);
+void ipoib_mcast_remove_list(struct list_head *remove_list);
+void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid,
+				struct list_head *remove_list);
 
 int ipoib_init_qp(struct net_device *dev);
 int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 3ae9726efb98..917e46ea3bf6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -70,7 +70,6 @@ static struct ib_qp_attr ipoib_cm_err_attr = {
 #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
 
 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
-	.wr_id = IPOIB_CM_RX_DRAIN_WRID,
 	.opcode = IB_WR_SEND,
 };
 
@@ -223,6 +222,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
 	 * error" WC will be immediately generated for each WR we post.
 	 */
 	p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+	ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID;
 	if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
 		ipoib_warn(priv, "failed to post drain wr\n");
 
@@ -1522,8 +1522,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
 int ipoib_cm_dev_init(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	int i, ret;
-	struct ib_device_attr attr;
+	int max_srq_sge, i;
 
 	INIT_LIST_HEAD(&priv->cm.passive_ids);
 	INIT_LIST_HEAD(&priv->cm.reap_list);
@@ -1540,19 +1539,13 @@ int ipoib_cm_dev_init(struct net_device *dev)
 
 	skb_queue_head_init(&priv->cm.skb_queue);
 
-	ret = ib_query_device(priv->ca, &attr);
-	if (ret) {
-		printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
-		return ret;
-	}
-
-	ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
+	ipoib_dbg(priv, "max_srq_sge=%d\n", priv->ca->attrs.max_srq_sge);
 
-	attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
-	ipoib_cm_create_srq(dev, attr.max_srq_sge);
+	max_srq_sge = min_t(int, IPOIB_CM_RX_SG, priv->ca->attrs.max_srq_sge);
+	ipoib_cm_create_srq(dev, max_srq_sge);
 	if (ipoib_cm_has_srq(dev)) {
-		priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10;
-		priv->cm.num_frags  = attr.max_srq_sge;
+		priv->cm.max_cm_mtu = max_srq_sge * PAGE_SIZE - 0x10;
+		priv->cm.num_frags  = max_srq_sge;
 		ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
 			  priv->cm.max_cm_mtu, priv->cm.num_frags);
 	} else {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 078cadd6c797..a53fa5fc0dec 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -40,15 +40,11 @@ static void ipoib_get_drvinfo(struct net_device *netdev,
 			      struct ethtool_drvinfo *drvinfo)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(netdev);
-	struct ib_device_attr *attr;
-
-	attr = kmalloc(sizeof(*attr), GFP_KERNEL);
-	if (attr && !ib_query_device(priv->ca, attr))
-		snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
-			 "%d.%d.%d", (int)(attr->fw_ver >> 32),
-			 (int)(attr->fw_ver >> 16) & 0xffff,
-			 (int)attr->fw_ver & 0xffff);
-	kfree(attr);
+
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%d.%d.%d", (int)(priv->ca->attrs.fw_ver >> 32),
+		 (int)(priv->ca->attrs.fw_ver >> 16) & 0xffff,
+		 (int)priv->ca->attrs.fw_ver & 0xffff);
 
 	strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device),
 		sizeof(drvinfo->bus_info));
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 7d3281866ffc..25509bbd4a05 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1150,8 +1150,6 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
 	unsigned long flags;
 	int i;
 	LIST_HEAD(remove_list);
-	struct ipoib_mcast *mcast, *tmcast;
-	struct net_device *dev = priv->dev;
 
 	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
 		return;
@@ -1179,18 +1177,8 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
 							  lockdep_is_held(&priv->lock))) != NULL) {
 			/* was the neigh idle for two GC periods */
 			if (time_after(neigh_obsolete, neigh->alive)) {
-				u8 *mgid = neigh->daddr + 4;
 
-				/* Is this multicast ? */
-				if (*mgid == 0xff) {
-					mcast = __ipoib_mcast_find(dev, mgid);
-
-					if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
-						list_del(&mcast->list);
-						rb_erase(&mcast->rb_node, &priv->multicast_tree);
-						list_add_tail(&mcast->list, &remove_list);
-					}
-				}
+				ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);
 
 				rcu_assign_pointer(*np,
 						   rcu_dereference_protected(neigh->hnext,
@@ -1207,10 +1195,7 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
 
 out_unlock:
 	spin_unlock_irqrestore(&priv->lock, flags);
-	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
-		ipoib_mcast_leave(dev, mcast);
-		ipoib_mcast_free(mcast);
-	}
+	ipoib_mcast_remove_list(&remove_list);
 }
 
 static void ipoib_reap_neigh(struct work_struct *work)
@@ -1777,26 +1762,7 @@ int ipoib_add_pkey_attr(struct net_device *dev)
 
 int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
 {
-	struct ib_device_attr *device_attr;
-	int result = -ENOMEM;
-
-	device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
-	if (!device_attr) {
-		printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
-		       hca->name, sizeof *device_attr);
-		return result;
-	}
-
-	result = ib_query_device(hca, device_attr);
-	if (result) {
-		printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
-		       hca->name, result);
-		kfree(device_attr);
-		return result;
-	}
-	priv->hca_caps = device_attr->device_cap_flags;
-
-	kfree(device_attr);
+	priv->hca_caps = hca->attrs.device_cap_flags;
 
 	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
 		priv->dev->hw_features = NETIF_F_SG |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index f357ca67a41c..050dfa175d16 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -106,7 +106,7 @@ static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv,
 		queue_delayed_work(priv->wq, &priv->mcast_task, 0);
 }
 
-void ipoib_mcast_free(struct ipoib_mcast *mcast)
+static void ipoib_mcast_free(struct ipoib_mcast *mcast)
 {
 	struct net_device *dev = mcast->dev;
 	int tx_dropped = 0;
@@ -153,7 +153,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
 	return mcast;
 }
 
-struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
+static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct rb_node *n = priv->multicast_tree.rb_node;
@@ -677,7 +677,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev)
 	return 0;
 }
 
-int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
+static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int ret = 0;
@@ -704,6 +704,35 @@ int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
 	return 0;
 }
 
+/*
+ * Check if the multicast group is sendonly. If so remove it from the maps
+ * and add to the remove list
+ */
+void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid,
+				struct list_head *remove_list)
+{
+	/* Is this multicast ? */
+	if (*mgid == 0xff) {
+		struct ipoib_mcast *mcast = __ipoib_mcast_find(priv->dev, mgid);
+
+		if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			list_del(&mcast->list);
+			rb_erase(&mcast->rb_node, &priv->multicast_tree);
+			list_add_tail(&mcast->list, remove_list);
+		}
+	}
+}
+
+void ipoib_mcast_remove_list(struct list_head *remove_list)
+{
+	struct ipoib_mcast *mcast, *tmcast;
+
+	list_for_each_entry_safe(mcast, tmcast, remove_list, list) {
+		ipoib_mcast_leave(mcast->dev, mcast);
+		ipoib_mcast_free(mcast);
+	}
+}
+
 void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -810,10 +839,7 @@ void ipoib_mcast_dev_flush(struct net_device *dev)
 		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
 			wait_for_completion(&mcast->done);
 
-	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
-		ipoib_mcast_leave(dev, mcast);
-		ipoib_mcast_free(mcast);
-	}
+	ipoib_mcast_remove_list(&remove_list);
 }
 
 static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast)
@@ -939,10 +965,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
 			wait_for_completion(&mcast->done);
 
-	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
-		ipoib_mcast_leave(mcast->dev, mcast);
-		ipoib_mcast_free(mcast);
-	}
+	ipoib_mcast_remove_list(&remove_list);
 
 	/*
 	 * Double check that we are still up
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 9080161e01af..c827c93f46c5 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -644,7 +644,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 
 		ib_conn = &iser_conn->ib_conn;
 		if (ib_conn->pi_support) {
-			u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap;
+			u32 sig_caps = ib_conn->device->ib_device->attrs.sig_prot_cap;
 
 			scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps));
 			scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP |
@@ -656,7 +656,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 		 * max fastreg page list length.
 		 */
 		shost->sg_tablesize = min_t(unsigned short, shost->sg_tablesize,
-			ib_conn->device->dev_attr.max_fast_reg_page_list_len);
+			ib_conn->device->ib_device->attrs.max_fast_reg_page_list_len);
 		shost->max_sectors = min_t(unsigned int,
 			1024, (shost->sg_tablesize * PAGE_SIZE) >> 9);
 
@@ -1059,7 +1059,8 @@ static int __init iser_init(void)
 	release_wq = alloc_workqueue("release workqueue", 0, 0);
 	if (!release_wq) {
 		iser_err("failed to allocate release workqueue\n");
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto err_alloc_wq;
 	}
 
 	iscsi_iser_scsi_transport = iscsi_register_transport(
@@ -1067,12 +1068,14 @@ static int __init iser_init(void)
 	if (!iscsi_iser_scsi_transport) {
 		iser_err("iscsi_register_transport failed\n");
 		err = -EINVAL;
-		goto register_transport_failure;
+		goto err_reg;
 	}
 
 	return 0;
 
-register_transport_failure:
+err_reg:
+	destroy_workqueue(release_wq);
+err_alloc_wq:
 	kmem_cache_destroy(ig.desc_cache);
 
 	return err;
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 8a5998e6a407..95f0a64e076b 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -48,6 +48,7 @@
 #include <scsi/scsi_transport_iscsi.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
+#include <scsi/iser.h>
 
 #include <linux/interrupt.h>
 #include <linux/wait.h>
@@ -151,46 +152,10 @@
 					 - ISER_MAX_RX_MISC_PDUS) /	\
 					 (1 + ISER_INFLIGHT_DATAOUTS))
 
-#define ISER_WC_BATCH_COUNT   16
 #define ISER_SIGNAL_CMD_COUNT 32
 
-#define ISER_VER			0x10
-#define ISER_WSV			0x08
-#define ISER_RSV			0x04
-
-#define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
-#define ISER_BEACON_WRID		0xfffffffffffffffeULL
-
-/**
- * struct iser_hdr - iSER header
- *
- * @flags:        flags support (zbva, remote_inv)
- * @rsvd:         reserved
- * @write_stag:   write rkey
- * @write_va:     write virtual address
- * @reaf_stag:    read rkey
- * @read_va:      read virtual address
- */
-struct iser_hdr {
-	u8      flags;
-	u8      rsvd[3];
-	__be32  write_stag;
-	__be64  write_va;
-	__be32  read_stag;
-	__be64  read_va;
-} __attribute__((packed));
-
-
-#define ISER_ZBVA_NOT_SUPPORTED		0x80
-#define ISER_SEND_W_INV_NOT_SUPPORTED	0x40
-
-struct iser_cm_hdr {
-	u8      flags;
-	u8      rsvd[3];
-} __packed;
-
 /* Constant PDU lengths calculations */
-#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
+#define ISER_HEADERS_LEN	(sizeof(struct iser_ctrl) + sizeof(struct iscsi_hdr))
 
 #define ISER_RECV_DATA_SEG_LEN	128
 #define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
@@ -269,7 +234,7 @@ enum iser_desc_type {
 #define ISER_MAX_WRS 7
 
 /**
- * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
+ * struct iser_tx_desc - iSER TX descriptor
  *
  * @iser_header:   iser header
  * @iscsi_header:  iscsi header
@@ -287,12 +252,13 @@ enum iser_desc_type {
  * @sig_attrs:     Signature attributes
  */
 struct iser_tx_desc {
-	struct iser_hdr              iser_header;
+	struct iser_ctrl             iser_header;
 	struct iscsi_hdr             iscsi_header;
 	enum   iser_desc_type        type;
 	u64		             dma_addr;
 	struct ib_sge		     tx_sg[2];
 	int                          num_sge;
+	struct ib_cqe		     cqe;
 	bool			     mapped;
 	u8                           wr_idx;
 	union iser_wr {
@@ -306,9 +272,10 @@ struct iser_tx_desc {
 };
 
 #define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
-					sizeof(u64) + sizeof(struct ib_sge)))
+				 sizeof(u64) + sizeof(struct ib_sge) + \
+				 sizeof(struct ib_cqe)))
 /**
- * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
+ * struct iser_rx_desc - iSER RX descriptor
  *
  * @iser_header:   iser header
  * @iscsi_header:  iscsi header
@@ -318,12 +285,32 @@ struct iser_tx_desc {
  * @pad:           for sense data TODO: Modify to maximum sense length supported
  */
 struct iser_rx_desc {
-	struct iser_hdr              iser_header;
+	struct iser_ctrl             iser_header;
 	struct iscsi_hdr             iscsi_header;
 	char		             data[ISER_RECV_DATA_SEG_LEN];
 	u64		             dma_addr;
 	struct ib_sge		     rx_sg;
+	struct ib_cqe		     cqe;
 	char		             pad[ISER_RX_PAD_SIZE];
+} __packed;
+
+/**
+ * struct iser_login_desc - iSER login descriptor
+ *
+ * @req:           pointer to login request buffer
+ * @resp:          pointer to login response buffer
+ * @req_dma:       DMA address of login request buffer
+ * @rsp_dma:      DMA address of login response buffer
+ * @sge:           IB sge for login post recv
+ * @cqe:           completion handler
+ */
+struct iser_login_desc {
+	void                         *req;
+	void                         *rsp;
+	u64                          req_dma;
+	u64                          rsp_dma;
+	struct ib_sge                sge;
+	struct ib_cqe		     cqe;
 } __attribute__((packed));
 
 struct iser_conn;
@@ -333,18 +320,12 @@ struct iscsi_iser_task;
 /**
  * struct iser_comp - iSER completion context
  *
- * @device:     pointer to device handle
  * @cq:         completion queue
- * @wcs:        work completion array
- * @tasklet:    Tasklet handle
  * @active_qps: Number of active QPs attached
  *              to completion context
  */
 struct iser_comp {
-	struct iser_device      *device;
 	struct ib_cq		*cq;
-	struct ib_wc		 wcs[ISER_WC_BATCH_COUNT];
-	struct tasklet_struct	 tasklet;
 	int                      active_qps;
 };
 
@@ -380,7 +361,6 @@ struct iser_reg_ops {
  *
  * @ib_device:     RDMA device
  * @pd:            Protection Domain for this device
- * @dev_attr:      Device attributes container
  * @mr:            Global DMA memory region
  * @event_handler: IB events handle routine
  * @ig_list:	   entry in devices list
@@ -389,18 +369,19 @@ struct iser_reg_ops {
  *                 cpus and device max completion vectors
  * @comps:         Dinamically allocated array of completion handlers
  * @reg_ops:       Registration ops
+ * @remote_inv_sup: Remote invalidate is supported on this device
  */
 struct iser_device {
 	struct ib_device             *ib_device;
 	struct ib_pd	             *pd;
-	struct ib_device_attr	     dev_attr;
 	struct ib_mr	             *mr;
 	struct ib_event_handler      event_handler;
 	struct list_head             ig_list;
 	int                          refcount;
 	int			     comps_used;
 	struct iser_comp	     *comps;
-	struct iser_reg_ops          *reg_ops;
+	const struct iser_reg_ops    *reg_ops;
+	bool                         remote_inv_sup;
 };
 
 #define ISER_CHECK_GUARD	0xc0
@@ -475,10 +456,11 @@ struct iser_fr_pool {
  * @rx_wr:               receive work request for batch posts
  * @device:              reference to iser device
  * @comp:                iser completion context
- * @pi_support:          Indicate device T10-PI support
- * @beacon:              beacon send wr to signal all flush errors were drained
- * @flush_comp:          completes when all connection completions consumed
  * @fr_pool:             connection fast registration poool
+ * @pi_support:          Indicate device T10-PI support
+ * @last:                last send wr to signal all flush errors were drained
+ * @last_cqe:            cqe handler for last wr
+ * @last_comp:           completes when all connection completions consumed
  */
 struct ib_conn {
 	struct rdma_cm_id           *cma_id;
@@ -488,10 +470,12 @@ struct ib_conn {
 	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
 	struct iser_device          *device;
 	struct iser_comp	    *comp;
-	bool			     pi_support;
-	struct ib_send_wr	     beacon;
-	struct completion	     flush_comp;
 	struct iser_fr_pool          fr_pool;
+	bool			     pi_support;
+	struct ib_send_wr	     last;
+	struct ib_cqe		     last_cqe;
+	struct ib_cqe		     reg_cqe;
+	struct completion	     last_comp;
 };
 
 /**
@@ -514,11 +498,7 @@ struct ib_conn {
  * @up_completion:    connection establishment completed
  *                    (state is ISER_CONN_UP)
  * @conn_list:        entry in ig conn list
- * @login_buf:        login data buffer (stores login parameters)
- * @login_req_buf:    login request buffer
- * @login_req_dma:    login request buffer dma address
- * @login_resp_buf:   login response buffer
- * @login_resp_dma:   login response buffer dma address
+ * @login_desc:       login descriptor
  * @rx_desc_head:     head of rx_descs cyclic buffer
  * @rx_descs:         rx buffers array (cyclic buffer)
  * @num_rx_descs:     number of rx descriptors
@@ -541,15 +521,13 @@ struct iser_conn {
 	struct completion	     ib_completion;
 	struct completion	     up_completion;
 	struct list_head	     conn_list;
-
-	char  			     *login_buf;
-	char			     *login_req_buf, *login_resp_buf;
-	u64			     login_req_dma, login_resp_dma;
+	struct iser_login_desc       login_desc;
 	unsigned int 		     rx_desc_head;
 	struct iser_rx_desc	     *rx_descs;
 	u32                          num_rx_descs;
 	unsigned short               scsi_sg_tablesize;
 	unsigned int                 scsi_max_sectors;
+	bool			     snd_w_inv;
 };
 
 /**
@@ -579,9 +557,8 @@ struct iscsi_iser_task {
 
 struct iser_page_vec {
 	u64 *pages;
-	int length;
-	int offset;
-	int data_size;
+	int npages;
+	struct ib_mr fake_mr;
 };
 
 /**
@@ -633,12 +610,14 @@ int iser_conn_terminate(struct iser_conn *iser_conn);
 
 void iser_release_work(struct work_struct *work);
 
-void iser_rcv_completion(struct iser_rx_desc *desc,
-			 unsigned long dto_xfer_len,
-			 struct ib_conn *ib_conn);
-
-void iser_snd_completion(struct iser_tx_desc *desc,
-			 struct ib_conn *ib_conn);
+void iser_err_comp(struct ib_wc *wc, const char *type);
+void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc);
 
 void iser_task_rdma_init(struct iscsi_iser_task *task);
 
@@ -651,7 +630,8 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
 				     enum iser_data_dir cmd_dir);
 
 int iser_reg_rdma_mem(struct iscsi_iser_task *task,
-		      enum iser_data_dir dir);
+		      enum iser_data_dir dir,
+		      bool all_imm);
 void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
 			 enum iser_data_dir dir);
 
@@ -719,4 +699,28 @@ iser_tx_next_wr(struct iser_tx_desc *tx_desc)
 	return cur_wr;
 }
 
+static inline struct iser_conn *
+to_iser_conn(struct ib_conn *ib_conn)
+{
+	return container_of(ib_conn, struct iser_conn, ib_conn);
+}
+
+static inline struct iser_rx_desc *
+iser_rx(struct ib_cqe *cqe)
+{
+	return container_of(cqe, struct iser_rx_desc, cqe);
+}
+
+static inline struct iser_tx_desc *
+iser_tx(struct ib_cqe *cqe)
+{
+	return container_of(cqe, struct iser_tx_desc, cqe);
+}
+
+static inline struct iser_login_desc *
+iser_login(struct ib_cqe *cqe)
+{
+	return container_of(cqe, struct iser_login_desc, cqe);
+}
+
 #endif
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index ffd00c420729..ed54b388e7ad 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -51,7 +51,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
 	struct iscsi_iser_task *iser_task = task->dd_data;
 	struct iser_mem_reg *mem_reg;
 	int err;
-	struct iser_hdr *hdr = &iser_task->desc.iser_header;
+	struct iser_ctrl *hdr = &iser_task->desc.iser_header;
 	struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN];
 
 	err = iser_dma_map_task_data(iser_task,
@@ -72,7 +72,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
 			return err;
 	}
 
-	err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
+	err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN, false);
 	if (err) {
 		iser_err("Failed to set up Data-IN RDMA\n");
 		return err;
@@ -104,7 +104,7 @@ iser_prepare_write_cmd(struct iscsi_task *task,
 	struct iscsi_iser_task *iser_task = task->dd_data;
 	struct iser_mem_reg *mem_reg;
 	int err;
-	struct iser_hdr *hdr = &iser_task->desc.iser_header;
+	struct iser_ctrl *hdr = &iser_task->desc.iser_header;
 	struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT];
 	struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1];
 
@@ -126,7 +126,8 @@ iser_prepare_write_cmd(struct iscsi_task *task,
 			return err;
 	}
 
-	err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
+	err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT,
+				buf_out->data_len == imm_sz);
 	if (err != 0) {
 		iser_err("Failed to register write cmd RDMA mem\n");
 		return err;
@@ -166,7 +167,7 @@ static void iser_create_send_desc(struct iser_conn	*iser_conn,
 	ib_dma_sync_single_for_cpu(device->ib_device,
 		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
 
-	memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
+	memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl));
 	tx_desc->iser_header.flags = ISER_VER;
 	tx_desc->num_sge = 1;
 }
@@ -174,73 +175,63 @@ static void iser_create_send_desc(struct iser_conn	*iser_conn,
 static void iser_free_login_buf(struct iser_conn *iser_conn)
 {
 	struct iser_device *device = iser_conn->ib_conn.device;
+	struct iser_login_desc *desc = &iser_conn->login_desc;
 
-	if (!iser_conn->login_buf)
+	if (!desc->req)
 		return;
 
-	if (iser_conn->login_req_dma)
-		ib_dma_unmap_single(device->ib_device,
-				    iser_conn->login_req_dma,
-				    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
+	ib_dma_unmap_single(device->ib_device, desc->req_dma,
+			    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
 
-	if (iser_conn->login_resp_dma)
-		ib_dma_unmap_single(device->ib_device,
-				    iser_conn->login_resp_dma,
-				    ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+	ib_dma_unmap_single(device->ib_device, desc->rsp_dma,
+			    ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
 
-	kfree(iser_conn->login_buf);
+	kfree(desc->req);
+	kfree(desc->rsp);
 
 	/* make sure we never redo any unmapping */
-	iser_conn->login_req_dma = 0;
-	iser_conn->login_resp_dma = 0;
-	iser_conn->login_buf = NULL;
+	desc->req = NULL;
+	desc->rsp = NULL;
 }
 
 static int iser_alloc_login_buf(struct iser_conn *iser_conn)
 {
 	struct iser_device *device = iser_conn->ib_conn.device;
-	int			req_err, resp_err;
-
-	BUG_ON(device == NULL);
-
-	iser_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
-				     ISER_RX_LOGIN_SIZE, GFP_KERNEL);
-	if (!iser_conn->login_buf)
-		goto out_err;
-
-	iser_conn->login_req_buf  = iser_conn->login_buf;
-	iser_conn->login_resp_buf = iser_conn->login_buf +
-						ISCSI_DEF_MAX_RECV_SEG_LEN;
-
-	iser_conn->login_req_dma = ib_dma_map_single(device->ib_device,
-						     iser_conn->login_req_buf,
-						     ISCSI_DEF_MAX_RECV_SEG_LEN,
-						     DMA_TO_DEVICE);
-
-	iser_conn->login_resp_dma = ib_dma_map_single(device->ib_device,
-						      iser_conn->login_resp_buf,
-						      ISER_RX_LOGIN_SIZE,
-						      DMA_FROM_DEVICE);
-
-	req_err  = ib_dma_mapping_error(device->ib_device,
-					iser_conn->login_req_dma);
-	resp_err = ib_dma_mapping_error(device->ib_device,
-					iser_conn->login_resp_dma);
-
-	if (req_err || resp_err) {
-		if (req_err)
-			iser_conn->login_req_dma = 0;
-		if (resp_err)
-			iser_conn->login_resp_dma = 0;
-		goto free_login_buf;
-	}
+	struct iser_login_desc *desc = &iser_conn->login_desc;
+
+	desc->req = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN, GFP_KERNEL);
+	if (!desc->req)
+		return -ENOMEM;
+
+	desc->req_dma = ib_dma_map_single(device->ib_device, desc->req,
+					  ISCSI_DEF_MAX_RECV_SEG_LEN,
+					  DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device,
+				desc->req_dma))
+		goto free_req;
+
+	desc->rsp = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+	if (!desc->rsp)
+		goto unmap_req;
+
+	desc->rsp_dma = ib_dma_map_single(device->ib_device, desc->rsp,
+					   ISER_RX_LOGIN_SIZE,
+					   DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device,
+				desc->rsp_dma))
+		goto free_rsp;
+
 	return 0;
 
-free_login_buf:
-	iser_free_login_buf(iser_conn);
+free_rsp:
+	kfree(desc->rsp);
+unmap_req:
+	ib_dma_unmap_single(device->ib_device, desc->req_dma,
+			    ISCSI_DEF_MAX_RECV_SEG_LEN,
+			    DMA_TO_DEVICE);
+free_req:
+	kfree(desc->req);
 
-out_err:
-	iser_err("unable to alloc or map login buf\n");
 	return -ENOMEM;
 }
 
@@ -280,11 +271,11 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
 			goto rx_desc_dma_map_failed;
 
 		rx_desc->dma_addr = dma_addr;
-
+		rx_desc->cqe.done = iser_task_rsp;
 		rx_sg = &rx_desc->rx_sg;
-		rx_sg->addr   = rx_desc->dma_addr;
+		rx_sg->addr = rx_desc->dma_addr;
 		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
-		rx_sg->lkey   = device->pd->local_dma_lkey;
+		rx_sg->lkey = device->pd->local_dma_lkey;
 	}
 
 	iser_conn->rx_desc_head = 0;
@@ -383,6 +374,7 @@ int iser_send_command(struct iscsi_conn *conn,
 
 	/* build the tx desc regd header and add it to the tx desc dto */
 	tx_desc->type = ISCSI_TX_SCSI_COMMAND;
+	tx_desc->cqe.done = iser_cmd_comp;
 	iser_create_send_desc(iser_conn, tx_desc);
 
 	if (hdr->flags & ISCSI_FLAG_CMD_READ) {
@@ -464,6 +456,7 @@ int iser_send_data_out(struct iscsi_conn *conn,
 	}
 
 	tx_desc->type = ISCSI_TX_DATAOUT;
+	tx_desc->cqe.done = iser_dataout_comp;
 	tx_desc->iser_header.flags = ISER_VER;
 	memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
 
@@ -513,6 +506,7 @@ int iser_send_control(struct iscsi_conn *conn,
 
 	/* build the tx desc regd header and add it to the tx desc dto */
 	mdesc->type = ISCSI_TX_CONTROL;
+	mdesc->cqe.done = iser_ctrl_comp;
 	iser_create_send_desc(iser_conn, mdesc);
 
 	device = iser_conn->ib_conn.device;
@@ -520,25 +514,25 @@ int iser_send_control(struct iscsi_conn *conn,
 	data_seg_len = ntoh24(task->hdr->dlength);
 
 	if (data_seg_len > 0) {
+		struct iser_login_desc *desc = &iser_conn->login_desc;
 		struct ib_sge *tx_dsg = &mdesc->tx_sg[1];
+
 		if (task != conn->login_task) {
 			iser_err("data present on non login task!!!\n");
 			goto send_control_error;
 		}
 
-		ib_dma_sync_single_for_cpu(device->ib_device,
-			iser_conn->login_req_dma, task->data_count,
-			DMA_TO_DEVICE);
+		ib_dma_sync_single_for_cpu(device->ib_device, desc->req_dma,
+					   task->data_count, DMA_TO_DEVICE);
 
-		memcpy(iser_conn->login_req_buf, task->data, task->data_count);
+		memcpy(desc->req, task->data, task->data_count);
 
-		ib_dma_sync_single_for_device(device->ib_device,
-			iser_conn->login_req_dma, task->data_count,
-			DMA_TO_DEVICE);
+		ib_dma_sync_single_for_device(device->ib_device, desc->req_dma,
+					      task->data_count, DMA_TO_DEVICE);
 
-		tx_dsg->addr    = iser_conn->login_req_dma;
-		tx_dsg->length  = task->data_count;
-		tx_dsg->lkey    = device->pd->local_dma_lkey;
+		tx_dsg->addr = desc->req_dma;
+		tx_dsg->length = task->data_count;
+		tx_dsg->lkey = device->pd->local_dma_lkey;
 		mdesc->num_sge = 2;
 	}
 
@@ -562,41 +556,126 @@ send_control_error:
 	return err;
 }
 
-/**
- * iser_rcv_dto_completion - recv DTO completion
- */
-void iser_rcv_completion(struct iser_rx_desc *rx_desc,
-			 unsigned long rx_xfer_len,
-			 struct ib_conn *ib_conn)
+void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
-						   ib_conn);
+	struct ib_conn *ib_conn = wc->qp->qp_context;
+	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
+	struct iser_login_desc *desc = iser_login(wc->wr_cqe);
 	struct iscsi_hdr *hdr;
-	u64 rx_dma;
-	int rx_buflen, outstanding, count, err;
+	char *data;
+	int length;
 
-	/* differentiate between login to all other PDUs */
-	if ((char *)rx_desc == iser_conn->login_resp_buf) {
-		rx_dma = iser_conn->login_resp_dma;
-		rx_buflen = ISER_RX_LOGIN_SIZE;
-	} else {
-		rx_dma = rx_desc->dma_addr;
-		rx_buflen = ISER_RX_PAYLOAD_SIZE;
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		iser_err_comp(wc, "login_rsp");
+		return;
+	}
+
+	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device,
+				   desc->rsp_dma, ISER_RX_LOGIN_SIZE,
+				   DMA_FROM_DEVICE);
+
+	hdr = desc->rsp + sizeof(struct iser_ctrl);
+	data = desc->rsp + ISER_HEADERS_LEN;
+	length = wc->byte_len - ISER_HEADERS_LEN;
+
+	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
+		 hdr->itt, length);
+
+	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, data, length);
+
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+				      desc->rsp_dma, ISER_RX_LOGIN_SIZE,
+				      DMA_FROM_DEVICE);
+
+	ib_conn->post_recv_buf_count--;
+}
+
+static inline void
+iser_inv_desc(struct iser_fr_desc *desc, u32 rkey)
+{
+	if (likely(rkey == desc->rsc.mr->rkey))
+		desc->rsc.mr_valid = 0;
+	else if (likely(rkey == desc->pi_ctx->sig_mr->rkey))
+		desc->pi_ctx->sig_mr_valid = 0;
+}
+
+static int
+iser_check_remote_inv(struct iser_conn *iser_conn,
+		      struct ib_wc *wc,
+		      struct iscsi_hdr *hdr)
+{
+	if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
+		struct iscsi_task *task;
+		u32 rkey = wc->ex.invalidate_rkey;
+
+		iser_dbg("conn %p: remote invalidation for rkey %#x\n",
+			 iser_conn, rkey);
+
+		if (unlikely(!iser_conn->snd_w_inv)) {
+			iser_err("conn %p: unexepected remote invalidation, "
+				 "terminating connection\n", iser_conn);
+			return -EPROTO;
+		}
+
+		task = iscsi_itt_to_ctask(iser_conn->iscsi_conn, hdr->itt);
+		if (likely(task)) {
+			struct iscsi_iser_task *iser_task = task->dd_data;
+			struct iser_fr_desc *desc;
+
+			if (iser_task->dir[ISER_DIR_IN]) {
+				desc = iser_task->rdma_reg[ISER_DIR_IN].mem_h;
+				iser_inv_desc(desc, rkey);
+			}
+
+			if (iser_task->dir[ISER_DIR_OUT]) {
+				desc = iser_task->rdma_reg[ISER_DIR_OUT].mem_h;
+				iser_inv_desc(desc, rkey);
+			}
+		} else {
+			iser_err("failed to get task for itt=%d\n", hdr->itt);
+			return -EINVAL;
+		}
 	}
 
-	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma,
-				   rx_buflen, DMA_FROM_DEVICE);
+	return 0;
+}
 
-	hdr = &rx_desc->iscsi_header;
+
+void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_conn *ib_conn = wc->qp->qp_context;
+	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
+	struct iser_rx_desc *desc = iser_rx(wc->wr_cqe);
+	struct iscsi_hdr *hdr;
+	int length;
+	int outstanding, count, err;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		iser_err_comp(wc, "task_rsp");
+		return;
+	}
+
+	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device,
+				   desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
+				   DMA_FROM_DEVICE);
+
+	hdr = &desc->iscsi_header;
+	length = wc->byte_len - ISER_HEADERS_LEN;
 
 	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
-			hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));
+		 hdr->itt, length);
+
+	if (iser_check_remote_inv(iser_conn, wc, hdr)) {
+		iscsi_conn_failure(iser_conn->iscsi_conn,
+				   ISCSI_ERR_CONN_FAILED);
+		return;
+	}
 
-	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, rx_desc->data,
-			rx_xfer_len - ISER_HEADERS_LEN);
+	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, desc->data, length);
 
-	ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma,
-				      rx_buflen, DMA_FROM_DEVICE);
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+				      desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
+				      DMA_FROM_DEVICE);
 
 	/* decrementing conn->post_recv_buf_count only --after-- freeing the   *
 	 * task eliminates the need to worry on tasks which are completed in   *
@@ -604,9 +683,6 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
 	 * for the posted rx bufs refcount to become zero handles everything   */
 	ib_conn->post_recv_buf_count--;
 
-	if (rx_dma == iser_conn->login_resp_dma)
-		return;
-
 	outstanding = ib_conn->post_recv_buf_count;
 	if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) {
 		count = min(iser_conn->qp_max_recv_dtos - outstanding,
@@ -617,26 +693,47 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
 	}
 }
 
-void iser_snd_completion(struct iser_tx_desc *tx_desc,
-			struct ib_conn *ib_conn)
+void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc)
 {
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		iser_err_comp(wc, "command");
+}
+
+void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct iser_tx_desc *desc = iser_tx(wc->wr_cqe);
 	struct iscsi_task *task;
-	struct iser_device *device = ib_conn->device;
 
-	if (tx_desc->type == ISCSI_TX_DATAOUT) {
-		ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
-					ISER_HEADERS_LEN, DMA_TO_DEVICE);
-		kmem_cache_free(ig.desc_cache, tx_desc);
-		tx_desc = NULL;
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		iser_err_comp(wc, "control");
+		return;
 	}
 
-	if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) {
-		/* this arithmetic is legal by libiscsi dd_data allocation */
-		task = (void *) ((long)(void *)tx_desc -
-				  sizeof(struct iscsi_task));
-		if (task->hdr->itt == RESERVED_ITT)
-			iscsi_put_task(task);
-	}
+	/* this arithmetic is legal by libiscsi dd_data allocation */
+	task = (void *)desc - sizeof(struct iscsi_task);
+	if (task->hdr->itt == RESERVED_ITT)
+		iscsi_put_task(task);
+}
+
+void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct iser_tx_desc *desc = iser_tx(wc->wr_cqe);
+	struct ib_conn *ib_conn = wc->qp->qp_context;
+	struct iser_device *device = ib_conn->device;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		iser_err_comp(wc, "dataout");
+
+	ib_dma_unmap_single(device->ib_device, desc->dma_addr,
+			    ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	kmem_cache_free(ig.desc_cache, desc);
+}
+
+void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_conn *ib_conn = wc->qp->qp_context;
+
+	complete(&ib_conn->last_comp);
 }
 
 void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index ea765fb9664d..9a391cc5b9b3 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -49,7 +49,7 @@ int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 		     struct iser_reg_resources *rsc,
 		     struct iser_mem_reg *mem_reg);
 
-static struct iser_reg_ops fastreg_ops = {
+static const struct iser_reg_ops fastreg_ops = {
 	.alloc_reg_res	= iser_alloc_fastreg_pool,
 	.free_reg_res	= iser_free_fastreg_pool,
 	.reg_mem	= iser_fast_reg_mr,
@@ -58,7 +58,7 @@ static struct iser_reg_ops fastreg_ops = {
 	.reg_desc_put	= iser_reg_desc_put_fr,
 };
 
-static struct iser_reg_ops fmr_ops = {
+static const struct iser_reg_ops fmr_ops = {
 	.alloc_reg_res	= iser_alloc_fmr_pool,
 	.free_reg_res	= iser_free_fmr_pool,
 	.reg_mem	= iser_fast_reg_fmr,
@@ -67,19 +67,24 @@ static struct iser_reg_ops fmr_ops = {
 	.reg_desc_put	= iser_reg_desc_put_fmr,
 };
 
+void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	iser_err_comp(wc, "memreg");
+}
+
 int iser_assign_reg_ops(struct iser_device *device)
 {
-	struct ib_device_attr *dev_attr = &device->dev_attr;
+	struct ib_device *ib_dev = device->ib_device;
 
 	/* Assign function handles  - based on FMR support */
-	if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
-	    device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
+	if (ib_dev->alloc_fmr && ib_dev->dealloc_fmr &&
+	    ib_dev->map_phys_fmr && ib_dev->unmap_fmr) {
 		iser_info("FMR supported, using FMR for registration\n");
 		device->reg_ops = &fmr_ops;
-	} else
-	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+	} else if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
 		iser_info("FastReg supported, using FastReg for registration\n");
 		device->reg_ops = &fastreg_ops;
+		device->remote_inv_sup = iser_always_reg;
 	} else {
 		iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
 		return -1;
@@ -131,67 +136,6 @@ iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
 {
 }
 
-#define IS_4K_ALIGNED(addr)	((((unsigned long)addr) & ~MASK_4K) == 0)
-
-/**
- * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
- * and returns the length of resulting physical address array (may be less than
- * the original due to possible compaction).
- *
- * we build a "page vec" under the assumption that the SG meets the RDMA
- * alignment requirements. Other then the first and last SG elements, all
- * the "internal" elements can be compacted into a list whose elements are
- * dma addresses of physical pages. The code supports also the weird case
- * where --few fragments of the same page-- are present in the SG as
- * consecutive elements. Also, it handles one entry SG.
- */
-
-static int iser_sg_to_page_vec(struct iser_data_buf *data,
-			       struct ib_device *ibdev, u64 *pages,
-			       int *offset, int *data_size)
-{
-	struct scatterlist *sg, *sgl = data->sg;
-	u64 start_addr, end_addr, page, chunk_start = 0;
-	unsigned long total_sz = 0;
-	unsigned int dma_len;
-	int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;
-
-	/* compute the offset of first element */
-	*offset = (u64) sgl[0].offset & ~MASK_4K;
-
-	new_chunk = 1;
-	cur_page  = 0;
-	for_each_sg(sgl, sg, data->dma_nents, i) {
-		start_addr = ib_sg_dma_address(ibdev, sg);
-		if (new_chunk)
-			chunk_start = start_addr;
-		dma_len = ib_sg_dma_len(ibdev, sg);
-		end_addr = start_addr + dma_len;
-		total_sz += dma_len;
-
-		/* collect page fragments until aligned or end of SG list */
-		if (!IS_4K_ALIGNED(end_addr) && i < last_ent) {
-			new_chunk = 0;
-			continue;
-		}
-		new_chunk = 1;
-
-		/* address of the first page in the contiguous chunk;
-		   masking relevant for the very first SG entry,
-		   which might be unaligned */
-		page = chunk_start & MASK_4K;
-		do {
-			pages[cur_page++] = page;
-			page += SIZE_4K;
-		} while (page < end_addr);
-	}
-
-	*data_size = total_sz;
-	iser_dbg("page_vec->data_size:%d cur_page %d\n",
-		 *data_size, cur_page);
-	return cur_page;
-}
-
 static void iser_data_buf_dump(struct iser_data_buf *data,
 			       struct ib_device *ibdev)
 {
@@ -210,10 +154,10 @@ static void iser_dump_page_vec(struct iser_page_vec *page_vec)
 {
 	int i;
 
-	iser_err("page vec length %d data size %d\n",
-		 page_vec->length, page_vec->data_size);
-	for (i = 0; i < page_vec->length; i++)
-		iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]);
+	iser_err("page vec npages %d data length %d\n",
+		 page_vec->npages, page_vec->fake_mr.length);
+	for (i = 0; i < page_vec->npages; i++)
+		iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]);
 }
 
 int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
@@ -251,7 +195,11 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
 	struct scatterlist *sg = mem->sg;
 
 	reg->sge.lkey = device->pd->local_dma_lkey;
-	reg->rkey = device->mr->rkey;
+	/*
+	 * FIXME: rework the registration code path to differentiate
+	 * rkey/lkey use cases
+	 */
+	reg->rkey = device->mr ? device->mr->rkey : 0;
 	reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
 	reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
 
@@ -262,11 +210,16 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
 	return 0;
 }
 
-/**
- * iser_reg_page_vec - Register physical memory
- *
- * returns: 0 on success, errno code on failure
- */
+static int iser_set_page(struct ib_mr *mr, u64 addr)
+{
+	struct iser_page_vec *page_vec =
+		container_of(mr, struct iser_page_vec, fake_mr);
+
+	page_vec->pages[page_vec->npages++] = addr;
+
+	return 0;
+}
+
 static
 int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
 		      struct iser_data_buf *mem,
@@ -280,22 +233,19 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
 	struct ib_pool_fmr *fmr;
 	int ret, plen;
 
-	plen = iser_sg_to_page_vec(mem, device->ib_device,
-				   page_vec->pages,
-				   &page_vec->offset,
-				   &page_vec->data_size);
-	page_vec->length = plen;
-	if (plen * SIZE_4K < page_vec->data_size) {
+	page_vec->npages = 0;
+	page_vec->fake_mr.page_size = SIZE_4K;
+	plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg,
+			      mem->size, iser_set_page);
+	if (unlikely(plen < mem->size)) {
 		iser_err("page vec too short to hold this SG\n");
 		iser_data_buf_dump(mem, device->ib_device);
 		iser_dump_page_vec(page_vec);
 		return -EINVAL;
 	}
 
-	fmr  = ib_fmr_pool_map_phys(fmr_pool,
-				    page_vec->pages,
-				    page_vec->length,
-				    page_vec->pages[0]);
+	fmr  = ib_fmr_pool_map_phys(fmr_pool, page_vec->pages,
+				    page_vec->npages, page_vec->pages[0]);
 	if (IS_ERR(fmr)) {
 		ret = PTR_ERR(fmr);
 		iser_err("ib_fmr_pool_map_phys failed: %d\n", ret);
@@ -304,8 +254,8 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
 
 	reg->sge.lkey = fmr->fmr->lkey;
 	reg->rkey = fmr->fmr->rkey;
-	reg->sge.addr = page_vec->pages[0] + page_vec->offset;
-	reg->sge.length = page_vec->data_size;
+	reg->sge.addr = page_vec->fake_mr.iova;
+	reg->sge.length = page_vec->fake_mr.length;
 	reg->mem_h = fmr;
 
 	iser_dbg("fmr reg: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
@@ -413,19 +363,16 @@ iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
 		*mask |= ISER_CHECK_GUARD;
 }
 
-static void
-iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
+static inline void
+iser_inv_rkey(struct ib_send_wr *inv_wr,
+	      struct ib_mr *mr,
+	      struct ib_cqe *cqe)
 {
-	u32 rkey;
-
 	inv_wr->opcode = IB_WR_LOCAL_INV;
-	inv_wr->wr_id = ISER_FASTREG_LI_WRID;
+	inv_wr->wr_cqe = cqe;
 	inv_wr->ex.invalidate_rkey = mr->rkey;
 	inv_wr->send_flags = 0;
 	inv_wr->num_sge = 0;
-
-	rkey = ib_inc_rkey(mr->rkey);
-	ib_update_fast_reg_key(mr, rkey);
 }
 
 static int
@@ -437,7 +384,9 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 {
 	struct iser_tx_desc *tx_desc = &iser_task->desc;
 	struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs;
+	struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
 	struct ib_sig_handover_wr *wr;
+	struct ib_mr *mr = pi_ctx->sig_mr;
 	int ret;
 
 	memset(sig_attrs, 0, sizeof(*sig_attrs));
@@ -447,17 +396,19 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 
 	iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask);
 
-	if (!pi_ctx->sig_mr_valid)
-		iser_inv_rkey(iser_tx_next_wr(tx_desc), pi_ctx->sig_mr);
+	if (pi_ctx->sig_mr_valid)
+		iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+
+	ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
 
 	wr = sig_handover_wr(iser_tx_next_wr(tx_desc));
 	wr->wr.opcode = IB_WR_REG_SIG_MR;
-	wr->wr.wr_id = ISER_FASTREG_LI_WRID;
+	wr->wr.wr_cqe = cqe;
 	wr->wr.sg_list = &data_reg->sge;
 	wr->wr.num_sge = 1;
 	wr->wr.send_flags = 0;
 	wr->sig_attrs = sig_attrs;
-	wr->sig_mr = pi_ctx->sig_mr;
+	wr->sig_mr = mr;
 	if (scsi_prot_sg_count(iser_task->sc))
 		wr->prot = &prot_reg->sge;
 	else
@@ -465,10 +416,10 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 	wr->access_flags = IB_ACCESS_LOCAL_WRITE |
 			   IB_ACCESS_REMOTE_READ |
 			   IB_ACCESS_REMOTE_WRITE;
-	pi_ctx->sig_mr_valid = 0;
+	pi_ctx->sig_mr_valid = 1;
 
-	sig_reg->sge.lkey = pi_ctx->sig_mr->lkey;
-	sig_reg->rkey = pi_ctx->sig_mr->rkey;
+	sig_reg->sge.lkey = mr->lkey;
+	sig_reg->rkey = mr->rkey;
 	sig_reg->sge.addr = 0;
 	sig_reg->sge.length = scsi_transfer_length(iser_task->sc);
 
@@ -485,12 +436,15 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 			    struct iser_mem_reg *reg)
 {
 	struct iser_tx_desc *tx_desc = &iser_task->desc;
+	struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
 	struct ib_mr *mr = rsc->mr;
 	struct ib_reg_wr *wr;
 	int n;
 
-	if (!rsc->mr_valid)
-		iser_inv_rkey(iser_tx_next_wr(tx_desc), mr);
+	if (rsc->mr_valid)
+		iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+
+	ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
 
 	n = ib_map_mr_sg(mr, mem->sg, mem->size, SIZE_4K);
 	if (unlikely(n != mem->size)) {
@@ -501,7 +455,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 
 	wr = reg_wr(iser_tx_next_wr(tx_desc));
 	wr->wr.opcode = IB_WR_REG_MR;
-	wr->wr.wr_id = ISER_FASTREG_LI_WRID;
+	wr->wr.wr_cqe = cqe;
 	wr->wr.send_flags = 0;
 	wr->wr.num_sge = 0;
 	wr->mr = mr;
@@ -510,7 +464,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 		     IB_ACCESS_REMOTE_WRITE |
 		     IB_ACCESS_REMOTE_READ;
 
-	rsc->mr_valid = 0;
+	rsc->mr_valid = 1;
 
 	reg->sge.lkey = mr->lkey;
 	reg->rkey = mr->rkey;
@@ -554,7 +508,8 @@ iser_reg_data_sg(struct iscsi_iser_task *task,
 }
 
 int iser_reg_rdma_mem(struct iscsi_iser_task *task,
-		      enum iser_data_dir dir)
+		      enum iser_data_dir dir,
+		      bool all_imm)
 {
 	struct ib_conn *ib_conn = &task->iser_conn->ib_conn;
 	struct iser_device *device = ib_conn->device;
@@ -565,8 +520,8 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task,
 	bool use_dma_key;
 	int err;
 
-	use_dma_key = (mem->dma_nents == 1 && !iser_always_reg &&
-		       scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL);
+	use_dma_key = mem->dma_nents == 1 && (all_imm || !iser_always_reg) &&
+		      scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL;
 
 	if (!use_dma_key) {
 		desc = device->reg_ops->reg_desc_get(ib_conn);
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 42f4da620f2e..40c0f4978e2f 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -44,17 +44,6 @@
 #define ISER_MAX_CQ_LEN		(ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
 				 ISCSI_ISER_MAX_CONN)
 
-static int iser_cq_poll_limit = 512;
-
-static void iser_cq_tasklet_fn(unsigned long data);
-static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
-
-static void iser_cq_event_callback(struct ib_event *cause, void *context)
-{
-	iser_err("cq event %s (%d)\n",
-		 ib_event_msg(cause->event), cause->event);
-}
-
 static void iser_qp_event_callback(struct ib_event *cause, void *context)
 {
 	iser_err("qp event %s (%d)\n",
@@ -78,59 +67,40 @@ static void iser_event_handler(struct ib_event_handler *handler,
  */
 static int iser_create_device_ib_res(struct iser_device *device)
 {
-	struct ib_device_attr *dev_attr = &device->dev_attr;
+	struct ib_device *ib_dev = device->ib_device;
 	int ret, i, max_cqe;
 
-	ret = ib_query_device(device->ib_device, dev_attr);
-	if (ret) {
-		pr_warn("Query device failed for %s\n", device->ib_device->name);
-		return ret;
-	}
-
 	ret = iser_assign_reg_ops(device);
 	if (ret)
 		return ret;
 
 	device->comps_used = min_t(int, num_online_cpus(),
-				 device->ib_device->num_comp_vectors);
+				 ib_dev->num_comp_vectors);
 
 	device->comps = kcalloc(device->comps_used, sizeof(*device->comps),
 				GFP_KERNEL);
 	if (!device->comps)
 		goto comps_err;
 
-	max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe);
+	max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe);
 
 	iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n",
-		  device->comps_used, device->ib_device->name,
-		  device->ib_device->num_comp_vectors, max_cqe);
+		  device->comps_used, ib_dev->name,
+		  ib_dev->num_comp_vectors, max_cqe);
 
-	device->pd = ib_alloc_pd(device->ib_device);
+	device->pd = ib_alloc_pd(ib_dev);
 	if (IS_ERR(device->pd))
 		goto pd_err;
 
 	for (i = 0; i < device->comps_used; i++) {
-		struct ib_cq_init_attr cq_attr = {};
 		struct iser_comp *comp = &device->comps[i];
 
-		comp->device = device;
-		cq_attr.cqe = max_cqe;
-		cq_attr.comp_vector = i;
-		comp->cq = ib_create_cq(device->ib_device,
-					iser_cq_callback,
-					iser_cq_event_callback,
-					(void *)comp,
-					&cq_attr);
+		comp->cq = ib_alloc_cq(ib_dev, comp, max_cqe, i,
+				       IB_POLL_SOFTIRQ);
 		if (IS_ERR(comp->cq)) {
 			comp->cq = NULL;
 			goto cq_err;
 		}
-
-		if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
-			goto cq_err;
-
-		tasklet_init(&comp->tasklet, iser_cq_tasklet_fn,
-			     (unsigned long)comp);
 	}
 
 	if (!iser_always_reg) {
@@ -140,11 +110,11 @@ static int iser_create_device_ib_res(struct iser_device *device)
 
 		device->mr = ib_get_dma_mr(device->pd, access);
 		if (IS_ERR(device->mr))
-			goto dma_mr_err;
+			goto cq_err;
 	}
 
-	INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
-				iser_event_handler);
+	INIT_IB_EVENT_HANDLER(&device->event_handler, ib_dev,
+			      iser_event_handler);
 	if (ib_register_event_handler(&device->event_handler))
 		goto handler_err;
 
@@ -153,15 +123,12 @@ static int iser_create_device_ib_res(struct iser_device *device)
 handler_err:
 	if (device->mr)
 		ib_dereg_mr(device->mr);
-dma_mr_err:
-	for (i = 0; i < device->comps_used; i++)
-		tasklet_kill(&device->comps[i].tasklet);
 cq_err:
 	for (i = 0; i < device->comps_used; i++) {
 		struct iser_comp *comp = &device->comps[i];
 
 		if (comp->cq)
-			ib_destroy_cq(comp->cq);
+			ib_free_cq(comp->cq);
 	}
 	ib_dealloc_pd(device->pd);
 pd_err:
@@ -182,8 +149,7 @@ static void iser_free_device_ib_res(struct iser_device *device)
 	for (i = 0; i < device->comps_used; i++) {
 		struct iser_comp *comp = &device->comps[i];
 
-		tasklet_kill(&comp->tasklet);
-		ib_destroy_cq(comp->cq);
+		ib_free_cq(comp->cq);
 		comp->cq = NULL;
 	}
 
@@ -299,7 +265,7 @@ iser_alloc_reg_res(struct ib_device *ib_device,
 		iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
 		return ret;
 	}
-	res->mr_valid = 1;
+	res->mr_valid = 0;
 
 	return 0;
 }
@@ -336,7 +302,7 @@ iser_alloc_pi_ctx(struct ib_device *ib_device,
 		ret = PTR_ERR(pi_ctx->sig_mr);
 		goto sig_mr_failure;
 	}
-	pi_ctx->sig_mr_valid = 1;
+	pi_ctx->sig_mr_valid = 0;
 	desc->pi_ctx->sig_protected = 0;
 
 	return 0;
@@ -461,10 +427,9 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
  */
 static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
 {
-	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
-						   ib_conn);
+	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
 	struct iser_device	*device;
-	struct ib_device_attr *dev_attr;
+	struct ib_device	*ib_dev;
 	struct ib_qp_init_attr	init_attr;
 	int			ret = -ENOMEM;
 	int index, min_index = 0;
@@ -472,7 +437,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
 	BUG_ON(ib_conn->device == NULL);
 
 	device = ib_conn->device;
-	dev_attr = &device->dev_attr;
+	ib_dev = device->ib_device;
 
 	memset(&init_attr, 0, sizeof init_attr);
 
@@ -503,16 +468,16 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
 		iser_conn->max_cmds =
 			ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
 	} else {
-		if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
+		if (ib_dev->attrs.max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
 			init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS + 1;
 			iser_conn->max_cmds =
 				ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
 		} else {
-			init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
+			init_attr.cap.max_send_wr = ib_dev->attrs.max_qp_wr;
 			iser_conn->max_cmds =
-				ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
+				ISER_GET_MAX_XMIT_CMDS(ib_dev->attrs.max_qp_wr);
 			iser_dbg("device %s supports max_send_wr %d\n",
-				 device->ib_device->name, dev_attr->max_qp_wr);
+				 device->ib_device->name, ib_dev->attrs.max_qp_wr);
 		}
 	}
 
@@ -724,13 +689,13 @@ int iser_conn_terminate(struct iser_conn *iser_conn)
 				 iser_conn, err);
 
 		/* post an indication that all flush errors were consumed */
-		err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr);
+		err = ib_post_send(ib_conn->qp, &ib_conn->last, &bad_wr);
 		if (err) {
-			iser_err("conn %p failed to post beacon", ib_conn);
+			iser_err("conn %p failed to post last wr", ib_conn);
 			return 1;
 		}
 
-		wait_for_completion(&ib_conn->flush_comp);
+		wait_for_completion(&ib_conn->last_comp);
 	}
 
 	return 1;
@@ -756,7 +721,7 @@ iser_calc_scsi_params(struct iser_conn *iser_conn,
 
 	sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K);
 	sup_sg_tablesize = min_t(unsigned, ISCSI_ISER_MAX_SG_TABLESIZE,
-				 device->dev_attr.max_fast_reg_page_list_len);
+				 device->ib_device->attrs.max_fast_reg_page_list_len);
 
 	if (sg_tablesize > sup_sg_tablesize) {
 		sg_tablesize = sup_sg_tablesize;
@@ -799,7 +764,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
 
 	/* connection T10-PI support */
 	if (iser_pi_enable) {
-		if (!(device->dev_attr.device_cap_flags &
+		if (!(device->ib_device->attrs.device_cap_flags &
 		      IB_DEVICE_SIGNATURE_HANDOVER)) {
 			iser_warn("T10-PI requested but not supported on %s, "
 				  "continue without T10-PI\n",
@@ -841,16 +806,17 @@ static void iser_route_handler(struct rdma_cm_id *cma_id)
 		goto failure;
 
 	memset(&conn_param, 0, sizeof conn_param);
-	conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
+	conn_param.responder_resources = device->ib_device->attrs.max_qp_rd_atom;
 	conn_param.initiator_depth     = 1;
 	conn_param.retry_count	       = 7;
 	conn_param.rnr_retry_count     = 6;
 
 	memset(&req_hdr, 0, sizeof(req_hdr));
-	req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
-			ISER_SEND_W_INV_NOT_SUPPORTED);
-	conn_param.private_data		= (void *)&req_hdr;
-	conn_param.private_data_len	= sizeof(struct iser_cm_hdr);
+	req_hdr.flags = ISER_ZBVA_NOT_SUP;
+	if (!device->remote_inv_sup)
+		req_hdr.flags |= ISER_SEND_W_INV_NOT_SUP;
+	conn_param.private_data	= (void *)&req_hdr;
+	conn_param.private_data_len = sizeof(struct iser_cm_hdr);
 
 	ret = rdma_connect(cma_id, &conn_param);
 	if (ret) {
@@ -863,7 +829,8 @@ failure:
 	iser_connect_error(cma_id);
 }
 
-static void iser_connected_handler(struct rdma_cm_id *cma_id)
+static void iser_connected_handler(struct rdma_cm_id *cma_id,
+				   const void *private_data)
 {
 	struct iser_conn *iser_conn;
 	struct ib_qp_attr attr;
@@ -877,6 +844,15 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id)
 	(void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
 	iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num);
 
+	if (private_data) {
+		u8 flags = *(u8 *)private_data;
+
+		iser_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP);
+	}
+
+	iser_info("conn %p: negotiated %s invalidation\n",
+		  iser_conn, iser_conn->snd_w_inv ? "remote" : "local");
+
 	iser_conn->state = ISER_CONN_UP;
 	complete(&iser_conn->up_completion);
 }
@@ -928,7 +904,7 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
 		iser_route_handler(cma_id);
 		break;
 	case RDMA_CM_EVENT_ESTABLISHED:
-		iser_connected_handler(cma_id);
+		iser_connected_handler(cma_id, event->param.conn.private_data);
 		break;
 	case RDMA_CM_EVENT_ADDR_ERROR:
 	case RDMA_CM_EVENT_ROUTE_ERROR:
@@ -967,14 +943,21 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
 
 void iser_conn_init(struct iser_conn *iser_conn)
 {
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+
 	iser_conn->state = ISER_CONN_INIT;
-	iser_conn->ib_conn.post_recv_buf_count = 0;
-	init_completion(&iser_conn->ib_conn.flush_comp);
 	init_completion(&iser_conn->stop_completion);
 	init_completion(&iser_conn->ib_completion);
 	init_completion(&iser_conn->up_completion);
 	INIT_LIST_HEAD(&iser_conn->conn_list);
 	mutex_init(&iser_conn->state_mutex);
+
+	ib_conn->post_recv_buf_count = 0;
+	ib_conn->reg_cqe.done = iser_reg_comp;
+	ib_conn->last_cqe.done = iser_last_comp;
+	ib_conn->last.wr_cqe = &ib_conn->last_cqe;
+	ib_conn->last.opcode = IB_WR_SEND;
+	init_completion(&ib_conn->last_comp);
 }
 
  /**
@@ -1000,9 +983,6 @@ int iser_connect(struct iser_conn   *iser_conn,
 
 	iser_conn->state = ISER_CONN_PENDING;
 
-	ib_conn->beacon.wr_id = ISER_BEACON_WRID;
-	ib_conn->beacon.opcode = IB_WR_SEND;
-
 	ib_conn->cma_id = rdma_create_id(&init_net, iser_cma_handler,
 					 (void *)iser_conn,
 					 RDMA_PS_TCP, IB_QPT_RC);
@@ -1045,56 +1025,60 @@ connect_failure:
 
 int iser_post_recvl(struct iser_conn *iser_conn)
 {
-	struct ib_recv_wr rx_wr, *rx_wr_failed;
 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
-	struct ib_sge	  sge;
+	struct iser_login_desc *desc = &iser_conn->login_desc;
+	struct ib_recv_wr wr, *wr_failed;
 	int ib_ret;
 
-	sge.addr   = iser_conn->login_resp_dma;
-	sge.length = ISER_RX_LOGIN_SIZE;
-	sge.lkey   = ib_conn->device->pd->local_dma_lkey;
+	desc->sge.addr = desc->rsp_dma;
+	desc->sge.length = ISER_RX_LOGIN_SIZE;
+	desc->sge.lkey = ib_conn->device->pd->local_dma_lkey;
 
-	rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
-	rx_wr.sg_list = &sge;
-	rx_wr.num_sge = 1;
-	rx_wr.next    = NULL;
+	desc->cqe.done = iser_login_rsp;
+	wr.wr_cqe = &desc->cqe;
+	wr.sg_list = &desc->sge;
+	wr.num_sge = 1;
+	wr.next = NULL;
 
 	ib_conn->post_recv_buf_count++;
-	ib_ret	= ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
+	ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed);
 	if (ib_ret) {
 		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
 		ib_conn->post_recv_buf_count--;
 	}
+
 	return ib_ret;
 }
 
 int iser_post_recvm(struct iser_conn *iser_conn, int count)
 {
-	struct ib_recv_wr *rx_wr, *rx_wr_failed;
-	int i, ib_ret;
 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
 	unsigned int my_rx_head = iser_conn->rx_desc_head;
 	struct iser_rx_desc *rx_desc;
+	struct ib_recv_wr *wr, *wr_failed;
+	int i, ib_ret;
 
-	for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
-		rx_desc		= &iser_conn->rx_descs[my_rx_head];
-		rx_wr->wr_id	= (uintptr_t)rx_desc;
-		rx_wr->sg_list	= &rx_desc->rx_sg;
-		rx_wr->num_sge	= 1;
-		rx_wr->next	= rx_wr + 1;
+	for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) {
+		rx_desc = &iser_conn->rx_descs[my_rx_head];
+		rx_desc->cqe.done = iser_task_rsp;
+		wr->wr_cqe = &rx_desc->cqe;
+		wr->sg_list = &rx_desc->rx_sg;
+		wr->num_sge = 1;
+		wr->next = wr + 1;
 		my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask;
 	}
 
-	rx_wr--;
-	rx_wr->next = NULL; /* mark end of work requests list */
+	wr--;
+	wr->next = NULL; /* mark end of work requests list */
 
 	ib_conn->post_recv_buf_count += count;
-	ib_ret	= ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
+	ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed);
 	if (ib_ret) {
 		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
 		ib_conn->post_recv_buf_count -= count;
 	} else
 		iser_conn->rx_desc_head = my_rx_head;
+
 	return ib_ret;
 }
 
@@ -1115,7 +1099,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
 				      DMA_TO_DEVICE);
 
 	wr->next = NULL;
-	wr->wr_id = (uintptr_t)tx_desc;
+	wr->wr_cqe = &tx_desc->cqe;
 	wr->sg_list = tx_desc->tx_sg;
 	wr->num_sge = tx_desc->num_sge;
 	wr->opcode = IB_WR_SEND;
@@ -1129,149 +1113,6 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
 	return ib_ret;
 }
 
-/**
- * is_iser_tx_desc - Indicate if the completion wr_id
- *     is a TX descriptor or not.
- * @iser_conn: iser connection
- * @wr_id: completion WR identifier
- *
- * Since we cannot rely on wc opcode in FLUSH errors
- * we must work around it by checking if the wr_id address
- * falls in the iser connection rx_descs buffer. If so
- * it is an RX descriptor, otherwize it is a TX.
- */
-static inline bool
-is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
-{
-	void *start = iser_conn->rx_descs;
-	int len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
-
-	if (wr_id >= start && wr_id < start + len)
-		return false;
-
-	return true;
-}
-
-/**
- * iser_handle_comp_error() - Handle error completion
- * @ib_conn:   connection RDMA resources
- * @wc:        work completion
- *
- * Notes: We may handle a FLUSH error completion and in this case
- *        we only cleanup in case TX type was DATAOUT. For non-FLUSH
- *        error completion we should also notify iscsi layer that
- *        connection is failed (in case we passed bind stage).
- */
-static void
-iser_handle_comp_error(struct ib_conn *ib_conn,
-		       struct ib_wc *wc)
-{
-	void *wr_id = (void *)(uintptr_t)wc->wr_id;
-	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
-						   ib_conn);
-
-	if (wc->status != IB_WC_WR_FLUSH_ERR)
-		if (iser_conn->iscsi_conn)
-			iscsi_conn_failure(iser_conn->iscsi_conn,
-					   ISCSI_ERR_CONN_FAILED);
-
-	if (wc->wr_id == ISER_FASTREG_LI_WRID)
-		return;
-
-	if (is_iser_tx_desc(iser_conn, wr_id)) {
-		struct iser_tx_desc *desc = wr_id;
-
-		if (desc->type == ISCSI_TX_DATAOUT)
-			kmem_cache_free(ig.desc_cache, desc);
-	} else {
-		ib_conn->post_recv_buf_count--;
-	}
-}
-
-/**
- * iser_handle_wc - handle a single work completion
- * @wc: work completion
- *
- * Soft-IRQ context, work completion can be either
- * SEND or RECV, and can turn out successful or
- * with error (or flush error).
- */
-static void iser_handle_wc(struct ib_wc *wc)
-{
-	struct ib_conn *ib_conn;
-	struct iser_tx_desc *tx_desc;
-	struct iser_rx_desc *rx_desc;
-
-	ib_conn = wc->qp->qp_context;
-	if (likely(wc->status == IB_WC_SUCCESS)) {
-		if (wc->opcode == IB_WC_RECV) {
-			rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
-			iser_rcv_completion(rx_desc, wc->byte_len,
-					    ib_conn);
-		} else
-		if (wc->opcode == IB_WC_SEND) {
-			tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
-			iser_snd_completion(tx_desc, ib_conn);
-		} else {
-			iser_err("Unknown wc opcode %d\n", wc->opcode);
-		}
-	} else {
-		if (wc->status != IB_WC_WR_FLUSH_ERR)
-			iser_err("%s (%d): wr id %llx vend_err %x\n",
-				 ib_wc_status_msg(wc->status), wc->status,
-				 wc->wr_id, wc->vendor_err);
-		else
-			iser_dbg("%s (%d): wr id %llx\n",
-				 ib_wc_status_msg(wc->status), wc->status,
-				 wc->wr_id);
-
-		if (wc->wr_id == ISER_BEACON_WRID)
-			/* all flush errors were consumed */
-			complete(&ib_conn->flush_comp);
-		else
-			iser_handle_comp_error(ib_conn, wc);
-	}
-}
-
-/**
- * iser_cq_tasklet_fn - iSER completion polling loop
- * @data: iSER completion context
- *
- * Soft-IRQ context, polling connection CQ until
- * either CQ was empty or we exausted polling budget
- */
-static void iser_cq_tasklet_fn(unsigned long data)
-{
-	struct iser_comp *comp = (struct iser_comp *)data;
-	struct ib_cq *cq = comp->cq;
-	struct ib_wc *const wcs = comp->wcs;
-	int i, n, completed = 0;
-
-	while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
-		for (i = 0; i < n; i++)
-			iser_handle_wc(&wcs[i]);
-
-		completed += n;
-		if (completed >= iser_cq_poll_limit)
-			break;
-	}
-
-	/*
-	 * It is assumed here that arming CQ only once its empty
-	 * would not cause interrupts to be missed.
-	 */
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-
-	iser_dbg("got %d completions\n", completed);
-}
-
-static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
-{
-	struct iser_comp *comp = cq_context;
-
-	tasklet_schedule(&comp->tasklet);
-}
-
 u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
 			     enum iser_data_dir cmd_dir, sector_t *sector)
 {
@@ -1319,3 +1160,21 @@ err:
 	/* Not alot we can do here, return ambiguous guard error */
 	return 0x1;
 }
+
+void iser_err_comp(struct ib_wc *wc, const char *type)
+{
+	if (wc->status != IB_WC_WR_FLUSH_ERR) {
+		struct iser_conn *iser_conn = to_iser_conn(wc->qp->qp_context);
+
+		iser_err("%s failure: %s (%d) vend_err %x\n", type,
+			 ib_wc_status_msg(wc->status), wc->status,
+			 wc->vendor_err);
+
+		if (iser_conn->iscsi_conn)
+			iscsi_conn_failure(iser_conn->iscsi_conn,
+					   ISCSI_ERR_CONN_FAILED);
+	} else {
+		iser_dbg("%s failure: %s (%d)\n", type,
+			 ib_wc_status_msg(wc->status), wc->status);
+	}
+}
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 468c5e132563..f121e6129339 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -29,7 +29,6 @@
 #include <target/iscsi/iscsi_transport.h>
 #include <linux/semaphore.h>
 
-#include "isert_proto.h"
 #include "ib_isert.h"
 
 #define	ISERT_MAX_CONN		8
@@ -95,22 +94,6 @@ isert_qp_event_callback(struct ib_event *e, void *context)
 	}
 }
 
-static int
-isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr)
-{
-	int ret;
-
-	ret = ib_query_device(ib_dev, devattr);
-	if (ret) {
-		isert_err("ib_query_device() failed: %d\n", ret);
-		return ret;
-	}
-	isert_dbg("devattr->max_sge: %d\n", devattr->max_sge);
-	isert_dbg("devattr->max_sge_rd: %d\n", devattr->max_sge_rd);
-
-	return 0;
-}
-
 static struct isert_comp *
 isert_comp_get(struct isert_conn *isert_conn)
 {
@@ -157,9 +140,9 @@ isert_create_qp(struct isert_conn *isert_conn,
 	attr.recv_cq = comp->cq;
 	attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS;
 	attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
-	attr.cap.max_send_sge = device->dev_attr.max_sge;
-	isert_conn->max_sge = min(device->dev_attr.max_sge,
-				  device->dev_attr.max_sge_rd);
+	attr.cap.max_send_sge = device->ib_device->attrs.max_sge;
+	isert_conn->max_sge = min(device->ib_device->attrs.max_sge,
+				  device->ib_device->attrs.max_sge_rd);
 	attr.cap.max_recv_sge = 1;
 	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 	attr.qp_type = IB_QPT_RC;
@@ -287,8 +270,7 @@ isert_free_comps(struct isert_device *device)
 }
 
 static int
-isert_alloc_comps(struct isert_device *device,
-		  struct ib_device_attr *attr)
+isert_alloc_comps(struct isert_device *device)
 {
 	int i, max_cqe, ret = 0;
 
@@ -308,7 +290,7 @@ isert_alloc_comps(struct isert_device *device,
 		return -ENOMEM;
 	}
 
-	max_cqe = min(ISER_MAX_CQ_LEN, attr->max_cqe);
+	max_cqe = min(ISER_MAX_CQ_LEN, device->ib_device->attrs.max_cqe);
 
 	for (i = 0; i < device->comps_used; i++) {
 		struct ib_cq_init_attr cq_attr = {};
@@ -344,17 +326,15 @@ out_cq:
 static int
 isert_create_device_ib_res(struct isert_device *device)
 {
-	struct ib_device_attr *dev_attr;
+	struct ib_device *ib_dev = device->ib_device;
 	int ret;
 
-	dev_attr = &device->dev_attr;
-	ret = isert_query_device(device->ib_device, dev_attr);
-	if (ret)
-		goto out;
+	isert_dbg("devattr->max_sge: %d\n", ib_dev->attrs.max_sge);
+	isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd);
 
 	/* asign function handlers */
-	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS &&
-	    dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) {
+	if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS &&
+	    ib_dev->attrs.device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) {
 		device->use_fastreg = 1;
 		device->reg_rdma_mem = isert_reg_rdma;
 		device->unreg_rdma_mem = isert_unreg_rdma;
@@ -364,11 +344,11 @@ isert_create_device_ib_res(struct isert_device *device)
 		device->unreg_rdma_mem = isert_unmap_cmd;
 	}
 
-	ret = isert_alloc_comps(device, dev_attr);
+	ret = isert_alloc_comps(device);
 	if (ret)
 		goto out;
 
-	device->pd = ib_alloc_pd(device->ib_device);
+	device->pd = ib_alloc_pd(ib_dev);
 	if (IS_ERR(device->pd)) {
 		ret = PTR_ERR(device->pd);
 		isert_err("failed to allocate pd, device %p, ret=%d\n",
@@ -377,7 +357,7 @@ isert_create_device_ib_res(struct isert_device *device)
 	}
 
 	/* Check signature cap */
-	device->pi_capable = dev_attr->device_cap_flags &
+	device->pi_capable = ib_dev->attrs.device_cap_flags &
 			     IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
 
 	return 0;
@@ -676,6 +656,32 @@ out_login_buf:
 	return ret;
 }
 
+static void
+isert_set_nego_params(struct isert_conn *isert_conn,
+		      struct rdma_conn_param *param)
+{
+	struct ib_device_attr *attr = &isert_conn->device->ib_device->attrs;
+
+	/* Set max inflight RDMA READ requests */
+	isert_conn->initiator_depth = min_t(u8, param->initiator_depth,
+				attr->max_qp_init_rd_atom);
+	isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth);
+
+	if (param->private_data) {
+		u8 flags = *(u8 *)param->private_data;
+
+		/*
+		 * use remote invalidation if the both initiator
+		 * and the HCA support it
+		 */
+		isert_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP) &&
+					  (attr->device_cap_flags &
+					   IB_DEVICE_MEM_MGT_EXTENSIONS);
+		if (isert_conn->snd_w_inv)
+			isert_info("Using remote invalidation\n");
+	}
+}
+
 static int
 isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
 {
@@ -714,11 +720,7 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
 	}
 	isert_conn->device = device;
 
-	/* Set max inflight RDMA READ requests */
-	isert_conn->initiator_depth = min_t(u8,
-				event->param.conn.initiator_depth,
-				device->dev_attr.max_qp_init_rd_atom);
-	isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth);
+	isert_set_nego_params(isert_conn, &event->param.conn);
 
 	ret = isert_conn_setup_qp(isert_conn, cma_id);
 	if (ret)
@@ -1050,8 +1052,8 @@ isert_create_send_desc(struct isert_conn *isert_conn,
 	ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr,
 				   ISER_HEADERS_LEN, DMA_TO_DEVICE);
 
-	memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
-	tx_desc->iser_header.flags = ISER_VER;
+	memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl));
+	tx_desc->iser_header.flags = ISCSI_CTRL;
 
 	tx_desc->num_sge = 1;
 	tx_desc->isert_cmd = isert_cmd;
@@ -1097,7 +1099,14 @@ isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
 
 	isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND;
 	send_wr->wr_id = (uintptr_t)&isert_cmd->tx_desc;
-	send_wr->opcode = IB_WR_SEND;
+
+	if (isert_conn->snd_w_inv && isert_cmd->inv_rkey) {
+		send_wr->opcode  = IB_WR_SEND_WITH_INV;
+		send_wr->ex.invalidate_rkey = isert_cmd->inv_rkey;
+	} else {
+		send_wr->opcode = IB_WR_SEND;
+	}
+
 	send_wr->sg_list = &tx_desc->tx_sg[0];
 	send_wr->num_sge = isert_cmd->tx_desc.num_sge;
 	send_wr->send_flags = IB_SEND_SIGNALED;
@@ -1486,6 +1495,7 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc,
 		isert_cmd->read_va = read_va;
 		isert_cmd->write_stag = write_stag;
 		isert_cmd->write_va = write_va;
+		isert_cmd->inv_rkey = read_stag ? read_stag : write_stag;
 
 		ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd,
 					rx_desc, (unsigned char *)hdr);
@@ -1543,21 +1553,21 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc,
 static void
 isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn)
 {
-	struct iser_hdr *iser_hdr = &rx_desc->iser_header;
+	struct iser_ctrl *iser_ctrl = &rx_desc->iser_header;
 	uint64_t read_va = 0, write_va = 0;
 	uint32_t read_stag = 0, write_stag = 0;
 
-	switch (iser_hdr->flags & 0xF0) {
+	switch (iser_ctrl->flags & 0xF0) {
 	case ISCSI_CTRL:
-		if (iser_hdr->flags & ISER_RSV) {
-			read_stag = be32_to_cpu(iser_hdr->read_stag);
-			read_va = be64_to_cpu(iser_hdr->read_va);
+		if (iser_ctrl->flags & ISER_RSV) {
+			read_stag = be32_to_cpu(iser_ctrl->read_stag);
+			read_va = be64_to_cpu(iser_ctrl->read_va);
 			isert_dbg("ISER_RSV: read_stag: 0x%x read_va: 0x%llx\n",
 				  read_stag, (unsigned long long)read_va);
 		}
-		if (iser_hdr->flags & ISER_WSV) {
-			write_stag = be32_to_cpu(iser_hdr->write_stag);
-			write_va = be64_to_cpu(iser_hdr->write_va);
+		if (iser_ctrl->flags & ISER_WSV) {
+			write_stag = be32_to_cpu(iser_ctrl->write_stag);
+			write_va = be64_to_cpu(iser_ctrl->write_va);
 			isert_dbg("ISER_WSV: write_stag: 0x%x write_va: 0x%llx\n",
 				  write_stag, (unsigned long long)write_va);
 		}
@@ -1568,7 +1578,7 @@ isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn)
 		isert_err("iSER Hello message\n");
 		break;
 	default:
-		isert_warn("Unknown iSER hdr flags: 0x%02x\n", iser_hdr->flags);
+		isert_warn("Unknown iSER hdr flags: 0x%02x\n", iser_ctrl->flags);
 		break;
 	}
 
@@ -3095,12 +3105,20 @@ isert_rdma_accept(struct isert_conn *isert_conn)
 	struct rdma_cm_id *cm_id = isert_conn->cm_id;
 	struct rdma_conn_param cp;
 	int ret;
+	struct iser_cm_hdr rsp_hdr;
 
 	memset(&cp, 0, sizeof(struct rdma_conn_param));
 	cp.initiator_depth = isert_conn->initiator_depth;
 	cp.retry_count = 7;
 	cp.rnr_retry_count = 7;
 
+	memset(&rsp_hdr, 0, sizeof(rsp_hdr));
+	rsp_hdr.flags = ISERT_ZBVA_NOT_USED;
+	if (!isert_conn->snd_w_inv)
+		rsp_hdr.flags = rsp_hdr.flags | ISERT_SEND_W_INV_NOT_USED;
+	cp.private_data = (void *)&rsp_hdr;
+	cp.private_data_len = sizeof(rsp_hdr);
+
 	ret = rdma_accept(cm_id, &cp);
 	if (ret) {
 		isert_err("rdma_accept() failed with: %d\n", ret);
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
index 3d7fbc47c343..8d50453eef66 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.h
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -3,6 +3,8 @@
 #include <linux/in6.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
+#include <scsi/iser.h>
+
 
 #define DRV_NAME	"isert"
 #define PFX		DRV_NAME ": "
@@ -31,6 +33,38 @@
 #define isert_err(fmt, arg...) \
 	pr_err(PFX "%s: " fmt, __func__ , ## arg)
 
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN	(sizeof(struct iser_ctrl) + \
+				 sizeof(struct iscsi_hdr))
+#define ISER_RECV_DATA_SEG_LEN	8192
+#define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISERT_MAX_TX_MISC_PDUS	4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
+
+#define ISERT_MAX_RX_MISC_PDUS	6 /*
+				   * NOOP_OUT(2), TEXT(1),
+				   * SCSI_TMFUNC(2), LOGOUT(1)
+				   */
+
+#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */
+
+#define ISERT_QP_MAX_RECV_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX)
+
+#define ISERT_MIN_POSTED_RX	(ISCSI_DEF_XMIT_CMDS_MAX >> 2)
+
+#define ISERT_INFLIGHT_DATAOUTS	8
+
+#define ISERT_QP_MAX_REQ_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX *    \
+				(1 + ISERT_INFLIGHT_DATAOUTS) + \
+				ISERT_MAX_TX_MISC_PDUS	+ \
+				ISERT_MAX_RX_MISC_PDUS)
+
+#define ISER_RX_PAD_SIZE	(ISER_RECV_DATA_SEG_LEN + 4096 - \
+		(ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge)))
+
 #define ISCSI_ISER_SG_TABLESIZE		256
 #define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
 #define ISER_BEACON_WRID               0xfffffffffffffffeULL
@@ -56,7 +90,7 @@ enum iser_conn_state {
 };
 
 struct iser_rx_desc {
-	struct iser_hdr iser_header;
+	struct iser_ctrl iser_header;
 	struct iscsi_hdr iscsi_header;
 	char		data[ISER_RECV_DATA_SEG_LEN];
 	u64		dma_addr;
@@ -65,7 +99,7 @@ struct iser_rx_desc {
 } __packed;
 
 struct iser_tx_desc {
-	struct iser_hdr iser_header;
+	struct iser_ctrl iser_header;
 	struct iscsi_hdr iscsi_header;
 	enum isert_desc_type type;
 	u64		dma_addr;
@@ -129,6 +163,7 @@ struct isert_cmd {
 	uint32_t		write_stag;
 	uint64_t		read_va;
 	uint64_t		write_va;
+	uint32_t		inv_rkey;
 	u64			pdu_buf_dma;
 	u32			pdu_buf_len;
 	struct isert_conn	*conn;
@@ -176,6 +211,7 @@ struct isert_conn {
 	struct work_struct	release_work;
 	struct ib_recv_wr       beacon;
 	bool                    logout_posted;
+	bool                    snd_w_inv;
 };
 
 #define ISERT_MAX_CQ 64
@@ -207,7 +243,6 @@ struct isert_device {
 	struct isert_comp	*comps;
 	int                     comps_used;
 	struct list_head	dev_node;
-	struct ib_device_attr	dev_attr;
 	int			(*reg_rdma_mem)(struct iscsi_conn *conn,
 						    struct iscsi_cmd *cmd,
 						    struct isert_rdma_wr *wr);
diff --git a/drivers/infiniband/ulp/isert/isert_proto.h b/drivers/infiniband/ulp/isert/isert_proto.h
deleted file mode 100644
index 4dccd313b777..000000000000
--- a/drivers/infiniband/ulp/isert/isert_proto.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* From iscsi_iser.h */
-
-struct iser_hdr {
-	u8	flags;
-	u8	rsvd[3];
-	__be32	write_stag; /* write rkey */
-	__be64	write_va;
-	__be32	read_stag;  /* read rkey */
-	__be64	read_va;
-} __packed;
-
-/*Constant PDU lengths calculations */
-#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
-
-#define ISER_RECV_DATA_SEG_LEN  8192
-#define ISER_RX_PAYLOAD_SIZE    (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
-#define ISER_RX_LOGIN_SIZE      (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
-
-/* QP settings */
-/* Maximal bounds on received asynchronous PDUs */
-#define ISERT_MAX_TX_MISC_PDUS	4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
-
-#define ISERT_MAX_RX_MISC_PDUS	6 /* NOOP_OUT(2), TEXT(1),         *
-				   * SCSI_TMFUNC(2), LOGOUT(1) */
-
-#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */
-
-#define ISERT_QP_MAX_RECV_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX)
-
-#define ISERT_MIN_POSTED_RX	(ISCSI_DEF_XMIT_CMDS_MAX >> 2)
-
-#define ISERT_INFLIGHT_DATAOUTS	8
-
-#define ISERT_QP_MAX_REQ_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX *    \
-				(1 + ISERT_INFLIGHT_DATAOUTS) + \
-				ISERT_MAX_TX_MISC_PDUS	+ \
-				ISERT_MAX_RX_MISC_PDUS)
-
-#define ISER_RX_PAD_SIZE	(ISER_RECV_DATA_SEG_LEN + 4096 - \
-		(ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge)))
-
-#define ISER_VER	0x10
-#define ISER_WSV	0x08
-#define ISER_RSV	0x04
-#define ISCSI_CTRL	0x10
-#define ISER_HELLO	0x20
-#define ISER_HELLORPLY	0x30
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 3db9a659719b..03022f6420d7 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -132,8 +132,9 @@ MODULE_PARM_DESC(ch_count,
 
 static void srp_add_one(struct ib_device *device);
 static void srp_remove_one(struct ib_device *device, void *client_data);
-static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr);
-static void srp_send_completion(struct ib_cq *cq, void *ch_ptr);
+static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
+		const char *opname);
 static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
 
 static struct scsi_transport_template *ib_srp_transport_template;
@@ -445,6 +446,17 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
 				  dev->max_pages_per_mr);
 }
 
+static void srp_drain_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct srp_rdma_ch *ch = cq->cq_context;
+
+	complete(&ch->done);
+}
+
+static struct ib_cqe srp_drain_cqe = {
+	.done		= srp_drain_done,
+};
+
 /**
  * srp_destroy_qp() - destroy an RDMA queue pair
  * @ch: SRP RDMA channel.
@@ -457,10 +469,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
 static void srp_destroy_qp(struct srp_rdma_ch *ch)
 {
 	static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
-	static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
+	static struct ib_recv_wr wr = { 0 };
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
+	wr.wr_cqe = &srp_drain_cqe;
 	/* Destroying a QP and reusing ch->done is only safe if not connected */
 	WARN_ON_ONCE(ch->connected);
 
@@ -489,34 +502,27 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 	struct ib_fmr_pool *fmr_pool = NULL;
 	struct srp_fr_pool *fr_pool = NULL;
 	const int m = dev->use_fast_reg ? 3 : 1;
-	struct ib_cq_init_attr cq_attr = {};
 	int ret;
 
 	init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
 	if (!init_attr)
 		return -ENOMEM;
 
-	/* + 1 for SRP_LAST_WR_ID */
-	cq_attr.cqe = target->queue_size + 1;
-	cq_attr.comp_vector = ch->comp_vector;
-	recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, ch,
-			       &cq_attr);
+	/* queue_size + 1 for ib_drain_qp */
+	recv_cq = ib_alloc_cq(dev->dev, ch, target->queue_size + 1,
+				ch->comp_vector, IB_POLL_SOFTIRQ);
 	if (IS_ERR(recv_cq)) {
 		ret = PTR_ERR(recv_cq);
 		goto err;
 	}
 
-	cq_attr.cqe = m * target->queue_size;
-	cq_attr.comp_vector = ch->comp_vector;
-	send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, ch,
-			       &cq_attr);
+	send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size,
+				ch->comp_vector, IB_POLL_DIRECT);
 	if (IS_ERR(send_cq)) {
 		ret = PTR_ERR(send_cq);
 		goto err_recv_cq;
 	}
 
-	ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP);
-
 	init_attr->event_handler       = srp_qp_event;
 	init_attr->cap.max_send_wr     = m * target->queue_size;
 	init_attr->cap.max_recv_wr     = target->queue_size + 1;
@@ -558,9 +564,9 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 	if (ch->qp)
 		srp_destroy_qp(ch);
 	if (ch->recv_cq)
-		ib_destroy_cq(ch->recv_cq);
+		ib_free_cq(ch->recv_cq);
 	if (ch->send_cq)
-		ib_destroy_cq(ch->send_cq);
+		ib_free_cq(ch->send_cq);
 
 	ch->qp = qp;
 	ch->recv_cq = recv_cq;
@@ -580,13 +586,13 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 	return 0;
 
 err_qp:
-	ib_destroy_qp(qp);
+	srp_destroy_qp(ch);
 
 err_send_cq:
-	ib_destroy_cq(send_cq);
+	ib_free_cq(send_cq);
 
 err_recv_cq:
-	ib_destroy_cq(recv_cq);
+	ib_free_cq(recv_cq);
 
 err:
 	kfree(init_attr);
@@ -622,9 +628,10 @@ static void srp_free_ch_ib(struct srp_target_port *target,
 		if (ch->fmr_pool)
 			ib_destroy_fmr_pool(ch->fmr_pool);
 	}
+
 	srp_destroy_qp(ch);
-	ib_destroy_cq(ch->send_cq);
-	ib_destroy_cq(ch->recv_cq);
+	ib_free_cq(ch->send_cq);
+	ib_free_cq(ch->recv_cq);
 
 	/*
 	 * Avoid that the SCSI error handler tries to use this channel after
@@ -1041,18 +1048,25 @@ out:
 	return ret <= 0 ? ret : -ENODEV;
 }
 
-static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey)
+static void srp_inv_rkey_err_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	srp_handle_qp_err(cq, wc, "INV RKEY");
+}
+
+static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch,
+		u32 rkey)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_send_wr wr = {
 		.opcode		    = IB_WR_LOCAL_INV,
-		.wr_id		    = LOCAL_INV_WR_ID_MASK,
 		.next		    = NULL,
 		.num_sge	    = 0,
 		.send_flags	    = 0,
 		.ex.invalidate_rkey = rkey,
 	};
 
+	wr.wr_cqe = &req->reg_cqe;
+	req->reg_cqe.done = srp_inv_rkey_err_done;
 	return ib_post_send(ch->qp, &wr, &bad_wr);
 }
 
@@ -1074,7 +1088,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd,
 		struct srp_fr_desc **pfr;
 
 		for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) {
-			res = srp_inv_rkey(ch, (*pfr)->mr->rkey);
+			res = srp_inv_rkey(req, ch, (*pfr)->mr->rkey);
 			if (res < 0) {
 				shost_printk(KERN_ERR, target->scsi_host, PFX
 				  "Queueing INV WR for rkey %#x failed (%d)\n",
@@ -1312,7 +1326,13 @@ reset_state:
 	return 0;
 }
 
+static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	srp_handle_qp_err(cq, wc, "FAST REG");
+}
+
 static int srp_map_finish_fr(struct srp_map_state *state,
+			     struct srp_request *req,
 			     struct srp_rdma_ch *ch, int sg_nents)
 {
 	struct srp_target_port *target = ch->target;
@@ -1349,9 +1369,11 @@ static int srp_map_finish_fr(struct srp_map_state *state,
 	if (unlikely(n < 0))
 		return n;
 
+	req->reg_cqe.done = srp_reg_mr_err_done;
+
 	wr.wr.next = NULL;
 	wr.wr.opcode = IB_WR_REG_MR;
-	wr.wr.wr_id = FAST_REG_WR_ID_MASK;
+	wr.wr.wr_cqe = &req->reg_cqe;
 	wr.wr.num_sge = 0;
 	wr.wr.send_flags = 0;
 	wr.mr = desc->mr;
@@ -1455,7 +1477,7 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
 	while (count) {
 		int i, n;
 
-		n = srp_map_finish_fr(state, ch, count);
+		n = srp_map_finish_fr(state, req, ch, count);
 		if (unlikely(n < 0))
 			return n;
 
@@ -1524,7 +1546,7 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
 		idb_sg->dma_length = idb_sg->length;	      /* hack^2 */
 #endif
-		ret = srp_map_finish_fr(&state, ch, 1);
+		ret = srp_map_finish_fr(&state, req, ch, 1);
 		if (ret < 0)
 			return ret;
 	} else if (dev->use_fmr) {
@@ -1719,7 +1741,7 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
 	s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE;
 	struct srp_iu *iu;
 
-	srp_send_completion(ch->send_cq, ch);
+	ib_process_cq_direct(ch->send_cq, -1);
 
 	if (list_empty(&ch->free_tx))
 		return NULL;
@@ -1739,6 +1761,19 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
 	return iu;
 }
 
+static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
+	struct srp_rdma_ch *ch = cq->cq_context;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		srp_handle_qp_err(cq, wc, "SEND");
+		return;
+	}
+
+	list_add(&iu->list, &ch->free_tx);
+}
+
 static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
 {
 	struct srp_target_port *target = ch->target;
@@ -1749,8 +1784,10 @@ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
 	list.length = len;
 	list.lkey   = target->lkey;
 
+	iu->cqe.done = srp_send_done;
+
 	wr.next       = NULL;
-	wr.wr_id      = (uintptr_t) iu;
+	wr.wr_cqe     = &iu->cqe;
 	wr.sg_list    = &list;
 	wr.num_sge    = 1;
 	wr.opcode     = IB_WR_SEND;
@@ -1769,8 +1806,10 @@ static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
 	list.length = iu->size;
 	list.lkey   = target->lkey;
 
+	iu->cqe.done = srp_recv_done;
+
 	wr.next     = NULL;
-	wr.wr_id    = (uintptr_t) iu;
+	wr.wr_cqe   = &iu->cqe;
 	wr.sg_list  = &list;
 	wr.num_sge  = 1;
 
@@ -1902,14 +1941,20 @@ static void srp_process_aer_req(struct srp_rdma_ch *ch,
 			     "problems processing SRP_AER_REQ\n");
 }
 
-static void srp_handle_recv(struct srp_rdma_ch *ch, struct ib_wc *wc)
+static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
+	struct srp_rdma_ch *ch = cq->cq_context;
 	struct srp_target_port *target = ch->target;
 	struct ib_device *dev = target->srp_host->srp_dev->dev;
-	struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id;
 	int res;
 	u8 opcode;
 
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		srp_handle_qp_err(cq, wc, "RECV");
+		return;
+	}
+
 	ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_ti_iu_len,
 				   DMA_FROM_DEVICE);
 
@@ -1972,68 +2017,22 @@ static void srp_tl_err_work(struct work_struct *work)
 		srp_start_tl_fail_timers(target->rport);
 }
 
-static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status,
-			      bool send_err, struct srp_rdma_ch *ch)
+static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
+		const char *opname)
 {
+	struct srp_rdma_ch *ch = cq->cq_context;
 	struct srp_target_port *target = ch->target;
 
-	if (wr_id == SRP_LAST_WR_ID) {
-		complete(&ch->done);
-		return;
-	}
-
 	if (ch->connected && !target->qp_in_error) {
-		if (wr_id & LOCAL_INV_WR_ID_MASK) {
-			shost_printk(KERN_ERR, target->scsi_host, PFX
-				     "LOCAL_INV failed with status %s (%d)\n",
-				     ib_wc_status_msg(wc_status), wc_status);
-		} else if (wr_id & FAST_REG_WR_ID_MASK) {
-			shost_printk(KERN_ERR, target->scsi_host, PFX
-				     "FAST_REG_MR failed status %s (%d)\n",
-				     ib_wc_status_msg(wc_status), wc_status);
-		} else {
-			shost_printk(KERN_ERR, target->scsi_host,
-				     PFX "failed %s status %s (%d) for iu %p\n",
-				     send_err ? "send" : "receive",
-				     ib_wc_status_msg(wc_status), wc_status,
-				     (void *)(uintptr_t)wr_id);
-		}
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "failed %s status %s (%d) for CQE %p\n",
+			     opname, ib_wc_status_msg(wc->status), wc->status,
+			     wc->wr_cqe);
 		queue_work(system_long_wq, &target->tl_err_work);
 	}
 	target->qp_in_error = true;
 }
 
-static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr)
-{
-	struct srp_rdma_ch *ch = ch_ptr;
-	struct ib_wc wc;
-
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-	while (ib_poll_cq(cq, 1, &wc) > 0) {
-		if (likely(wc.status == IB_WC_SUCCESS)) {
-			srp_handle_recv(ch, &wc);
-		} else {
-			srp_handle_qp_err(wc.wr_id, wc.status, false, ch);
-		}
-	}
-}
-
-static void srp_send_completion(struct ib_cq *cq, void *ch_ptr)
-{
-	struct srp_rdma_ch *ch = ch_ptr;
-	struct ib_wc wc;
-	struct srp_iu *iu;
-
-	while (ib_poll_cq(cq, 1, &wc) > 0) {
-		if (likely(wc.status == IB_WC_SUCCESS)) {
-			iu = (struct srp_iu *) (uintptr_t) wc.wr_id;
-			list_add(&iu->list, &ch->free_tx);
-		} else {
-			srp_handle_qp_err(wc.wr_id, wc.status, true, ch);
-		}
-	}
-}
-
 static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
 {
 	struct srp_target_port *target = host_to_target(shost);
@@ -3439,27 +3438,17 @@ free_host:
 static void srp_add_one(struct ib_device *device)
 {
 	struct srp_device *srp_dev;
-	struct ib_device_attr *dev_attr;
 	struct srp_host *host;
 	int mr_page_shift, p;
 	u64 max_pages_per_mr;
 
-	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
-	if (!dev_attr)
-		return;
-
-	if (ib_query_device(device, dev_attr)) {
-		pr_warn("Query device failed for %s\n", device->name);
-		goto free_attr;
-	}
-
 	srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL);
 	if (!srp_dev)
-		goto free_attr;
+		return;
 
 	srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
 			    device->map_phys_fmr && device->unmap_fmr);
-	srp_dev->has_fr = (dev_attr->device_cap_flags &
+	srp_dev->has_fr = (device->attrs.device_cap_flags &
 			   IB_DEVICE_MEM_MGT_EXTENSIONS);
 	if (!srp_dev->has_fmr && !srp_dev->has_fr)
 		dev_warn(&device->dev, "neither FMR nor FR is supported\n");
@@ -3473,23 +3462,23 @@ static void srp_add_one(struct ib_device *device)
 	 * minimum of 4096 bytes. We're unlikely to build large sglists
 	 * out of smaller entries.
 	 */
-	mr_page_shift		= max(12, ffs(dev_attr->page_size_cap) - 1);
+	mr_page_shift		= max(12, ffs(device->attrs.page_size_cap) - 1);
 	srp_dev->mr_page_size	= 1 << mr_page_shift;
 	srp_dev->mr_page_mask	= ~((u64) srp_dev->mr_page_size - 1);
-	max_pages_per_mr	= dev_attr->max_mr_size;
+	max_pages_per_mr	= device->attrs.max_mr_size;
 	do_div(max_pages_per_mr, srp_dev->mr_page_size);
 	srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
 					  max_pages_per_mr);
 	if (srp_dev->use_fast_reg) {
 		srp_dev->max_pages_per_mr =
 			min_t(u32, srp_dev->max_pages_per_mr,
-			      dev_attr->max_fast_reg_page_list_len);
+			      device->attrs.max_fast_reg_page_list_len);
 	}
 	srp_dev->mr_max_size	= srp_dev->mr_page_size *
 				   srp_dev->max_pages_per_mr;
-	pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n",
-		 device->name, mr_page_shift, dev_attr->max_mr_size,
-		 dev_attr->max_fast_reg_page_list_len,
+	pr_debug("%s: mr_page_shift = %d, device->max_mr_size = %#llx, device->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n",
+		 device->name, mr_page_shift, device->attrs.max_mr_size,
+		 device->attrs.max_fast_reg_page_list_len,
 		 srp_dev->max_pages_per_mr, srp_dev->mr_max_size);
 
 	INIT_LIST_HEAD(&srp_dev->dev_list);
@@ -3517,17 +3506,13 @@ static void srp_add_one(struct ib_device *device)
 	}
 
 	ib_set_client_data(device, &srp_client, srp_dev);
-
-	goto free_attr;
+	return;
 
 err_pd:
 	ib_dealloc_pd(srp_dev->pd);
 
 free_dev:
 	kfree(srp_dev);
-
-free_attr:
-	kfree(dev_attr);
 }
 
 static void srp_remove_one(struct ib_device *device, void *client_data)
@@ -3587,8 +3572,6 @@ static int __init srp_init_module(void)
 {
 	int ret;
 
-	BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *));
-
 	if (srp_sg_tablesize) {
 		pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n");
 		if (!cmd_sg_entries)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index f6af531f9f32..9e05ce4a04fd 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -66,11 +66,6 @@ enum {
 	SRP_TAG_TSK_MGMT	= 1U << 31,
 
 	SRP_MAX_PAGES_PER_MR	= 512,
-
-	LOCAL_INV_WR_ID_MASK	= 1,
-	FAST_REG_WR_ID_MASK	= 2,
-
-	SRP_LAST_WR_ID		= 0xfffffffcU,
 };
 
 enum srp_target_state {
@@ -128,6 +123,7 @@ struct srp_request {
 	struct srp_direct_buf  *indirect_desc;
 	dma_addr_t		indirect_dma_addr;
 	short			nmdesc;
+	struct ib_cqe		reg_cqe;
 };
 
 /**
@@ -231,6 +227,7 @@ struct srp_iu {
 	void		       *buf;
 	size_t			size;
 	enum dma_data_direction	direction;
+	struct ib_cqe		cqe;
 };
 
 /**
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index bc5470c43d26..0c37fee363b1 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -93,6 +93,8 @@ MODULE_PARM_DESC(srpt_service_guid,
 static struct ib_client srpt_client;
 static void srpt_release_channel(struct srpt_rdma_ch *ch);
 static int srpt_queue_status(struct se_cmd *cmd);
+static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc);
 
 /**
  * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE.
@@ -341,10 +343,10 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot,
 	memset(iocp, 0, sizeof *iocp);
 	strcpy(iocp->id_string, SRPT_ID_STRING);
 	iocp->guid = cpu_to_be64(srpt_service_guid);
-	iocp->vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id);
-	iocp->device_id = cpu_to_be32(sdev->dev_attr.vendor_part_id);
-	iocp->device_version = cpu_to_be16(sdev->dev_attr.hw_ver);
-	iocp->subsys_vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id);
+	iocp->vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id);
+	iocp->device_id = cpu_to_be32(sdev->device->attrs.vendor_part_id);
+	iocp->device_version = cpu_to_be16(sdev->device->attrs.hw_ver);
+	iocp->subsys_vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id);
 	iocp->subsys_device_id = 0x0;
 	iocp->io_class = cpu_to_be16(SRP_REV16A_IB_IO_CLASS);
 	iocp->io_subclass = cpu_to_be16(SRP_IO_SUBCLASS);
@@ -453,6 +455,7 @@ static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent,
  * srpt_mad_recv_handler() - MAD reception callback function.
  */
 static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent,
+				  struct ib_mad_send_buf *send_buf,
 				  struct ib_mad_recv_wc *mad_wc)
 {
 	struct srpt_port *sport = (struct srpt_port *)mad_agent->context;
@@ -778,12 +781,12 @@ static int srpt_post_recv(struct srpt_device *sdev,
 	struct ib_recv_wr wr, *bad_wr;
 
 	BUG_ON(!sdev);
-	wr.wr_id = encode_wr_id(SRPT_RECV, ioctx->ioctx.index);
-
 	list.addr = ioctx->ioctx.dma;
 	list.length = srp_max_req_size;
 	list.lkey = sdev->pd->local_dma_lkey;
 
+	ioctx->ioctx.cqe.done = srpt_recv_done;
+	wr.wr_cqe = &ioctx->ioctx.cqe;
 	wr.next = NULL;
 	wr.sg_list = &list;
 	wr.num_sge = 1;
@@ -819,8 +822,9 @@ static int srpt_post_send(struct srpt_rdma_ch *ch,
 	list.length = len;
 	list.lkey = sdev->pd->local_dma_lkey;
 
+	ioctx->ioctx.cqe.done = srpt_send_done;
 	wr.next = NULL;
-	wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index);
+	wr.wr_cqe = &ioctx->ioctx.cqe;
 	wr.sg_list = &list;
 	wr.num_sge = 1;
 	wr.opcode = IB_WR_SEND;
@@ -1052,13 +1056,13 @@ static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 
 	BUG_ON(!ch);
 	BUG_ON(!ioctx);
-	BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius);
+	BUG_ON(ioctx->n_rdma && !ioctx->rdma_wrs);
 
 	while (ioctx->n_rdma)
-		kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge);
+		kfree(ioctx->rdma_wrs[--ioctx->n_rdma].wr.sg_list);
 
-	kfree(ioctx->rdma_ius);
-	ioctx->rdma_ius = NULL;
+	kfree(ioctx->rdma_wrs);
+	ioctx->rdma_wrs = NULL;
 
 	if (ioctx->mapped_sg_count) {
 		sg = ioctx->sg;
@@ -1082,7 +1086,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 	struct scatterlist *sg, *sg_orig;
 	int sg_cnt;
 	enum dma_data_direction dir;
-	struct rdma_iu *riu;
+	struct ib_rdma_wr *riu;
 	struct srp_direct_buf *db;
 	dma_addr_t dma_addr;
 	struct ib_sge *sge;
@@ -1109,23 +1113,24 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 
 	ioctx->mapped_sg_count = count;
 
-	if (ioctx->rdma_ius && ioctx->n_rdma_ius)
-		nrdma = ioctx->n_rdma_ius;
+	if (ioctx->rdma_wrs && ioctx->n_rdma_wrs)
+		nrdma = ioctx->n_rdma_wrs;
 	else {
 		nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE
 			+ ioctx->n_rbuf;
 
-		ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL);
-		if (!ioctx->rdma_ius)
+		ioctx->rdma_wrs = kcalloc(nrdma, sizeof(*ioctx->rdma_wrs),
+				GFP_KERNEL);
+		if (!ioctx->rdma_wrs)
 			goto free_mem;
 
-		ioctx->n_rdma_ius = nrdma;
+		ioctx->n_rdma_wrs = nrdma;
 	}
 
 	db = ioctx->rbufs;
 	tsize = cmd->data_length;
 	dma_len = ib_sg_dma_len(dev, &sg[0]);
-	riu = ioctx->rdma_ius;
+	riu = ioctx->rdma_wrs;
 
 	/*
 	 * For each remote desc - calculate the #ib_sge.
@@ -1139,9 +1144,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
 		rsize = be32_to_cpu(db->len);
 		raddr = be64_to_cpu(db->va);
-		riu->raddr = raddr;
+		riu->remote_addr = raddr;
 		riu->rkey = be32_to_cpu(db->key);
-		riu->sge_cnt = 0;
+		riu->wr.num_sge = 0;
 
 		/* calculate how many sge required for this remote_buf */
 		while (rsize > 0 && tsize > 0) {
@@ -1165,33 +1170,35 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 				rsize = 0;
 			}
 
-			++riu->sge_cnt;
+			++riu->wr.num_sge;
 
-			if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) {
+			if (rsize > 0 &&
+			    riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) {
 				++ioctx->n_rdma;
-				riu->sge =
-				    kmalloc(riu->sge_cnt * sizeof *riu->sge,
-					    GFP_KERNEL);
-				if (!riu->sge)
+				riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+						sizeof(*riu->wr.sg_list),
+						GFP_KERNEL);
+				if (!riu->wr.sg_list)
 					goto free_mem;
 
 				++riu;
-				riu->sge_cnt = 0;
-				riu->raddr = raddr;
+				riu->wr.num_sge = 0;
+				riu->remote_addr = raddr;
 				riu->rkey = be32_to_cpu(db->key);
 			}
 		}
 
 		++ioctx->n_rdma;
-		riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge,
-				   GFP_KERNEL);
-		if (!riu->sge)
+		riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+					sizeof(*riu->wr.sg_list),
+					GFP_KERNEL);
+		if (!riu->wr.sg_list)
 			goto free_mem;
 	}
 
 	db = ioctx->rbufs;
 	tsize = cmd->data_length;
-	riu = ioctx->rdma_ius;
+	riu = ioctx->rdma_wrs;
 	sg = sg_orig;
 	dma_len = ib_sg_dma_len(dev, &sg[0]);
 	dma_addr = ib_sg_dma_address(dev, &sg[0]);
@@ -1200,7 +1207,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 	for (i = 0, j = 0;
 	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
 		rsize = be32_to_cpu(db->len);
-		sge = riu->sge;
+		sge = riu->wr.sg_list;
 		k = 0;
 
 		while (rsize > 0 && tsize > 0) {
@@ -1232,9 +1239,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 			}
 
 			++k;
-			if (k == riu->sge_cnt && rsize > 0 && tsize > 0) {
+			if (k == riu->wr.num_sge && rsize > 0 && tsize > 0) {
 				++riu;
-				sge = riu->sge;
+				sge = riu->wr.sg_list;
 				k = 0;
 			} else if (rsize > 0 && tsize > 0)
 				++sge;
@@ -1277,8 +1284,8 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
 	ioctx->n_rbuf = 0;
 	ioctx->rbufs = NULL;
 	ioctx->n_rdma = 0;
-	ioctx->n_rdma_ius = 0;
-	ioctx->rdma_ius = NULL;
+	ioctx->n_rdma_wrs = 0;
+	ioctx->rdma_wrs = NULL;
 	ioctx->mapped_sg_count = 0;
 	init_completion(&ioctx->tx_done);
 	ioctx->queue_status_only = false;
@@ -1380,118 +1387,44 @@ out:
 }
 
 /**
- * srpt_handle_send_err_comp() - Process an IB_WC_SEND error completion.
- */
-static void srpt_handle_send_err_comp(struct srpt_rdma_ch *ch, u64 wr_id)
-{
-	struct srpt_send_ioctx *ioctx;
-	enum srpt_command_state state;
-	u32 index;
-
-	atomic_inc(&ch->sq_wr_avail);
-
-	index = idx_from_wr_id(wr_id);
-	ioctx = ch->ioctx_ring[index];
-	state = srpt_get_cmd_state(ioctx);
-
-	WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
-		&& state != SRPT_STATE_MGMT_RSP_SENT
-		&& state != SRPT_STATE_NEED_DATA
-		&& state != SRPT_STATE_DONE);
-
-	/* If SRP_RSP sending failed, undo the ch->req_lim change. */
-	if (state == SRPT_STATE_CMD_RSP_SENT
-	    || state == SRPT_STATE_MGMT_RSP_SENT)
-		atomic_dec(&ch->req_lim);
-
-	srpt_abort_cmd(ioctx);
-}
-
-/**
- * srpt_handle_send_comp() - Process an IB send completion notification.
- */
-static void srpt_handle_send_comp(struct srpt_rdma_ch *ch,
-				  struct srpt_send_ioctx *ioctx)
-{
-	enum srpt_command_state state;
-
-	atomic_inc(&ch->sq_wr_avail);
-
-	state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
-
-	if (WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
-		    && state != SRPT_STATE_MGMT_RSP_SENT
-		    && state != SRPT_STATE_DONE))
-		pr_debug("state = %d\n", state);
-
-	if (state != SRPT_STATE_DONE) {
-		srpt_unmap_sg_to_ib_sge(ch, ioctx);
-		transport_generic_free_cmd(&ioctx->cmd, 0);
-	} else {
-		pr_err("IB completion has been received too late for"
-		       " wr_id = %u.\n", ioctx->ioctx.index);
-	}
-}
-
-/**
- * srpt_handle_rdma_comp() - Process an IB RDMA completion notification.
- *
  * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping
  * the data that has been transferred via IB RDMA had to be postponed until the
  * check_stop_free() callback.  None of this is necessary anymore and needs to
  * be cleaned up.
  */
-static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch,
-				  struct srpt_send_ioctx *ioctx,
-				  enum srpt_opcode opcode)
+static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_send_ioctx *ioctx =
+		container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe);
+
 	WARN_ON(ioctx->n_rdma <= 0);
 	atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
 
-	if (opcode == SRPT_RDMA_READ_LAST) {
-		if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
-						SRPT_STATE_DATA_IN))
-			target_execute_cmd(&ioctx->cmd);
-		else
-			pr_err("%s[%d]: wrong state = %d\n", __func__,
-			       __LINE__, srpt_get_cmd_state(ioctx));
-	} else if (opcode == SRPT_RDMA_ABORT) {
-		ioctx->rdma_aborted = true;
-	} else {
-		WARN(true, "unexpected opcode %d\n", opcode);
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		pr_info("RDMA_READ for ioctx 0x%p failed with status %d\n",
+			ioctx, wc->status);
+		srpt_abort_cmd(ioctx);
+		return;
 	}
+
+	if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
+					SRPT_STATE_DATA_IN))
+		target_execute_cmd(&ioctx->cmd);
+	else
+		pr_err("%s[%d]: wrong state = %d\n", __func__,
+		       __LINE__, srpt_get_cmd_state(ioctx));
 }
 
-/**
- * srpt_handle_rdma_err_comp() - Process an IB RDMA error completion.
- */
-static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch,
-				      struct srpt_send_ioctx *ioctx,
-				      enum srpt_opcode opcode)
+static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	enum srpt_command_state state;
+	struct srpt_send_ioctx *ioctx =
+		container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe);
 
-	state = srpt_get_cmd_state(ioctx);
-	switch (opcode) {
-	case SRPT_RDMA_READ_LAST:
-		if (ioctx->n_rdma <= 0) {
-			pr_err("Received invalid RDMA read"
-			       " error completion with idx %d\n",
-			       ioctx->ioctx.index);
-			break;
-		}
-		atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
-		if (state == SRPT_STATE_NEED_DATA)
-			srpt_abort_cmd(ioctx);
-		else
-			pr_err("%s[%d]: wrong state = %d\n",
-			       __func__, __LINE__, state);
-		break;
-	case SRPT_RDMA_WRITE_LAST:
-		break;
-	default:
-		pr_err("%s[%d]: opcode = %u\n", __func__, __LINE__, opcode);
-		break;
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n",
+			ioctx, wc->status);
+		srpt_abort_cmd(ioctx);
 	}
 }
 
@@ -1926,32 +1859,26 @@ out:
 	return;
 }
 
-static void srpt_process_rcv_completion(struct ib_cq *cq,
-					struct srpt_rdma_ch *ch,
-					struct ib_wc *wc)
+static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct srpt_device *sdev = ch->sport->sdev;
-	struct srpt_recv_ioctx *ioctx;
-	u32 index;
+	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_recv_ioctx *ioctx =
+		container_of(wc->wr_cqe, struct srpt_recv_ioctx, ioctx.cqe);
 
-	index = idx_from_wr_id(wc->wr_id);
 	if (wc->status == IB_WC_SUCCESS) {
 		int req_lim;
 
 		req_lim = atomic_dec_return(&ch->req_lim);
 		if (unlikely(req_lim < 0))
 			pr_err("req_lim = %d < 0\n", req_lim);
-		ioctx = sdev->ioctx_ring[index];
 		srpt_handle_new_iu(ch, ioctx, NULL);
 	} else {
-		pr_info("receiving failed for idx %u with status %d\n",
-			index, wc->status);
+		pr_info("receiving failed for ioctx %p with status %d\n",
+			ioctx, wc->status);
 	}
 }
 
 /**
- * srpt_process_send_completion() - Process an IB send completion.
- *
  * Note: Although this has not yet been observed during tests, at least in
  * theory it is possible that the srpt_get_send_ioctx() call invoked by
  * srpt_handle_new_iu() fails. This is possible because the req_lim_delta
@@ -1964,109 +1891,52 @@ static void srpt_process_rcv_completion(struct ib_cq *cq,
  * are queued on cmd_wait_list. The code below processes these delayed
  * requests one at a time.
  */
-static void srpt_process_send_completion(struct ib_cq *cq,
-					 struct srpt_rdma_ch *ch,
-					 struct ib_wc *wc)
+static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct srpt_send_ioctx *send_ioctx;
-	uint32_t index;
-	enum srpt_opcode opcode;
+	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_send_ioctx *ioctx =
+		container_of(wc->wr_cqe, struct srpt_send_ioctx, ioctx.cqe);
+	enum srpt_command_state state;
 
-	index = idx_from_wr_id(wc->wr_id);
-	opcode = opcode_from_wr_id(wc->wr_id);
-	send_ioctx = ch->ioctx_ring[index];
-	if (wc->status == IB_WC_SUCCESS) {
-		if (opcode == SRPT_SEND)
-			srpt_handle_send_comp(ch, send_ioctx);
-		else {
-			WARN_ON(opcode != SRPT_RDMA_ABORT &&
-				wc->opcode != IB_WC_RDMA_READ);
-			srpt_handle_rdma_comp(ch, send_ioctx, opcode);
-		}
+	state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+
+	WARN_ON(state != SRPT_STATE_CMD_RSP_SENT &&
+		state != SRPT_STATE_MGMT_RSP_SENT);
+
+	atomic_inc(&ch->sq_wr_avail);
+
+	if (wc->status != IB_WC_SUCCESS) {
+		pr_info("sending response for ioctx 0x%p failed"
+			" with status %d\n", ioctx, wc->status);
+
+		atomic_dec(&ch->req_lim);
+		srpt_abort_cmd(ioctx);
+		goto out;
+	}
+
+	if (state != SRPT_STATE_DONE) {
+		srpt_unmap_sg_to_ib_sge(ch, ioctx);
+		transport_generic_free_cmd(&ioctx->cmd, 0);
 	} else {
-		if (opcode == SRPT_SEND) {
-			pr_info("sending response for idx %u failed"
-				" with status %d\n", index, wc->status);
-			srpt_handle_send_err_comp(ch, wc->wr_id);
-		} else if (opcode != SRPT_RDMA_MID) {
-			pr_info("RDMA t %d for idx %u failed with"
-				" status %d\n", opcode, index, wc->status);
-			srpt_handle_rdma_err_comp(ch, send_ioctx, opcode);
-		}
+		pr_err("IB completion has been received too late for"
+		       " wr_id = %u.\n", ioctx->ioctx.index);
 	}
 
-	while (unlikely(opcode == SRPT_SEND
-			&& !list_empty(&ch->cmd_wait_list)
-			&& srpt_get_ch_state(ch) == CH_LIVE
-			&& (send_ioctx = srpt_get_send_ioctx(ch)) != NULL)) {
+out:
+	while (!list_empty(&ch->cmd_wait_list) &&
+	       srpt_get_ch_state(ch) == CH_LIVE &&
+	       (ioctx = srpt_get_send_ioctx(ch)) != NULL) {
 		struct srpt_recv_ioctx *recv_ioctx;
 
 		recv_ioctx = list_first_entry(&ch->cmd_wait_list,
 					      struct srpt_recv_ioctx,
 					      wait_list);
 		list_del(&recv_ioctx->wait_list);
-		srpt_handle_new_iu(ch, recv_ioctx, send_ioctx);
-	}
-}
-
-static void srpt_process_completion(struct ib_cq *cq, struct srpt_rdma_ch *ch)
-{
-	struct ib_wc *const wc = ch->wc;
-	int i, n;
-
-	WARN_ON(cq != ch->cq);
-
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-	while ((n = ib_poll_cq(cq, ARRAY_SIZE(ch->wc), wc)) > 0) {
-		for (i = 0; i < n; i++) {
-			if (opcode_from_wr_id(wc[i].wr_id) == SRPT_RECV)
-				srpt_process_rcv_completion(cq, ch, &wc[i]);
-			else
-				srpt_process_send_completion(cq, ch, &wc[i]);
-		}
+		srpt_handle_new_iu(ch, recv_ioctx, ioctx);
 	}
 }
 
 /**
- * srpt_completion() - IB completion queue callback function.
- *
- * Notes:
- * - It is guaranteed that a completion handler will never be invoked
- *   concurrently on two different CPUs for the same completion queue. See also
- *   Documentation/infiniband/core_locking.txt and the implementation of
- *   handle_edge_irq() in kernel/irq/chip.c.
- * - When threaded IRQs are enabled, completion handlers are invoked in thread
- *   context instead of interrupt context.
- */
-static void srpt_completion(struct ib_cq *cq, void *ctx)
-{
-	struct srpt_rdma_ch *ch = ctx;
-
-	wake_up_interruptible(&ch->wait_queue);
-}
-
-static int srpt_compl_thread(void *arg)
-{
-	struct srpt_rdma_ch *ch;
-
-	/* Hibernation / freezing of the SRPT kernel thread is not supported. */
-	current->flags |= PF_NOFREEZE;
-
-	ch = arg;
-	BUG_ON(!ch);
-	pr_info("Session %s: kernel thread %s (PID %d) started\n",
-		ch->sess_name, ch->thread->comm, current->pid);
-	while (!kthread_should_stop()) {
-		wait_event_interruptible(ch->wait_queue,
-			(srpt_process_completion(ch->cq, ch),
-			 kthread_should_stop()));
-	}
-	pr_info("Session %s: kernel thread %s (PID %d) stopped\n",
-		ch->sess_name, ch->thread->comm, current->pid);
-	return 0;
-}
-
-/**
  * srpt_create_ch_ib() - Create receive and send completion queues.
  */
 static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
@@ -2075,7 +1945,6 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
 	struct srpt_port *sport = ch->sport;
 	struct srpt_device *sdev = sport->sdev;
 	u32 srp_sq_size = sport->port_attrib.srp_sq_size;
-	struct ib_cq_init_attr cq_attr = {};
 	int ret;
 
 	WARN_ON(ch->rq_size < 1);
@@ -2086,9 +1955,8 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
 		goto out;
 
 retry:
-	cq_attr.cqe = ch->rq_size + srp_sq_size;
-	ch->cq = ib_create_cq(sdev->device, srpt_completion, NULL, ch,
-			      &cq_attr);
+	ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + srp_sq_size,
+			0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE);
 	if (IS_ERR(ch->cq)) {
 		ret = PTR_ERR(ch->cq);
 		pr_err("failed to create CQ cqe= %d ret= %d\n",
@@ -2131,18 +1999,6 @@ retry:
 	if (ret)
 		goto err_destroy_qp;
 
-	init_waitqueue_head(&ch->wait_queue);
-
-	pr_debug("creating thread for session %s\n", ch->sess_name);
-
-	ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl");
-	if (IS_ERR(ch->thread)) {
-		pr_err("failed to create kernel thread %ld\n",
-		       PTR_ERR(ch->thread));
-		ch->thread = NULL;
-		goto err_destroy_qp;
-	}
-
 out:
 	kfree(qp_init);
 	return ret;
@@ -2150,17 +2006,14 @@ out:
 err_destroy_qp:
 	ib_destroy_qp(ch->qp);
 err_destroy_cq:
-	ib_destroy_cq(ch->cq);
+	ib_free_cq(ch->cq);
 	goto out;
 }
 
 static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch)
 {
-	if (ch->thread)
-		kthread_stop(ch->thread);
-
 	ib_destroy_qp(ch->qp);
-	ib_destroy_cq(ch->cq);
+	ib_free_cq(ch->cq);
 }
 
 /**
@@ -2808,12 +2661,8 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
 			      struct srpt_send_ioctx *ioctx)
 {
-	struct ib_rdma_wr wr;
 	struct ib_send_wr *bad_wr;
-	struct rdma_iu *riu;
-	int i;
-	int ret;
-	int sq_wr_avail;
+	int sq_wr_avail, ret, i;
 	enum dma_data_direction dir;
 	const int n_rdma = ioctx->n_rdma;
 
@@ -2829,59 +2678,32 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
 		}
 	}
 
-	ioctx->rdma_aborted = false;
-	ret = 0;
-	riu = ioctx->rdma_ius;
-	memset(&wr, 0, sizeof wr);
-
-	for (i = 0; i < n_rdma; ++i, ++riu) {
-		if (dir == DMA_FROM_DEVICE) {
-			wr.wr.opcode = IB_WR_RDMA_WRITE;
-			wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
-						SRPT_RDMA_WRITE_LAST :
-						SRPT_RDMA_MID,
-						ioctx->ioctx.index);
-		} else {
-			wr.wr.opcode = IB_WR_RDMA_READ;
-			wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
-						SRPT_RDMA_READ_LAST :
-						SRPT_RDMA_MID,
-						ioctx->ioctx.index);
-		}
-		wr.wr.next = NULL;
-		wr.remote_addr = riu->raddr;
-		wr.rkey = riu->rkey;
-		wr.wr.num_sge = riu->sge_cnt;
-		wr.wr.sg_list = riu->sge;
+	for (i = 0; i < n_rdma; i++) {
+		struct ib_send_wr *wr = &ioctx->rdma_wrs[i].wr;
 
-		/* only get completion event for the last rdma write */
-		if (i == (n_rdma - 1) && dir == DMA_TO_DEVICE)
-			wr.wr.send_flags = IB_SEND_SIGNALED;
+		wr->opcode = (dir == DMA_FROM_DEVICE) ?
+				IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
 
-		ret = ib_post_send(ch->qp, &wr.wr, &bad_wr);
-		if (ret)
-			break;
+		if (i == n_rdma - 1) {
+			/* only get completion event for the last rdma read */
+			if (dir == DMA_TO_DEVICE) {
+				wr->send_flags = IB_SEND_SIGNALED;
+				ioctx->rdma_cqe.done = srpt_rdma_read_done;
+			} else {
+				ioctx->rdma_cqe.done = srpt_rdma_write_done;
+			}
+			wr->wr_cqe = &ioctx->rdma_cqe;
+			wr->next = NULL;
+		} else {
+			wr->wr_cqe = NULL;
+			wr->next = &ioctx->rdma_wrs[i + 1].wr;
+		}
 	}
 
+	ret = ib_post_send(ch->qp, &ioctx->rdma_wrs->wr, &bad_wr);
 	if (ret)
 		pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
 				 __func__, __LINE__, ret, i, n_rdma);
-	if (ret && i > 0) {
-		wr.wr.num_sge = 0;
-		wr.wr.wr_id = encode_wr_id(SRPT_RDMA_ABORT, ioctx->ioctx.index);
-		wr.wr.send_flags = IB_SEND_SIGNALED;
-		while (ch->state == CH_LIVE &&
-			ib_post_send(ch->qp, &wr.wr, &bad_wr) != 0) {
-			pr_info("Trying to abort failed RDMA transfer [%d]\n",
-				ioctx->ioctx.index);
-			msleep(1000);
-		}
-		while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) {
-			pr_info("Waiting until RDMA abort finished [%d]\n",
-				ioctx->ioctx.index);
-			msleep(1000);
-		}
-	}
 out:
 	if (unlikely(dir == DMA_TO_DEVICE && ret < 0))
 		atomic_add(n_rdma, &ch->sq_wr_avail);
@@ -3190,14 +3012,11 @@ static void srpt_add_one(struct ib_device *device)
 	init_waitqueue_head(&sdev->ch_releaseQ);
 	spin_lock_init(&sdev->spinlock);
 
-	if (ib_query_device(device, &sdev->dev_attr))
-		goto free_dev;
-
 	sdev->pd = ib_alloc_pd(device);
 	if (IS_ERR(sdev->pd))
 		goto free_dev;
 
-	sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr);
+	sdev->srq_size = min(srpt_srq_size, sdev->device->attrs.max_srq_wr);
 
 	srq_attr.event_handler = srpt_srq_event;
 	srq_attr.srq_context = (void *)sdev;
@@ -3211,7 +3030,7 @@ static void srpt_add_one(struct ib_device *device)
 		goto err_pd;
 
 	pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n",
-		 __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr,
+		 __func__, sdev->srq_size, sdev->device->attrs.max_srq_wr,
 		 device->name);
 
 	if (!srpt_service_guid)
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index 5366e0a9fd6d..09037f2b0b51 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -128,36 +128,6 @@ enum {
 	DEFAULT_MAX_RDMA_SIZE = 65536,
 };
 
-enum srpt_opcode {
-	SRPT_RECV,
-	SRPT_SEND,
-	SRPT_RDMA_MID,
-	SRPT_RDMA_ABORT,
-	SRPT_RDMA_READ_LAST,
-	SRPT_RDMA_WRITE_LAST,
-};
-
-static inline u64 encode_wr_id(u8 opcode, u32 idx)
-{
-	return ((u64)opcode << 32) | idx;
-}
-static inline enum srpt_opcode opcode_from_wr_id(u64 wr_id)
-{
-	return wr_id >> 32;
-}
-static inline u32 idx_from_wr_id(u64 wr_id)
-{
-	return (u32)wr_id;
-}
-
-struct rdma_iu {
-	u64		raddr;
-	u32		rkey;
-	struct ib_sge	*sge;
-	u32		sge_cnt;
-	int		mem_id;
-};
-
 /**
  * enum srpt_command_state - SCSI command state managed by SRPT.
  * @SRPT_STATE_NEW:           New command arrived and is being processed.
@@ -189,6 +159,7 @@ enum srpt_command_state {
  * @index: Index of the I/O context in its ioctx_ring array.
  */
 struct srpt_ioctx {
+	struct ib_cqe		cqe;
 	void			*buf;
 	dma_addr_t		dma;
 	uint32_t		index;
@@ -215,32 +186,30 @@ struct srpt_recv_ioctx {
  * @sg:          Pointer to sg-list associated with this I/O context.
  * @sg_cnt:      SG-list size.
  * @mapped_sg_count: ib_dma_map_sg() return value.
- * @n_rdma_ius:  Number of elements in the rdma_ius array.
- * @rdma_ius:    Array with information about the RDMA mapping.
+ * @n_rdma_wrs:  Number of elements in the rdma_wrs array.
+ * @rdma_wrs:    Array with information about the RDMA mapping.
  * @tag:         Tag of the received SRP information unit.
  * @spinlock:    Protects 'state'.
  * @state:       I/O context state.
- * @rdma_aborted: If initiating a multipart RDMA transfer failed, whether
- * 		 the already initiated transfers have finished.
  * @cmd:         Target core command data structure.
  * @sense_data:  SCSI sense data.
  */
 struct srpt_send_ioctx {
 	struct srpt_ioctx	ioctx;
 	struct srpt_rdma_ch	*ch;
-	struct rdma_iu		*rdma_ius;
+	struct ib_rdma_wr	*rdma_wrs;
+	struct ib_cqe		rdma_cqe;
 	struct srp_direct_buf	*rbufs;
 	struct srp_direct_buf	single_rbuf;
 	struct scatterlist	*sg;
 	struct list_head	free_list;
 	spinlock_t		spinlock;
 	enum srpt_command_state	state;
-	bool			rdma_aborted;
 	struct se_cmd		cmd;
 	struct completion	tx_done;
 	int			sg_cnt;
 	int			mapped_sg_count;
-	u16			n_rdma_ius;
+	u16			n_rdma_wrs;
 	u8			n_rdma;
 	u8			n_rbuf;
 	bool			queue_status_only;
@@ -267,9 +236,6 @@ enum rdma_ch_state {
 
 /**
  * struct srpt_rdma_ch - RDMA channel.
- * @wait_queue:    Allows the kernel thread to wait for more work.
- * @thread:        Kernel thread that processes the IB queues associated with
- *                 the channel.
  * @cm_id:         IB CM ID associated with the channel.
  * @qp:            IB queue pair used for communicating over this channel.
  * @cq:            IB completion queue for this channel.
@@ -288,7 +254,6 @@ enum rdma_ch_state {
  * @free_list:     Head of list with free send I/O contexts.
  * @state:         channel state. See also enum rdma_ch_state.
  * @ioctx_ring:    Send ring.
- * @wc:            IB work completion array for srpt_process_completion().
  * @list:          Node for insertion in the srpt_device.rch_list list.
  * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This
  *                 list contains struct srpt_ioctx elements and is protected
@@ -299,8 +264,6 @@ enum rdma_ch_state {
  * @release_done:  Enables waiting for srpt_release_channel() completion.
  */
 struct srpt_rdma_ch {
-	wait_queue_head_t	wait_queue;
-	struct task_struct	*thread;
 	struct ib_cm_id		*cm_id;
 	struct ib_qp		*qp;
 	struct ib_cq		*cq;
@@ -317,7 +280,6 @@ struct srpt_rdma_ch {
 	struct list_head	free_list;
 	enum rdma_ch_state	state;
 	struct srpt_send_ioctx	**ioctx_ring;
-	struct ib_wc		wc[16];
 	struct list_head	list;
 	struct list_head	cmd_wait_list;
 	struct se_session	*sess;
@@ -377,8 +339,6 @@ struct srpt_port {
  * @mr:            L_Key (local key) with write access to all local memory.
  * @srq:           Per-HCA SRQ (shared receive queue).
  * @cm_id:         Connection identifier.
- * @dev_attr:      Attributes of the InfiniBand device as obtained during the
- *                 ib_client.add() callback.
  * @srq_size:      SRQ size.
  * @ioctx_ring:    Per-HCA SRQ.
  * @rch_list:      Per-device channel list -- see also srpt_rdma_ch.list.
@@ -393,7 +353,6 @@ struct srpt_device {
 	struct ib_pd		*pd;
 	struct ib_srq		*srq;
 	struct ib_cm_id		*cm_id;
-	struct ib_device_attr	dev_attr;
 	int			srq_size;
 	struct srpt_recv_ioctx	**ioctx_ring;
 	struct list_head	rch_list;
diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c
index fd4100d56d8c..6727954ab74b 100644
--- a/drivers/input/joystick/xpad.c
+++ b/drivers/input/joystick/xpad.c
@@ -76,10 +76,13 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/input.h>
+#include <linux/rcupdate.h>
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/module.h>
 #include <linux/usb/input.h>
+#include <linux/usb/quirks.h>
 
 #define DRIVER_AUTHOR "Marko Friedemann <mfr@bmx-chemnitz.de>"
 #define DRIVER_DESC "X-Box pad driver"
@@ -125,7 +128,7 @@ static const struct xpad_device {
 	{ 0x045e, 0x0289, "Microsoft X-Box pad v2 (US)", 0, XTYPE_XBOX },
 	{ 0x045e, 0x028e, "Microsoft X-Box 360 pad", 0, XTYPE_XBOX360 },
 	{ 0x045e, 0x02d1, "Microsoft X-Box One pad", 0, XTYPE_XBOXONE },
-	{ 0x045e, 0x02dd, "Microsoft X-Box One pad (Covert Forces)", 0, XTYPE_XBOXONE },
+	{ 0x045e, 0x02dd, "Microsoft X-Box One pad (Firmware 2015)", 0, XTYPE_XBOXONE },
 	{ 0x045e, 0x0291, "Xbox 360 Wireless Receiver (XBOX)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W },
 	{ 0x045e, 0x0719, "Xbox 360 Wireless Receiver", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W },
 	{ 0x044f, 0x0f07, "Thrustmaster, Inc. Controller", 0, XTYPE_XBOX },
@@ -317,21 +320,42 @@ static struct usb_device_id xpad_table[] = {
 
 MODULE_DEVICE_TABLE(usb, xpad_table);
 
+struct xpad_output_packet {
+	u8 data[XPAD_PKT_LEN];
+	u8 len;
+	bool pending;
+};
+
+#define XPAD_OUT_CMD_IDX	0
+#define XPAD_OUT_FF_IDX		1
+#define XPAD_OUT_LED_IDX	(1 + IS_ENABLED(CONFIG_JOYSTICK_XPAD_FF))
+#define XPAD_NUM_OUT_PACKETS	(1 + \
+				 IS_ENABLED(CONFIG_JOYSTICK_XPAD_FF) + \
+				 IS_ENABLED(CONFIG_JOYSTICK_XPAD_LEDS))
+
 struct usb_xpad {
 	struct input_dev *dev;		/* input device interface */
+	struct input_dev __rcu *x360w_dev;
 	struct usb_device *udev;	/* usb device */
 	struct usb_interface *intf;	/* usb interface */
 
-	int pad_present;
+	bool pad_present;
+	bool input_created;
 
 	struct urb *irq_in;		/* urb for interrupt in report */
 	unsigned char *idata;		/* input data */
 	dma_addr_t idata_dma;
 
 	struct urb *irq_out;		/* urb for interrupt out report */
+	struct usb_anchor irq_out_anchor;
+	bool irq_out_active;		/* we must not use an active URB */
+	u8 odata_serial;		/* serial number for xbox one protocol */
 	unsigned char *odata;		/* output data */
 	dma_addr_t odata_dma;
-	struct mutex odata_mutex;
+	spinlock_t odata_lock;
+
+	struct xpad_output_packet out_packets[XPAD_NUM_OUT_PACKETS];
+	int last_out_packet;
 
 #if defined(CONFIG_JOYSTICK_XPAD_LEDS)
 	struct xpad_led *led;
@@ -343,8 +367,12 @@ struct usb_xpad {
 	int xtype;			/* type of xbox device */
 	int pad_nr;			/* the order x360 pads were attached */
 	const char *name;		/* name of the device */
+	struct work_struct work;	/* init/remove device from callback */
 };
 
+static int xpad_init_input(struct usb_xpad *xpad);
+static void xpad_deinit_input(struct usb_xpad *xpad);
+
 /*
  *	xpad_process_packet
  *
@@ -424,11 +452,9 @@ static void xpad_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *d
  *		http://www.free60.org/wiki/Gamepad
  */
 
-static void xpad360_process_packet(struct usb_xpad *xpad,
+static void xpad360_process_packet(struct usb_xpad *xpad, struct input_dev *dev,
 				   u16 cmd, unsigned char *data)
 {
-	struct input_dev *dev = xpad->dev;
-
 	/* digital pad */
 	if (xpad->mapping & MAP_DPAD_TO_BUTTONS) {
 		/* dpad as buttons (left, right, up, down) */
@@ -495,7 +521,30 @@ static void xpad360_process_packet(struct usb_xpad *xpad,
 	input_sync(dev);
 }
 
-static void xpad_identify_controller(struct usb_xpad *xpad);
+static void xpad_presence_work(struct work_struct *work)
+{
+	struct usb_xpad *xpad = container_of(work, struct usb_xpad, work);
+	int error;
+
+	if (xpad->pad_present) {
+		error = xpad_init_input(xpad);
+		if (error) {
+			/* complain only, not much else we can do here */
+			dev_err(&xpad->dev->dev,
+				"unable to init device: %d\n", error);
+		} else {
+			rcu_assign_pointer(xpad->x360w_dev, xpad->dev);
+		}
+	} else {
+		RCU_INIT_POINTER(xpad->x360w_dev, NULL);
+		synchronize_rcu();
+		/*
+		 * Now that we are sure xpad360w_process_packet is not
+		 * using input device we can get rid of it.
+		 */
+		xpad_deinit_input(xpad);
+	}
+}
 
 /*
  * xpad360w_process_packet
@@ -513,24 +562,28 @@ static void xpad_identify_controller(struct usb_xpad *xpad);
  */
 static void xpad360w_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *data)
 {
+	struct input_dev *dev;
+	bool present;
+
 	/* Presence change */
 	if (data[0] & 0x08) {
-		if (data[1] & 0x80) {
-			xpad->pad_present = 1;
-			/*
-			 * Light up the segment corresponding to
-			 * controller number.
-			 */
-			xpad_identify_controller(xpad);
-		} else
-			xpad->pad_present = 0;
+		present = (data[1] & 0x80) != 0;
+
+		if (xpad->pad_present != present) {
+			xpad->pad_present = present;
+			schedule_work(&xpad->work);
+		}
 	}
 
 	/* Valid pad data */
-	if (!(data[1] & 0x1))
+	if (data[1] != 0x1)
 		return;
 
-	xpad360_process_packet(xpad, cmd, &data[4]);
+	rcu_read_lock();
+	dev = rcu_dereference(xpad->x360w_dev);
+	if (dev)
+		xpad360_process_packet(xpad, dev, cmd, &data[4]);
+	rcu_read_unlock();
 }
 
 /*
@@ -659,7 +712,7 @@ static void xpad_irq_in(struct urb *urb)
 
 	switch (xpad->xtype) {
 	case XTYPE_XBOX360:
-		xpad360_process_packet(xpad, 0, xpad->idata);
+		xpad360_process_packet(xpad, xpad->dev, 0, xpad->idata);
 		break;
 	case XTYPE_XBOX360W:
 		xpad360w_process_packet(xpad, 0, xpad->idata);
@@ -678,18 +731,73 @@ exit:
 			__func__, retval);
 }
 
+/* Callers must hold xpad->odata_lock spinlock */
+static bool xpad_prepare_next_out_packet(struct usb_xpad *xpad)
+{
+	struct xpad_output_packet *pkt, *packet = NULL;
+	int i;
+
+	for (i = 0; i < XPAD_NUM_OUT_PACKETS; i++) {
+		if (++xpad->last_out_packet >= XPAD_NUM_OUT_PACKETS)
+			xpad->last_out_packet = 0;
+
+		pkt = &xpad->out_packets[xpad->last_out_packet];
+		if (pkt->pending) {
+			dev_dbg(&xpad->intf->dev,
+				"%s - found pending output packet %d\n",
+				__func__, xpad->last_out_packet);
+			packet = pkt;
+			break;
+		}
+	}
+
+	if (packet) {
+		memcpy(xpad->odata, packet->data, packet->len);
+		xpad->irq_out->transfer_buffer_length = packet->len;
+		return true;
+	}
+
+	return false;
+}
+
+/* Callers must hold xpad->odata_lock spinlock */
+static int xpad_try_sending_next_out_packet(struct usb_xpad *xpad)
+{
+	int error;
+
+	if (!xpad->irq_out_active && xpad_prepare_next_out_packet(xpad)) {
+		usb_anchor_urb(xpad->irq_out, &xpad->irq_out_anchor);
+		error = usb_submit_urb(xpad->irq_out, GFP_ATOMIC);
+		if (error) {
+			dev_err(&xpad->intf->dev,
+				"%s - usb_submit_urb failed with result %d\n",
+				__func__, error);
+			usb_unanchor_urb(xpad->irq_out);
+			return -EIO;
+		}
+
+		xpad->irq_out_active = true;
+	}
+
+	return 0;
+}
+
 static void xpad_irq_out(struct urb *urb)
 {
 	struct usb_xpad *xpad = urb->context;
 	struct device *dev = &xpad->intf->dev;
-	int retval, status;
+	int status = urb->status;
+	int error;
+	unsigned long flags;
 
-	status = urb->status;
+	spin_lock_irqsave(&xpad->odata_lock, flags);
 
 	switch (status) {
 	case 0:
 		/* success */
-		return;
+		xpad->out_packets[xpad->last_out_packet].pending = false;
+		xpad->irq_out_active = xpad_prepare_next_out_packet(xpad);
+		break;
 
 	case -ECONNRESET:
 	case -ENOENT:
@@ -697,19 +805,28 @@ static void xpad_irq_out(struct urb *urb)
 		/* this urb is terminated, clean up */
 		dev_dbg(dev, "%s - urb shutting down with status: %d\n",
 			__func__, status);
-		return;
+		xpad->irq_out_active = false;
+		break;
 
 	default:
 		dev_dbg(dev, "%s - nonzero urb status received: %d\n",
 			__func__, status);
-		goto exit;
+		break;
 	}
 
-exit:
-	retval = usb_submit_urb(urb, GFP_ATOMIC);
-	if (retval)
-		dev_err(dev, "%s - usb_submit_urb failed with result %d\n",
-			__func__, retval);
+	if (xpad->irq_out_active) {
+		usb_anchor_urb(urb, &xpad->irq_out_anchor);
+		error = usb_submit_urb(urb, GFP_ATOMIC);
+		if (error) {
+			dev_err(dev,
+				"%s - usb_submit_urb failed with result %d\n",
+				__func__, error);
+			usb_unanchor_urb(urb);
+			xpad->irq_out_active = false;
+		}
+	}
+
+	spin_unlock_irqrestore(&xpad->odata_lock, flags);
 }
 
 static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad)
@@ -721,6 +838,8 @@ static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad)
 	if (xpad->xtype == XTYPE_UNKNOWN)
 		return 0;
 
+	init_usb_anchor(&xpad->irq_out_anchor);
+
 	xpad->odata = usb_alloc_coherent(xpad->udev, XPAD_PKT_LEN,
 					 GFP_KERNEL, &xpad->odata_dma);
 	if (!xpad->odata) {
@@ -728,7 +847,7 @@ static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad)
 		goto fail1;
 	}
 
-	mutex_init(&xpad->odata_mutex);
+	spin_lock_init(&xpad->odata_lock);
 
 	xpad->irq_out = usb_alloc_urb(0, GFP_KERNEL);
 	if (!xpad->irq_out) {
@@ -755,8 +874,14 @@ static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad)
 
 static void xpad_stop_output(struct usb_xpad *xpad)
 {
-	if (xpad->xtype != XTYPE_UNKNOWN)
-		usb_kill_urb(xpad->irq_out);
+	if (xpad->xtype != XTYPE_UNKNOWN) {
+		if (!usb_wait_anchor_empty_timeout(&xpad->irq_out_anchor,
+						   5000)) {
+			dev_warn(&xpad->intf->dev,
+				 "timed out waiting for output URB to complete, killing\n");
+			usb_kill_anchored_urbs(&xpad->irq_out_anchor);
+		}
+	}
 }
 
 static void xpad_deinit_output(struct usb_xpad *xpad)
@@ -770,27 +895,60 @@ static void xpad_deinit_output(struct usb_xpad *xpad)
 
 static int xpad_inquiry_pad_presence(struct usb_xpad *xpad)
 {
+	struct xpad_output_packet *packet =
+			&xpad->out_packets[XPAD_OUT_CMD_IDX];
+	unsigned long flags;
 	int retval;
 
-	mutex_lock(&xpad->odata_mutex);
+	spin_lock_irqsave(&xpad->odata_lock, flags);
+
+	packet->data[0] = 0x08;
+	packet->data[1] = 0x00;
+	packet->data[2] = 0x0F;
+	packet->data[3] = 0xC0;
+	packet->data[4] = 0x00;
+	packet->data[5] = 0x00;
+	packet->data[6] = 0x00;
+	packet->data[7] = 0x00;
+	packet->data[8] = 0x00;
+	packet->data[9] = 0x00;
+	packet->data[10] = 0x00;
+	packet->data[11] = 0x00;
+	packet->len = 12;
+	packet->pending = true;
+
+	/* Reset the sequence so we send out presence first */
+	xpad->last_out_packet = -1;
+	retval = xpad_try_sending_next_out_packet(xpad);
+
+	spin_unlock_irqrestore(&xpad->odata_lock, flags);
 
-	xpad->odata[0] = 0x08;
-	xpad->odata[1] = 0x00;
-	xpad->odata[2] = 0x0F;
-	xpad->odata[3] = 0xC0;
-	xpad->odata[4] = 0x00;
-	xpad->odata[5] = 0x00;
-	xpad->odata[6] = 0x00;
-	xpad->odata[7] = 0x00;
-	xpad->odata[8] = 0x00;
-	xpad->odata[9] = 0x00;
-	xpad->odata[10] = 0x00;
-	xpad->odata[11] = 0x00;
-	xpad->irq_out->transfer_buffer_length = 12;
+	return retval;
+}
+
+static int xpad_start_xbox_one(struct usb_xpad *xpad)
+{
+	struct xpad_output_packet *packet =
+			&xpad->out_packets[XPAD_OUT_CMD_IDX];
+	unsigned long flags;
+	int retval;
 
-	retval = usb_submit_urb(xpad->irq_out, GFP_KERNEL);
+	spin_lock_irqsave(&xpad->odata_lock, flags);
 
-	mutex_unlock(&xpad->odata_mutex);
+	/* Xbox one controller needs to be initialized. */
+	packet->data[0] = 0x05;
+	packet->data[1] = 0x20;
+	packet->data[2] = xpad->odata_serial++; /* packet serial */
+	packet->data[3] = 0x01; /* rumble bit enable?  */
+	packet->data[4] = 0x00;
+	packet->len = 5;
+	packet->pending = true;
+
+	/* Reset the sequence so we send out start packet first */
+	xpad->last_out_packet = -1;
+	retval = xpad_try_sending_next_out_packet(xpad);
+
+	spin_unlock_irqrestore(&xpad->odata_lock, flags);
 
 	return retval;
 }
@@ -799,8 +957,11 @@ static int xpad_inquiry_pad_presence(struct usb_xpad *xpad)
 static int xpad_play_effect(struct input_dev *dev, void *data, struct ff_effect *effect)
 {
 	struct usb_xpad *xpad = input_get_drvdata(dev);
+	struct xpad_output_packet *packet = &xpad->out_packets[XPAD_OUT_FF_IDX];
 	__u16 strong;
 	__u16 weak;
+	int retval;
+	unsigned long flags;
 
 	if (effect->type != FF_RUMBLE)
 		return 0;
@@ -808,69 +969,81 @@ static int xpad_play_effect(struct input_dev *dev, void *data, struct ff_effect
 	strong = effect->u.rumble.strong_magnitude;
 	weak = effect->u.rumble.weak_magnitude;
 
+	spin_lock_irqsave(&xpad->odata_lock, flags);
+
 	switch (xpad->xtype) {
 	case XTYPE_XBOX:
-		xpad->odata[0] = 0x00;
-		xpad->odata[1] = 0x06;
-		xpad->odata[2] = 0x00;
-		xpad->odata[3] = strong / 256;	/* left actuator */
-		xpad->odata[4] = 0x00;
-		xpad->odata[5] = weak / 256;	/* right actuator */
-		xpad->irq_out->transfer_buffer_length = 6;
+		packet->data[0] = 0x00;
+		packet->data[1] = 0x06;
+		packet->data[2] = 0x00;
+		packet->data[3] = strong / 256;	/* left actuator */
+		packet->data[4] = 0x00;
+		packet->data[5] = weak / 256;	/* right actuator */
+		packet->len = 6;
+		packet->pending = true;
 		break;
 
 	case XTYPE_XBOX360:
-		xpad->odata[0] = 0x00;
-		xpad->odata[1] = 0x08;
-		xpad->odata[2] = 0x00;
-		xpad->odata[3] = strong / 256;  /* left actuator? */
-		xpad->odata[4] = weak / 256;	/* right actuator? */
-		xpad->odata[5] = 0x00;
-		xpad->odata[6] = 0x00;
-		xpad->odata[7] = 0x00;
-		xpad->irq_out->transfer_buffer_length = 8;
+		packet->data[0] = 0x00;
+		packet->data[1] = 0x08;
+		packet->data[2] = 0x00;
+		packet->data[3] = strong / 256;  /* left actuator? */
+		packet->data[4] = weak / 256;	/* right actuator? */
+		packet->data[5] = 0x00;
+		packet->data[6] = 0x00;
+		packet->data[7] = 0x00;
+		packet->len = 8;
+		packet->pending = true;
 		break;
 
 	case XTYPE_XBOX360W:
-		xpad->odata[0] = 0x00;
-		xpad->odata[1] = 0x01;
-		xpad->odata[2] = 0x0F;
-		xpad->odata[3] = 0xC0;
-		xpad->odata[4] = 0x00;
-		xpad->odata[5] = strong / 256;
-		xpad->odata[6] = weak / 256;
-		xpad->odata[7] = 0x00;
-		xpad->odata[8] = 0x00;
-		xpad->odata[9] = 0x00;
-		xpad->odata[10] = 0x00;
-		xpad->odata[11] = 0x00;
-		xpad->irq_out->transfer_buffer_length = 12;
+		packet->data[0] = 0x00;
+		packet->data[1] = 0x01;
+		packet->data[2] = 0x0F;
+		packet->data[3] = 0xC0;
+		packet->data[4] = 0x00;
+		packet->data[5] = strong / 256;
+		packet->data[6] = weak / 256;
+		packet->data[7] = 0x00;
+		packet->data[8] = 0x00;
+		packet->data[9] = 0x00;
+		packet->data[10] = 0x00;
+		packet->data[11] = 0x00;
+		packet->len = 12;
+		packet->pending = true;
 		break;
 
 	case XTYPE_XBOXONE:
-		xpad->odata[0] = 0x09; /* activate rumble */
-		xpad->odata[1] = 0x08;
-		xpad->odata[2] = 0x00;
-		xpad->odata[3] = 0x08; /* continuous effect */
-		xpad->odata[4] = 0x00; /* simple rumble mode */
-		xpad->odata[5] = 0x03; /* L and R actuator only */
-		xpad->odata[6] = 0x00; /* TODO: LT actuator */
-		xpad->odata[7] = 0x00; /* TODO: RT actuator */
-		xpad->odata[8] = strong / 256;	/* left actuator */
-		xpad->odata[9] = weak / 256;	/* right actuator */
-		xpad->odata[10] = 0x80;	/* length of pulse */
-		xpad->odata[11] = 0x00;	/* stop period of pulse */
-		xpad->irq_out->transfer_buffer_length = 12;
+		packet->data[0] = 0x09; /* activate rumble */
+		packet->data[1] = 0x08;
+		packet->data[2] = xpad->odata_serial++;
+		packet->data[3] = 0x08; /* continuous effect */
+		packet->data[4] = 0x00; /* simple rumble mode */
+		packet->data[5] = 0x03; /* L and R actuator only */
+		packet->data[6] = 0x00; /* TODO: LT actuator */
+		packet->data[7] = 0x00; /* TODO: RT actuator */
+		packet->data[8] = strong / 512;	/* left actuator */
+		packet->data[9] = weak / 512;	/* right actuator */
+		packet->data[10] = 0x80;	/* length of pulse */
+		packet->data[11] = 0x00;	/* stop period of pulse */
+		packet->data[12] = 0x00;
+		packet->len = 13;
+		packet->pending = true;
 		break;
 
 	default:
 		dev_dbg(&xpad->dev->dev,
 			"%s - rumble command sent to unsupported xpad type: %d\n",
 			__func__, xpad->xtype);
-		return -EINVAL;
+		retval = -EINVAL;
+		goto out;
 	}
 
-	return usb_submit_urb(xpad->irq_out, GFP_ATOMIC);
+	retval = xpad_try_sending_next_out_packet(xpad);
+
+out:
+	spin_unlock_irqrestore(&xpad->odata_lock, flags);
+	return retval;
 }
 
 static int xpad_init_ff(struct usb_xpad *xpad)
@@ -921,36 +1094,44 @@ struct xpad_led {
  */
 static void xpad_send_led_command(struct usb_xpad *xpad, int command)
 {
+	struct xpad_output_packet *packet =
+			&xpad->out_packets[XPAD_OUT_LED_IDX];
+	unsigned long flags;
+
 	command %= 16;
 
-	mutex_lock(&xpad->odata_mutex);
+	spin_lock_irqsave(&xpad->odata_lock, flags);
 
 	switch (xpad->xtype) {
 	case XTYPE_XBOX360:
-		xpad->odata[0] = 0x01;
-		xpad->odata[1] = 0x03;
-		xpad->odata[2] = command;
-		xpad->irq_out->transfer_buffer_length = 3;
+		packet->data[0] = 0x01;
+		packet->data[1] = 0x03;
+		packet->data[2] = command;
+		packet->len = 3;
+		packet->pending = true;
 		break;
+
 	case XTYPE_XBOX360W:
-		xpad->odata[0] = 0x00;
-		xpad->odata[1] = 0x00;
-		xpad->odata[2] = 0x08;
-		xpad->odata[3] = 0x40 + command;
-		xpad->odata[4] = 0x00;
-		xpad->odata[5] = 0x00;
-		xpad->odata[6] = 0x00;
-		xpad->odata[7] = 0x00;
-		xpad->odata[8] = 0x00;
-		xpad->odata[9] = 0x00;
-		xpad->odata[10] = 0x00;
-		xpad->odata[11] = 0x00;
-		xpad->irq_out->transfer_buffer_length = 12;
+		packet->data[0] = 0x00;
+		packet->data[1] = 0x00;
+		packet->data[2] = 0x08;
+		packet->data[3] = 0x40 + command;
+		packet->data[4] = 0x00;
+		packet->data[5] = 0x00;
+		packet->data[6] = 0x00;
+		packet->data[7] = 0x00;
+		packet->data[8] = 0x00;
+		packet->data[9] = 0x00;
+		packet->data[10] = 0x00;
+		packet->data[11] = 0x00;
+		packet->len = 12;
+		packet->pending = true;
 		break;
 	}
 
-	usb_submit_urb(xpad->irq_out, GFP_KERNEL);
-	mutex_unlock(&xpad->odata_mutex);
+	xpad_try_sending_next_out_packet(xpad);
+
+	spin_unlock_irqrestore(&xpad->odata_lock, flags);
 }
 
 /*
@@ -959,7 +1140,7 @@ static void xpad_send_led_command(struct usb_xpad *xpad, int command)
  */
 static void xpad_identify_controller(struct usb_xpad *xpad)
 {
-	xpad_send_led_command(xpad, (xpad->pad_nr % 4) + 2);
+	led_set_brightness(&xpad->led->led_cdev, (xpad->pad_nr % 4) + 2);
 }
 
 static void xpad_led_set(struct led_classdev *led_cdev,
@@ -1001,14 +1182,7 @@ static int xpad_led_probe(struct usb_xpad *xpad)
 	if (error)
 		goto err_free_id;
 
-	if (xpad->xtype == XTYPE_XBOX360) {
-		/*
-		 * Light up the segment corresponding to controller
-		 * number on wired devices. On wireless we'll do that
-		 * when they respond to "presence" packet.
-		 */
-		xpad_identify_controller(xpad);
-	}
+	xpad_identify_controller(xpad);
 
 	return 0;
 
@@ -1036,37 +1210,73 @@ static void xpad_led_disconnect(struct usb_xpad *xpad) { }
 static void xpad_identify_controller(struct usb_xpad *xpad) { }
 #endif
 
-static int xpad_open(struct input_dev *dev)
+static int xpad_start_input(struct usb_xpad *xpad)
 {
-	struct usb_xpad *xpad = input_get_drvdata(dev);
-
-	/* URB was submitted in probe */
-	if (xpad->xtype == XTYPE_XBOX360W)
-		return 0;
+	int error;
 
-	xpad->irq_in->dev = xpad->udev;
 	if (usb_submit_urb(xpad->irq_in, GFP_KERNEL))
 		return -EIO;
 
 	if (xpad->xtype == XTYPE_XBOXONE) {
-		/* Xbox one controller needs to be initialized. */
-		xpad->odata[0] = 0x05;
-		xpad->odata[1] = 0x20;
-		xpad->irq_out->transfer_buffer_length = 2;
-		return usb_submit_urb(xpad->irq_out, GFP_KERNEL);
+		error = xpad_start_xbox_one(xpad);
+		if (error) {
+			usb_kill_urb(xpad->irq_in);
+			return error;
+		}
 	}
 
 	return 0;
 }
 
-static void xpad_close(struct input_dev *dev)
+static void xpad_stop_input(struct usb_xpad *xpad)
 {
-	struct usb_xpad *xpad = input_get_drvdata(dev);
+	usb_kill_urb(xpad->irq_in);
+}
+
+static int xpad360w_start_input(struct usb_xpad *xpad)
+{
+	int error;
 
-	if (xpad->xtype != XTYPE_XBOX360W)
+	error = usb_submit_urb(xpad->irq_in, GFP_KERNEL);
+	if (error)
+		return -EIO;
+
+	/*
+	 * Send presence packet.
+	 * This will force the controller to resend connection packets.
+	 * This is useful in the case we activate the module after the
+	 * adapter has been plugged in, as it won't automatically
+	 * send us info about the controllers.
+	 */
+	error = xpad_inquiry_pad_presence(xpad);
+	if (error) {
 		usb_kill_urb(xpad->irq_in);
+		return error;
+	}
 
-	xpad_stop_output(xpad);
+	return 0;
+}
+
+static void xpad360w_stop_input(struct usb_xpad *xpad)
+{
+	usb_kill_urb(xpad->irq_in);
+
+	/* Make sure we are done with presence work if it was scheduled */
+	flush_work(&xpad->work);
+}
+
+static int xpad_open(struct input_dev *dev)
+{
+	struct usb_xpad *xpad = input_get_drvdata(dev);
+
+	return xpad_start_input(xpad);
+}
+
+static void xpad_close(struct input_dev *dev)
+{
+	struct usb_xpad *xpad = input_get_drvdata(dev);
+
+	xpad_stop_input(xpad);
 }
 
 static void xpad_set_up_abs(struct input_dev *input_dev, signed short abs)
@@ -1097,8 +1307,11 @@ static void xpad_set_up_abs(struct input_dev *input_dev, signed short abs)
 
 static void xpad_deinit_input(struct usb_xpad *xpad)
 {
-	xpad_led_disconnect(xpad);
-	input_unregister_device(xpad->dev);
+	if (xpad->input_created) {
+		xpad->input_created = false;
+		xpad_led_disconnect(xpad);
+		input_unregister_device(xpad->dev);
+	}
 }
 
 static int xpad_init_input(struct usb_xpad *xpad)
@@ -1118,8 +1331,10 @@ static int xpad_init_input(struct usb_xpad *xpad)
 
 	input_set_drvdata(input_dev, xpad);
 
-	input_dev->open = xpad_open;
-	input_dev->close = xpad_close;
+	if (xpad->xtype != XTYPE_XBOX360W) {
+		input_dev->open = xpad_open;
+		input_dev->close = xpad_close;
+	}
 
 	__set_bit(EV_KEY, input_dev->evbit);
 
@@ -1181,6 +1396,7 @@ static int xpad_init_input(struct usb_xpad *xpad)
 	if (error)
 		goto err_disconnect_led;
 
+	xpad->input_created = true;
 	return 0;
 
 err_disconnect_led:
@@ -1241,6 +1457,7 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
 	xpad->mapping = xpad_device[i].mapping;
 	xpad->xtype = xpad_device[i].xtype;
 	xpad->name = xpad_device[i].name;
+	INIT_WORK(&xpad->work, xpad_presence_work);
 
 	if (xpad->xtype == XTYPE_UNKNOWN) {
 		if (intf->cur_altsetting->desc.bInterfaceClass == USB_CLASS_VENDOR_SPEC) {
@@ -1277,10 +1494,6 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
 
 	usb_set_intfdata(intf, xpad);
 
-	error = xpad_init_input(xpad);
-	if (error)
-		goto err_deinit_output;
-
 	if (xpad->xtype == XTYPE_XBOX360W) {
 		/*
 		 * Submit the int URB immediately rather than waiting for open
@@ -1289,28 +1502,24 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
 		 * exactly the message that a controller has arrived that
 		 * we're waiting for.
 		 */
-		xpad->irq_in->dev = xpad->udev;
-		error = usb_submit_urb(xpad->irq_in, GFP_KERNEL);
+		error = xpad360w_start_input(xpad);
 		if (error)
-			goto err_deinit_input;
-
+			goto err_deinit_output;
 		/*
-		 * Send presence packet.
-		 * This will force the controller to resend connection packets.
-		 * This is useful in the case we activate the module after the
-		 * adapter has been plugged in, as it won't automatically
-		 * send us info about the controllers.
+		 * Wireless controllers require RESET_RESUME to work properly
+		 * after suspend. Ideally this quirk should be in usb core
+		 * quirk list, but we have too many vendors producing these
+		 * controllers and we'd need to maintain 2 identical lists
+		 * here in this driver and in usb core.
 		 */
-		error = xpad_inquiry_pad_presence(xpad);
+		udev->quirks |= USB_QUIRK_RESET_RESUME;
+	} else {
+		error = xpad_init_input(xpad);
 		if (error)
-			goto err_kill_in_urb;
+			goto err_deinit_output;
 	}
 	return 0;
 
-err_kill_in_urb:
-	usb_kill_urb(xpad->irq_in);
-err_deinit_input:
-	xpad_deinit_input(xpad);
 err_deinit_output:
 	xpad_deinit_output(xpad);
 err_free_in_urb:
@@ -1320,19 +1529,24 @@ err_free_idata:
 err_free_mem:
 	kfree(xpad);
 	return error;
-
 }
 
 static void xpad_disconnect(struct usb_interface *intf)
 {
-	struct usb_xpad *xpad = usb_get_intfdata (intf);
+	struct usb_xpad *xpad = usb_get_intfdata(intf);
+
+	if (xpad->xtype == XTYPE_XBOX360W)
+		xpad360w_stop_input(xpad);
 
 	xpad_deinit_input(xpad);
-	xpad_deinit_output(xpad);
 
-	if (xpad->xtype == XTYPE_XBOX360W) {
-		usb_kill_urb(xpad->irq_in);
-	}
+	/*
+	 * Now that both input device and LED device are gone we can
+	 * stop output URB.
+	 */
+	xpad_stop_output(xpad);
+
+	xpad_deinit_output(xpad);
 
 	usb_free_urb(xpad->irq_in);
 	usb_free_coherent(xpad->udev, XPAD_PKT_LEN,
@@ -1343,10 +1557,55 @@ static void xpad_disconnect(struct usb_interface *intf)
 	usb_set_intfdata(intf, NULL);
 }
 
+static int xpad_suspend(struct usb_interface *intf, pm_message_t message)
+{
+	struct usb_xpad *xpad = usb_get_intfdata(intf);
+	struct input_dev *input = xpad->dev;
+
+	if (xpad->xtype == XTYPE_XBOX360W) {
+		/*
+		 * Wireless controllers always listen to input so
+		 * they are notified when controller shows up
+		 * or goes away.
+		 */
+		xpad360w_stop_input(xpad);
+	} else {
+		mutex_lock(&input->mutex);
+		if (input->users)
+			xpad_stop_input(xpad);
+		mutex_unlock(&input->mutex);
+	}
+
+	xpad_stop_output(xpad);
+
+	return 0;
+}
+
+static int xpad_resume(struct usb_interface *intf)
+{
+	struct usb_xpad *xpad = usb_get_intfdata(intf);
+	struct input_dev *input = xpad->dev;
+	int retval = 0;
+
+	if (xpad->xtype == XTYPE_XBOX360W) {
+		retval = xpad360w_start_input(xpad);
+	} else {
+		mutex_lock(&input->mutex);
+		if (input->users)
+			retval = xpad_start_input(xpad);
+		mutex_unlock(&input->mutex);
+	}
+
+	return retval;
+}
+
 static struct usb_driver xpad_driver = {
 	.name		= "xpad",
 	.probe		= xpad_probe,
 	.disconnect	= xpad_disconnect,
+	.suspend	= xpad_suspend,
+	.resume		= xpad_resume,
+	.reset_resume	= xpad_resume,
 	.id_table	= xpad_table,
 };
 
diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c
index b9f01bd1b7ef..29093657f2ef 100644
--- a/drivers/input/keyboard/gpio_keys.c
+++ b/drivers/input/keyboard/gpio_keys.c
@@ -630,7 +630,7 @@ gpio_keys_get_devtree_pdata(struct device *dev)
 	if (!node)
 		return ERR_PTR(-ENODEV);
 
-	nbuttons = of_get_child_count(node);
+	nbuttons = of_get_available_child_count(node);
 	if (nbuttons == 0)
 		return ERR_PTR(-ENODEV);
 
@@ -645,8 +645,10 @@ gpio_keys_get_devtree_pdata(struct device *dev)
 
 	pdata->rep = !!of_get_property(node, "autorepeat", NULL);
 
+	of_property_read_string(node, "label", &pdata->name);
+
 	i = 0;
-	for_each_child_of_node(node, pp) {
+	for_each_available_child_of_node(node, pp) {
 		enum of_gpio_flags flags;
 
 		button = &pdata->buttons[i++];
diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c
index 2d5794ec338b..2160512e861a 100644
--- a/drivers/input/touchscreen/atmel_mxt_ts.c
+++ b/drivers/input/touchscreen/atmel_mxt_ts.c
@@ -113,8 +113,8 @@ struct t7_config {
 #define MXT_T9_DETECT		(1 << 7)
 
 struct t9_range {
-	u16 x;
-	u16 y;
+	__le16 x;
+	__le16 y;
 } __packed;
 
 /* MXT_TOUCH_MULTI_T9 orient */
@@ -216,6 +216,7 @@ struct mxt_data {
 	unsigned int irq;
 	unsigned int max_x;
 	unsigned int max_y;
+	bool xy_switch;
 	bool in_bootloader;
 	u16 mem_size;
 	u8 t100_aux_ampl;
@@ -1665,8 +1666,8 @@ static int mxt_read_t9_resolution(struct mxt_data *data)
 	if (error)
 		return error;
 
-	le16_to_cpus(&range.x);
-	le16_to_cpus(&range.y);
+	data->max_x = get_unaligned_le16(&range.x);
+	data->max_y = get_unaligned_le16(&range.y);
 
 	error =  __mxt_read_reg(client,
 				object->start_address + MXT_T9_ORIENT,
@@ -1674,23 +1675,7 @@ static int mxt_read_t9_resolution(struct mxt_data *data)
 	if (error)
 		return error;
 
-	/* Handle default values */
-	if (range.x == 0)
-		range.x = 1023;
-
-	if (range.y == 0)
-		range.y = 1023;
-
-	if (orient & MXT_T9_ORIENT_SWITCH) {
-		data->max_x = range.y;
-		data->max_y = range.x;
-	} else {
-		data->max_x = range.x;
-		data->max_y = range.y;
-	}
-
-	dev_dbg(&client->dev,
-		"Touchscreen size X%uY%u\n", data->max_x, data->max_y);
+	data->xy_switch = orient & MXT_T9_ORIENT_SWITCH;
 
 	return 0;
 }
@@ -1708,13 +1693,14 @@ static int mxt_read_t100_config(struct mxt_data *data)
 	if (!object)
 		return -EINVAL;
 
+	/* read touchscreen dimensions */
 	error = __mxt_read_reg(client,
 			       object->start_address + MXT_T100_XRANGE,
 			       sizeof(range_x), &range_x);
 	if (error)
 		return error;
 
-	le16_to_cpus(&range_x);
+	data->max_x = get_unaligned_le16(&range_x);
 
 	error = __mxt_read_reg(client,
 			       object->start_address + MXT_T100_YRANGE,
@@ -1722,36 +1708,24 @@ static int mxt_read_t100_config(struct mxt_data *data)
 	if (error)
 		return error;
 
-	le16_to_cpus(&range_y);
+	data->max_y = get_unaligned_le16(&range_y);
 
+	/* read orientation config */
 	error =  __mxt_read_reg(client,
 				object->start_address + MXT_T100_CFG1,
 				1, &cfg);
 	if (error)
 		return error;
 
+	data->xy_switch = cfg & MXT_T100_CFG_SWITCHXY;
+
+	/* allocate aux bytes */
 	error =  __mxt_read_reg(client,
 				object->start_address + MXT_T100_TCHAUX,
 				1, &tchaux);
 	if (error)
 		return error;
 
-	/* Handle default values */
-	if (range_x == 0)
-		range_x = 1023;
-
-	if (range_y == 0)
-		range_y = 1023;
-
-	if (cfg & MXT_T100_CFG_SWITCHXY) {
-		data->max_x = range_y;
-		data->max_y = range_x;
-	} else {
-		data->max_x = range_x;
-		data->max_y = range_y;
-	}
-
-	/* allocate aux bytes */
 	aux = 6;
 
 	if (tchaux & MXT_T100_TCHAUX_VECT)
@@ -1767,9 +1741,6 @@ static int mxt_read_t100_config(struct mxt_data *data)
 		"T100 aux mappings vect:%u ampl:%u area:%u\n",
 		data->t100_aux_vect, data->t100_aux_ampl, data->t100_aux_area);
 
-	dev_info(&client->dev,
-		 "T100 Touchscreen size X%uY%u\n", data->max_x, data->max_y);
-
 	return 0;
 }
 
@@ -1828,6 +1799,19 @@ static int mxt_initialize_input_device(struct mxt_data *data)
 		return -EINVAL;
 	}
 
+	/* Handle default values and orientation switch */
+	if (data->max_x == 0)
+		data->max_x = 1023;
+
+	if (data->max_y == 0)
+		data->max_y = 1023;
+
+	if (data->xy_switch)
+		swap(data->max_x, data->max_y);
+
+	dev_info(dev, "Touchscreen size X%uY%u\n", data->max_x, data->max_y);
+
+	/* Register input device */
 	input_dev = input_allocate_device();
 	if (!input_dev) {
 		dev_err(dev, "Failed to allocate memory\n");
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index 7e0f42acb737..a7a0a22cf1a5 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -2,6 +2,6 @@
 # Makefile for Open-Channel SSDs.
 #
 
-obj-$(CONFIG_NVM)		:= core.o
+obj-$(CONFIG_NVM)		:= core.o sysblk.o
 obj-$(CONFIG_NVM_GENNVM) 	+= gennvm.o
 obj-$(CONFIG_NVM_RRPC)		+= rrpc.o
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 8f41b245cd55..33224cb91c5b 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/miscdevice.h>
 #include <linux/lightnvm.h>
+#include <linux/sched/sysctl.h>
 #include <uapi/linux/lightnvm.h>
 
 static LIST_HEAD(nvm_targets);
@@ -105,6 +106,9 @@ struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev)
 	lockdep_assert_held(&nvm_lock);
 
 	list_for_each_entry(mt, &nvm_mgrs, list) {
+		if (strncmp(dev->sb.mmtype, mt->name, NVM_MMTYPE_LEN))
+			continue;
+
 		ret = mt->register_mgr(dev);
 		if (ret < 0) {
 			pr_err("nvm: media mgr failed to init (%d) on dev %s\n",
@@ -166,6 +170,20 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
 	return NULL;
 }
 
+struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *dev, struct nvm_lun *lun,
+							unsigned long flags)
+{
+	return dev->mt->get_blk_unlocked(dev, lun, flags);
+}
+EXPORT_SYMBOL(nvm_get_blk_unlocked);
+
+/* Assumes that all valid pages have already been moved on release to bm */
+void nvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk)
+{
+	return dev->mt->put_blk_unlocked(dev, blk);
+}
+EXPORT_SYMBOL(nvm_put_blk_unlocked);
+
 struct nvm_block *nvm_get_blk(struct nvm_dev *dev, struct nvm_lun *lun,
 							unsigned long flags)
 {
@@ -192,6 +210,206 @@ int nvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk)
 }
 EXPORT_SYMBOL(nvm_erase_blk);
 
+void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+	int i;
+
+	if (rqd->nr_pages > 1) {
+		for (i = 0; i < rqd->nr_pages; i++)
+			rqd->ppa_list[i] = dev_to_generic_addr(dev,
+							rqd->ppa_list[i]);
+	} else {
+		rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr);
+	}
+}
+EXPORT_SYMBOL(nvm_addr_to_generic_mode);
+
+void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+	int i;
+
+	if (rqd->nr_pages > 1) {
+		for (i = 0; i < rqd->nr_pages; i++)
+			rqd->ppa_list[i] = generic_to_dev_addr(dev,
+							rqd->ppa_list[i]);
+	} else {
+		rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
+	}
+}
+EXPORT_SYMBOL(nvm_generic_to_addr_mode);
+
+int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
+					struct ppa_addr *ppas, int nr_ppas)
+{
+	int i, plane_cnt, pl_idx;
+
+	if (dev->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
+		rqd->nr_pages = 1;
+		rqd->ppa_addr = ppas[0];
+
+		return 0;
+	}
+
+	plane_cnt = (1 << dev->plane_mode);
+	rqd->nr_pages = plane_cnt * nr_ppas;
+
+	if (dev->ops->max_phys_sect < rqd->nr_pages)
+		return -EINVAL;
+
+	rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
+	if (!rqd->ppa_list) {
+		pr_err("nvm: failed to allocate dma memory\n");
+		return -ENOMEM;
+	}
+
+	for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+		for (i = 0; i < nr_ppas; i++) {
+			ppas[i].g.pl = pl_idx;
+			rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i];
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(nvm_set_rqd_ppalist);
+
+void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+	if (!rqd->ppa_list)
+		return;
+
+	nvm_dev_dma_free(dev, rqd->ppa_list, rqd->dma_ppa_list);
+}
+EXPORT_SYMBOL(nvm_free_rqd_ppalist);
+
+int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas)
+{
+	struct nvm_rq rqd;
+	int ret;
+
+	if (!dev->ops->erase_block)
+		return 0;
+
+	memset(&rqd, 0, sizeof(struct nvm_rq));
+
+	ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas);
+	if (ret)
+		return ret;
+
+	nvm_generic_to_addr_mode(dev, &rqd);
+
+	ret = dev->ops->erase_block(dev, &rqd);
+
+	nvm_free_rqd_ppalist(dev, &rqd);
+
+	return ret;
+}
+EXPORT_SYMBOL(nvm_erase_ppa);
+
+void nvm_end_io(struct nvm_rq *rqd, int error)
+{
+	rqd->error = error;
+	rqd->end_io(rqd);
+}
+EXPORT_SYMBOL(nvm_end_io);
+
+static void nvm_end_io_sync(struct nvm_rq *rqd)
+{
+	struct completion *waiting = rqd->wait;
+
+	rqd->wait = NULL;
+
+	complete(waiting);
+}
+
+int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas,
+				int opcode, int flags, void *buf, int len)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct nvm_rq rqd;
+	struct bio *bio;
+	int ret;
+	unsigned long hang_check;
+
+	bio = bio_map_kern(dev->q, buf, len, GFP_KERNEL);
+	if (IS_ERR_OR_NULL(bio))
+		return -ENOMEM;
+
+	memset(&rqd, 0, sizeof(struct nvm_rq));
+	ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas);
+	if (ret) {
+		bio_put(bio);
+		return ret;
+	}
+
+	rqd.opcode = opcode;
+	rqd.bio = bio;
+	rqd.wait = &wait;
+	rqd.dev = dev;
+	rqd.end_io = nvm_end_io_sync;
+	rqd.flags = flags;
+	nvm_generic_to_addr_mode(dev, &rqd);
+
+	ret = dev->ops->submit_io(dev, &rqd);
+
+	/* Prevent hang_check timer from firing at us during very long I/O */
+	hang_check = sysctl_hung_task_timeout_secs;
+	if (hang_check)
+		while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
+	else
+		wait_for_completion_io(&wait);
+
+	nvm_free_rqd_ppalist(dev, &rqd);
+
+	return rqd.error;
+}
+EXPORT_SYMBOL(nvm_submit_ppa);
+
+static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
+{
+	int i;
+
+	dev->lps_per_blk = dev->pgs_per_blk;
+	dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
+	if (!dev->lptbl)
+		return -ENOMEM;
+
+	/* Just a linear array */
+	for (i = 0; i < dev->lps_per_blk; i++)
+		dev->lptbl[i] = i;
+
+	return 0;
+}
+
+static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
+{
+	int i, p;
+	struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc;
+
+	if (!mlc->num_pairs)
+		return 0;
+
+	dev->lps_per_blk = mlc->num_pairs;
+	dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
+	if (!dev->lptbl)
+		return -ENOMEM;
+
+	/* The lower page table encoding consists of a list of bytes, where each
+	 * has a lower and an upper half. The first half byte maintains the
+	 * increment value and every value after is an offset added to the
+	 * previous incrementation value */
+	dev->lptbl[0] = mlc->pairs[0] & 0xF;
+	for (i = 1; i < dev->lps_per_blk; i++) {
+		p = mlc->pairs[i >> 1];
+		if (i & 0x1) /* upper */
+			dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4);
+		else /* lower */
+			dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF);
+	}
+
+	return 0;
+}
+
 static int nvm_core_init(struct nvm_dev *dev)
 {
 	struct nvm_id *id = &dev->identity;
@@ -206,6 +424,7 @@ static int nvm_core_init(struct nvm_dev *dev)
 	dev->sec_size = grp->csecs;
 	dev->oob_size = grp->sos;
 	dev->sec_per_pg = grp->fpg_sz / grp->csecs;
+	dev->mccap = grp->mccap;
 	memcpy(&dev->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
 
 	dev->plane_mode = NVM_PLANE_SINGLE;
@@ -216,11 +435,23 @@ static int nvm_core_init(struct nvm_dev *dev)
 		return -EINVAL;
 	}
 
-	if (grp->fmtype != 0 && grp->fmtype != 1) {
+	switch (grp->fmtype) {
+	case NVM_ID_FMTYPE_SLC:
+		if (nvm_init_slc_tbl(dev, grp))
+			return -ENOMEM;
+		break;
+	case NVM_ID_FMTYPE_MLC:
+		if (nvm_init_mlc_tbl(dev, grp))
+			return -ENOMEM;
+		break;
+	default:
 		pr_err("nvm: flash type not supported\n");
 		return -EINVAL;
 	}
 
+	if (!dev->lps_per_blk)
+		pr_info("nvm: lower page programming table missing\n");
+
 	if (grp->mpos & 0x020202)
 		dev->plane_mode = NVM_PLANE_DOUBLE;
 	if (grp->mpos & 0x040404)
@@ -238,6 +469,7 @@ static int nvm_core_init(struct nvm_dev *dev)
 				dev->nr_chnls;
 	dev->total_pages = dev->total_blocks * dev->pgs_per_blk;
 	INIT_LIST_HEAD(&dev->online_targets);
+	mutex_init(&dev->mlock);
 
 	return 0;
 }
@@ -249,6 +481,8 @@ static void nvm_free(struct nvm_dev *dev)
 
 	if (dev->mt)
 		dev->mt->unregister_mgr(dev);
+
+	kfree(dev->lptbl);
 }
 
 static int nvm_init(struct nvm_dev *dev)
@@ -338,9 +572,16 @@ int nvm_register(struct request_queue *q, char *disk_name,
 		}
 	}
 
+	ret = nvm_get_sysblock(dev, &dev->sb);
+	if (!ret)
+		pr_err("nvm: device not initialized.\n");
+	else if (ret < 0)
+		pr_err("nvm: err (%d) on device initialization\n", ret);
+
 	/* register device with a supported media manager */
 	down_write(&nvm_lock);
-	dev->mt = nvm_init_mgr(dev);
+	if (ret > 0)
+		dev->mt = nvm_init_mgr(dev);
 	list_add(&dev->devices, &nvm_devices);
 	up_write(&nvm_lock);
 
@@ -788,6 +1029,97 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
 	return __nvm_configure_remove(&remove);
 }
 
+static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info)
+{
+	info->seqnr = 1;
+	info->erase_cnt = 0;
+	info->version = 1;
+}
+
+static long __nvm_ioctl_dev_init(struct nvm_ioctl_dev_init *init)
+{
+	struct nvm_dev *dev;
+	struct nvm_sb_info info;
+	int ret;
+
+	down_write(&nvm_lock);
+	dev = nvm_find_nvm_dev(init->dev);
+	up_write(&nvm_lock);
+	if (!dev) {
+		pr_err("nvm: device not found\n");
+		return -EINVAL;
+	}
+
+	nvm_setup_nvm_sb_info(&info);
+
+	strncpy(info.mmtype, init->mmtype, NVM_MMTYPE_LEN);
+	info.fs_ppa.ppa = -1;
+
+	ret = nvm_init_sysblock(dev, &info);
+	if (ret)
+		return ret;
+
+	memcpy(&dev->sb, &info, sizeof(struct nvm_sb_info));
+
+	down_write(&nvm_lock);
+	dev->mt = nvm_init_mgr(dev);
+	up_write(&nvm_lock);
+
+	return 0;
+}
+
+static long nvm_ioctl_dev_init(struct file *file, void __user *arg)
+{
+	struct nvm_ioctl_dev_init init;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init)))
+		return -EFAULT;
+
+	if (init.flags != 0) {
+		pr_err("nvm: no flags supported\n");
+		return -EINVAL;
+	}
+
+	init.dev[DISK_NAME_LEN - 1] = '\0';
+
+	return __nvm_ioctl_dev_init(&init);
+}
+
+static long nvm_ioctl_dev_factory(struct file *file, void __user *arg)
+{
+	struct nvm_ioctl_dev_factory fact;
+	struct nvm_dev *dev;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory)))
+		return -EFAULT;
+
+	fact.dev[DISK_NAME_LEN - 1] = '\0';
+
+	if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1))
+		return -EINVAL;
+
+	down_write(&nvm_lock);
+	dev = nvm_find_nvm_dev(fact.dev);
+	up_write(&nvm_lock);
+	if (!dev) {
+		pr_err("nvm: device not found\n");
+		return -EINVAL;
+	}
+
+	if (dev->mt) {
+		dev->mt->unregister_mgr(dev);
+		dev->mt = NULL;
+	}
+
+	return nvm_dev_factory(dev, fact.flags);
+}
+
 static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
@@ -801,6 +1133,10 @@ static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg)
 		return nvm_ioctl_dev_create(file, argp);
 	case NVM_DEV_REMOVE:
 		return nvm_ioctl_dev_remove(file, argp);
+	case NVM_DEV_INIT:
+		return nvm_ioctl_dev_init(file, argp);
+	case NVM_DEV_FACTORY:
+		return nvm_ioctl_dev_factory(file, argp);
 	}
 	return 0;
 }
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index a54b339951a3..7fb725b16148 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -60,7 +60,8 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn)
 		lun->vlun.lun_id = i % dev->luns_per_chnl;
 		lun->vlun.chnl_id = i / dev->luns_per_chnl;
 		lun->vlun.nr_free_blocks = dev->blks_per_lun;
-		lun->vlun.nr_inuse_blocks = 0;
+		lun->vlun.nr_open_blocks = 0;
+		lun->vlun.nr_closed_blocks = 0;
 		lun->vlun.nr_bad_blocks = 0;
 	}
 	return 0;
@@ -89,6 +90,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks,
 
 		list_move_tail(&blk->list, &lun->bb_list);
 		lun->vlun.nr_bad_blocks++;
+		lun->vlun.nr_free_blocks--;
 	}
 
 	return 0;
@@ -133,15 +135,15 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
 		pba = pba - (dev->sec_per_lun * lun_id);
 		blk = &lun->vlun.blocks[div_u64(pba, dev->sec_per_blk)];
 
-		if (!blk->type) {
+		if (!blk->state) {
 			/* at this point, we don't know anything about the
 			 * block. It's up to the FTL on top to re-etablish the
-			 * block state
+			 * block state. The block is assumed to be open.
 			 */
 			list_move_tail(&blk->list, &lun->used_list);
-			blk->type = 1;
+			blk->state = NVM_BLK_ST_OPEN;
 			lun->vlun.nr_free_blocks--;
-			lun->vlun.nr_inuse_blocks++;
+			lun->vlun.nr_open_blocks++;
 		}
 	}
 
@@ -255,14 +257,14 @@ static void gennvm_unregister(struct nvm_dev *dev)
 	module_put(THIS_MODULE);
 }
 
-static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev,
+static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev,
 				struct nvm_lun *vlun, unsigned long flags)
 {
 	struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun);
 	struct nvm_block *blk = NULL;
 	int is_gc = flags & NVM_IOTYPE_GC;
 
-	spin_lock(&vlun->lock);
+	assert_spin_locked(&vlun->lock);
 
 	if (list_empty(&lun->free_list)) {
 		pr_err_ratelimited("gennvm: lun %u have no free pages available",
@@ -275,83 +277,64 @@ static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev,
 
 	blk = list_first_entry(&lun->free_list, struct nvm_block, list);
 	list_move_tail(&blk->list, &lun->used_list);
-	blk->type = 1;
+	blk->state = NVM_BLK_ST_OPEN;
 
 	lun->vlun.nr_free_blocks--;
-	lun->vlun.nr_inuse_blocks++;
+	lun->vlun.nr_open_blocks++;
 
 out:
+	return blk;
+}
+
+static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev,
+				struct nvm_lun *vlun, unsigned long flags)
+{
+	struct nvm_block *blk;
+
+	spin_lock(&vlun->lock);
+	blk = gennvm_get_blk_unlocked(dev, vlun, flags);
 	spin_unlock(&vlun->lock);
 	return blk;
 }
 
-static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
+static void gennvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk)
 {
 	struct nvm_lun *vlun = blk->lun;
 	struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun);
 
-	spin_lock(&vlun->lock);
+	assert_spin_locked(&vlun->lock);
 
-	switch (blk->type) {
-	case 1:
+	if (blk->state & NVM_BLK_ST_OPEN) {
 		list_move_tail(&blk->list, &lun->free_list);
+		lun->vlun.nr_open_blocks--;
 		lun->vlun.nr_free_blocks++;
-		lun->vlun.nr_inuse_blocks--;
-		blk->type = 0;
-		break;
-	case 2:
+		blk->state = NVM_BLK_ST_FREE;
+	} else if (blk->state & NVM_BLK_ST_CLOSED) {
+		list_move_tail(&blk->list, &lun->free_list);
+		lun->vlun.nr_closed_blocks--;
+		lun->vlun.nr_free_blocks++;
+		blk->state = NVM_BLK_ST_FREE;
+	} else if (blk->state & NVM_BLK_ST_BAD) {
 		list_move_tail(&blk->list, &lun->bb_list);
 		lun->vlun.nr_bad_blocks++;
-		lun->vlun.nr_inuse_blocks--;
-		break;
-	default:
+		blk->state = NVM_BLK_ST_BAD;
+	} else {
 		WARN_ON_ONCE(1);
 		pr_err("gennvm: erroneous block type (%lu -> %u)\n",
-							blk->id, blk->type);
+							blk->id, blk->state);
 		list_move_tail(&blk->list, &lun->bb_list);
 		lun->vlun.nr_bad_blocks++;
-		lun->vlun.nr_inuse_blocks--;
-	}
-
-	spin_unlock(&vlun->lock);
-}
-
-static void gennvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-	int i;
-
-	if (rqd->nr_pages > 1) {
-		for (i = 0; i < rqd->nr_pages; i++)
-			rqd->ppa_list[i] = dev_to_generic_addr(dev,
-							rqd->ppa_list[i]);
-	} else {
-		rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr);
+		blk->state = NVM_BLK_ST_BAD;
 	}
 }
 
-static void gennvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-	int i;
-
-	if (rqd->nr_pages > 1) {
-		for (i = 0; i < rqd->nr_pages; i++)
-			rqd->ppa_list[i] = generic_to_dev_addr(dev,
-							rqd->ppa_list[i]);
-	} else {
-		rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
-	}
-}
-
-static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
 {
-	if (!dev->ops->submit_io)
-		return 0;
-
-	/* Convert address space */
-	gennvm_generic_to_addr_mode(dev, rqd);
+	struct nvm_lun *vlun = blk->lun;
 
-	rqd->dev = dev;
-	return dev->ops->submit_io(dev, rqd);
+	spin_lock(&vlun->lock);
+	gennvm_put_blk_unlocked(dev, blk);
+	spin_unlock(&vlun->lock);
 }
 
 static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa,
@@ -376,7 +359,7 @@ static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa,
 	blk = &lun->vlun.blocks[ppa->g.blk];
 
 	/* will be moved to bb list on put_blk from target */
-	blk->type = type;
+	blk->state = type;
 }
 
 /* mark block bad. It is expected the target recover from the error. */
@@ -390,77 +373,51 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
 	if (dev->ops->set_bb_tbl(dev, rqd, 1))
 		return;
 
-	gennvm_addr_to_generic_mode(dev, rqd);
+	nvm_addr_to_generic_mode(dev, rqd);
 
 	/* look up blocks and mark them as bad */
 	if (rqd->nr_pages > 1)
 		for (i = 0; i < rqd->nr_pages; i++)
-			gennvm_blk_set_type(dev, &rqd->ppa_list[i], 2);
+			gennvm_blk_set_type(dev, &rqd->ppa_list[i],
+						NVM_BLK_ST_BAD);
 	else
-		gennvm_blk_set_type(dev, &rqd->ppa_addr, 2);
+		gennvm_blk_set_type(dev, &rqd->ppa_addr, NVM_BLK_ST_BAD);
 }
 
-static int gennvm_end_io(struct nvm_rq *rqd, int error)
+static void gennvm_end_io(struct nvm_rq *rqd)
 {
 	struct nvm_tgt_instance *ins = rqd->ins;
-	int ret = 0;
 
-	switch (error) {
+	switch (rqd->error) {
 	case NVM_RSP_SUCCESS:
-		break;
 	case NVM_RSP_ERR_EMPTYPAGE:
 		break;
 	case NVM_RSP_ERR_FAILWRITE:
 		gennvm_mark_blk_bad(rqd->dev, rqd);
-	default:
-		ret++;
 	}
 
-	ret += ins->tt->end_io(rqd, error);
-
-	return ret;
+	ins->tt->end_io(rqd);
 }
 
-static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
-							unsigned long flags)
+static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 {
-	int plane_cnt = 0, pl_idx, ret;
-	struct ppa_addr addr;
-	struct nvm_rq rqd;
-
-	if (!dev->ops->erase_block)
-		return 0;
-
-	addr = block_to_ppa(dev, blk);
-
-	if (dev->plane_mode == NVM_PLANE_SINGLE) {
-		rqd.nr_pages = 1;
-		rqd.ppa_addr = addr;
-	} else {
-		plane_cnt = (1 << dev->plane_mode);
-		rqd.nr_pages = plane_cnt;
-
-		rqd.ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL,
-							&rqd.dma_ppa_list);
-		if (!rqd.ppa_list) {
-			pr_err("gennvm: failed to allocate dma memory\n");
-			return -ENOMEM;
-		}
-
-		for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
-			addr.g.pl = pl_idx;
-			rqd.ppa_list[pl_idx] = addr;
-		}
-	}
+	if (!dev->ops->submit_io)
+		return -ENODEV;
 
-	gennvm_generic_to_addr_mode(dev, &rqd);
+	/* Convert address space */
+	nvm_generic_to_addr_mode(dev, rqd);
 
-	ret = dev->ops->erase_block(dev, &rqd);
+	rqd->dev = dev;
+	rqd->end_io = gennvm_end_io;
+	return dev->ops->submit_io(dev, rqd);
+}
 
-	if (plane_cnt)
-		nvm_dev_dma_free(dev, rqd.ppa_list, rqd.dma_ppa_list);
+static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
+							unsigned long flags)
+{
+	struct ppa_addr addr = block_to_ppa(dev, blk);
 
-	return ret;
+	return nvm_erase_ppa(dev, &addr, 1);
 }
 
 static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid)
@@ -480,10 +437,11 @@ static void gennvm_lun_info_print(struct nvm_dev *dev)
 	gennvm_for_each_lun(gn, lun, i) {
 		spin_lock(&lun->vlun.lock);
 
-		pr_info("%s: lun%8u\t%u\t%u\t%u\n",
+		pr_info("%s: lun%8u\t%u\t%u\t%u\t%u\n",
 				dev->name, i,
 				lun->vlun.nr_free_blocks,
-				lun->vlun.nr_inuse_blocks,
+				lun->vlun.nr_open_blocks,
+				lun->vlun.nr_closed_blocks,
 				lun->vlun.nr_bad_blocks);
 
 		spin_unlock(&lun->vlun.lock);
@@ -491,21 +449,23 @@ static void gennvm_lun_info_print(struct nvm_dev *dev)
 }
 
 static struct nvmm_type gennvm = {
-	.name		= "gennvm",
-	.version	= {0, 1, 0},
+	.name			= "gennvm",
+	.version		= {0, 1, 0},
+
+	.register_mgr		= gennvm_register,
+	.unregister_mgr		= gennvm_unregister,
 
-	.register_mgr	= gennvm_register,
-	.unregister_mgr	= gennvm_unregister,
+	.get_blk_unlocked	= gennvm_get_blk_unlocked,
+	.put_blk_unlocked	= gennvm_put_blk_unlocked,
 
-	.get_blk	= gennvm_get_blk,
-	.put_blk	= gennvm_put_blk,
+	.get_blk		= gennvm_get_blk,
+	.put_blk		= gennvm_put_blk,
 
-	.submit_io	= gennvm_submit_io,
-	.end_io		= gennvm_end_io,
-	.erase_blk	= gennvm_erase_blk,
+	.submit_io		= gennvm_submit_io,
+	.erase_blk		= gennvm_erase_blk,
 
-	.get_lun	= gennvm_get_lun,
-	.lun_info_print = gennvm_lun_info_print,
+	.get_lun		= gennvm_get_lun,
+	.lun_info_print		= gennvm_lun_info_print,
 };
 
 static int __init gennvm_module_init(void)
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 134e4faba482..d8c75958ced3 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -179,16 +179,23 @@ static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *rblk)
 static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
 							unsigned long flags)
 {
+	struct nvm_lun *lun = rlun->parent;
 	struct nvm_block *blk;
 	struct rrpc_block *rblk;
 
-	blk = nvm_get_blk(rrpc->dev, rlun->parent, flags);
-	if (!blk)
+	spin_lock(&lun->lock);
+	blk = nvm_get_blk_unlocked(rrpc->dev, rlun->parent, flags);
+	if (!blk) {
+		pr_err("nvm: rrpc: cannot get new block from media manager\n");
+		spin_unlock(&lun->lock);
 		return NULL;
+	}
 
 	rblk = &rlun->blocks[blk->id];
-	blk->priv = rblk;
+	list_add_tail(&rblk->list, &rlun->open_list);
+	spin_unlock(&lun->lock);
 
+	blk->priv = rblk;
 	bitmap_zero(rblk->invalid_pages, rrpc->dev->pgs_per_blk);
 	rblk->next_page = 0;
 	rblk->nr_invalid_pages = 0;
@@ -199,7 +206,13 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
 
 static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
-	nvm_put_blk(rrpc->dev, rblk->parent);
+	struct rrpc_lun *rlun = rblk->rlun;
+	struct nvm_lun *lun = rlun->parent;
+
+	spin_lock(&lun->lock);
+	nvm_put_blk_unlocked(rrpc->dev, rblk->parent);
+	list_del(&rblk->list);
+	spin_unlock(&lun->lock);
 }
 
 static void rrpc_put_blks(struct rrpc *rrpc)
@@ -287,6 +300,8 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
 	}
 
 	page = mempool_alloc(rrpc->page_pool, GFP_NOIO);
+	if (!page)
+		return -ENOMEM;
 
 	while ((slot = find_first_zero_bit(rblk->invalid_pages,
 					    nr_pgs_per_blk)) < nr_pgs_per_blk) {
@@ -328,6 +343,10 @@ try:
 			goto finished;
 		}
 		wait_for_completion_io(&wait);
+		if (bio->bi_error) {
+			rrpc_inflight_laddr_release(rrpc, rqd);
+			goto finished;
+		}
 
 		bio_reset(bio);
 		reinit_completion(&wait);
@@ -350,6 +369,8 @@ try:
 		wait_for_completion_io(&wait);
 
 		rrpc_inflight_laddr_release(rrpc, rqd);
+		if (bio->bi_error)
+			goto finished;
 
 		bio_reset(bio);
 	}
@@ -373,16 +394,26 @@ static void rrpc_block_gc(struct work_struct *work)
 	struct rrpc *rrpc = gcb->rrpc;
 	struct rrpc_block *rblk = gcb->rblk;
 	struct nvm_dev *dev = rrpc->dev;
+	struct nvm_lun *lun = rblk->parent->lun;
+	struct rrpc_lun *rlun = &rrpc->luns[lun->id - rrpc->lun_offset];
 
+	mempool_free(gcb, rrpc->gcb_pool);
 	pr_debug("nvm: block '%lu' being reclaimed\n", rblk->parent->id);
 
 	if (rrpc_move_valid_pages(rrpc, rblk))
-		goto done;
+		goto put_back;
+
+	if (nvm_erase_blk(dev, rblk->parent))
+		goto put_back;
 
-	nvm_erase_blk(dev, rblk->parent);
 	rrpc_put_blk(rrpc, rblk);
-done:
-	mempool_free(gcb, rrpc->gcb_pool);
+
+	return;
+
+put_back:
+	spin_lock(&rlun->lock);
+	list_add_tail(&rblk->prio, &rlun->prio_list);
+	spin_unlock(&rlun->lock);
 }
 
 /* the block with highest number of invalid pages, will be in the beginning
@@ -427,7 +458,7 @@ static void rrpc_lun_gc(struct work_struct *work)
 	if (nr_blocks_need < rrpc->nr_luns)
 		nr_blocks_need = rrpc->nr_luns;
 
-	spin_lock(&lun->lock);
+	spin_lock(&rlun->lock);
 	while (nr_blocks_need > lun->nr_free_blocks &&
 					!list_empty(&rlun->prio_list)) {
 		struct rrpc_block *rblock = block_prio_find_max(rlun);
@@ -436,16 +467,16 @@ static void rrpc_lun_gc(struct work_struct *work)
 		if (!rblock->nr_invalid_pages)
 			break;
 
+		gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC);
+		if (!gcb)
+			break;
+
 		list_del_init(&rblock->prio);
 
 		BUG_ON(!block_is_full(rrpc, rblock));
 
 		pr_debug("rrpc: selected block '%lu' for GC\n", block->id);
 
-		gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC);
-		if (!gcb)
-			break;
-
 		gcb->rrpc = rrpc;
 		gcb->rblk = rblock;
 		INIT_WORK(&gcb->ws_gc, rrpc_block_gc);
@@ -454,7 +485,7 @@ static void rrpc_lun_gc(struct work_struct *work)
 
 		nr_blocks_need--;
 	}
-	spin_unlock(&lun->lock);
+	spin_unlock(&rlun->lock);
 
 	/* TODO: Hint that request queue can be started again */
 }
@@ -635,12 +666,24 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
 		lun = rblk->parent->lun;
 
 		cmnt_size = atomic_inc_return(&rblk->data_cmnt_size);
-		if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk))
+		if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk)) {
+			struct nvm_block *blk = rblk->parent;
+			struct rrpc_lun *rlun = rblk->rlun;
+
+			spin_lock(&lun->lock);
+			lun->nr_open_blocks--;
+			lun->nr_closed_blocks++;
+			blk->state &= ~NVM_BLK_ST_OPEN;
+			blk->state |= NVM_BLK_ST_CLOSED;
+			list_move_tail(&rblk->list, &rlun->closed_list);
+			spin_unlock(&lun->lock);
+
 			rrpc_run_gc(rrpc, rblk);
+		}
 	}
 }
 
-static int rrpc_end_io(struct nvm_rq *rqd, int error)
+static void rrpc_end_io(struct nvm_rq *rqd)
 {
 	struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance);
 	struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
@@ -650,11 +693,12 @@ static int rrpc_end_io(struct nvm_rq *rqd, int error)
 	if (bio_data_dir(rqd->bio) == WRITE)
 		rrpc_end_io_write(rrpc, rrqd, laddr, npages);
 
+	bio_put(rqd->bio);
+
 	if (rrqd->flags & NVM_IOTYPE_GC)
-		return 0;
+		return;
 
 	rrpc_unlock_rq(rrpc, rqd);
-	bio_put(rqd->bio);
 
 	if (npages > 1)
 		nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list);
@@ -662,8 +706,6 @@ static int rrpc_end_io(struct nvm_rq *rqd, int error)
 		nvm_dev_dma_free(rrpc->dev, rqd->metadata, rqd->dma_metadata);
 
 	mempool_free(rqd, rrpc->rq_pool);
-
-	return 0;
 }
 
 static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
@@ -841,6 +883,13 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
 	err = nvm_submit_io(rrpc->dev, rqd);
 	if (err) {
 		pr_err("rrpc: I/O submission failed: %d\n", err);
+		bio_put(bio);
+		if (!(flags & NVM_IOTYPE_GC)) {
+			rrpc_unlock_rq(rrpc, rqd);
+			if (rqd->nr_pages > 1)
+				nvm_dev_dma_free(rrpc->dev,
+			rqd->ppa_list, rqd->dma_ppa_list);
+		}
 		return NVM_IO_ERR;
 	}
 
@@ -1090,6 +1139,11 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 	struct rrpc_lun *rlun;
 	int i, j;
 
+	if (dev->pgs_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) {
+		pr_err("rrpc: number of pages per block too high.");
+		return -EINVAL;
+	}
+
 	spin_lock_init(&rrpc->rev_lock);
 
 	rrpc->luns = kcalloc(rrpc->nr_luns, sizeof(struct rrpc_lun),
@@ -1101,16 +1155,13 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 	for (i = 0; i < rrpc->nr_luns; i++) {
 		struct nvm_lun *lun = dev->mt->get_lun(dev, lun_begin + i);
 
-		if (dev->pgs_per_blk >
-				MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) {
-			pr_err("rrpc: number of pages per block too high.");
-			goto err;
-		}
-
 		rlun = &rrpc->luns[i];
 		rlun->rrpc = rrpc;
 		rlun->parent = lun;
 		INIT_LIST_HEAD(&rlun->prio_list);
+		INIT_LIST_HEAD(&rlun->open_list);
+		INIT_LIST_HEAD(&rlun->closed_list);
+
 		INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
 		spin_lock_init(&rlun->lock);
 
@@ -1127,6 +1178,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 			struct nvm_block *blk = &lun->blocks[j];
 
 			rblk->parent = blk;
+			rblk->rlun = rlun;
 			INIT_LIST_HEAD(&rblk->prio);
 			spin_lock_init(&rblk->lock);
 		}
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
index a9696a06c38c..ef13ac7700c8 100644
--- a/drivers/lightnvm/rrpc.h
+++ b/drivers/lightnvm/rrpc.h
@@ -54,7 +54,9 @@ struct rrpc_rq {
 
 struct rrpc_block {
 	struct nvm_block *parent;
+	struct rrpc_lun *rlun;
 	struct list_head prio;
+	struct list_head list;
 
 #define MAX_INVALID_PAGES_STORAGE 8
 	/* Bitmap for invalid page intries */
@@ -73,7 +75,16 @@ struct rrpc_lun {
 	struct nvm_lun *parent;
 	struct rrpc_block *cur, *gc_cur;
 	struct rrpc_block *blocks;	/* Reference to block allocation */
-	struct list_head prio_list;		/* Blocks that may be GC'ed */
+
+	struct list_head prio_list;	/* Blocks that may be GC'ed */
+	struct list_head open_list;	/* In-use open blocks. These are blocks
+					 * that can be both written to and read
+					 * from
+					 */
+	struct list_head closed_list;	/* In-use closed blocks. These are
+					 * blocks that can _only_ be read from
+					 */
+
 	struct work_struct ws_gc;
 
 	spinlock_t lock;
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
new file mode 100644
index 000000000000..321de1f154c5
--- /dev/null
+++ b/drivers/lightnvm/sysblk.c
@@ -0,0 +1,741 @@
+/*
+ * Copyright (C) 2015 Matias Bjorling. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/lightnvm.h>
+
+#define MAX_SYSBLKS 3	/* remember to update mapping scheme on change */
+#define MAX_BLKS_PR_SYSBLK 2 /* 2 blks with 256 pages and 3000 erases
+			      * enables ~1.5M updates per sysblk unit
+			      */
+
+struct sysblk_scan {
+	/* A row is a collection of flash blocks for a system block. */
+	int nr_rows;
+	int row;
+	int act_blk[MAX_SYSBLKS];
+
+	int nr_ppas;
+	struct ppa_addr ppas[MAX_SYSBLKS * MAX_BLKS_PR_SYSBLK];/* all sysblks */
+};
+
+static inline int scan_ppa_idx(int row, int blkid)
+{
+	return (row * MAX_BLKS_PR_SYSBLK) + blkid;
+}
+
+void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb)
+{
+	info->seqnr = be32_to_cpu(sb->seqnr);
+	info->erase_cnt = be32_to_cpu(sb->erase_cnt);
+	info->version = be16_to_cpu(sb->version);
+	strncpy(info->mmtype, sb->mmtype, NVM_MMTYPE_LEN);
+	info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa);
+}
+
+void nvm_cpu_to_sysblk(struct nvm_system_block *sb, struct nvm_sb_info *info)
+{
+	sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC);
+	sb->seqnr = cpu_to_be32(info->seqnr);
+	sb->erase_cnt = cpu_to_be32(info->erase_cnt);
+	sb->version = cpu_to_be16(info->version);
+	strncpy(sb->mmtype, info->mmtype, NVM_MMTYPE_LEN);
+	sb->fs_ppa = cpu_to_be64(info->fs_ppa.ppa);
+}
+
+static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas)
+{
+	int nr_rows = min_t(int, MAX_SYSBLKS, dev->nr_chnls);
+	int i;
+
+	for (i = 0; i < nr_rows; i++)
+		sysblk_ppas[i].ppa = 0;
+
+	/* if possible, place sysblk at first channel, middle channel and last
+	 * channel of the device. If not, create only one or two sys blocks
+	 */
+	switch (dev->nr_chnls) {
+	case 2:
+		sysblk_ppas[1].g.ch = 1;
+		/* fall-through */
+	case 1:
+		sysblk_ppas[0].g.ch = 0;
+		break;
+	default:
+		sysblk_ppas[0].g.ch = 0;
+		sysblk_ppas[1].g.ch = dev->nr_chnls / 2;
+		sysblk_ppas[2].g.ch = dev->nr_chnls - 1;
+		break;
+	}
+
+	return nr_rows;
+}
+
+void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s,
+						struct ppa_addr *sysblk_ppas)
+{
+	memset(s, 0, sizeof(struct sysblk_scan));
+	s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas);
+}
+
+static int sysblk_get_host_blks(struct ppa_addr ppa, int nr_blks, u8 *blks,
+								void *private)
+{
+	struct sysblk_scan *s = private;
+	int i, nr_sysblk = 0;
+
+	for (i = 0; i < nr_blks; i++) {
+		if (blks[i] != NVM_BLK_T_HOST)
+			continue;
+
+		if (s->nr_ppas == MAX_BLKS_PR_SYSBLK * MAX_SYSBLKS) {
+			pr_err("nvm: too many host blks\n");
+			return -EINVAL;
+		}
+
+		ppa.g.blk = i;
+
+		s->ppas[scan_ppa_idx(s->row, nr_sysblk)] = ppa;
+		s->nr_ppas++;
+		nr_sysblk++;
+	}
+
+	return 0;
+}
+
+static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s,
+				struct ppa_addr *ppas, nvm_bb_update_fn *fn)
+{
+	struct ppa_addr dppa;
+	int i, ret;
+
+	s->nr_ppas = 0;
+
+	for (i = 0; i < s->nr_rows; i++) {
+		dppa = generic_to_dev_addr(dev, ppas[i]);
+		s->row = i;
+
+		ret = dev->ops->get_bb_tbl(dev, dppa, dev->blks_per_lun, fn, s);
+		if (ret) {
+			pr_err("nvm: failed bb tbl for ppa (%u %u)\n",
+							ppas[i].g.ch,
+							ppas[i].g.blk);
+			return ret;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * scans a block for latest sysblk.
+ * Returns:
+ *	0 - newer sysblk not found. PPA is updated to latest page.
+ *	1 - newer sysblk found and stored in *cur. PPA is updated to
+ *	    next valid page.
+ *	<0- error.
+ */
+static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa,
+						struct nvm_system_block *sblk)
+{
+	struct nvm_system_block *cur;
+	int pg, cursz, ret, found = 0;
+
+	/* the full buffer for a flash page is allocated. Only the first of it
+	 * contains the system block information
+	 */
+	cursz = dev->sec_size * dev->sec_per_pg * dev->nr_planes;
+	cur = kmalloc(cursz, GFP_KERNEL);
+	if (!cur)
+		return -ENOMEM;
+
+	/* perform linear scan through the block */
+	for (pg = 0; pg < dev->lps_per_blk; pg++) {
+		ppa->g.pg = ppa_to_slc(dev, pg);
+
+		ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE,
+								cur, cursz);
+		if (ret) {
+			if (ret == NVM_RSP_ERR_EMPTYPAGE) {
+				pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n",
+							ppa->g.ch,
+							ppa->g.lun,
+							ppa->g.blk,
+							ppa->g.pg);
+				break;
+			}
+			pr_err("nvm: read failed (%x) for ppa (%u %u %u %u)",
+							ret,
+							ppa->g.ch,
+							ppa->g.lun,
+							ppa->g.blk,
+							ppa->g.pg);
+			break; /* if we can't read a page, continue to the
+				* next blk
+				*/
+		}
+
+		if (be32_to_cpu(cur->magic) != NVM_SYSBLK_MAGIC) {
+			pr_debug("nvm: scan break for ppa (%u %u %u %u)\n",
+							ppa->g.ch,
+							ppa->g.lun,
+							ppa->g.blk,
+							ppa->g.pg);
+			break; /* last valid page already found */
+		}
+
+		if (be32_to_cpu(cur->seqnr) < be32_to_cpu(sblk->seqnr))
+			continue;
+
+		memcpy(sblk, cur, sizeof(struct nvm_system_block));
+		found = 1;
+	}
+
+	kfree(cur);
+
+	return found;
+}
+
+static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type)
+{
+	struct nvm_rq rqd;
+	int ret;
+
+	if (s->nr_ppas > dev->ops->max_phys_sect) {
+		pr_err("nvm: unable to update all sysblocks atomically\n");
+		return -EINVAL;
+	}
+
+	memset(&rqd, 0, sizeof(struct nvm_rq));
+
+	nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas);
+	nvm_generic_to_addr_mode(dev, &rqd);
+
+	ret = dev->ops->set_bb_tbl(dev, &rqd, type);
+	nvm_free_rqd_ppalist(dev, &rqd);
+	if (ret) {
+		pr_err("nvm: sysblk failed bb mark\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int sysblk_get_free_blks(struct ppa_addr ppa, int nr_blks, u8 *blks,
+								void *private)
+{
+	struct sysblk_scan *s = private;
+	struct ppa_addr *sppa;
+	int i, blkid = 0;
+
+	for (i = 0; i < nr_blks; i++) {
+		if (blks[i] == NVM_BLK_T_HOST)
+			return -EEXIST;
+
+		if (blks[i] != NVM_BLK_T_FREE)
+			continue;
+
+		sppa = &s->ppas[scan_ppa_idx(s->row, blkid)];
+		sppa->g.ch = ppa.g.ch;
+		sppa->g.lun = ppa.g.lun;
+		sppa->g.blk = i;
+		s->nr_ppas++;
+		blkid++;
+
+		pr_debug("nvm: use (%u %u %u) as sysblk\n",
+					sppa->g.ch, sppa->g.lun, sppa->g.blk);
+		if (blkid > MAX_BLKS_PR_SYSBLK - 1)
+			return 0;
+	}
+
+	pr_err("nvm: sysblk failed get sysblk\n");
+	return -EINVAL;
+}
+
+static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
+							struct sysblk_scan *s)
+{
+	struct nvm_system_block nvmsb;
+	void *buf;
+	int i, sect, ret, bufsz;
+	struct ppa_addr *ppas;
+
+	nvm_cpu_to_sysblk(&nvmsb, info);
+
+	/* buffer for flash page */
+	bufsz = dev->sec_size * dev->sec_per_pg * dev->nr_planes;
+	buf = kzalloc(bufsz, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	memcpy(buf, &nvmsb, sizeof(struct nvm_system_block));
+
+	ppas = kcalloc(dev->sec_per_pg, sizeof(struct ppa_addr), GFP_KERNEL);
+	if (!ppas) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/* Write and verify */
+	for (i = 0; i < s->nr_rows; i++) {
+		ppas[0] = s->ppas[scan_ppa_idx(i, s->act_blk[i])];
+
+		pr_debug("nvm: writing sysblk to ppa (%u %u %u %u)\n",
+							ppas[0].g.ch,
+							ppas[0].g.lun,
+							ppas[0].g.blk,
+							ppas[0].g.pg);
+
+		/* Expand to all sectors within a flash page */
+		if (dev->sec_per_pg > 1) {
+			for (sect = 1; sect < dev->sec_per_pg; sect++) {
+				ppas[sect].ppa = ppas[0].ppa;
+				ppas[sect].g.sec = sect;
+			}
+		}
+
+		ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PWRITE,
+						NVM_IO_SLC_MODE, buf, bufsz);
+		if (ret) {
+			pr_err("nvm: sysblk failed program (%u %u %u)\n",
+							ppas[0].g.ch,
+							ppas[0].g.lun,
+							ppas[0].g.blk);
+			break;
+		}
+
+		ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PREAD,
+						NVM_IO_SLC_MODE, buf, bufsz);
+		if (ret) {
+			pr_err("nvm: sysblk failed read (%u %u %u)\n",
+							ppas[0].g.ch,
+							ppas[0].g.lun,
+							ppas[0].g.blk);
+			break;
+		}
+
+		if (memcmp(buf, &nvmsb, sizeof(struct nvm_system_block))) {
+			pr_err("nvm: sysblk failed verify (%u %u %u)\n",
+							ppas[0].g.ch,
+							ppas[0].g.lun,
+							ppas[0].g.blk);
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+	kfree(ppas);
+err:
+	kfree(buf);
+
+	return ret;
+}
+
+static int nvm_prepare_new_sysblks(struct nvm_dev *dev, struct sysblk_scan *s)
+{
+	int i, ret;
+	unsigned long nxt_blk;
+	struct ppa_addr *ppa;
+
+	for (i = 0; i < s->nr_rows; i++) {
+		nxt_blk = (s->act_blk[i] + 1) % MAX_BLKS_PR_SYSBLK;
+		ppa = &s->ppas[scan_ppa_idx(i, nxt_blk)];
+		ppa->g.pg = ppa_to_slc(dev, 0);
+
+		ret = nvm_erase_ppa(dev, ppa, 1);
+		if (ret)
+			return ret;
+
+		s->act_blk[i] = nxt_blk;
+	}
+
+	return 0;
+}
+
+int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
+{
+	struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
+	struct sysblk_scan s;
+	struct nvm_system_block *cur;
+	int i, j, found = 0;
+	int ret = -ENOMEM;
+
+	/*
+	 * 1. setup sysblk locations
+	 * 2. get bad block list
+	 * 3. filter on host-specific (type 3)
+	 * 4. iterate through all and find the highest seq nr.
+	 * 5. return superblock information
+	 */
+
+	if (!dev->ops->get_bb_tbl)
+		return -EINVAL;
+
+	nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
+
+	mutex_lock(&dev->mlock);
+	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks);
+	if (ret)
+		goto err_sysblk;
+
+	/* no sysblocks initialized */
+	if (!s.nr_ppas)
+		goto err_sysblk;
+
+	cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL);
+	if (!cur)
+		goto err_sysblk;
+
+	/* find the latest block across all sysblocks */
+	for (i = 0; i < s.nr_rows; i++) {
+		for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) {
+			struct ppa_addr ppa = s.ppas[scan_ppa_idx(i, j)];
+
+			ret = nvm_scan_block(dev, &ppa, cur);
+			if (ret > 0)
+				found = 1;
+			else if (ret < 0)
+				break;
+		}
+	}
+
+	nvm_sysblk_to_cpu(info, cur);
+
+	kfree(cur);
+err_sysblk:
+	mutex_unlock(&dev->mlock);
+
+	if (found)
+		return 1;
+	return ret;
+}
+
+int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new)
+{
+	/* 1. for each latest superblock
+	 * 2. if room
+	 *    a. write new flash page entry with the updated information
+	 * 3. if no room
+	 *    a. find next available block on lun (linear search)
+	 *       if none, continue to next lun
+	 *       if none at all, report error. also report that it wasn't
+	 *       possible to write to all superblocks.
+	 *    c. write data to block.
+	 */
+	struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
+	struct sysblk_scan s;
+	struct nvm_system_block *cur;
+	int i, j, ppaidx, found = 0;
+	int ret = -ENOMEM;
+
+	if (!dev->ops->get_bb_tbl)
+		return -EINVAL;
+
+	nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
+
+	mutex_lock(&dev->mlock);
+	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks);
+	if (ret)
+		goto err_sysblk;
+
+	cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL);
+	if (!cur)
+		goto err_sysblk;
+
+	/* Get the latest sysblk for each sysblk row */
+	for (i = 0; i < s.nr_rows; i++) {
+		found = 0;
+		for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) {
+			ppaidx = scan_ppa_idx(i, j);
+			ret = nvm_scan_block(dev, &s.ppas[ppaidx], cur);
+			if (ret > 0) {
+				s.act_blk[i] = j;
+				found = 1;
+			} else if (ret < 0)
+				break;
+		}
+	}
+
+	if (!found) {
+		pr_err("nvm: no valid sysblks found to update\n");
+		ret = -EINVAL;
+		goto err_cur;
+	}
+
+	/*
+	 * All sysblocks found. Check that they have same page id in their flash
+	 * blocks
+	 */
+	for (i = 1; i < s.nr_rows; i++) {
+		struct ppa_addr l = s.ppas[scan_ppa_idx(0, s.act_blk[0])];
+		struct ppa_addr r = s.ppas[scan_ppa_idx(i, s.act_blk[i])];
+
+		if (l.g.pg != r.g.pg) {
+			pr_err("nvm: sysblks not on same page. Previous update failed.\n");
+			ret = -EINVAL;
+			goto err_cur;
+		}
+	}
+
+	/*
+	 * Check that there haven't been another update to the seqnr since we
+	 * began
+	 */
+	if ((new->seqnr - 1) != be32_to_cpu(cur->seqnr)) {
+		pr_err("nvm: seq is not sequential\n");
+		ret = -EINVAL;
+		goto err_cur;
+	}
+
+	/*
+	 * When all pages in a block has been written, a new block is selected
+	 * and writing is performed on the new block.
+	 */
+	if (s.ppas[scan_ppa_idx(0, s.act_blk[0])].g.pg ==
+						dev->lps_per_blk - 1) {
+		ret = nvm_prepare_new_sysblks(dev, &s);
+		if (ret)
+			goto err_cur;
+	}
+
+	ret = nvm_write_and_verify(dev, new, &s);
+err_cur:
+	kfree(cur);
+err_sysblk:
+	mutex_unlock(&dev->mlock);
+
+	return ret;
+}
+
+int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
+{
+	struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
+	struct sysblk_scan s;
+	int ret;
+
+	/*
+	 * 1. select master blocks and select first available blks
+	 * 2. get bad block list
+	 * 3. mark MAX_SYSBLKS block as host-based device allocated.
+	 * 4. write and verify data to block
+	 */
+
+	if (!dev->ops->get_bb_tbl || !dev->ops->set_bb_tbl)
+		return -EINVAL;
+
+	if (!(dev->mccap & NVM_ID_CAP_SLC) || !dev->lps_per_blk) {
+		pr_err("nvm: memory does not support SLC access\n");
+		return -EINVAL;
+	}
+
+	/* Index all sysblocks and mark them as host-driven */
+	nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
+
+	mutex_lock(&dev->mlock);
+	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_free_blks);
+	if (ret)
+		goto err_mark;
+
+	ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_HOST);
+	if (ret)
+		goto err_mark;
+
+	/* Write to the first block of each row */
+	ret = nvm_write_and_verify(dev, info, &s);
+err_mark:
+	mutex_unlock(&dev->mlock);
+	return ret;
+}
+
+struct factory_blks {
+	struct nvm_dev *dev;
+	int flags;
+	unsigned long *blks;
+};
+
+static int factory_nblks(int nblks)
+{
+	/* Round up to nearest BITS_PER_LONG */
+	return (nblks + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+}
+
+static unsigned int factory_blk_offset(struct nvm_dev *dev, int ch, int lun)
+{
+	int nblks = factory_nblks(dev->blks_per_lun);
+
+	return ((ch * dev->luns_per_chnl * nblks) + (lun * nblks)) /
+								BITS_PER_LONG;
+}
+
+static int nvm_factory_blks(struct ppa_addr ppa, int nr_blks, u8 *blks,
+								void *private)
+{
+	struct factory_blks *f = private;
+	struct nvm_dev *dev = f->dev;
+	int i, lunoff;
+
+	lunoff = factory_blk_offset(dev, ppa.g.ch, ppa.g.lun);
+
+	/* non-set bits correspond to the block must be erased */
+	for (i = 0; i < nr_blks; i++) {
+		switch (blks[i]) {
+		case NVM_BLK_T_FREE:
+			if (f->flags & NVM_FACTORY_ERASE_ONLY_USER)
+				set_bit(i, &f->blks[lunoff]);
+			break;
+		case NVM_BLK_T_HOST:
+			if (!(f->flags & NVM_FACTORY_RESET_HOST_BLKS))
+				set_bit(i, &f->blks[lunoff]);
+			break;
+		case NVM_BLK_T_GRWN_BAD:
+			if (!(f->flags & NVM_FACTORY_RESET_GRWN_BBLKS))
+				set_bit(i, &f->blks[lunoff]);
+			break;
+		default:
+			set_bit(i, &f->blks[lunoff]);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list,
+					int max_ppas, struct factory_blks *f)
+{
+	struct ppa_addr ppa;
+	int ch, lun, blkid, idx, done = 0, ppa_cnt = 0;
+	unsigned long *offset;
+
+	while (!done) {
+		done = 1;
+		for (ch = 0; ch < dev->nr_chnls; ch++) {
+			for (lun = 0; lun < dev->luns_per_chnl; lun++) {
+				idx = factory_blk_offset(dev, ch, lun);
+				offset = &f->blks[idx];
+
+				blkid = find_first_zero_bit(offset,
+							dev->blks_per_lun);
+				if (blkid >= dev->blks_per_lun)
+					continue;
+				set_bit(blkid, offset);
+
+				ppa.ppa = 0;
+				ppa.g.ch = ch;
+				ppa.g.lun = lun;
+				ppa.g.blk = blkid;
+				pr_debug("nvm: erase ppa (%u %u %u)\n",
+								ppa.g.ch,
+								ppa.g.lun,
+								ppa.g.blk);
+
+				erase_list[ppa_cnt] = ppa;
+				ppa_cnt++;
+				done = 0;
+
+				if (ppa_cnt == max_ppas)
+					return ppa_cnt;
+			}
+		}
+	}
+
+	return ppa_cnt;
+}
+
+static int nvm_fact_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa,
+					nvm_bb_update_fn *fn, void *priv)
+{
+	struct ppa_addr dev_ppa;
+	int ret;
+
+	dev_ppa = generic_to_dev_addr(dev, ppa);
+
+	ret = dev->ops->get_bb_tbl(dev, dev_ppa, dev->blks_per_lun, fn, priv);
+	if (ret)
+		pr_err("nvm: failed bb tbl for ch%u lun%u\n",
+							ppa.g.ch, ppa.g.blk);
+	return ret;
+}
+
+static int nvm_fact_select_blks(struct nvm_dev *dev, struct factory_blks *f)
+{
+	int ch, lun, ret;
+	struct ppa_addr ppa;
+
+	ppa.ppa = 0;
+	for (ch = 0; ch < dev->nr_chnls; ch++) {
+		for (lun = 0; lun < dev->luns_per_chnl; lun++) {
+			ppa.g.ch = ch;
+			ppa.g.lun = lun;
+
+			ret = nvm_fact_get_bb_tbl(dev, ppa, nvm_factory_blks,
+									f);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+int nvm_dev_factory(struct nvm_dev *dev, int flags)
+{
+	struct factory_blks f;
+	struct ppa_addr *ppas;
+	int ppa_cnt, ret = -ENOMEM;
+	int max_ppas = dev->ops->max_phys_sect / dev->nr_planes;
+	struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
+	struct sysblk_scan s;
+
+	f.blks = kzalloc(factory_nblks(dev->blks_per_lun) * dev->nr_luns,
+								GFP_KERNEL);
+	if (!f.blks)
+		return ret;
+
+	ppas = kcalloc(max_ppas, sizeof(struct ppa_addr), GFP_KERNEL);
+	if (!ppas)
+		goto err_blks;
+
+	f.dev = dev;
+	f.flags = flags;
+
+	/* create list of blks to be erased */
+	ret = nvm_fact_select_blks(dev, &f);
+	if (ret)
+		goto err_ppas;
+
+	/* continue to erase until list of blks until empty */
+	while ((ppa_cnt = nvm_fact_get_blks(dev, ppas, max_ppas, &f)) > 0)
+		nvm_erase_ppa(dev, ppas, ppa_cnt);
+
+	/* mark host reserved blocks free */
+	if (flags & NVM_FACTORY_RESET_HOST_BLKS) {
+		nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
+		mutex_lock(&dev->mlock);
+		ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas,
+							sysblk_get_host_blks);
+		if (!ret)
+			ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_FREE);
+		mutex_unlock(&dev->mlock);
+	}
+err_ppas:
+	kfree(ppas);
+err_blks:
+	kfree(f.blks);
+	return ret;
+}
+EXPORT_SYMBOL(nvm_dev_factory);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 83392f856dfd..22b9e34ceb75 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c)
 	do {
 		ret = btree_root(gc_root, c, &op, &writes, &stats);
 		closure_sync(&writes);
+		cond_resched();
 
 		if (ret && ret != -EAGAIN)
 			pr_warn("gc failed!");
@@ -2162,8 +2163,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 		rw_lock(true, b, b->level);
 
 		if (b->key.ptr[0] != btree_ptr ||
-		    b->seq != seq + 1)
+                   b->seq != seq + 1) {
+                       op->lock = b->level;
 			goto out;
+               }
 	}
 
 	SET_KEY_PTRS(check_key, 1);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 679a093a3bf6..8d0ead98eb6e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
 	WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
 	     sysfs_create_link(&c->kobj, &d->kobj, d->name),
 	     "Couldn't create device <-> cache set symlinks");
+
+	clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
 }
 
 static void bcache_device_detach(struct bcache_device *d)
@@ -847,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc)
 	buf[SB_LABEL_SIZE] = '\0';
 	env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
 
-	if (atomic_xchg(&dc->running, 1))
+	if (atomic_xchg(&dc->running, 1)) {
+		kfree(env[1]);
+		kfree(env[2]);
 		return;
+	}
 
 	if (!d->c &&
 	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
@@ -1933,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			else
 				err = "device busy";
 			mutex_unlock(&bch_register_lock);
+			if (attr == &ksysfs_register_quiet)
+				goto out;
 		}
 		goto err;
 	}
@@ -1971,8 +1978,7 @@ out:
 err_close:
 	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 err:
-	if (attr != &ksysfs_register_quiet)
-		pr_info("error opening %s: %s", path, err);
+	pr_info("error opening %s: %s", path, err);
 	ret = -EINVAL;
 	goto out;
 }
@@ -2066,8 +2072,10 @@ static int __init bcache_init(void)
 	closure_debug_init();
 
 	bcache_major = register_blkdev(0, "bcache");
-	if (bcache_major < 0)
+	if (bcache_major < 0) {
+		unregister_reboot_notifier(&reboot);
 		return bcache_major;
+	}
 
 	if (!(bcache_wq = create_workqueue("bcache")) ||
 	    !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index b23f88d9f18c..b9346cd9cda1 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 
 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 {
+	struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
+
+	BUG_ON(KEY_INODE(k) != dc->disk.id);
+
 	return KEY_DIRTY(k);
 }
 
@@ -372,11 +376,24 @@ next:
 	}
 }
 
+/*
+ * Returns true if we scanned the entire disk
+ */
 static bool refill_dirty(struct cached_dev *dc)
 {
 	struct keybuf *buf = &dc->writeback_keys;
+	struct bkey start = KEY(dc->disk.id, 0, 0);
 	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
-	bool searched_from_start = false;
+	struct bkey start_pos;
+
+	/*
+	 * make sure keybuf pos is inside the range for this disk - at bringup
+	 * we might not be attached yet so this disk's inode nr isn't
+	 * initialized then
+	 */
+	if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
+	    bkey_cmp(&buf->last_scanned, &end) > 0)
+		buf->last_scanned = start;
 
 	if (dc->partial_stripes_expensive) {
 		refill_full_stripes(dc);
@@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc)
 			return false;
 	}
 
-	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
-		buf->last_scanned = KEY(dc->disk.id, 0, 0);
-		searched_from_start = true;
-	}
-
+	start_pos = buf->last_scanned;
 	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
 
-	return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
+	if (bkey_cmp(&buf->last_scanned, &end) < 0)
+		return false;
+
+	/*
+	 * If we get to the end start scanning again from the beginning, and
+	 * only scan up to where we initially started scanning from:
+	 */
+	buf->last_scanned = start;
+	bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
+
+	return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
 }
 
 static int bch_writeback_thread(void *arg)
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 0a9dab187b79..073a042aed24 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 
 static inline void bch_writeback_queue(struct cached_dev *dc)
 {
-	wake_up_process(dc->writeback_thread);
+	if (!IS_ERR_OR_NULL(dc->writeback_thread))
+		wake_up_process(dc->writeback_thread);
 }
 
 static inline void bch_writeback_add(struct cached_dev *dc)
diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index 154aced0b91b..65cc0ac9b82d 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -170,7 +170,7 @@ static int mmc_ios_show(struct seq_file *s, void *data)
 		str = "invalid";
 		break;
 	}
-	seq_printf(s, "signal voltage:\t%u (%s)\n", ios->chip_select, str);
+	seq_printf(s, "signal voltage:\t%u (%s)\n", ios->signal_voltage, str);
 
 	switch (ios->drv_type) {
 	case MMC_SET_DRIVER_TYPE_A:
diff --git a/drivers/mmc/core/pwrseq_simple.c b/drivers/mmc/core/pwrseq_simple.c
index 2b16263458af..aba786daebca 100644
--- a/drivers/mmc/core/pwrseq_simple.c
+++ b/drivers/mmc/core/pwrseq_simple.c
@@ -29,15 +29,18 @@ struct mmc_pwrseq_simple {
 static void mmc_pwrseq_simple_set_gpios_value(struct mmc_pwrseq_simple *pwrseq,
 					      int value)
 {
-	int i;
 	struct gpio_descs *reset_gpios = pwrseq->reset_gpios;
-	int values[reset_gpios->ndescs];
 
-	for (i = 0; i < reset_gpios->ndescs; i++)
-		values[i] = value;
+	if (!IS_ERR(reset_gpios)) {
+		int i;
+		int values[reset_gpios->ndescs];
 
-	gpiod_set_array_value_cansleep(reset_gpios->ndescs, reset_gpios->desc,
-				       values);
+		for (i = 0; i < reset_gpios->ndescs; i++)
+			values[i] = value;
+
+		gpiod_set_array_value_cansleep(
+			reset_gpios->ndescs, reset_gpios->desc, values);
+	}
 }
 
 static void mmc_pwrseq_simple_pre_power_on(struct mmc_host *host)
@@ -79,7 +82,8 @@ static void mmc_pwrseq_simple_free(struct mmc_host *host)
 	struct mmc_pwrseq_simple *pwrseq = container_of(host->pwrseq,
 					struct mmc_pwrseq_simple, pwrseq);
 
-	gpiod_put_array(pwrseq->reset_gpios);
+	if (!IS_ERR(pwrseq->reset_gpios))
+		gpiod_put_array(pwrseq->reset_gpios);
 
 	if (!IS_ERR(pwrseq->ext_clk))
 		clk_put(pwrseq->ext_clk);
@@ -112,7 +116,9 @@ struct mmc_pwrseq *mmc_pwrseq_simple_alloc(struct mmc_host *host,
 	}
 
 	pwrseq->reset_gpios = gpiod_get_array(dev, "reset", GPIOD_OUT_HIGH);
-	if (IS_ERR(pwrseq->reset_gpios)) {
+	if (IS_ERR(pwrseq->reset_gpios) &&
+	    PTR_ERR(pwrseq->reset_gpios) != -ENOENT &&
+	    PTR_ERR(pwrseq->reset_gpios) != -ENOSYS) {
 		ret = PTR_ERR(pwrseq->reset_gpios);
 		goto clk_put;
 	}
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index f2b164b214ae..bb39a29b2db6 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -329,6 +329,7 @@ static int mmc_read_switch(struct mmc_card *card)
 		card->sw_caps.sd3_bus_mode = status[13];
 		/* Driver Strengths supported by the card */
 		card->sw_caps.sd3_drv_type = status[9];
+		card->sw_caps.sd3_curr_limit = status[7] | status[6] << 8;
 	}
 
 out:
@@ -545,14 +546,25 @@ static int sd_set_current_limit(struct mmc_card *card, u8 *status)
 	 * when we set current limit to 200ma, the card will draw 200ma, and
 	 * when we set current limit to 400/600/800ma, the card will draw its
 	 * maximum 300ma from the host.
+	 *
+	 * The above is incorrect: if we try to set a current limit that is
+	 * not supported by the card, the card can rightfully error out the
+	 * attempt, and remain at the default current limit.  This results
+	 * in a 300mA card being limited to 200mA even though the host
+	 * supports 800mA. Failures seen with SanDisk 8GB UHS cards with
+	 * an iMX6 host. --rmk
 	 */
-	if (max_current >= 800)
+	if (max_current >= 800 &&
+	    card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_800)
 		current_limit = SD_SET_CURRENT_LIMIT_800;
-	else if (max_current >= 600)
+	else if (max_current >= 600 &&
+		 card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_600)
 		current_limit = SD_SET_CURRENT_LIMIT_600;
-	else if (max_current >= 400)
+	else if (max_current >= 400 &&
+		 card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_400)
 		current_limit = SD_SET_CURRENT_LIMIT_400;
-	else if (max_current >= 200)
+	else if (max_current >= 200 &&
+		 card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_200)
 		current_limit = SD_SET_CURRENT_LIMIT_200;
 
 	if (current_limit != SD_SET_CURRENT_NO_CHANGE) {
@@ -626,9 +638,9 @@ static int mmc_sd_init_uhs_card(struct mmc_card *card)
 	 * SDR104 mode SD-cards. Note that tuning is mandatory for SDR104.
 	 */
 	if (!mmc_host_is_spi(card->host) &&
-		(card->sd_bus_speed == UHS_SDR50_BUS_SPEED ||
-		 card->sd_bus_speed == UHS_DDR50_BUS_SPEED ||
-		 card->sd_bus_speed == UHS_SDR104_BUS_SPEED)) {
+		(card->host->ios.timing == MMC_TIMING_UHS_SDR50 ||
+		 card->host->ios.timing == MMC_TIMING_UHS_DDR50 ||
+		 card->host->ios.timing == MMC_TIMING_UHS_SDR104)) {
 		err = mmc_execute_tuning(card);
 
 		/*
@@ -638,7 +650,7 @@ static int mmc_sd_init_uhs_card(struct mmc_card *card)
 		 * difference between v3.00 and 3.01 spec means that CMD19
 		 * tuning is also available for DDR50 mode.
 		 */
-		if (err && card->sd_bus_speed == UHS_DDR50_BUS_SPEED) {
+		if (err && card->host->ios.timing == MMC_TIMING_UHS_DDR50) {
 			pr_warn("%s: ddr50 tuning failed\n",
 				mmc_hostname(card->host));
 			err = 0;
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index d61ba1a0495e..467b3cf80c44 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -535,8 +535,8 @@ static int mmc_sdio_init_uhs_card(struct mmc_card *card)
 	 * SDR104 mode SD-cards. Note that tuning is mandatory for SDR104.
 	 */
 	if (!mmc_host_is_spi(card->host) &&
-	    ((card->sw_caps.sd3_bus_mode & SD_MODE_UHS_SDR50) ||
-	     (card->sw_caps.sd3_bus_mode & SD_MODE_UHS_SDR104)))
+	    ((card->host->ios.timing == MMC_TIMING_UHS_SDR50) ||
+	      (card->host->ios.timing == MMC_TIMING_UHS_SDR104)))
 		err = mmc_execute_tuning(card);
 out:
 	return err;
diff --git a/drivers/mmc/core/sdio_cis.c b/drivers/mmc/core/sdio_cis.c
index 8e94e555b788..6f6fc527a263 100644
--- a/drivers/mmc/core/sdio_cis.c
+++ b/drivers/mmc/core/sdio_cis.c
@@ -223,6 +223,7 @@ static const struct cis_tpl cis_tpl_list[] = {
 	{	0x20,	4,	cistpl_manfid		},
 	{	0x21,	2,	/* cistpl_funcid */	},
 	{	0x22,	0,	cistpl_funce		},
+	{	0x91,	2,	/* cistpl_sdio_std */	},
 };
 
 static int sdio_read_cis(struct mmc_card *card, struct sdio_func *func)
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index fb266745f824..0d6ca4116f3d 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -151,6 +151,7 @@ static struct variant_data variant_nomadik = {
 	.fifosize		= 16 * 4,
 	.fifohalfsize		= 8 * 4,
 	.clkreg			= MCI_CLK_ENABLE,
+	.clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS,
 	.datalength_bits	= 24,
 	.datactrl_mask_sdio	= MCI_ST_DPSM_SDIOEN,
 	.st_sdio		= true,
@@ -1886,7 +1887,7 @@ static struct amba_id mmci_ids[] = {
 	{
 		.id     = 0x00280180,
 		.mask   = 0x00ffffff,
-		.data	= &variant_u300,
+		.data	= &variant_nomadik,
 	},
 	{
 		.id     = 0x00480180,
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index e4b05dbb9ca8..4a0d6b80eaa3 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -94,9 +94,9 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 			desc = NULL;
 			ret = cookie;
 		}
+		dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
+			__func__, host->sg_len, ret, cookie, host->mrq);
 	}
-	dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
-		__func__, host->sg_len, ret, cookie, host->mrq);
 
 pio:
 	if (!desc) {
@@ -116,8 +116,8 @@ pio:
 			 "DMA failed: %d, falling back to PIO\n", ret);
 	}
 
-	dev_dbg(&host->pdev->dev, "%s(): desc %p, cookie %d, sg[%d]\n", __func__,
-		desc, cookie, host->sg_len);
+	dev_dbg(&host->pdev->dev, "%s(): desc %p, sg[%d]\n", __func__,
+		desc, host->sg_len);
 }
 
 static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
@@ -174,9 +174,9 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
 			desc = NULL;
 			ret = cookie;
 		}
+		dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
+			__func__, host->sg_len, ret, cookie, host->mrq);
 	}
-	dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
-		__func__, host->sg_len, ret, cookie, host->mrq);
 
 pio:
 	if (!desc) {
@@ -196,8 +196,7 @@ pio:
 			 "DMA failed: %d, falling back to PIO\n", ret);
 	}
 
-	dev_dbg(&host->pdev->dev, "%s(): desc %p, cookie %d\n", __func__,
-		desc, cookie);
+	dev_dbg(&host->pdev->dev, "%s(): desc %p\n", __func__, desc);
 }
 
 void tmio_mmc_start_dma(struct tmio_mmc_host *host,
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 54e056d3be02..ee2b74d1d1b5 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -174,9 +174,9 @@ static int vol_cdev_fsync(struct file *file, loff_t start, loff_t end,
 	struct ubi_device *ubi = desc->vol->ubi;
 	struct inode *inode = file_inode(file);
 	int err;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	err = ubi_sync(ubi->ubi_num);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 2c2baab9d880..d66c690a8597 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -157,6 +157,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[29] = "802.1ad offload support",
 		[31] = "Modifying loopback source checks using UPDATE_QP support",
 		[32] = "Loopback source checks support",
+		[33] = "RoCEv2 support"
 	};
 	int i;
 
@@ -626,6 +627,8 @@ out:
 	return err;
 }
 
+static void disable_unsupported_roce_caps(void *buf);
+
 int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	struct mlx4_cmd_mailbox *mailbox;
@@ -738,6 +741,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	if (err)
 		goto out;
 
+	if (mlx4_is_mfunc(dev))
+		disable_unsupported_roce_caps(outbox);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
 	dev_cap->reserved_qps = 1 << (field & 0xf);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
@@ -905,6 +910,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
 	MLX4_GET(dev_cap->bmme_flags, outbox,
 		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	if (dev_cap->bmme_flags & MLX4_FLAG_ROCE_V1_V2)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ROCE_V1_V2;
 	if (dev_cap->bmme_flags & MLX4_FLAG_PORT_REMAP)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_REMAP;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
@@ -1161,6 +1168,7 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
 	if (err)
 		return err;
 
+	disable_unsupported_roce_caps(outbox->buf);
 	/* add port mng change event capability and disable mw type 1
 	 * unconditionally to slaves
 	 */
@@ -1258,6 +1266,21 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
 	return 0;
 }
 
+static void disable_unsupported_roce_caps(void *buf)
+{
+	u32 flags;
+
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	flags &= ~(1UL << 31);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	flags &= ~(1UL << 24);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	flags &= ~(MLX4_FLAG_ROCE_V1_V2);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+}
+
 int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
 			    struct mlx4_vhcr *vhcr,
 			    struct mlx4_cmd_mailbox *inbox,
@@ -2239,7 +2262,8 @@ struct mlx4_config_dev {
 	__be32	rsvd1[3];
 	__be16	vxlan_udp_dport;
 	__be16	rsvd2;
-	__be32	rsvd3;
+	__be16  roce_v2_entropy;
+	__be16  roce_v2_udp_dport;
 	__be32	roce_flags;
 	__be32	rsvd4[25];
 	__be16	rsvd5;
@@ -2248,6 +2272,7 @@ struct mlx4_config_dev {
 };
 
 #define MLX4_VXLAN_UDP_DPORT (1 << 0)
+#define MLX4_ROCE_V2_UDP_DPORT BIT(3)
 #define MLX4_DISABLE_RX_PORT BIT(18)
 
 static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
@@ -2365,6 +2390,18 @@ int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis)
 	return mlx4_CONFIG_DEV_set(dev, &config_dev);
 }
 
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags    = cpu_to_be32(MLX4_ROCE_V2_UDP_DPORT);
+	config_dev.roce_v2_udp_dport = cpu_to_be16(udp_port);
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+EXPORT_SYMBOL_GPL(mlx4_config_roce_v2_port);
+
 int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2)
 {
 	struct mlx4_cmd_mailbox *mailbox;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 2404c22ad2b2..7baef52db6b7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -780,7 +780,10 @@ struct mlx4_set_port_general_context {
 	u16 reserved1;
 	u8 v_ignore_fcs;
 	u8 flags;
-	u8 ignore_fcs;
+	union {
+		u8 ignore_fcs;
+		u8 roce_mode;
+	};
 	u8 reserved2;
 	__be16 mtu;
 	u8 pptx;
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index f2550425c251..787b7bb54d52 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -1520,6 +1520,8 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz)
 	return err;
 }
 
+#define SET_PORT_ROCE_2_FLAGS          0x10
+#define MLX4_SET_PORT_ROCE_V1_V2       0x2
 int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
 			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
 {
@@ -1539,6 +1541,11 @@ int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
 	context->pprx = (pprx * (!pfcrx)) << 7;
 	context->pfcrx = pfcrx;
 
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+		context->flags |= SET_PORT_ROCE_2_FLAGS;
+		context->roce_mode |=
+			MLX4_SET_PORT_ROCE_V1_V2 << 4;
+	}
 	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
 	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
 		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 168823dde79f..d1cd9c32a9ae 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -167,6 +167,12 @@ static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 		context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
 	}
 
+	if ((cur_state == MLX4_QP_STATE_RTR) &&
+	    (new_state == MLX4_QP_STATE_RTS) &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+		context->roce_entropy =
+			cpu_to_be16(mlx4_qp_roce_entropy(dev, qp->qpn));
+
 	*(__be32 *) mailbox->buf = cpu_to_be32(optpar);
 	memcpy(mailbox->buf + 8, context, sizeof *context);
 
@@ -921,3 +927,23 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_to_ready);
+
+u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn)
+{
+	struct mlx4_qp_context context;
+	struct mlx4_qp qp;
+	int err;
+
+	qp.qpn = qpn;
+	err = mlx4_qp_query(dev, &qp, &context);
+	if (!err) {
+		u32 dest_qpn = be32_to_cpu(context.remote_qpn) & 0xffffff;
+		u16 folded_dst = folded_qp(dest_qpn);
+		u16 folded_src = folded_qp(qpn);
+
+		return (dest_qpn != qpn) ?
+			((folded_dst ^ folded_src) | 0xC000) :
+			folded_src | 0xC000;
+	}
+	return 0xdead;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9ea49a893323..aac071a7e830 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -39,8 +39,8 @@
 #include <linux/mlx5/qp.h>
 #include <linux/mlx5/cq.h>
 #include <linux/mlx5/vport.h>
+#include <linux/mlx5/transobj.h>
 #include "wq.h"
-#include "transobj.h"
 #include "mlx5_core.h"
 
 #define MLX5E_MAX_NUM_TC	8
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c56d91a2812b..6a3e430f1062 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2241,7 +2241,7 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
 		goto err_unmap_free_uar;
 	}
 
-	err = mlx5_alloc_transport_domain(mdev, &priv->tdn);
+	err = mlx5_core_alloc_transport_domain(mdev, &priv->tdn);
 	if (err) {
 		mlx5_core_err(mdev, "alloc td failed, %d\n", err);
 		goto err_dealloc_pd;
@@ -2324,7 +2324,7 @@ err_destroy_mkey:
 	mlx5_core_destroy_mkey(mdev, &priv->mr);
 
 err_dealloc_transport_domain:
-	mlx5_dealloc_transport_domain(mdev, priv->tdn);
+	mlx5_core_dealloc_transport_domain(mdev, priv->tdn);
 
 err_dealloc_pd:
 	mlx5_core_dealloc_pd(mdev, priv->pdn);
@@ -2356,7 +2356,7 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
 	mlx5e_close_drop_rq(priv);
 	mlx5e_destroy_tises(priv);
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
-	mlx5_dealloc_transport_domain(priv->mdev, priv->tdn);
+	mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn);
 	mlx5_core_dealloc_pd(priv->mdev, priv->pdn);
 	mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar);
 	free_netdev(netdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 23c244a7e5d7..647a3ca2c2a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -230,6 +230,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 		case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
 		case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
 			rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+			rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN);
 			mlx5_core_dbg(dev, "event %s(%d) arrived on resource 0x%x\n",
 				      eqe_type_str(eqe->type), eqe->type, rsn);
 			mlx5_rsc_event(dev, rsn, eqe->type);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index b37749a3730e..1545a944c309 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -78,6 +78,11 @@ struct mlx5_device_context {
 	void		       *context;
 };
 
+enum {
+	MLX5_ATOMIC_REQ_MODE_BE = 0x0,
+	MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS = 0x1,
+};
+
 static struct mlx5_profile profile[] = {
 	[0] = {
 		.mask           = 0,
@@ -387,7 +392,7 @@ query_ex:
 	return err;
 }
 
-static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz)
+static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz, int opmod)
 {
 	u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)];
 	int err;
@@ -395,6 +400,7 @@ static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz)
 	memset(out, 0, sizeof(out));
 
 	MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP);
+	MLX5_SET(set_hca_cap_in, in, op_mod, opmod << 1);
 	err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
 	if (err)
 		return err;
@@ -404,6 +410,46 @@ static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz)
 	return err;
 }
 
+static int handle_hca_cap_atomic(struct mlx5_core_dev *dev)
+{
+	void *set_ctx;
+	void *set_hca_cap;
+	int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
+	int req_endianness;
+	int err;
+
+	if (MLX5_CAP_GEN(dev, atomic)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC,
+					 HCA_CAP_OPMOD_GET_CUR);
+		if (err)
+			return err;
+	} else {
+		return 0;
+	}
+
+	req_endianness =
+		MLX5_CAP_ATOMIC(dev,
+				supported_atomic_req_8B_endianess_mode_1);
+
+	if (req_endianness != MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS)
+		return 0;
+
+	set_ctx = kzalloc(set_sz, GFP_KERNEL);
+	if (!set_ctx)
+		return -ENOMEM;
+
+	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability);
+
+	/* Set requestor to host endianness */
+	MLX5_SET(atomic_caps, set_hca_cap, atomic_req_8B_endianess_mode,
+		 MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS);
+
+	err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC);
+
+	kfree(set_ctx);
+	return err;
+}
+
 static int handle_hca_cap(struct mlx5_core_dev *dev)
 {
 	void *set_ctx = NULL;
@@ -445,7 +491,8 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
 
 	MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12);
 
-	err = set_caps(dev, set_ctx, set_sz);
+	err = set_caps(dev, set_ctx, set_sz,
+		       MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
 
 query_ex:
 	kfree(set_ctx);
@@ -667,7 +714,6 @@ clean:
 	return err;
 }
 
-#ifdef CONFIG_MLX5_CORE_EN
 static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
 {
 	u32 query_in[MLX5_ST_SZ_DW(query_issi_in)];
@@ -720,7 +766,6 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
 
 	return -ENOTSUPP;
 }
-#endif
 
 static int map_bf_area(struct mlx5_core_dev *dev)
 {
@@ -966,13 +1011,11 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 		goto err_pagealloc_cleanup;
 	}
 
-#ifdef CONFIG_MLX5_CORE_EN
 	err = mlx5_core_set_issi(dev);
 	if (err) {
 		dev_err(&pdev->dev, "failed to set issi\n");
 		goto err_disable_hca;
 	}
-#endif
 
 	err = mlx5_satisfy_startup_pages(dev, 1);
 	if (err) {
@@ -992,6 +1035,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 		goto reclaim_boot_pages;
 	}
 
+	err = handle_hca_cap_atomic(dev);
+	if (err) {
+		dev_err(&pdev->dev, "handle_hca_cap_atomic failed\n");
+		goto reclaim_boot_pages;
+	}
+
 	err = mlx5_satisfy_startup_pages(dev, 0);
 	if (err) {
 		dev_err(&pdev->dev, "failed to allocate init pages\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index 30e2ba3f5f16..def289375ecb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -36,6 +36,7 @@
 #include <linux/mlx5/cmd.h>
 #include <linux/mlx5/qp.h>
 #include <linux/mlx5/driver.h>
+#include <linux/mlx5/transobj.h>
 
 #include "mlx5_core.h"
 
@@ -67,6 +68,52 @@ void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common)
 		complete(&common->free);
 }
 
+static u64 qp_allowed_event_types(void)
+{
+	u64 mask;
+
+	mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) |
+	       BIT(MLX5_EVENT_TYPE_COMM_EST) |
+	       BIT(MLX5_EVENT_TYPE_SQ_DRAINED) |
+	       BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+	       BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) |
+	       BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) |
+	       BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) |
+	       BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR);
+
+	return mask;
+}
+
+static u64 rq_allowed_event_types(void)
+{
+	u64 mask;
+
+	mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+	       BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+
+	return mask;
+}
+
+static u64 sq_allowed_event_types(void)
+{
+	return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+}
+
+static bool is_event_type_allowed(int rsc_type, int event_type)
+{
+	switch (rsc_type) {
+	case MLX5_EVENT_QUEUE_TYPE_QP:
+		return BIT(event_type) & qp_allowed_event_types();
+	case MLX5_EVENT_QUEUE_TYPE_RQ:
+		return BIT(event_type) & rq_allowed_event_types();
+	case MLX5_EVENT_QUEUE_TYPE_SQ:
+		return BIT(event_type) & sq_allowed_event_types();
+	default:
+		WARN(1, "Event arrived for unknown resource type");
+		return false;
+	}
+}
+
 void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
 {
 	struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn);
@@ -75,8 +122,16 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
 	if (!common)
 		return;
 
+	if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type)) {
+		mlx5_core_warn(dev, "event 0x%.2x is not allowed on resource 0x%.8x\n",
+			       event_type, rsn);
+		return;
+	}
+
 	switch (common->res) {
 	case MLX5_RES_QP:
+	case MLX5_RES_RQ:
+	case MLX5_RES_SQ:
 		qp = (struct mlx5_core_qp *)common;
 		qp->event(qp, event_type);
 		break;
@@ -177,27 +232,56 @@ void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
 }
 #endif
 
+static int create_qprqsq_common(struct mlx5_core_dev *dev,
+				struct mlx5_core_qp *qp,
+				int rsc_type)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	int err;
+
+	qp->common.res = rsc_type;
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree,
+				qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN),
+				qp);
+	spin_unlock_irq(&table->lock);
+	if (err)
+		return err;
+
+	atomic_set(&qp->common.refcount, 1);
+	init_completion(&qp->common.free);
+	qp->pid = current->pid;
+
+	return 0;
+}
+
+static void destroy_qprqsq_common(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *qp)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	unsigned long flags;
+
+	spin_lock_irqsave(&table->lock, flags);
+	radix_tree_delete(&table->tree,
+			  qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN));
+	spin_unlock_irqrestore(&table->lock, flags);
+	mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
+	wait_for_completion(&qp->common.free);
+}
+
 int mlx5_core_create_qp(struct mlx5_core_dev *dev,
 			struct mlx5_core_qp *qp,
 			struct mlx5_create_qp_mbox_in *in,
 			int inlen)
 {
-	struct mlx5_qp_table *table = &dev->priv.qp_table;
 	struct mlx5_create_qp_mbox_out out;
 	struct mlx5_destroy_qp_mbox_in din;
 	struct mlx5_destroy_qp_mbox_out dout;
 	int err;
-	void *qpc;
 
 	memset(&out, 0, sizeof(out));
 	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_QP);
 
-	if (dev->issi) {
-		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
-		/* 0xffffff means we ask to work with cqe version 0 */
-		MLX5_SET(qpc, qpc, user_index, 0xffffff);
-	}
-
 	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
 	if (err) {
 		mlx5_core_warn(dev, "ret %d\n", err);
@@ -213,24 +297,16 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
 	qp->qpn = be32_to_cpu(out.qpn) & 0xffffff;
 	mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn);
 
-	qp->common.res = MLX5_RES_QP;
-	spin_lock_irq(&table->lock);
-	err = radix_tree_insert(&table->tree, qp->qpn, qp);
-	spin_unlock_irq(&table->lock);
-	if (err) {
-		mlx5_core_warn(dev, "err %d\n", err);
+	err = create_qprqsq_common(dev, qp, MLX5_RES_QP);
+	if (err)
 		goto err_cmd;
-	}
 
 	err = mlx5_debug_qp_add(dev, qp);
 	if (err)
 		mlx5_core_dbg(dev, "failed adding QP 0x%x to debug file system\n",
 			      qp->qpn);
 
-	qp->pid = current->pid;
-	atomic_set(&qp->common.refcount, 1);
 	atomic_inc(&dev->num_qps);
-	init_completion(&qp->common.free);
 
 	return 0;
 
@@ -250,18 +326,11 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
 {
 	struct mlx5_destroy_qp_mbox_in in;
 	struct mlx5_destroy_qp_mbox_out out;
-	struct mlx5_qp_table *table = &dev->priv.qp_table;
-	unsigned long flags;
 	int err;
 
 	mlx5_debug_qp_remove(dev, qp);
 
-	spin_lock_irqsave(&table->lock, flags);
-	radix_tree_delete(&table->tree, qp->qpn);
-	spin_unlock_irqrestore(&table->lock, flags);
-
-	mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
-	wait_for_completion(&qp->common.free);
+	destroy_qprqsq_common(dev, qp);
 
 	memset(&in, 0, sizeof(in));
 	memset(&out, 0, sizeof(out));
@@ -279,59 +348,15 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp);
 
-int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state,
-			enum mlx5_qp_state new_state,
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation,
 			struct mlx5_modify_qp_mbox_in *in, int sqd_event,
 			struct mlx5_core_qp *qp)
 {
-	static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
-		[MLX5_QP_STATE_RST] = {
-			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
-			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
-			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_RST2INIT_QP,
-		},
-		[MLX5_QP_STATE_INIT]  = {
-			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
-			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
-			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_INIT2INIT_QP,
-			[MLX5_QP_STATE_RTR]	= MLX5_CMD_OP_INIT2RTR_QP,
-		},
-		[MLX5_QP_STATE_RTR]   = {
-			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
-			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
-			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTR2RTS_QP,
-		},
-		[MLX5_QP_STATE_RTS]   = {
-			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
-			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
-			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTS2RTS_QP,
-		},
-		[MLX5_QP_STATE_SQD] = {
-			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
-			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
-		},
-		[MLX5_QP_STATE_SQER] = {
-			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
-			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
-			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_SQERR2RTS_QP,
-		},
-		[MLX5_QP_STATE_ERR] = {
-			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
-			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
-		}
-	};
-
 	struct mlx5_modify_qp_mbox_out out;
 	int err = 0;
-	u16 op;
-
-	if (cur_state >= MLX5_QP_NUM_STATE || new_state >= MLX5_QP_NUM_STATE ||
-	    !optab[cur_state][new_state])
-		return -EINVAL;
 
 	memset(&out, 0, sizeof(out));
-	op = optab[cur_state][new_state];
-	in->hdr.opcode = cpu_to_be16(op);
+	in->hdr.opcode = cpu_to_be16(operation);
 	in->qpn = cpu_to_be32(qp->qpn);
 	err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out));
 	if (err)
@@ -449,3 +474,67 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
 }
 EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
 #endif
+
+int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *rq)
+{
+	int err;
+	u32 rqn;
+
+	err = mlx5_core_create_rq(dev, in, inlen, &rqn);
+	if (err)
+		return err;
+
+	rq->qpn = rqn;
+	err = create_qprqsq_common(dev, rq, MLX5_RES_RQ);
+	if (err)
+		goto err_destroy_rq;
+
+	return 0;
+
+err_destroy_rq:
+	mlx5_core_destroy_rq(dev, rq->qpn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_rq_tracked);
+
+void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *rq)
+{
+	destroy_qprqsq_common(dev, rq);
+	mlx5_core_destroy_rq(dev, rq->qpn);
+}
+EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked);
+
+int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *sq)
+{
+	int err;
+	u32 sqn;
+
+	err = mlx5_core_create_sq(dev, in, inlen, &sqn);
+	if (err)
+		return err;
+
+	sq->qpn = sqn;
+	err = create_qprqsq_common(dev, sq, MLX5_RES_SQ);
+	if (err)
+		goto err_destroy_sq;
+
+	return 0;
+
+err_destroy_sq:
+	mlx5_core_destroy_sq(dev, sq->qpn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_sq_tracked);
+
+void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *sq)
+{
+	destroy_qprqsq_common(dev, sq);
+	mlx5_core_destroy_sq(dev, sq->qpn);
+}
+EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
index ffada801976b..04bc522605a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
@@ -37,7 +37,7 @@
 #include <linux/mlx5/srq.h>
 #include <rdma/ib_verbs.h>
 #include "mlx5_core.h"
-#include "transobj.h"
+#include <linux/mlx5/transobj.h>
 
 void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type)
 {
@@ -241,8 +241,6 @@ static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
 
 	memcpy(xrc_srqc, srqc, MLX5_ST_SZ_BYTES(srqc));
 	memcpy(pas, in->pas, pas_size);
-	/* 0xffffff means we ask to work with cqe version 0 */
-	MLX5_SET(xrc_srqc,	    xrc_srqc,  user_index, 0xffffff);
 	MLX5_SET(create_xrc_srq_in, create_in, opcode,
 		 MLX5_CMD_OP_CREATE_XRC_SRQ);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index d7068f54e800..03a5093ffeb7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -32,9 +32,9 @@
 
 #include <linux/mlx5/driver.h>
 #include "mlx5_core.h"
-#include "transobj.h"
+#include <linux/mlx5/transobj.h>
 
-int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn)
+int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn)
 {
 	u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)];
 	u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)];
@@ -53,8 +53,9 @@ int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn)
 
 	return err;
 }
+EXPORT_SYMBOL(mlx5_core_alloc_transport_domain);
 
-void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn)
+void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn)
 {
 	u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)];
 	u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)];
@@ -68,6 +69,7 @@ void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn)
 
 	mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_dealloc_transport_domain);
 
 int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn)
 {
@@ -94,6 +96,7 @@ int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen)
 	memset(out, 0, sizeof(out));
 	return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_modify_rq);
 
 void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn)
 {
@@ -108,6 +111,18 @@ void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn)
 	mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
 
+int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_rq_in)] = {0};
+	int outlen = MLX5_ST_SZ_BYTES(query_rq_out);
+
+	MLX5_SET(query_rq_in, in, opcode, MLX5_CMD_OP_QUERY_RQ);
+	MLX5_SET(query_rq_in, in, rqn, rqn);
+
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_rq);
+
 int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn)
 {
 	u32 out[MLX5_ST_SZ_DW(create_sq_out)];
@@ -133,6 +148,7 @@ int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen)
 	memset(out, 0, sizeof(out));
 	return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_modify_sq);
 
 void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn)
 {
@@ -147,6 +163,18 @@ void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn)
 	mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
 
+int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_sq_in)] = {0};
+	int outlen = MLX5_ST_SZ_BYTES(query_sq_out);
+
+	MLX5_SET(query_sq_in, in, opcode, MLX5_CMD_OP_QUERY_SQ);
+	MLX5_SET(query_sq_in, in, sqn, sqn);
+
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_sq);
+
 int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *tirn)
 {
@@ -162,6 +190,7 @@ int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
 
 	return err;
 }
+EXPORT_SYMBOL(mlx5_core_create_tir);
 
 int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
 			 int inlen)
@@ -187,6 +216,7 @@ void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn)
 
 	mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_destroy_tir);
 
 int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *tisn)
@@ -203,6 +233,19 @@ int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
 
 	return err;
 }
+EXPORT_SYMBOL(mlx5_core_create_tis);
+
+int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in,
+			 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_tis_out)] = {0};
+
+	MLX5_SET(modify_tis_in, in, tisn, tisn);
+	MLX5_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS);
+
+	return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_modify_tis);
 
 void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn)
 {
@@ -216,6 +259,7 @@ void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn)
 
 	mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_destroy_tis);
 
 int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *rmpn)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 076197efea9b..c7398b95aecd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -76,7 +76,7 @@ u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
 
 	return MLX5_GET(query_vport_state_out, out, admin_state);
 }
-EXPORT_SYMBOL(mlx5_query_vport_admin_state);
+EXPORT_SYMBOL_GPL(mlx5_query_vport_admin_state);
 
 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 				  u16 vport, u8 state)
@@ -104,7 +104,7 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 
 	return err;
 }
-EXPORT_SYMBOL(mlx5_modify_vport_admin_state);
+EXPORT_SYMBOL_GPL(mlx5_modify_vport_admin_state);
 
 static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
 					u32 *out, int outlen)
@@ -151,12 +151,9 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 				nic_vport_context.permanent_address);
 
 	err = mlx5_query_nic_vport_context(mdev, vport, out, outlen);
-	if (err)
-		goto out;
-
-	ether_addr_copy(addr, &out_addr[2]);
+	if (!err)
+		ether_addr_copy(addr, &out_addr[2]);
 
-out:
 	kvfree(out);
 	return err;
 }
@@ -197,7 +194,7 @@ int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 
 	return err;
 }
-EXPORT_SYMBOL(mlx5_modify_nic_vport_mac_address);
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_address);
 
 int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev,
 				  u32 vport,
@@ -430,6 +427,68 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_vlans);
 
+int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
+					   u64 *system_image_guid)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return -ENOMEM;
+
+	mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+	*system_image_guid = MLX5_GET64(query_nic_vport_context_out, out,
+					nic_vport_context.system_image_guid);
+
+	kfree(out);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid);
+
+int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return -ENOMEM;
+
+	mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+	*node_guid = MLX5_GET64(query_nic_vport_context_out, out,
+				nic_vport_context.node_guid);
+
+	kfree(out);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid);
+
+int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
+					u16 *qkey_viol_cntr)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return -ENOMEM;
+
+	mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+	*qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out,
+				   nic_vport_context.qkey_violation_counter);
+
+	kfree(out);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_qkey_viol_cntr);
+
 int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
 			     u8 port_num, u16  vf_num, u16 gid_index,
 			     union ib_gid *gid)
@@ -750,3 +809,44 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_promisc);
+
+enum mlx5_vport_roce_state {
+	MLX5_VPORT_ROCE_DISABLED = 0,
+	MLX5_VPORT_ROCE_ENABLED  = 1,
+};
+
+static int mlx5_nic_vport_update_roce_state(struct mlx5_core_dev *mdev,
+					    enum mlx5_vport_roce_state state)
+{
+	void *in;
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	int err;
+
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		mlx5_core_warn(mdev, "failed to allocate inbox\n");
+		return -ENOMEM;
+	}
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.roce_en, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.roce_en,
+		 state);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev)
+{
+	return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_ENABLED);
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_enable_roce);
+
+int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev)
+{
+	return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED);
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce);
diff --git a/drivers/ntb/hw/Kconfig b/drivers/ntb/hw/Kconfig
index 4d5535c4cddf..7116472b4625 100644
--- a/drivers/ntb/hw/Kconfig
+++ b/drivers/ntb/hw/Kconfig
@@ -1 +1,2 @@
+source "drivers/ntb/hw/amd/Kconfig"
 source "drivers/ntb/hw/intel/Kconfig"
diff --git a/drivers/ntb/hw/Makefile b/drivers/ntb/hw/Makefile
index 175d7c92a569..532e0859b4a1 100644
--- a/drivers/ntb/hw/Makefile
+++ b/drivers/ntb/hw/Makefile
@@ -1 +1,2 @@
+obj-$(CONFIG_NTB_AMD)	+= amd/
 obj-$(CONFIG_NTB_INTEL)	+= intel/
diff --git a/drivers/ntb/hw/amd/Kconfig b/drivers/ntb/hw/amd/Kconfig
new file mode 100644
index 000000000000..cfe903cd9514
--- /dev/null
+++ b/drivers/ntb/hw/amd/Kconfig
@@ -0,0 +1,7 @@
+config NTB_AMD
+	tristate "AMD Non-Transparent Bridge support"
+	depends on X86_64
+	help
+	 This driver supports AMD NTB on capable Zeppelin hardware.
+
+	 If unsure, say N.
diff --git a/drivers/ntb/hw/amd/Makefile b/drivers/ntb/hw/amd/Makefile
new file mode 100644
index 000000000000..ad54da917563
--- /dev/null
+++ b/drivers/ntb/hw/amd/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_NTB_AMD) += ntb_hw_amd.o
diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.c b/drivers/ntb/hw/amd/ntb_hw_amd.c
new file mode 100644
index 000000000000..588803ad6847
--- /dev/null
+++ b/drivers/ntb/hw/amd/ntb_hw_amd.c
@@ -0,0 +1,1143 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of AMD Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * AMD PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Xiangliang Yu <Xiangliang.Yu@amd.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/acpi.h>
+#include <linux/pci.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/ntb.h>
+
+#include "ntb_hw_amd.h"
+
+#define NTB_NAME	"ntb_hw_amd"
+#define NTB_DESC	"AMD(R) PCI-E Non-Transparent Bridge Driver"
+#define NTB_VER		"1.0"
+
+MODULE_DESCRIPTION(NTB_DESC);
+MODULE_VERSION(NTB_VER);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("AMD Inc.");
+
+static const struct file_operations amd_ntb_debugfs_info;
+static struct dentry *debugfs_dir;
+
+static int ndev_mw_to_bar(struct amd_ntb_dev *ndev, int idx)
+{
+	if (idx < 0 || idx > ndev->mw_count)
+		return -EINVAL;
+
+	return 1 << idx;
+}
+
+static int amd_ntb_mw_count(struct ntb_dev *ntb)
+{
+	return ntb_ndev(ntb)->mw_count;
+}
+
+static int amd_ntb_mw_get_range(struct ntb_dev *ntb, int idx,
+				phys_addr_t *base,
+				resource_size_t *size,
+				resource_size_t *align,
+				resource_size_t *align_size)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	int bar;
+
+	bar = ndev_mw_to_bar(ndev, idx);
+	if (bar < 0)
+		return bar;
+
+	if (base)
+		*base = pci_resource_start(ndev->ntb.pdev, bar);
+
+	if (size)
+		*size = pci_resource_len(ndev->ntb.pdev, bar);
+
+	if (align)
+		*align = SZ_4K;
+
+	if (align_size)
+		*align_size = 1;
+
+	return 0;
+}
+
+static int amd_ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
+				dma_addr_t addr, resource_size_t size)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	unsigned long xlat_reg, limit_reg = 0;
+	resource_size_t mw_size;
+	void __iomem *mmio, *peer_mmio;
+	u64 base_addr, limit, reg_val;
+	int bar;
+
+	bar = ndev_mw_to_bar(ndev, idx);
+	if (bar < 0)
+		return bar;
+
+	mw_size = pci_resource_len(ndev->ntb.pdev, bar);
+
+	/* make sure the range fits in the usable mw size */
+	if (size > mw_size)
+		return -EINVAL;
+
+	mmio = ndev->self_mmio;
+	peer_mmio = ndev->peer_mmio;
+
+	base_addr = pci_resource_start(ndev->ntb.pdev, bar);
+
+	if (bar != 1) {
+		xlat_reg = AMD_BAR23XLAT_OFFSET + ((bar - 2) << 3);
+		limit_reg = AMD_BAR23LMT_OFFSET + ((bar - 2) << 3);
+
+		/* Set the limit if supported */
+		limit = base_addr + size;
+
+		/* set and verify setting the translation address */
+		write64(addr, peer_mmio + xlat_reg);
+		reg_val = read64(peer_mmio + xlat_reg);
+		if (reg_val != addr) {
+			write64(0, peer_mmio + xlat_reg);
+			return -EIO;
+		}
+
+		/* set and verify setting the limit */
+		write64(limit, mmio + limit_reg);
+		reg_val = read64(mmio + limit_reg);
+		if (reg_val != limit) {
+			write64(base_addr, mmio + limit_reg);
+			write64(0, peer_mmio + xlat_reg);
+			return -EIO;
+		}
+	} else {
+		xlat_reg = AMD_BAR1XLAT_OFFSET;
+		limit_reg = AMD_BAR1LMT_OFFSET;
+
+		/* split bar addr range must all be 32 bit */
+		if (addr & (~0ull << 32))
+			return -EINVAL;
+		if ((addr + size) & (~0ull << 32))
+			return -EINVAL;
+
+		/* Set the limit if supported */
+		limit = base_addr + size;
+
+		/* set and verify setting the translation address */
+		write64(addr, peer_mmio + xlat_reg);
+		reg_val = read64(peer_mmio + xlat_reg);
+		if (reg_val != addr) {
+			write64(0, peer_mmio + xlat_reg);
+			return -EIO;
+		}
+
+		/* set and verify setting the limit */
+		writel(limit, mmio + limit_reg);
+		reg_val = readl(mmio + limit_reg);
+		if (reg_val != limit) {
+			writel(base_addr, mmio + limit_reg);
+			writel(0, peer_mmio + xlat_reg);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static int amd_link_is_up(struct amd_ntb_dev *ndev)
+{
+	if (!ndev->peer_sta)
+		return NTB_LNK_STA_ACTIVE(ndev->cntl_sta);
+
+	/* If peer_sta is reset or D0 event, the ISR has
+	 * started a timer to check link status of hardware.
+	 * So here just clear status bit. And if peer_sta is
+	 * D3 or PME_TO, D0/reset event will be happened when
+	 * system wakeup/poweron, so do nothing here.
+	 */
+	if (ndev->peer_sta & AMD_PEER_RESET_EVENT)
+		ndev->peer_sta &= ~AMD_PEER_RESET_EVENT;
+	else if (ndev->peer_sta & AMD_PEER_D0_EVENT)
+		ndev->peer_sta = 0;
+
+	return 0;
+}
+
+static int amd_ntb_link_is_up(struct ntb_dev *ntb,
+			      enum ntb_speed *speed,
+			      enum ntb_width *width)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	int ret = 0;
+
+	if (amd_link_is_up(ndev)) {
+		if (speed)
+			*speed = NTB_LNK_STA_SPEED(ndev->lnk_sta);
+		if (width)
+			*width = NTB_LNK_STA_WIDTH(ndev->lnk_sta);
+
+		dev_dbg(ndev_dev(ndev), "link is up.\n");
+
+		ret = 1;
+	} else {
+		if (speed)
+			*speed = NTB_SPEED_NONE;
+		if (width)
+			*width = NTB_WIDTH_NONE;
+
+		dev_dbg(ndev_dev(ndev), "link is down.\n");
+	}
+
+	return ret;
+}
+
+static int amd_ntb_link_enable(struct ntb_dev *ntb,
+			       enum ntb_speed max_speed,
+			       enum ntb_width max_width)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	void __iomem *mmio = ndev->self_mmio;
+	u32 ntb_ctl;
+
+	/* Enable event interrupt */
+	ndev->int_mask &= ~AMD_EVENT_INTMASK;
+	writel(ndev->int_mask, mmio + AMD_INTMASK_OFFSET);
+
+	if (ndev->ntb.topo == NTB_TOPO_SEC)
+		return -EINVAL;
+	dev_dbg(ndev_dev(ndev), "Enabling Link.\n");
+
+	ntb_ctl = readl(mmio + AMD_CNTL_OFFSET);
+	ntb_ctl |= (PMM_REG_CTL | SMM_REG_CTL);
+	writel(ntb_ctl, mmio + AMD_CNTL_OFFSET);
+
+	return 0;
+}
+
+static int amd_ntb_link_disable(struct ntb_dev *ntb)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	void __iomem *mmio = ndev->self_mmio;
+	u32 ntb_ctl;
+
+	/* Disable event interrupt */
+	ndev->int_mask |= AMD_EVENT_INTMASK;
+	writel(ndev->int_mask, mmio + AMD_INTMASK_OFFSET);
+
+	if (ndev->ntb.topo == NTB_TOPO_SEC)
+		return -EINVAL;
+	dev_dbg(ndev_dev(ndev), "Enabling Link.\n");
+
+	ntb_ctl = readl(mmio + AMD_CNTL_OFFSET);
+	ntb_ctl &= ~(PMM_REG_CTL | SMM_REG_CTL);
+	writel(ntb_ctl, mmio + AMD_CNTL_OFFSET);
+
+	return 0;
+}
+
+static u64 amd_ntb_db_valid_mask(struct ntb_dev *ntb)
+{
+	return ntb_ndev(ntb)->db_valid_mask;
+}
+
+static int amd_ntb_db_vector_count(struct ntb_dev *ntb)
+{
+	return ntb_ndev(ntb)->db_count;
+}
+
+static u64 amd_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+
+	if (db_vector < 0 || db_vector > ndev->db_count)
+		return 0;
+
+	return ntb_ndev(ntb)->db_valid_mask & (1 << db_vector);
+}
+
+static u64 amd_ntb_db_read(struct ntb_dev *ntb)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	void __iomem *mmio = ndev->self_mmio;
+
+	return (u64)readw(mmio + AMD_DBSTAT_OFFSET);
+}
+
+static int amd_ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	void __iomem *mmio = ndev->self_mmio;
+
+	writew((u16)db_bits, mmio + AMD_DBSTAT_OFFSET);
+
+	return 0;
+}
+
+static int amd_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	void __iomem *mmio = ndev->self_mmio;
+	unsigned long flags;
+
+	if (db_bits & ~ndev->db_valid_mask)
+		return -EINVAL;
+
+	spin_lock_irqsave(&ndev->db_mask_lock, flags);
+	ndev->db_mask |= db_bits;
+	writew((u16)ndev->db_mask, mmio + AMD_DBMASK_OFFSET);
+	spin_unlock_irqrestore(&ndev->db_mask_lock, flags);
+
+	return 0;
+}
+
+static int amd_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	void __iomem *mmio = ndev->self_mmio;
+	unsigned long flags;
+
+	if (db_bits & ~ndev->db_valid_mask)
+		return -EINVAL;
+
+	spin_lock_irqsave(&ndev->db_mask_lock, flags);
+	ndev->db_mask &= ~db_bits;
+	writew((u16)ndev->db_mask, mmio + AMD_DBMASK_OFFSET);
+	spin_unlock_irqrestore(&ndev->db_mask_lock, flags);
+
+	return 0;
+}
+
+static int amd_ntb_peer_db_addr(struct ntb_dev *ntb,
+				phys_addr_t *db_addr,
+				resource_size_t *db_size)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+
+	if (db_addr)
+		*db_addr = (phys_addr_t)(ndev->peer_mmio + AMD_DBREQ_OFFSET);
+	if (db_size)
+		*db_size = sizeof(u32);
+
+	return 0;
+}
+
+static int amd_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	void __iomem *mmio = ndev->self_mmio;
+
+	writew((u16)db_bits, mmio + AMD_DBREQ_OFFSET);
+
+	return 0;
+}
+
+static int amd_ntb_spad_count(struct ntb_dev *ntb)
+{
+	return ntb_ndev(ntb)->spad_count;
+}
+
+static u32 amd_ntb_spad_read(struct ntb_dev *ntb, int idx)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	void __iomem *mmio = ndev->self_mmio;
+	u32 offset;
+
+	if (idx < 0 || idx >= ndev->spad_count)
+		return 0;
+
+	offset = ndev->self_spad + (idx << 2);
+	return readl(mmio + AMD_SPAD_OFFSET + offset);
+}
+
+static int amd_ntb_spad_write(struct ntb_dev *ntb,
+			      int idx, u32 val)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	void __iomem *mmio = ndev->self_mmio;
+	u32 offset;
+
+	if (idx < 0 || idx >= ndev->spad_count)
+		return -EINVAL;
+
+	offset = ndev->self_spad + (idx << 2);
+	writel(val, mmio + AMD_SPAD_OFFSET + offset);
+
+	return 0;
+}
+
+static int amd_ntb_peer_spad_addr(struct ntb_dev *ntb, int idx,
+				  phys_addr_t *spad_addr)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+
+	if (idx < 0 || idx >= ndev->spad_count)
+		return -EINVAL;
+
+	if (spad_addr)
+		*spad_addr = (phys_addr_t)(ndev->self_mmio + AMD_SPAD_OFFSET +
+					   ndev->peer_spad + (idx << 2));
+	return 0;
+}
+
+static u32 amd_ntb_peer_spad_read(struct ntb_dev *ntb, int idx)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	void __iomem *mmio = ndev->self_mmio;
+	u32 offset;
+
+	if (idx < 0 || idx >= ndev->spad_count)
+		return -EINVAL;
+
+	offset = ndev->peer_spad + (idx << 2);
+	return readl(mmio + AMD_SPAD_OFFSET + offset);
+}
+
+static int amd_ntb_peer_spad_write(struct ntb_dev *ntb,
+				   int idx, u32 val)
+{
+	struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+	void __iomem *mmio = ndev->self_mmio;
+	u32 offset;
+
+	if (idx < 0 || idx >= ndev->spad_count)
+		return -EINVAL;
+
+	offset = ndev->peer_spad + (idx << 2);
+	writel(val, mmio + AMD_SPAD_OFFSET + offset);
+
+	return 0;
+}
+
+static const struct ntb_dev_ops amd_ntb_ops = {
+	.mw_count		= amd_ntb_mw_count,
+	.mw_get_range		= amd_ntb_mw_get_range,
+	.mw_set_trans		= amd_ntb_mw_set_trans,
+	.link_is_up		= amd_ntb_link_is_up,
+	.link_enable		= amd_ntb_link_enable,
+	.link_disable		= amd_ntb_link_disable,
+	.db_valid_mask		= amd_ntb_db_valid_mask,
+	.db_vector_count	= amd_ntb_db_vector_count,
+	.db_vector_mask		= amd_ntb_db_vector_mask,
+	.db_read		= amd_ntb_db_read,
+	.db_clear		= amd_ntb_db_clear,
+	.db_set_mask		= amd_ntb_db_set_mask,
+	.db_clear_mask		= amd_ntb_db_clear_mask,
+	.peer_db_addr		= amd_ntb_peer_db_addr,
+	.peer_db_set		= amd_ntb_peer_db_set,
+	.spad_count		= amd_ntb_spad_count,
+	.spad_read		= amd_ntb_spad_read,
+	.spad_write		= amd_ntb_spad_write,
+	.peer_spad_addr		= amd_ntb_peer_spad_addr,
+	.peer_spad_read		= amd_ntb_peer_spad_read,
+	.peer_spad_write	= amd_ntb_peer_spad_write,
+};
+
+static void amd_ack_smu(struct amd_ntb_dev *ndev, u32 bit)
+{
+	void __iomem *mmio = ndev->self_mmio;
+	int reg;
+
+	reg = readl(mmio + AMD_SMUACK_OFFSET);
+	reg |= bit;
+	writel(reg, mmio + AMD_SMUACK_OFFSET);
+
+	ndev->peer_sta |= bit;
+}
+
+static void amd_handle_event(struct amd_ntb_dev *ndev, int vec)
+{
+	void __iomem *mmio = ndev->self_mmio;
+	u32 status;
+
+	status = readl(mmio + AMD_INTSTAT_OFFSET);
+	if (!(status & AMD_EVENT_INTMASK))
+		return;
+
+	dev_dbg(ndev_dev(ndev), "status = 0x%x and vec = %d\n", status, vec);
+
+	status &= AMD_EVENT_INTMASK;
+	switch (status) {
+	case AMD_PEER_FLUSH_EVENT:
+		dev_info(ndev_dev(ndev), "Flush is done.\n");
+		break;
+	case AMD_PEER_RESET_EVENT:
+		amd_ack_smu(ndev, AMD_PEER_RESET_EVENT);
+
+		/* link down first */
+		ntb_link_event(&ndev->ntb);
+		/* polling peer status */
+		schedule_delayed_work(&ndev->hb_timer, AMD_LINK_HB_TIMEOUT);
+
+		break;
+	case AMD_PEER_D3_EVENT:
+	case AMD_PEER_PMETO_EVENT:
+		amd_ack_smu(ndev, status);
+
+		/* link down */
+		ntb_link_event(&ndev->ntb);
+
+		break;
+	case AMD_PEER_D0_EVENT:
+		mmio = ndev->peer_mmio;
+		status = readl(mmio + AMD_PMESTAT_OFFSET);
+		/* check if this is WAKEUP event */
+		if (status & 0x1)
+			dev_info(ndev_dev(ndev), "Wakeup is done.\n");
+
+		amd_ack_smu(ndev, AMD_PEER_D0_EVENT);
+
+		/* start a timer to poll link status */
+		schedule_delayed_work(&ndev->hb_timer,
+				      AMD_LINK_HB_TIMEOUT);
+		break;
+	default:
+		dev_info(ndev_dev(ndev), "event status = 0x%x.\n", status);
+		break;
+	}
+}
+
+static irqreturn_t ndev_interrupt(struct amd_ntb_dev *ndev, int vec)
+{
+	dev_dbg(ndev_dev(ndev), "vec %d\n", vec);
+
+	if (vec > (AMD_DB_CNT - 1) || (ndev->msix_vec_count == 1))
+		amd_handle_event(ndev, vec);
+
+	if (vec < AMD_DB_CNT)
+		ntb_db_event(&ndev->ntb, vec);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t ndev_vec_isr(int irq, void *dev)
+{
+	struct amd_ntb_vec *nvec = dev;
+
+	return ndev_interrupt(nvec->ndev, nvec->num);
+}
+
+static irqreturn_t ndev_irq_isr(int irq, void *dev)
+{
+	struct amd_ntb_dev *ndev = dev;
+
+	return ndev_interrupt(ndev, irq - ndev_pdev(ndev)->irq);
+}
+
+static int ndev_init_isr(struct amd_ntb_dev *ndev,
+			 int msix_min, int msix_max)
+{
+	struct pci_dev *pdev;
+	int rc, i, msix_count, node;
+
+	pdev = ndev_pdev(ndev);
+
+	node = dev_to_node(&pdev->dev);
+
+	ndev->db_mask = ndev->db_valid_mask;
+
+	/* Try to set up msix irq */
+	ndev->vec = kzalloc_node(msix_max * sizeof(*ndev->vec),
+				 GFP_KERNEL, node);
+	if (!ndev->vec)
+		goto err_msix_vec_alloc;
+
+	ndev->msix = kzalloc_node(msix_max * sizeof(*ndev->msix),
+				  GFP_KERNEL, node);
+	if (!ndev->msix)
+		goto err_msix_alloc;
+
+	for (i = 0; i < msix_max; ++i)
+		ndev->msix[i].entry = i;
+
+	msix_count = pci_enable_msix_range(pdev, ndev->msix,
+					   msix_min, msix_max);
+	if (msix_count < 0)
+		goto err_msix_enable;
+
+	/* NOTE: Disable MSIX if msix count is less than 16 because of
+	 * hardware limitation.
+	 */
+	if (msix_count < msix_min) {
+		pci_disable_msix(pdev);
+		goto err_msix_enable;
+	}
+
+	for (i = 0; i < msix_count; ++i) {
+		ndev->vec[i].ndev = ndev;
+		ndev->vec[i].num = i;
+		rc = request_irq(ndev->msix[i].vector, ndev_vec_isr, 0,
+				 "ndev_vec_isr", &ndev->vec[i]);
+		if (rc)
+			goto err_msix_request;
+	}
+
+	dev_dbg(ndev_dev(ndev), "Using msix interrupts\n");
+	ndev->db_count = msix_min;
+	ndev->msix_vec_count = msix_max;
+	return 0;
+
+err_msix_request:
+	while (i-- > 0)
+		free_irq(ndev->msix[i].vector, ndev);
+	pci_disable_msix(pdev);
+err_msix_enable:
+	kfree(ndev->msix);
+err_msix_alloc:
+	kfree(ndev->vec);
+err_msix_vec_alloc:
+	ndev->msix = NULL;
+	ndev->vec = NULL;
+
+	/* Try to set up msi irq */
+	rc = pci_enable_msi(pdev);
+	if (rc)
+		goto err_msi_enable;
+
+	rc = request_irq(pdev->irq, ndev_irq_isr, 0,
+			 "ndev_irq_isr", ndev);
+	if (rc)
+		goto err_msi_request;
+
+	dev_dbg(ndev_dev(ndev), "Using msi interrupts\n");
+	ndev->db_count = 1;
+	ndev->msix_vec_count = 1;
+	return 0;
+
+err_msi_request:
+	pci_disable_msi(pdev);
+err_msi_enable:
+
+	/* Try to set up intx irq */
+	pci_intx(pdev, 1);
+
+	rc = request_irq(pdev->irq, ndev_irq_isr, IRQF_SHARED,
+			 "ndev_irq_isr", ndev);
+	if (rc)
+		goto err_intx_request;
+
+	dev_dbg(ndev_dev(ndev), "Using intx interrupts\n");
+	ndev->db_count = 1;
+	ndev->msix_vec_count = 1;
+	return 0;
+
+err_intx_request:
+	return rc;
+}
+
+static void ndev_deinit_isr(struct amd_ntb_dev *ndev)
+{
+	struct pci_dev *pdev;
+	void __iomem *mmio = ndev->self_mmio;
+	int i;
+
+	pdev = ndev_pdev(ndev);
+
+	/* Mask all doorbell interrupts */
+	ndev->db_mask = ndev->db_valid_mask;
+	writel(ndev->db_mask, mmio + AMD_DBMASK_OFFSET);
+
+	if (ndev->msix) {
+		i = ndev->msix_vec_count;
+		while (i--)
+			free_irq(ndev->msix[i].vector, &ndev->vec[i]);
+		pci_disable_msix(pdev);
+		kfree(ndev->msix);
+		kfree(ndev->vec);
+	} else {
+		free_irq(pdev->irq, ndev);
+		if (pci_dev_msi_enabled(pdev))
+			pci_disable_msi(pdev);
+		else
+			pci_intx(pdev, 0);
+	}
+}
+
+static ssize_t ndev_debugfs_read(struct file *filp, char __user *ubuf,
+				 size_t count, loff_t *offp)
+{
+	struct amd_ntb_dev *ndev;
+	void __iomem *mmio;
+	char *buf;
+	size_t buf_size;
+	ssize_t ret, off;
+	union { u64 v64; u32 v32; u16 v16; } u;
+
+	ndev = filp->private_data;
+	mmio = ndev->self_mmio;
+
+	buf_size = min(count, 0x800ul);
+
+	buf = kmalloc(buf_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	off = 0;
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "NTB Device Information:\n");
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "Connection Topology -\t%s\n",
+			 ntb_topo_string(ndev->ntb.topo));
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "LNK STA -\t\t%#06x\n", ndev->lnk_sta);
+
+	if (!amd_link_is_up(ndev)) {
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Status -\t\tDown\n");
+	} else {
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Status -\t\tUp\n");
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Speed -\t\tPCI-E Gen %u\n",
+				 NTB_LNK_STA_SPEED(ndev->lnk_sta));
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Width -\t\tx%u\n",
+				 NTB_LNK_STA_WIDTH(ndev->lnk_sta));
+	}
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "Memory Window Count -\t%u\n", ndev->mw_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Scratchpad Count -\t%u\n", ndev->spad_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Count -\t%u\n", ndev->db_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "MSIX Vector Count -\t%u\n", ndev->msix_vec_count);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Valid Mask -\t%#llx\n", ndev->db_valid_mask);
+
+	u.v32 = readl(ndev->self_mmio + AMD_DBMASK_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Mask -\t\t\t%#06x\n", u.v32);
+
+	u.v32 = readl(mmio + AMD_DBSTAT_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Bell -\t\t\t%#06x\n", u.v32);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "\nNTB Incoming XLAT:\n");
+
+	u.v64 = read64(mmio + AMD_BAR1XLAT_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "XLAT1 -\t\t%#018llx\n", u.v64);
+
+	u.v64 = read64(ndev->self_mmio + AMD_BAR23XLAT_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "XLAT23 -\t\t%#018llx\n", u.v64);
+
+	u.v64 = read64(ndev->self_mmio + AMD_BAR45XLAT_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "XLAT45 -\t\t%#018llx\n", u.v64);
+
+	u.v32 = readl(mmio + AMD_BAR1LMT_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "LMT1 -\t\t\t%#06x\n", u.v32);
+
+	u.v64 = read64(ndev->self_mmio + AMD_BAR23LMT_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "LMT23 -\t\t\t%#018llx\n", u.v64);
+
+	u.v64 = read64(ndev->self_mmio + AMD_BAR45LMT_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "LMT45 -\t\t\t%#018llx\n", u.v64);
+
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, off);
+	kfree(buf);
+	return ret;
+}
+
+static void ndev_init_debugfs(struct amd_ntb_dev *ndev)
+{
+	if (!debugfs_dir) {
+		ndev->debugfs_dir = NULL;
+		ndev->debugfs_info = NULL;
+	} else {
+		ndev->debugfs_dir =
+			debugfs_create_dir(ndev_name(ndev), debugfs_dir);
+		if (!ndev->debugfs_dir)
+			ndev->debugfs_info = NULL;
+		else
+			ndev->debugfs_info =
+				debugfs_create_file("info", S_IRUSR,
+						    ndev->debugfs_dir, ndev,
+						    &amd_ntb_debugfs_info);
+	}
+}
+
+static void ndev_deinit_debugfs(struct amd_ntb_dev *ndev)
+{
+	debugfs_remove_recursive(ndev->debugfs_dir);
+}
+
+static inline void ndev_init_struct(struct amd_ntb_dev *ndev,
+				    struct pci_dev *pdev)
+{
+	ndev->ntb.pdev = pdev;
+	ndev->ntb.topo = NTB_TOPO_NONE;
+	ndev->ntb.ops = &amd_ntb_ops;
+	ndev->int_mask = AMD_EVENT_INTMASK;
+	spin_lock_init(&ndev->db_mask_lock);
+}
+
+static int amd_poll_link(struct amd_ntb_dev *ndev)
+{
+	void __iomem *mmio = ndev->peer_mmio;
+	u32 reg, stat;
+	int rc;
+
+	reg = readl(mmio + AMD_SIDEINFO_OFFSET);
+	reg &= NTB_LIN_STA_ACTIVE_BIT;
+
+	dev_dbg(ndev_dev(ndev), "%s: reg_val = 0x%x.\n", __func__, reg);
+
+	if (reg == ndev->cntl_sta)
+		return 0;
+
+	ndev->cntl_sta = reg;
+
+	rc = pci_read_config_dword(ndev->ntb.pdev,
+				   AMD_LINK_STATUS_OFFSET, &stat);
+	if (rc)
+		return 0;
+	ndev->lnk_sta = stat;
+
+	return 1;
+}
+
+static void amd_link_hb(struct work_struct *work)
+{
+	struct amd_ntb_dev *ndev = hb_ndev(work);
+
+	if (amd_poll_link(ndev))
+		ntb_link_event(&ndev->ntb);
+
+	if (!amd_link_is_up(ndev))
+		schedule_delayed_work(&ndev->hb_timer, AMD_LINK_HB_TIMEOUT);
+}
+
+static int amd_init_isr(struct amd_ntb_dev *ndev)
+{
+	return ndev_init_isr(ndev, AMD_DB_CNT, AMD_MSIX_VECTOR_CNT);
+}
+
+static void amd_init_side_info(struct amd_ntb_dev *ndev)
+{
+	void __iomem *mmio = ndev->self_mmio;
+	unsigned int reg;
+
+	reg = readl(mmio + AMD_SIDEINFO_OFFSET);
+	if (!(reg & AMD_SIDE_READY)) {
+		reg |= AMD_SIDE_READY;
+		writel(reg, mmio + AMD_SIDEINFO_OFFSET);
+	}
+}
+
+static void amd_deinit_side_info(struct amd_ntb_dev *ndev)
+{
+	void __iomem *mmio = ndev->self_mmio;
+	unsigned int reg;
+
+	reg = readl(mmio + AMD_SIDEINFO_OFFSET);
+	if (reg & AMD_SIDE_READY) {
+		reg &= ~AMD_SIDE_READY;
+		writel(reg, mmio + AMD_SIDEINFO_OFFSET);
+		readl(mmio + AMD_SIDEINFO_OFFSET);
+	}
+}
+
+static int amd_init_ntb(struct amd_ntb_dev *ndev)
+{
+	void __iomem *mmio = ndev->self_mmio;
+
+	ndev->mw_count = AMD_MW_CNT;
+	ndev->spad_count = AMD_SPADS_CNT;
+	ndev->db_count = AMD_DB_CNT;
+
+	switch (ndev->ntb.topo) {
+	case NTB_TOPO_PRI:
+	case NTB_TOPO_SEC:
+		ndev->spad_count >>= 1;
+		if (ndev->ntb.topo == NTB_TOPO_PRI) {
+			ndev->self_spad = 0;
+			ndev->peer_spad = 0x20;
+		} else {
+			ndev->self_spad = 0x20;
+			ndev->peer_spad = 0;
+		}
+
+		INIT_DELAYED_WORK(&ndev->hb_timer, amd_link_hb);
+		schedule_delayed_work(&ndev->hb_timer, AMD_LINK_HB_TIMEOUT);
+
+		break;
+	default:
+		dev_err(ndev_dev(ndev), "AMD NTB does not support B2B mode.\n");
+		return -EINVAL;
+	}
+
+	ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1;
+
+	/* Mask event interrupts */
+	writel(ndev->int_mask, mmio + AMD_INTMASK_OFFSET);
+
+	return 0;
+}
+
+static enum ntb_topo amd_get_topo(struct amd_ntb_dev *ndev)
+{
+	void __iomem *mmio = ndev->self_mmio;
+	u32 info;
+
+	info = readl(mmio + AMD_SIDEINFO_OFFSET);
+	if (info & AMD_SIDE_MASK)
+		return NTB_TOPO_SEC;
+	else
+		return NTB_TOPO_PRI;
+}
+
+static int amd_init_dev(struct amd_ntb_dev *ndev)
+{
+	struct pci_dev *pdev;
+	int rc = 0;
+
+	pdev = ndev_pdev(ndev);
+
+	ndev->ntb.topo = amd_get_topo(ndev);
+	dev_dbg(ndev_dev(ndev), "AMD NTB topo is %s\n",
+		ntb_topo_string(ndev->ntb.topo));
+
+	rc = amd_init_ntb(ndev);
+	if (rc)
+		return rc;
+
+	rc = amd_init_isr(ndev);
+	if (rc) {
+		dev_err(ndev_dev(ndev), "fail to init isr.\n");
+		return rc;
+	}
+
+	ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1;
+
+	return 0;
+}
+
+static void amd_deinit_dev(struct amd_ntb_dev *ndev)
+{
+	cancel_delayed_work_sync(&ndev->hb_timer);
+
+	ndev_deinit_isr(ndev);
+}
+
+static int amd_ntb_init_pci(struct amd_ntb_dev *ndev,
+			    struct pci_dev *pdev)
+{
+	int rc;
+
+	pci_set_drvdata(pdev, ndev);
+
+	rc = pci_enable_device(pdev);
+	if (rc)
+		goto err_pci_enable;
+
+	rc = pci_request_regions(pdev, NTB_NAME);
+	if (rc)
+		goto err_pci_regions;
+
+	pci_set_master(pdev);
+
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err_dma_mask;
+		dev_warn(ndev_dev(ndev), "Cannot DMA highmem\n");
+	}
+
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err_dma_mask;
+		dev_warn(ndev_dev(ndev), "Cannot DMA consistent highmem\n");
+	}
+
+	ndev->self_mmio = pci_iomap(pdev, 0, 0);
+	if (!ndev->self_mmio) {
+		rc = -EIO;
+		goto err_dma_mask;
+	}
+	ndev->peer_mmio = ndev->self_mmio + AMD_PEER_OFFSET;
+
+	return 0;
+
+err_dma_mask:
+	pci_clear_master(pdev);
+err_pci_regions:
+	pci_disable_device(pdev);
+err_pci_enable:
+	pci_set_drvdata(pdev, NULL);
+	return rc;
+}
+
+static void amd_ntb_deinit_pci(struct amd_ntb_dev *ndev)
+{
+	struct pci_dev *pdev = ndev_pdev(ndev);
+
+	pci_iounmap(pdev, ndev->self_mmio);
+
+	pci_clear_master(pdev);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+static int amd_ntb_pci_probe(struct pci_dev *pdev,
+			     const struct pci_device_id *id)
+{
+	struct amd_ntb_dev *ndev;
+	int rc, node;
+
+	node = dev_to_node(&pdev->dev);
+
+	ndev = kzalloc_node(sizeof(*ndev), GFP_KERNEL, node);
+	if (!ndev) {
+		rc = -ENOMEM;
+		goto err_ndev;
+	}
+
+	ndev_init_struct(ndev, pdev);
+
+	rc = amd_ntb_init_pci(ndev, pdev);
+	if (rc)
+		goto err_init_pci;
+
+	rc = amd_init_dev(ndev);
+	if (rc)
+		goto err_init_dev;
+
+	/* write side info */
+	amd_init_side_info(ndev);
+
+	amd_poll_link(ndev);
+
+	ndev_init_debugfs(ndev);
+
+	rc = ntb_register_device(&ndev->ntb);
+	if (rc)
+		goto err_register;
+
+	dev_info(&pdev->dev, "NTB device registered.\n");
+
+	return 0;
+
+err_register:
+	ndev_deinit_debugfs(ndev);
+	amd_deinit_dev(ndev);
+err_init_dev:
+	amd_ntb_deinit_pci(ndev);
+err_init_pci:
+	kfree(ndev);
+err_ndev:
+	return rc;
+}
+
+static void amd_ntb_pci_remove(struct pci_dev *pdev)
+{
+	struct amd_ntb_dev *ndev = pci_get_drvdata(pdev);
+
+	ntb_unregister_device(&ndev->ntb);
+	ndev_deinit_debugfs(ndev);
+	amd_deinit_side_info(ndev);
+	amd_deinit_dev(ndev);
+	amd_ntb_deinit_pci(ndev);
+	kfree(ndev);
+}
+
+static const struct file_operations amd_ntb_debugfs_info = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = ndev_debugfs_read,
+};
+
+static const struct pci_device_id amd_ntb_pci_tbl[] = {
+	{PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_NTB)},
+	{0}
+};
+MODULE_DEVICE_TABLE(pci, amd_ntb_pci_tbl);
+
+static struct pci_driver amd_ntb_pci_driver = {
+	.name		= KBUILD_MODNAME,
+	.id_table	= amd_ntb_pci_tbl,
+	.probe		= amd_ntb_pci_probe,
+	.remove		= amd_ntb_pci_remove,
+};
+
+static int __init amd_ntb_pci_driver_init(void)
+{
+	pr_info("%s %s\n", NTB_DESC, NTB_VER);
+
+	if (debugfs_initialized())
+		debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+
+	return pci_register_driver(&amd_ntb_pci_driver);
+}
+module_init(amd_ntb_pci_driver_init);
+
+static void __exit amd_ntb_pci_driver_exit(void)
+{
+	pci_unregister_driver(&amd_ntb_pci_driver);
+	debugfs_remove_recursive(debugfs_dir);
+}
+module_exit(amd_ntb_pci_driver_exit);
diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.h b/drivers/ntb/hw/amd/ntb_hw_amd.h
new file mode 100644
index 000000000000..2eac3cd3e646
--- /dev/null
+++ b/drivers/ntb/hw/amd/ntb_hw_amd.h
@@ -0,0 +1,217 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of AMD Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * AMD PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Xiangliang Yu <Xiangliang.Yu@amd.com>
+ */
+
+#ifndef NTB_HW_AMD_H
+#define NTB_HW_AMD_H
+
+#include <linux/ntb.h>
+#include <linux/pci.h>
+
+#define PCI_DEVICE_ID_AMD_NTB	0x145B
+#define AMD_LINK_HB_TIMEOUT	msecs_to_jiffies(1000)
+#define AMD_LINK_STATUS_OFFSET	0x68
+#define NTB_LIN_STA_ACTIVE_BIT	0x00000002
+#define NTB_LNK_STA_SPEED_MASK	0x000F0000
+#define NTB_LNK_STA_WIDTH_MASK	0x03F00000
+#define NTB_LNK_STA_ACTIVE(x)	(!!((x) & NTB_LIN_STA_ACTIVE_BIT))
+#define NTB_LNK_STA_SPEED(x)	(((x) & NTB_LNK_STA_SPEED_MASK) >> 16)
+#define NTB_LNK_STA_WIDTH(x)	(((x) & NTB_LNK_STA_WIDTH_MASK) >> 20)
+
+#ifndef read64
+#ifdef readq
+#define read64 readq
+#else
+#define read64 _read64
+static inline u64 _read64(void __iomem *mmio)
+{
+	u64 low, high;
+
+	low = readl(mmio);
+	high = readl(mmio + sizeof(u32));
+	return low | (high << 32);
+}
+#endif
+#endif
+
+#ifndef write64
+#ifdef writeq
+#define write64 writeq
+#else
+#define write64 _write64
+static inline void _write64(u64 val, void __iomem *mmio)
+{
+	writel(val, mmio);
+	writel(val >> 32, mmio + sizeof(u32));
+}
+#endif
+#endif
+
+enum {
+	/* AMD NTB Capability */
+	AMD_MW_CNT		= 3,
+	AMD_DB_CNT		= 16,
+	AMD_MSIX_VECTOR_CNT	= 24,
+	AMD_SPADS_CNT		= 16,
+
+	/*  AMD NTB register offset */
+	AMD_CNTL_OFFSET		= 0x200,
+
+	/* NTB control register bits */
+	PMM_REG_CTL		= BIT(21),
+	SMM_REG_CTL		= BIT(20),
+	SMM_REG_ACC_PATH	= BIT(18),
+	PMM_REG_ACC_PATH	= BIT(17),
+	NTB_CLK_EN		= BIT(16),
+
+	AMD_STA_OFFSET		= 0x204,
+	AMD_PGSLV_OFFSET	= 0x208,
+	AMD_SPAD_MUX_OFFSET	= 0x20C,
+	AMD_SPAD_OFFSET		= 0x210,
+	AMD_RSMU_HCID		= 0x250,
+	AMD_RSMU_SIID		= 0x254,
+	AMD_PSION_OFFSET	= 0x300,
+	AMD_SSION_OFFSET	= 0x330,
+	AMD_MMINDEX_OFFSET	= 0x400,
+	AMD_MMDATA_OFFSET	= 0x404,
+	AMD_SIDEINFO_OFFSET	= 0x408,
+
+	AMD_SIDE_MASK		= BIT(0),
+	AMD_SIDE_READY		= BIT(1),
+
+	/* limit register */
+	AMD_ROMBARLMT_OFFSET	= 0x410,
+	AMD_BAR1LMT_OFFSET	= 0x414,
+	AMD_BAR23LMT_OFFSET	= 0x418,
+	AMD_BAR45LMT_OFFSET	= 0x420,
+	/* xlat address */
+	AMD_POMBARXLAT_OFFSET	= 0x428,
+	AMD_BAR1XLAT_OFFSET	= 0x430,
+	AMD_BAR23XLAT_OFFSET	= 0x438,
+	AMD_BAR45XLAT_OFFSET	= 0x440,
+	/* doorbell and interrupt */
+	AMD_DBFM_OFFSET		= 0x450,
+	AMD_DBREQ_OFFSET	= 0x454,
+	AMD_MIRRDBSTAT_OFFSET	= 0x458,
+	AMD_DBMASK_OFFSET	= 0x45C,
+	AMD_DBSTAT_OFFSET	= 0x460,
+	AMD_INTMASK_OFFSET	= 0x470,
+	AMD_INTSTAT_OFFSET	= 0x474,
+
+	/* event type */
+	AMD_PEER_FLUSH_EVENT	= BIT(0),
+	AMD_PEER_RESET_EVENT	= BIT(1),
+	AMD_PEER_D3_EVENT	= BIT(2),
+	AMD_PEER_PMETO_EVENT	= BIT(3),
+	AMD_PEER_D0_EVENT	= BIT(4),
+	AMD_EVENT_INTMASK	= (AMD_PEER_FLUSH_EVENT |
+				AMD_PEER_RESET_EVENT | AMD_PEER_D3_EVENT |
+				AMD_PEER_PMETO_EVENT | AMD_PEER_D0_EVENT),
+
+	AMD_PMESTAT_OFFSET	= 0x480,
+	AMD_PMSGTRIG_OFFSET	= 0x490,
+	AMD_LTRLATENCY_OFFSET	= 0x494,
+	AMD_FLUSHTRIG_OFFSET	= 0x498,
+
+	/* SMU register*/
+	AMD_SMUACK_OFFSET	= 0x4A0,
+	AMD_SINRST_OFFSET	= 0x4A4,
+	AMD_RSPNUM_OFFSET	= 0x4A8,
+	AMD_SMU_SPADMUTEX	= 0x4B0,
+	AMD_SMU_SPADOFFSET	= 0x4B4,
+
+	AMD_PEER_OFFSET		= 0x400,
+};
+
+struct amd_ntb_dev;
+
+struct amd_ntb_vec {
+	struct amd_ntb_dev	*ndev;
+	int			num;
+};
+
+struct amd_ntb_dev {
+	struct ntb_dev ntb;
+
+	u32 ntb_side;
+	u32 lnk_sta;
+	u32 cntl_sta;
+	u32 peer_sta;
+
+	unsigned char mw_count;
+	unsigned char spad_count;
+	unsigned char db_count;
+	unsigned char msix_vec_count;
+
+	u64 db_valid_mask;
+	u64 db_mask;
+	u32 int_mask;
+
+	struct msix_entry *msix;
+	struct amd_ntb_vec *vec;
+
+	/* synchronize rmw access of db_mask and hw reg */
+	spinlock_t db_mask_lock;
+
+	void __iomem *self_mmio;
+	void __iomem *peer_mmio;
+	unsigned int self_spad;
+	unsigned int peer_spad;
+
+	struct delayed_work hb_timer;
+
+	struct dentry *debugfs_dir;
+	struct dentry *debugfs_info;
+};
+
+#define ndev_pdev(ndev) ((ndev)->ntb.pdev)
+#define ndev_name(ndev) pci_name(ndev_pdev(ndev))
+#define ndev_dev(ndev) (&ndev_pdev(ndev)->dev)
+#define ntb_ndev(__ntb) container_of(__ntb, struct amd_ntb_dev, ntb)
+#define hb_ndev(__work) container_of(__work, struct amd_ntb_dev, hb_timer.work)
+
+#endif
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.c b/drivers/ntb/hw/intel/ntb_hw_intel.c
index a198f8298258..40d04ef5da9e 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.c
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.c
@@ -875,7 +875,7 @@ static int intel_ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
 	limit_reg = bar2_off(ndev->xlat_reg->bar2_limit, bar);
 
 	if (bar < 4 || !ndev->bar4_split) {
-		base = ioread64(mmio + base_reg);
+		base = ioread64(mmio + base_reg) & NTB_BAR_MASK_64;
 
 		/* Set the limit if supported, if size is not mw_size */
 		if (limit_reg && size != mw_size)
@@ -906,7 +906,7 @@ static int intel_ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
 		if ((addr + size) & (~0ull << 32))
 			return -EINVAL;
 
-		base = ioread32(mmio + base_reg);
+		base = ioread32(mmio + base_reg) & NTB_BAR_MASK_32;
 
 		/* Set the limit if supported, if size is not mw_size */
 		if (limit_reg && size != mw_size)
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.h b/drivers/ntb/hw/intel/ntb_hw_intel.h
index 2eb4addd10d0..3ec149cf6562 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.h
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.h
@@ -245,6 +245,9 @@
 #define NTB_UNSAFE_DB			BIT_ULL(0)
 #define NTB_UNSAFE_SPAD			BIT_ULL(1)
 
+#define NTB_BAR_MASK_64			~(0xfull)
+#define NTB_BAR_MASK_32			~(0xfu)
+
 struct intel_ntb_dev;
 
 struct intel_ntb_reg {
@@ -334,7 +337,8 @@ struct intel_ntb_dev {
 #define ndev_pdev(ndev) ((ndev)->ntb.pdev)
 #define ndev_name(ndev) pci_name(ndev_pdev(ndev))
 #define ndev_dev(ndev) (&ndev_pdev(ndev)->dev)
-#define ntb_ndev(ntb) container_of(ntb, struct intel_ntb_dev, ntb)
-#define hb_ndev(work) container_of(work, struct intel_ntb_dev, hb_timer.work)
+#define ntb_ndev(__ntb) container_of(__ntb, struct intel_ntb_dev, ntb)
+#define hb_ndev(__work) container_of(__work, struct intel_ntb_dev, \
+				     hb_timer.work)
 
 #endif
diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index 60654d524858..ec4775f0ec16 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -171,12 +171,14 @@ struct ntb_transport_qp {
 	u64 rx_err_ver;
 	u64 rx_memcpy;
 	u64 rx_async;
+	u64 dma_rx_prep_err;
 	u64 tx_bytes;
 	u64 tx_pkts;
 	u64 tx_ring_full;
 	u64 tx_err_no_buf;
 	u64 tx_memcpy;
 	u64 tx_async;
+	u64 dma_tx_prep_err;
 };
 
 struct ntb_transport_mw {
@@ -249,6 +251,8 @@ enum {
 #define QP_TO_MW(nt, qp)	((qp) % nt->mw_count)
 #define NTB_QP_DEF_NUM_ENTRIES	100
 #define NTB_LINK_DOWN_TIMEOUT	10
+#define DMA_RETRIES		20
+#define DMA_OUT_RESOURCE_TO	50
 
 static void ntb_transport_rxc_db(unsigned long data);
 static const struct ntb_ctx_ops ntb_transport_ops;
@@ -501,6 +505,12 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
 	out_offset += snprintf(buf + out_offset, out_count - out_offset,
 			       "free tx - \t%u\n",
 			       ntb_transport_tx_free_entry(qp));
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "DMA tx prep err - \t%llu\n",
+			       qp->dma_tx_prep_err);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "DMA rx prep err - \t%llu\n",
+			       qp->dma_rx_prep_err);
 
 	out_offset += snprintf(buf + out_offset, out_count - out_offset,
 			       "\n");
@@ -726,6 +736,8 @@ static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
 	qp->tx_err_no_buf = 0;
 	qp->tx_memcpy = 0;
 	qp->tx_async = 0;
+	qp->dma_tx_prep_err = 0;
+	qp->dma_rx_prep_err = 0;
 }
 
 static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
@@ -1228,6 +1240,7 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset)
 	struct dmaengine_unmap_data *unmap;
 	dma_cookie_t cookie;
 	void *buf = entry->buf;
+	int retries = 0;
 
 	len = entry->len;
 
@@ -1263,11 +1276,21 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset)
 
 	unmap->from_cnt = 1;
 
-	txd = device->device_prep_dma_memcpy(chan, unmap->addr[1],
-					     unmap->addr[0], len,
-					     DMA_PREP_INTERRUPT);
-	if (!txd)
+	for (retries = 0; retries < DMA_RETRIES; retries++) {
+		txd = device->device_prep_dma_memcpy(chan, unmap->addr[1],
+						     unmap->addr[0], len,
+						     DMA_PREP_INTERRUPT);
+		if (txd)
+			break;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(DMA_OUT_RESOURCE_TO);
+	}
+
+	if (!txd) {
+		qp->dma_rx_prep_err++;
 		goto err_get_unmap;
+	}
 
 	txd->callback = ntb_rx_copy_callback;
 	txd->callback_param = entry;
@@ -1460,6 +1483,7 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
 	void __iomem *offset;
 	size_t len = entry->len;
 	void *buf = entry->buf;
+	int retries = 0;
 
 	offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index;
 	hdr = offset + qp->tx_max_frame - sizeof(struct ntb_payload_header);
@@ -1494,10 +1518,20 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
 
 	unmap->to_cnt = 1;
 
-	txd = device->device_prep_dma_memcpy(chan, dest, unmap->addr[0], len,
-					     DMA_PREP_INTERRUPT);
-	if (!txd)
+	for (retries = 0; retries < DMA_RETRIES; retries++) {
+		txd = device->device_prep_dma_memcpy(chan, dest, unmap->addr[0],
+						     len, DMA_PREP_INTERRUPT);
+		if (txd)
+			break;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(DMA_OUT_RESOURCE_TO);
+	}
+
+	if (!txd) {
+		qp->dma_tx_prep_err++;
 		goto err_get_unmap;
+	}
 
 	txd->callback = ntb_tx_copy_callback;
 	txd->callback_param = entry;
@@ -1532,7 +1566,7 @@ static int ntb_process_tx(struct ntb_transport_qp *qp,
 
 	if (entry->len > qp->tx_max_frame - sizeof(struct ntb_payload_header)) {
 		if (qp->tx_handler)
-			qp->tx_handler(qp->cb_data, qp, NULL, -EIO);
+			qp->tx_handler(qp, qp->cb_data, NULL, -EIO);
 
 		ntb_list_add(&qp->ntb_tx_free_q_lock, &entry->entry,
 			     &qp->tx_free_q);
diff --git a/drivers/ntb/test/Kconfig b/drivers/ntb/test/Kconfig
index 01852f98a843..a5d0eda44438 100644
--- a/drivers/ntb/test/Kconfig
+++ b/drivers/ntb/test/Kconfig
@@ -17,3 +17,11 @@ config NTB_TOOL
 	 functioning at a basic level.
 
 	 If unsure, say N.
+
+config NTB_PERF
+	tristate "NTB RAW Perf Measuring Tool"
+	help
+	 This is a tool to measure raw NTB performance by transferring data
+	 to and from the window without additional software interaction.
+
+	 If unsure, say N.
diff --git a/drivers/ntb/test/Makefile b/drivers/ntb/test/Makefile
index 0ea32a324b6c..9e77e0b761c2 100644
--- a/drivers/ntb/test/Makefile
+++ b/drivers/ntb/test/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o
 obj-$(CONFIG_NTB_TOOL) += ntb_tool.o
+obj-$(CONFIG_NTB_PERF) += ntb_perf.o
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
new file mode 100644
index 000000000000..c8a37ba4b4f9
--- /dev/null
+++ b/drivers/ntb/test/ntb_perf.c
@@ -0,0 +1,748 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *   PCIe NTB Perf Linux driver
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/sizes.h>
+#include <linux/ntb.h>
+
+#define DRIVER_NAME		"ntb_perf"
+#define DRIVER_DESCRIPTION	"PCIe NTB Performance Measurement Tool"
+
+#define DRIVER_LICENSE		"Dual BSD/GPL"
+#define DRIVER_VERSION		"1.0"
+#define DRIVER_AUTHOR		"Dave Jiang <dave.jiang@intel.com>"
+
+#define PERF_LINK_DOWN_TIMEOUT	10
+#define PERF_VERSION		0xffff0001
+#define MAX_THREADS		32
+#define MAX_TEST_SIZE		SZ_1M
+#define MAX_SRCS		32
+#define DMA_OUT_RESOURCE_TO	50
+#define DMA_RETRIES		20
+#define SZ_4G			(1ULL << 32)
+#define MAX_SEG_ORDER		20 /* no larger than 1M for kmalloc buffer */
+
+MODULE_LICENSE(DRIVER_LICENSE);
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
+
+static struct dentry *perf_debugfs_dir;
+
+static unsigned int seg_order = 19; /* 512K */
+module_param(seg_order, uint, 0644);
+MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment for testing");
+
+static unsigned int run_order = 32; /* 4G */
+module_param(run_order, uint, 0644);
+MODULE_PARM_DESC(run_order, "size order [n^2] of total data to transfer");
+
+static bool use_dma; /* default to 0 */
+module_param(use_dma, bool, 0644);
+MODULE_PARM_DESC(use_dma, "Using DMA engine to measure performance");
+
+struct perf_mw {
+	phys_addr_t	phys_addr;
+	resource_size_t	phys_size;
+	resource_size_t	xlat_align;
+	resource_size_t	xlat_align_size;
+	void __iomem	*vbase;
+	size_t		xlat_size;
+	size_t		buf_size;
+	void		*virt_addr;
+	dma_addr_t	dma_addr;
+};
+
+struct perf_ctx;
+
+struct pthr_ctx {
+	struct task_struct	*thread;
+	struct perf_ctx		*perf;
+	atomic_t		dma_sync;
+	struct dma_chan		*dma_chan;
+	int			dma_prep_err;
+	int			src_idx;
+	void			*srcs[MAX_SRCS];
+};
+
+struct perf_ctx {
+	struct ntb_dev		*ntb;
+	spinlock_t		db_lock;
+	struct perf_mw		mw;
+	bool			link_is_up;
+	struct work_struct	link_cleanup;
+	struct delayed_work	link_work;
+	struct dentry		*debugfs_node_dir;
+	struct dentry		*debugfs_run;
+	struct dentry		*debugfs_threads;
+	u8			perf_threads;
+	bool			run;
+	struct pthr_ctx		pthr_ctx[MAX_THREADS];
+	atomic_t		tsync;
+};
+
+enum {
+	VERSION = 0,
+	MW_SZ_HIGH,
+	MW_SZ_LOW,
+	SPAD_MSG,
+	SPAD_ACK,
+	MAX_SPAD
+};
+
+static void perf_link_event(void *ctx)
+{
+	struct perf_ctx *perf = ctx;
+
+	if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)
+		schedule_delayed_work(&perf->link_work, 2*HZ);
+	else
+		schedule_work(&perf->link_cleanup);
+}
+
+static void perf_db_event(void *ctx, int vec)
+{
+	struct perf_ctx *perf = ctx;
+	u64 db_bits, db_mask;
+
+	db_mask = ntb_db_vector_mask(perf->ntb, vec);
+	db_bits = ntb_db_read(perf->ntb);
+
+	dev_dbg(&perf->ntb->dev, "doorbell vec %d mask %#llx bits %#llx\n",
+		vec, db_mask, db_bits);
+}
+
+static const struct ntb_ctx_ops perf_ops = {
+	.link_event = perf_link_event,
+	.db_event = perf_db_event,
+};
+
+static void perf_copy_callback(void *data)
+{
+	struct pthr_ctx *pctx = data;
+
+	atomic_dec(&pctx->dma_sync);
+}
+
+static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
+			 char *src, size_t size)
+{
+	struct perf_ctx *perf = pctx->perf;
+	struct dma_async_tx_descriptor *txd;
+	struct dma_chan *chan = pctx->dma_chan;
+	struct dma_device *device;
+	struct dmaengine_unmap_data *unmap;
+	dma_cookie_t cookie;
+	size_t src_off, dst_off;
+	struct perf_mw *mw = &perf->mw;
+	u64 vbase, dst_vaddr;
+	dma_addr_t dst_phys;
+	int retries = 0;
+
+	if (!use_dma) {
+		memcpy_toio(dst, src, size);
+		return size;
+	}
+
+	if (!chan) {
+		dev_err(&perf->ntb->dev, "DMA engine does not exist\n");
+		return -EINVAL;
+	}
+
+	device = chan->device;
+	src_off = (size_t)src & ~PAGE_MASK;
+	dst_off = (size_t)dst & ~PAGE_MASK;
+
+	if (!is_dma_copy_aligned(device, src_off, dst_off, size))
+		return -ENODEV;
+
+	vbase = (u64)(u64 *)mw->vbase;
+	dst_vaddr = (u64)(u64 *)dst;
+	dst_phys = mw->phys_addr + (dst_vaddr - vbase);
+
+	unmap = dmaengine_get_unmap_data(device->dev, 1, GFP_NOWAIT);
+	if (!unmap)
+		return -ENOMEM;
+
+	unmap->len = size;
+	unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
+				      src_off, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(device->dev, unmap->addr[0]))
+		goto err_get_unmap;
+
+	unmap->to_cnt = 1;
+
+	do {
+		txd = device->device_prep_dma_memcpy(chan, dst_phys,
+						     unmap->addr[0],
+						     size, DMA_PREP_INTERRUPT);
+		if (!txd) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(DMA_OUT_RESOURCE_TO);
+		}
+	} while (!txd && (++retries < DMA_RETRIES));
+
+	if (!txd) {
+		pctx->dma_prep_err++;
+		goto err_get_unmap;
+	}
+
+	txd->callback = perf_copy_callback;
+	txd->callback_param = pctx;
+	dma_set_unmap(txd, unmap);
+
+	cookie = dmaengine_submit(txd);
+	if (dma_submit_error(cookie))
+		goto err_set_unmap;
+
+	atomic_inc(&pctx->dma_sync);
+	dma_async_issue_pending(chan);
+
+	return size;
+
+err_set_unmap:
+	dmaengine_unmap_put(unmap);
+err_get_unmap:
+	dmaengine_unmap_put(unmap);
+	return 0;
+}
+
+static int perf_move_data(struct pthr_ctx *pctx, char *dst, char *src,
+			  u64 buf_size, u64 win_size, u64 total)
+{
+	int chunks, total_chunks, i;
+	int copied_chunks = 0;
+	u64 copied = 0, result;
+	char *tmp = dst;
+	u64 perf, diff_us;
+	ktime_t kstart, kstop, kdiff;
+
+	chunks = div64_u64(win_size, buf_size);
+	total_chunks = div64_u64(total, buf_size);
+	kstart = ktime_get();
+
+	for (i = 0; i < total_chunks; i++) {
+		result = perf_copy(pctx, tmp, src, buf_size);
+		copied += result;
+		copied_chunks++;
+		if (copied_chunks == chunks) {
+			tmp = dst;
+			copied_chunks = 0;
+		} else
+			tmp += buf_size;
+
+		/* Probably should schedule every 4GB to prevent soft hang. */
+		if (((copied % SZ_4G) == 0) && !use_dma) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(1);
+		}
+	}
+
+	if (use_dma) {
+		pr_info("%s: All DMA descriptors submitted\n", current->comm);
+		while (atomic_read(&pctx->dma_sync) != 0)
+			msleep(20);
+	}
+
+	kstop = ktime_get();
+	kdiff = ktime_sub(kstop, kstart);
+	diff_us = ktime_to_us(kdiff);
+
+	pr_info("%s: copied %llu bytes\n", current->comm, copied);
+
+	pr_info("%s: lasted %llu usecs\n", current->comm, diff_us);
+
+	perf = div64_u64(copied, diff_us);
+
+	pr_info("%s: MBytes/s: %llu\n", current->comm, perf);
+
+	return 0;
+}
+
+static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
+{
+	return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
+}
+
+static int ntb_perf_thread(void *data)
+{
+	struct pthr_ctx *pctx = data;
+	struct perf_ctx *perf = pctx->perf;
+	struct pci_dev *pdev = perf->ntb->pdev;
+	struct perf_mw *mw = &perf->mw;
+	char *dst;
+	u64 win_size, buf_size, total;
+	void *src;
+	int rc, node, i;
+	struct dma_chan *dma_chan = NULL;
+
+	pr_info("kthread %s starting...\n", current->comm);
+
+	node = dev_to_node(&pdev->dev);
+
+	if (use_dma && !pctx->dma_chan) {
+		dma_cap_mask_t dma_mask;
+
+		dma_cap_zero(dma_mask);
+		dma_cap_set(DMA_MEMCPY, dma_mask);
+		dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
+					       (void *)(unsigned long)node);
+		if (!dma_chan) {
+			pr_warn("%s: cannot acquire DMA channel, quitting\n",
+				current->comm);
+			return -ENODEV;
+		}
+		pctx->dma_chan = dma_chan;
+	}
+
+	for (i = 0; i < MAX_SRCS; i++) {
+		pctx->srcs[i] = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
+		if (!pctx->srcs[i]) {
+			rc = -ENOMEM;
+			goto err;
+		}
+	}
+
+	win_size = mw->phys_size;
+	buf_size = 1ULL << seg_order;
+	total = 1ULL << run_order;
+
+	if (buf_size > MAX_TEST_SIZE)
+		buf_size = MAX_TEST_SIZE;
+
+	dst = (char *)mw->vbase;
+
+	atomic_inc(&perf->tsync);
+	while (atomic_read(&perf->tsync) != perf->perf_threads)
+		schedule();
+
+	src = pctx->srcs[pctx->src_idx];
+	pctx->src_idx = (pctx->src_idx + 1) & (MAX_SRCS - 1);
+
+	rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);
+
+	atomic_dec(&perf->tsync);
+
+	if (rc < 0) {
+		pr_err("%s: failed\n", current->comm);
+		rc = -ENXIO;
+		goto err;
+	}
+
+	for (i = 0; i < MAX_SRCS; i++) {
+		kfree(pctx->srcs[i]);
+		pctx->srcs[i] = NULL;
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < MAX_SRCS; i++) {
+		kfree(pctx->srcs[i]);
+		pctx->srcs[i] = NULL;
+	}
+
+	if (dma_chan) {
+		dma_release_channel(dma_chan);
+		pctx->dma_chan = NULL;
+	}
+
+	return rc;
+}
+
+static void perf_free_mw(struct perf_ctx *perf)
+{
+	struct perf_mw *mw = &perf->mw;
+	struct pci_dev *pdev = perf->ntb->pdev;
+
+	if (!mw->virt_addr)
+		return;
+
+	ntb_mw_clear_trans(perf->ntb, 0);
+	dma_free_coherent(&pdev->dev, mw->buf_size,
+			  mw->virt_addr, mw->dma_addr);
+	mw->xlat_size = 0;
+	mw->buf_size = 0;
+	mw->virt_addr = NULL;
+}
+
+static int perf_set_mw(struct perf_ctx *perf, resource_size_t size)
+{
+	struct perf_mw *mw = &perf->mw;
+	size_t xlat_size, buf_size;
+
+	if (!size)
+		return -EINVAL;
+
+	xlat_size = round_up(size, mw->xlat_align_size);
+	buf_size = round_up(size, mw->xlat_align);
+
+	if (mw->xlat_size == xlat_size)
+		return 0;
+
+	if (mw->buf_size)
+		perf_free_mw(perf);
+
+	mw->xlat_size = xlat_size;
+	mw->buf_size = buf_size;
+
+	mw->virt_addr = dma_alloc_coherent(&perf->ntb->pdev->dev, buf_size,
+					   &mw->dma_addr, GFP_KERNEL);
+	if (!mw->virt_addr) {
+		mw->xlat_size = 0;
+		mw->buf_size = 0;
+	}
+
+	return 0;
+}
+
+static void perf_link_work(struct work_struct *work)
+{
+	struct perf_ctx *perf =
+		container_of(work, struct perf_ctx, link_work.work);
+	struct ntb_dev *ndev = perf->ntb;
+	struct pci_dev *pdev = ndev->pdev;
+	u32 val;
+	u64 size;
+	int rc;
+
+	dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+
+	size = perf->mw.phys_size;
+	ntb_peer_spad_write(ndev, MW_SZ_HIGH, upper_32_bits(size));
+	ntb_peer_spad_write(ndev, MW_SZ_LOW, lower_32_bits(size));
+	ntb_peer_spad_write(ndev, VERSION, PERF_VERSION);
+
+	/* now read what peer wrote */
+	val = ntb_spad_read(ndev, VERSION);
+	if (val != PERF_VERSION) {
+		dev_dbg(&pdev->dev, "Remote version = %#x\n", val);
+		goto out;
+	}
+
+	val = ntb_spad_read(ndev, MW_SZ_HIGH);
+	size = (u64)val << 32;
+
+	val = ntb_spad_read(ndev, MW_SZ_LOW);
+	size |= val;
+
+	dev_dbg(&pdev->dev, "Remote MW size = %#llx\n", size);
+
+	rc = perf_set_mw(perf, size);
+	if (rc)
+		goto out1;
+
+	perf->link_is_up = true;
+
+	return;
+
+out1:
+	perf_free_mw(perf);
+
+out:
+	if (ntb_link_is_up(ndev, NULL, NULL) == 1)
+		schedule_delayed_work(&perf->link_work,
+				      msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT));
+}
+
+static void perf_link_cleanup(struct work_struct *work)
+{
+	struct perf_ctx *perf = container_of(work,
+					     struct perf_ctx,
+					     link_cleanup);
+
+	dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+
+	if (!perf->link_is_up)
+		cancel_delayed_work_sync(&perf->link_work);
+}
+
+static int perf_setup_mw(struct ntb_dev *ntb, struct perf_ctx *perf)
+{
+	struct perf_mw *mw;
+	int rc;
+
+	mw = &perf->mw;
+
+	rc = ntb_mw_get_range(ntb, 0, &mw->phys_addr, &mw->phys_size,
+			      &mw->xlat_align, &mw->xlat_align_size);
+	if (rc)
+		return rc;
+
+	perf->mw.vbase = ioremap_wc(mw->phys_addr, mw->phys_size);
+	if (!mw->vbase)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
+				size_t count, loff_t *offp)
+{
+	struct perf_ctx *perf = filp->private_data;
+	char *buf;
+	ssize_t ret, out_offset;
+
+	if (!perf)
+		return 0;
+
+	buf = kmalloc(64, GFP_KERNEL);
+	out_offset = snprintf(buf, 64, "%d\n", perf->run);
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
+	kfree(buf);
+
+	return ret;
+}
+
+static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
+				 size_t count, loff_t *offp)
+{
+	struct perf_ctx *perf = filp->private_data;
+	int node, i;
+
+	if (!perf->link_is_up)
+		return 0;
+
+	if (perf->perf_threads == 0)
+		return 0;
+
+	if (atomic_read(&perf->tsync) == 0)
+		perf->run = false;
+
+	if (perf->run) {
+		/* lets stop the threads */
+		perf->run = false;
+		for (i = 0; i < MAX_THREADS; i++) {
+			if (perf->pthr_ctx[i].thread) {
+				kthread_stop(perf->pthr_ctx[i].thread);
+				perf->pthr_ctx[i].thread = NULL;
+			} else
+				break;
+		}
+	} else {
+		perf->run = true;
+
+		if (perf->perf_threads > MAX_THREADS) {
+			perf->perf_threads = MAX_THREADS;
+			pr_info("Reset total threads to: %u\n", MAX_THREADS);
+		}
+
+		/* no greater than 1M */
+		if (seg_order > MAX_SEG_ORDER) {
+			seg_order = MAX_SEG_ORDER;
+			pr_info("Fix seg_order to %u\n", seg_order);
+		}
+
+		if (run_order < seg_order) {
+			run_order = seg_order;
+			pr_info("Fix run_order to %u\n", run_order);
+		}
+
+		node = dev_to_node(&perf->ntb->pdev->dev);
+		/* launch kernel thread */
+		for (i = 0; i < perf->perf_threads; i++) {
+			struct pthr_ctx *pctx;
+
+			pctx = &perf->pthr_ctx[i];
+			atomic_set(&pctx->dma_sync, 0);
+			pctx->perf = perf;
+			pctx->thread =
+				kthread_create_on_node(ntb_perf_thread,
+						       (void *)pctx,
+						       node, "ntb_perf %d", i);
+			if (pctx->thread)
+				wake_up_process(pctx->thread);
+			else {
+				perf->run = false;
+				for (i = 0; i < MAX_THREADS; i++) {
+					if (pctx->thread) {
+						kthread_stop(pctx->thread);
+						pctx->thread = NULL;
+					}
+				}
+			}
+
+			if (perf->run == false)
+				return -ENXIO;
+		}
+
+	}
+
+	return count;
+}
+
+static const struct file_operations ntb_perf_debugfs_run = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = debugfs_run_read,
+	.write = debugfs_run_write,
+};
+
+static int perf_debugfs_setup(struct perf_ctx *perf)
+{
+	struct pci_dev *pdev = perf->ntb->pdev;
+
+	if (!debugfs_initialized())
+		return -ENODEV;
+
+	if (!perf_debugfs_dir) {
+		perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+		if (!perf_debugfs_dir)
+			return -ENODEV;
+	}
+
+	perf->debugfs_node_dir = debugfs_create_dir(pci_name(pdev),
+						    perf_debugfs_dir);
+	if (!perf->debugfs_node_dir)
+		return -ENODEV;
+
+	perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR,
+						perf->debugfs_node_dir, perf,
+						&ntb_perf_debugfs_run);
+	if (!perf->debugfs_run)
+		return -ENODEV;
+
+	perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
+						  perf->debugfs_node_dir,
+						  &perf->perf_threads);
+	if (!perf->debugfs_threads)
+		return -ENODEV;
+
+	return 0;
+}
+
+static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
+{
+	struct pci_dev *pdev = ntb->pdev;
+	struct perf_ctx *perf;
+	int node;
+	int rc = 0;
+
+	node = dev_to_node(&pdev->dev);
+
+	perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, node);
+	if (!perf) {
+		rc = -ENOMEM;
+		goto err_perf;
+	}
+
+	perf->ntb = ntb;
+	perf->perf_threads = 1;
+	atomic_set(&perf->tsync, 0);
+	perf->run = false;
+	spin_lock_init(&perf->db_lock);
+	perf_setup_mw(ntb, perf);
+	INIT_DELAYED_WORK(&perf->link_work, perf_link_work);
+	INIT_WORK(&perf->link_cleanup, perf_link_cleanup);
+
+	rc = ntb_set_ctx(ntb, perf, &perf_ops);
+	if (rc)
+		goto err_ctx;
+
+	perf->link_is_up = false;
+	ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+	ntb_link_event(ntb);
+
+	rc = perf_debugfs_setup(perf);
+	if (rc)
+		goto err_ctx;
+
+	return 0;
+
+err_ctx:
+	cancel_delayed_work_sync(&perf->link_work);
+	cancel_work_sync(&perf->link_cleanup);
+	kfree(perf);
+err_perf:
+	return rc;
+}
+
+static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb)
+{
+	struct perf_ctx *perf = ntb->ctx;
+	int i;
+
+	dev_dbg(&perf->ntb->dev, "%s called\n", __func__);
+
+	cancel_delayed_work_sync(&perf->link_work);
+	cancel_work_sync(&perf->link_cleanup);
+
+	ntb_clear_ctx(ntb);
+	ntb_link_disable(ntb);
+
+	debugfs_remove_recursive(perf_debugfs_dir);
+	perf_debugfs_dir = NULL;
+
+	if (use_dma) {
+		for (i = 0; i < MAX_THREADS; i++) {
+			struct pthr_ctx *pctx = &perf->pthr_ctx[i];
+
+			if (pctx->dma_chan)
+				dma_release_channel(pctx->dma_chan);
+		}
+	}
+
+	kfree(perf);
+}
+
+static struct ntb_client perf_client = {
+	.ops = {
+		.probe = perf_probe,
+		.remove = perf_remove,
+	},
+};
+module_ntb_client(perf_client);
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 002a94abdbc4..5d6237391dcd 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -8,3 +8,14 @@ config BLK_DEV_NVME
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called nvme.
+
+config BLK_DEV_NVME_SCSI
+	bool "SCSI emulation for NVMe device nodes"
+	depends on BLK_DEV_NVME
+	---help---
+	  This adds support for the SG_IO ioctl on the NVMe character
+	  and block devices nodes, as well a a translation for a small
+	  number of selected SCSI commands to NVMe commands to the NVMe
+	  driver.  If you don't know what this means you probably want
+	  to say N here, and if you know what it means you probably
+	  want to say N as well.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index a5fe23952586..51bf90871549 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -1,5 +1,6 @@
 
 obj-$(CONFIG_BLK_DEV_NVME)     += nvme.o
 
-lightnvm-$(CONFIG_NVM)	:= lightnvm.o
-nvme-y		+= pci.o scsi.o $(lightnvm-y)
+lightnvm-$(CONFIG_NVM)			:= lightnvm.o
+nvme-y					+= core.o pci.o $(lightnvm-y)
+nvme-$(CONFIG_BLK_DEV_NVME_SCSI)        += scsi.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
new file mode 100644
index 000000000000..c5bf001af559
--- /dev/null
+++ b/drivers/nvme/host/core.c
@@ -0,0 +1,1472 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/hdreg.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list_sort.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/pr.h>
+#include <linux/ptrace.h>
+#include <linux/nvme_ioctl.h>
+#include <linux/t10-pi.h>
+#include <scsi/sg.h>
+#include <asm/unaligned.h>
+
+#include "nvme.h"
+
+#define NVME_MINORS		(1U << MINORBITS)
+
+static int nvme_major;
+module_param(nvme_major, int, 0);
+
+static int nvme_char_major;
+module_param(nvme_char_major, int, 0);
+
+static LIST_HEAD(nvme_ctrl_list);
+DEFINE_SPINLOCK(dev_list_lock);
+
+static struct class *nvme_class;
+
+static void nvme_free_ns(struct kref *kref)
+{
+	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
+
+	if (ns->type == NVME_NS_LIGHTNVM)
+		nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
+
+	spin_lock(&dev_list_lock);
+	ns->disk->private_data = NULL;
+	spin_unlock(&dev_list_lock);
+
+	nvme_put_ctrl(ns->ctrl);
+	put_disk(ns->disk);
+	kfree(ns);
+}
+
+static void nvme_put_ns(struct nvme_ns *ns)
+{
+	kref_put(&ns->kref, nvme_free_ns);
+}
+
+static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
+{
+	struct nvme_ns *ns;
+
+	spin_lock(&dev_list_lock);
+	ns = disk->private_data;
+	if (ns && !kref_get_unless_zero(&ns->kref))
+		ns = NULL;
+	spin_unlock(&dev_list_lock);
+
+	return ns;
+}
+
+void nvme_requeue_req(struct request *req)
+{
+	unsigned long flags;
+
+	blk_mq_requeue_request(req);
+	spin_lock_irqsave(req->q->queue_lock, flags);
+	if (!blk_queue_stopped(req->q))
+		blk_mq_kick_requeue_list(req->q);
+	spin_unlock_irqrestore(req->q->queue_lock, flags);
+}
+
+struct request *nvme_alloc_request(struct request_queue *q,
+		struct nvme_command *cmd, unsigned int flags)
+{
+	bool write = cmd->common.opcode & 1;
+	struct request *req;
+
+	req = blk_mq_alloc_request(q, write, flags);
+	if (IS_ERR(req))
+		return req;
+
+	req->cmd_type = REQ_TYPE_DRV_PRIV;
+	req->cmd_flags |= REQ_FAILFAST_DRIVER;
+	req->__data_len = 0;
+	req->__sector = (sector_t) -1;
+	req->bio = req->biotail = NULL;
+
+	req->cmd = (unsigned char *)cmd;
+	req->cmd_len = sizeof(struct nvme_command);
+	req->special = (void *)0;
+
+	return req;
+}
+
+/*
+ * Returns 0 on success.  If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, unsigned bufflen, u32 *result, unsigned timeout)
+{
+	struct request *req;
+	int ret;
+
+	req = nvme_alloc_request(q, cmd, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+	if (buffer && bufflen) {
+		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
+		if (ret)
+			goto out;
+	}
+
+	blk_execute_rq(req->q, NULL, req, 0);
+	if (result)
+		*result = (u32)(uintptr_t)req->special;
+	ret = req->errors;
+ out:
+	blk_mq_free_request(req);
+	return ret;
+}
+
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, unsigned bufflen)
+{
+	return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
+}
+
+int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void __user *ubuffer, unsigned bufflen,
+		void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
+		u32 *result, unsigned timeout)
+{
+	bool write = cmd->common.opcode & 1;
+	struct nvme_ns *ns = q->queuedata;
+	struct gendisk *disk = ns ? ns->disk : NULL;
+	struct request *req;
+	struct bio *bio = NULL;
+	void *meta = NULL;
+	int ret;
+
+	req = nvme_alloc_request(q, cmd, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+	if (ubuffer && bufflen) {
+		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
+				GFP_KERNEL);
+		if (ret)
+			goto out;
+		bio = req->bio;
+
+		if (!disk)
+			goto submit;
+		bio->bi_bdev = bdget_disk(disk, 0);
+		if (!bio->bi_bdev) {
+			ret = -ENODEV;
+			goto out_unmap;
+		}
+
+		if (meta_buffer) {
+			struct bio_integrity_payload *bip;
+
+			meta = kmalloc(meta_len, GFP_KERNEL);
+			if (!meta) {
+				ret = -ENOMEM;
+				goto out_unmap;
+			}
+
+			if (write) {
+				if (copy_from_user(meta, meta_buffer,
+						meta_len)) {
+					ret = -EFAULT;
+					goto out_free_meta;
+				}
+			}
+
+			bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
+			if (IS_ERR(bip)) {
+				ret = PTR_ERR(bip);
+				goto out_free_meta;
+			}
+
+			bip->bip_iter.bi_size = meta_len;
+			bip->bip_iter.bi_sector = meta_seed;
+
+			ret = bio_integrity_add_page(bio, virt_to_page(meta),
+					meta_len, offset_in_page(meta));
+			if (ret != meta_len) {
+				ret = -ENOMEM;
+				goto out_free_meta;
+			}
+		}
+	}
+ submit:
+	blk_execute_rq(req->q, disk, req, 0);
+	ret = req->errors;
+	if (result)
+		*result = (u32)(uintptr_t)req->special;
+	if (meta && !ret && !write) {
+		if (copy_to_user(meta_buffer, meta, meta_len))
+			ret = -EFAULT;
+	}
+ out_free_meta:
+	kfree(meta);
+ out_unmap:
+	if (bio) {
+		if (disk && bio->bi_bdev)
+			bdput(bio->bi_bdev);
+		blk_rq_unmap_user(bio);
+	}
+ out:
+	blk_mq_free_request(req);
+	return ret;
+}
+
+int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void __user *ubuffer, unsigned bufflen, u32 *result,
+		unsigned timeout)
+{
+	return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
+			result, timeout);
+}
+
+int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
+{
+	struct nvme_command c = { };
+	int error;
+
+	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.cns = cpu_to_le32(1);
+
+	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ctrl));
+	if (error)
+		kfree(*id);
+	return error;
+}
+
+static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
+{
+	struct nvme_command c = { };
+
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.cns = cpu_to_le32(2);
+	c.identify.nsid = cpu_to_le32(nsid);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
+}
+
+int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
+		struct nvme_id_ns **id)
+{
+	struct nvme_command c = { };
+	int error;
+
+	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+	c.identify.opcode = nvme_admin_identify,
+	c.identify.nsid = cpu_to_le32(nsid),
+
+	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ns));
+	if (error)
+		kfree(*id);
+	return error;
+}
+
+int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
+					dma_addr_t dma_addr, u32 *result)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_get_features;
+	c.features.nsid = cpu_to_le32(nsid);
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
+}
+
+int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
+					dma_addr_t dma_addr, u32 *result)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_set_features;
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+	c.features.dword11 = cpu_to_le32(dword11);
+
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
+}
+
+int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
+{
+	struct nvme_command c = { };
+	int error;
+
+	c.common.opcode = nvme_admin_get_log_page,
+	c.common.nsid = cpu_to_le32(0xFFFFFFFF),
+	c.common.cdw10[0] = cpu_to_le32(
+			(((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
+			 NVME_LOG_SMART),
+
+	*log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
+	if (!*log)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
+			sizeof(struct nvme_smart_log));
+	if (error)
+		kfree(*log);
+	return error;
+}
+
+int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
+{
+	u32 q_count = (*count - 1) | ((*count - 1) << 16);
+	u32 result;
+	int status, nr_io_queues;
+
+	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
+			&result);
+	if (status)
+		return status;
+
+	nr_io_queues = min(result & 0xffff, result >> 16) + 1;
+	*count = min(*count, nr_io_queues);
+	return 0;
+}
+
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+	struct nvme_user_io io;
+	struct nvme_command c;
+	unsigned length, meta_len;
+	void __user *metadata;
+
+	if (copy_from_user(&io, uio, sizeof(io)))
+		return -EFAULT;
+
+	switch (io.opcode) {
+	case nvme_cmd_write:
+	case nvme_cmd_read:
+	case nvme_cmd_compare:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	length = (io.nblocks + 1) << ns->lba_shift;
+	meta_len = (io.nblocks + 1) * ns->ms;
+	metadata = (void __user *)(uintptr_t)io.metadata;
+
+	if (ns->ext) {
+		length += meta_len;
+		meta_len = 0;
+	} else if (meta_len) {
+		if ((io.metadata & 3) || !io.metadata)
+			return -EINVAL;
+	}
+
+	memset(&c, 0, sizeof(c));
+	c.rw.opcode = io.opcode;
+	c.rw.flags = io.flags;
+	c.rw.nsid = cpu_to_le32(ns->ns_id);
+	c.rw.slba = cpu_to_le64(io.slba);
+	c.rw.length = cpu_to_le16(io.nblocks);
+	c.rw.control = cpu_to_le16(io.control);
+	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+	c.rw.reftag = cpu_to_le32(io.reftag);
+	c.rw.apptag = cpu_to_le16(io.apptag);
+	c.rw.appmask = cpu_to_le16(io.appmask);
+
+	return __nvme_submit_user_cmd(ns->queue, &c,
+			(void __user *)(uintptr_t)io.addr, length,
+			metadata, meta_len, io.slba, NULL, 0);
+}
+
+static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+			struct nvme_passthru_cmd __user *ucmd)
+{
+	struct nvme_passthru_cmd cmd;
+	struct nvme_command c;
+	unsigned timeout = 0;
+	int status;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+		return -EFAULT;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = cmd.opcode;
+	c.common.flags = cmd.flags;
+	c.common.nsid = cpu_to_le32(cmd.nsid);
+	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
+	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
+	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
+	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
+	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
+	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+
+	if (cmd.timeout_ms)
+		timeout = msecs_to_jiffies(cmd.timeout_ms);
+
+	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+			(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
+			&cmd.result, timeout);
+	if (status >= 0) {
+		if (put_user(cmd.result, &ucmd->result))
+			return -EFAULT;
+	}
+
+	return status;
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case NVME_IOCTL_ID:
+		force_successful_syscall_return();
+		return ns->ns_id;
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
+	case NVME_IOCTL_IO_CMD:
+		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
+	case NVME_IOCTL_SUBMIT_IO:
+		return nvme_submit_io(ns, (void __user *)arg);
+#ifdef CONFIG_BLK_DEV_NVME_SCSI
+	case SG_GET_VERSION_NUM:
+		return nvme_sg_get_version_num((void __user *)arg);
+	case SG_IO:
+		return nvme_sg_io(ns, (void __user *)arg);
+#endif
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case SG_IO:
+		return -ENOIOCTLCMD;
+	}
+	return nvme_ioctl(bdev, mode, cmd, arg);
+}
+#else
+#define nvme_compat_ioctl	NULL
+#endif
+
+static int nvme_open(struct block_device *bdev, fmode_t mode)
+{
+	return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
+}
+
+static void nvme_release(struct gendisk *disk, fmode_t mode)
+{
+	nvme_put_ns(disk->private_data);
+}
+
+static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	/* some standard values */
+	geo->heads = 1 << 6;
+	geo->sectors = 1 << 5;
+	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
+	return 0;
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+	struct blk_integrity integrity;
+
+	switch (ns->pi_type) {
+	case NVME_NS_DPS_PI_TYPE3:
+		integrity.profile = &t10_pi_type3_crc;
+		break;
+	case NVME_NS_DPS_PI_TYPE1:
+	case NVME_NS_DPS_PI_TYPE2:
+		integrity.profile = &t10_pi_type1_crc;
+		break;
+	default:
+		integrity.profile = NULL;
+		break;
+	}
+	integrity.tuple_size = ns->ms;
+	blk_integrity_register(ns->disk, &integrity);
+	blk_queue_max_integrity_segments(ns->queue, 1);
+}
+#else
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
+static void nvme_config_discard(struct nvme_ns *ns)
+{
+	u32 logical_block_size = queue_logical_block_size(ns->queue);
+	ns->queue->limits.discard_zeroes_data = 0;
+	ns->queue->limits.discard_alignment = logical_block_size;
+	ns->queue->limits.discard_granularity = logical_block_size;
+	blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+}
+
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+	struct nvme_ns *ns = disk->private_data;
+	struct nvme_id_ns *id;
+	u8 lbaf, pi_type;
+	u16 old_ms;
+	unsigned short bs;
+
+	if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
+		dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
+				__func__, ns->ctrl->instance, ns->ns_id);
+		return -ENODEV;
+	}
+	if (id->ncap == 0) {
+		kfree(id);
+		return -ENODEV;
+	}
+
+	if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
+		if (nvme_nvm_register(ns->queue, disk->disk_name)) {
+			dev_warn(ns->ctrl->dev,
+				"%s: LightNVM init failure\n", __func__);
+			kfree(id);
+			return -ENODEV;
+		}
+		ns->type = NVME_NS_LIGHTNVM;
+	}
+
+	if (ns->ctrl->vs >= NVME_VS(1, 1))
+		memcpy(ns->eui, id->eui64, sizeof(ns->eui));
+	if (ns->ctrl->vs >= NVME_VS(1, 2))
+		memcpy(ns->uuid, id->nguid, sizeof(ns->uuid));
+
+	old_ms = ns->ms;
+	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
+	ns->lba_shift = id->lbaf[lbaf].ds;
+	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+
+	/*
+	 * If identify namespace failed, use default 512 byte block size so
+	 * block layer can use before failing read/write for 0 capacity.
+	 */
+	if (ns->lba_shift == 0)
+		ns->lba_shift = 9;
+	bs = 1 << ns->lba_shift;
+	/* XXX: PI implementation requires metadata equal t10 pi tuple size */
+	pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
+					id->dps & NVME_NS_DPS_PI_MASK : 0;
+
+	blk_mq_freeze_queue(disk->queue);
+	if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
+				ns->ms != old_ms ||
+				bs != queue_logical_block_size(disk->queue) ||
+				(ns->ms && ns->ext)))
+		blk_integrity_unregister(disk);
+
+	ns->pi_type = pi_type;
+	blk_queue_logical_block_size(ns->queue, bs);
+
+	if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
+		nvme_init_integrity(ns);
+	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
+		set_capacity(disk, 0);
+	else
+		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+	if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+		nvme_config_discard(ns);
+	blk_mq_unfreeze_queue(disk->queue);
+
+	kfree(id);
+	return 0;
+}
+
+static char nvme_pr_type(enum pr_type type)
+{
+	switch (type) {
+	case PR_WRITE_EXCLUSIVE:
+		return 1;
+	case PR_EXCLUSIVE_ACCESS:
+		return 2;
+	case PR_WRITE_EXCLUSIVE_REG_ONLY:
+		return 3;
+	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
+		return 4;
+	case PR_WRITE_EXCLUSIVE_ALL_REGS:
+		return 5;
+	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
+		return 6;
+	default:
+		return 0;
+	}
+};
+
+static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
+				u64 key, u64 sa_key, u8 op)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+	struct nvme_command c;
+	u8 data[16] = { 0, };
+
+	put_unaligned_le64(key, &data[0]);
+	put_unaligned_le64(sa_key, &data[8]);
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = op;
+	c.common.nsid = cpu_to_le32(ns->ns_id);
+	c.common.cdw10[0] = cpu_to_le32(cdw10);
+
+	return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+}
+
+static int nvme_pr_register(struct block_device *bdev, u64 old,
+		u64 new, unsigned flags)
+{
+	u32 cdw10;
+
+	if (flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+
+	cdw10 = old ? 2 : 0;
+	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
+	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
+	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_reserve(struct block_device *bdev, u64 key,
+		enum pr_type type, unsigned flags)
+{
+	u32 cdw10;
+
+	if (flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+
+	cdw10 = nvme_pr_type(type) << 8;
+	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
+		enum pr_type type, bool abort)
+{
+	u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
+	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_clear(struct block_device *bdev, u64 key)
+{
+	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+{
+	u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+}
+
+static const struct pr_ops nvme_pr_ops = {
+	.pr_register	= nvme_pr_register,
+	.pr_reserve	= nvme_pr_reserve,
+	.pr_release	= nvme_pr_release,
+	.pr_preempt	= nvme_pr_preempt,
+	.pr_clear	= nvme_pr_clear,
+};
+
+static const struct block_device_operations nvme_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= nvme_compat_ioctl,
+	.open		= nvme_open,
+	.release	= nvme_release,
+	.getgeo		= nvme_getgeo,
+	.revalidate_disk= nvme_revalidate_disk,
+	.pr_ops		= &nvme_pr_ops,
+};
+
+static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
+{
+	unsigned long timeout =
+		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
+	int ret;
+
+	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
+		if ((csts & NVME_CSTS_RDY) == bit)
+			break;
+
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(ctrl->dev,
+				"Device not ready; aborting %s\n", enabled ?
+						"initialisation" : "reset");
+			return -ENODEV;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * If the device has been passed off to us in an enabled state, just clear
+ * the enabled bit.  The spec says we should set the 'shutdown notification
+ * bits', but doing so may cause the device to complete commands to the
+ * admin queue ... and we don't know what memory that might be pointing at!
+ */
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+{
+	int ret;
+
+	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
+	ctrl->ctrl_config &= ~NVME_CC_ENABLE;
+
+	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+	if (ret)
+		return ret;
+	return nvme_wait_ready(ctrl, cap, false);
+}
+
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+{
+	/*
+	 * Default to a 4K page size, with the intention to update this
+	 * path in the future to accomodate architectures with differing
+	 * kernel and IO page sizes.
+	 */
+	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
+	int ret;
+
+	if (page_shift < dev_page_min) {
+		dev_err(ctrl->dev,
+			"Minimum device page size %u too large for host (%u)\n",
+			1 << dev_page_min, 1 << page_shift);
+		return -ENODEV;
+	}
+
+	ctrl->page_size = 1 << page_shift;
+
+	ctrl->ctrl_config = NVME_CC_CSS_NVM;
+	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
+	ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
+	ctrl->ctrl_config |= NVME_CC_ENABLE;
+
+	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+	if (ret)
+		return ret;
+	return nvme_wait_ready(ctrl, cap, true);
+}
+
+int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
+{
+	unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies;
+	u32 csts;
+	int ret;
+
+	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
+	ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
+
+	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+	if (ret)
+		return ret;
+
+	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
+		if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
+			break;
+
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(ctrl->dev,
+				"Device shutdown incomplete; abort shutdown\n");
+			return -ENODEV;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Initialize the cached copies of the Identify data and various controller
+ * register in our nvme_ctrl structure.  This should be called as soon as
+ * the admin queue is fully up and running.
+ */
+int nvme_init_identify(struct nvme_ctrl *ctrl)
+{
+	struct nvme_id_ctrl *id;
+	u64 cap;
+	int ret, page_shift;
+
+	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
+	if (ret) {
+		dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret);
+		return ret;
+	}
+
+	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
+	if (ret) {
+		dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret);
+		return ret;
+	}
+	page_shift = NVME_CAP_MPSMIN(cap) + 12;
+
+	if (ctrl->vs >= NVME_VS(1, 1))
+		ctrl->subsystem = NVME_CAP_NSSRC(cap);
+
+	ret = nvme_identify_ctrl(ctrl, &id);
+	if (ret) {
+		dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret);
+		return -EIO;
+	}
+
+	ctrl->oncs = le16_to_cpup(&id->oncs);
+	atomic_set(&ctrl->abort_limit, id->acl + 1);
+	ctrl->vwc = id->vwc;
+	memcpy(ctrl->serial, id->sn, sizeof(id->sn));
+	memcpy(ctrl->model, id->mn, sizeof(id->mn));
+	memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
+	if (id->mdts)
+		ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9);
+	else
+		ctrl->max_hw_sectors = UINT_MAX;
+
+	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
+		unsigned int max_hw_sectors;
+
+		ctrl->stripe_size = 1 << (id->vs[3] + page_shift);
+		max_hw_sectors = ctrl->stripe_size >> (page_shift - 9);
+		if (ctrl->max_hw_sectors) {
+			ctrl->max_hw_sectors = min(max_hw_sectors,
+							ctrl->max_hw_sectors);
+		} else {
+			ctrl->max_hw_sectors = max_hw_sectors;
+		}
+	}
+
+	kfree(id);
+	return 0;
+}
+
+static int nvme_dev_open(struct inode *inode, struct file *file)
+{
+	struct nvme_ctrl *ctrl;
+	int instance = iminor(inode);
+	int ret = -ENODEV;
+
+	spin_lock(&dev_list_lock);
+	list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
+		if (ctrl->instance != instance)
+			continue;
+
+		if (!ctrl->admin_q) {
+			ret = -EWOULDBLOCK;
+			break;
+		}
+		if (!kref_get_unless_zero(&ctrl->kref))
+			break;
+		file->private_data = ctrl;
+		ret = 0;
+		break;
+	}
+	spin_unlock(&dev_list_lock);
+
+	return ret;
+}
+
+static int nvme_dev_release(struct inode *inode, struct file *file)
+{
+	nvme_put_ctrl(file->private_data);
+	return 0;
+}
+
+static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
+{
+	struct nvme_ns *ns;
+	int ret;
+
+	mutex_lock(&ctrl->namespaces_mutex);
+	if (list_empty(&ctrl->namespaces)) {
+		ret = -ENOTTY;
+		goto out_unlock;
+	}
+
+	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
+	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
+		dev_warn(ctrl->dev,
+			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	dev_warn(ctrl->dev,
+		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
+	kref_get(&ns->kref);
+	mutex_unlock(&ctrl->namespaces_mutex);
+
+	ret = nvme_user_cmd(ctrl, ns, argp);
+	nvme_put_ns(ns);
+	return ret;
+
+out_unlock:
+	mutex_unlock(&ctrl->namespaces_mutex);
+	return ret;
+}
+
+static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct nvme_ctrl *ctrl = file->private_data;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_cmd(ctrl, NULL, argp);
+	case NVME_IOCTL_IO_CMD:
+		return nvme_dev_user_cmd(ctrl, argp);
+	case NVME_IOCTL_RESET:
+		dev_warn(ctrl->dev, "resetting controller\n");
+		return ctrl->ops->reset_ctrl(ctrl);
+	case NVME_IOCTL_SUBSYS_RESET:
+		return nvme_reset_subsystem(ctrl);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations nvme_dev_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nvme_dev_open,
+	.release	= nvme_dev_release,
+	.unlocked_ioctl	= nvme_dev_ioctl,
+	.compat_ioctl	= nvme_dev_ioctl,
+};
+
+static ssize_t nvme_sysfs_reset(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	int ret;
+
+	ret = ctrl->ops->reset_ctrl(ctrl);
+	if (ret < 0)
+		return ret;
+	return count;
+}
+static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
+
+static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
+								char *buf)
+{
+	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	return sprintf(buf, "%pU\n", ns->uuid);
+}
+static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
+
+static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
+								char *buf)
+{
+	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	return sprintf(buf, "%8phd\n", ns->eui);
+}
+static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
+
+static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
+								char *buf)
+{
+	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	return sprintf(buf, "%d\n", ns->ns_id);
+}
+static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
+
+static struct attribute *nvme_ns_attrs[] = {
+	&dev_attr_uuid.attr,
+	&dev_attr_eui.attr,
+	&dev_attr_nsid.attr,
+	NULL,
+};
+
+static umode_t nvme_attrs_are_visible(struct kobject *kobj,
+		struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+
+	if (a == &dev_attr_uuid.attr) {
+		if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid)))
+			return 0;
+	}
+	if (a == &dev_attr_eui.attr) {
+		if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
+			return 0;
+	}
+	return a->mode;
+}
+
+static const struct attribute_group nvme_ns_attr_group = {
+	.attrs		= nvme_ns_attrs,
+	.is_visible	= nvme_attrs_are_visible,
+};
+
+#define nvme_show_function(field)						\
+static ssize_t  field##_show(struct device *dev,				\
+			    struct device_attribute *attr, char *buf)		\
+{										\
+        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
+        return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field);	\
+}										\
+static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
+
+nvme_show_function(model);
+nvme_show_function(serial);
+nvme_show_function(firmware_rev);
+
+static struct attribute *nvme_dev_attrs[] = {
+	&dev_attr_reset_controller.attr,
+	&dev_attr_model.attr,
+	&dev_attr_serial.attr,
+	&dev_attr_firmware_rev.attr,
+	NULL
+};
+
+static struct attribute_group nvme_dev_attrs_group = {
+	.attrs = nvme_dev_attrs,
+};
+
+static const struct attribute_group *nvme_dev_attr_groups[] = {
+	&nvme_dev_attrs_group,
+	NULL,
+};
+
+static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
+	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
+
+	return nsa->ns_id - nsb->ns_id;
+}
+
+static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+	struct nvme_ns *ns;
+
+	lockdep_assert_held(&ctrl->namespaces_mutex);
+
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (ns->ns_id == nsid)
+			return ns;
+		if (ns->ns_id > nsid)
+			break;
+	}
+	return NULL;
+}
+
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+	struct nvme_ns *ns;
+	struct gendisk *disk;
+	int node = dev_to_node(ctrl->dev);
+
+	lockdep_assert_held(&ctrl->namespaces_mutex);
+
+	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
+	if (!ns)
+		return;
+
+	ns->queue = blk_mq_init_queue(ctrl->tagset);
+	if (IS_ERR(ns->queue))
+		goto out_free_ns;
+	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+	ns->queue->queuedata = ns;
+	ns->ctrl = ctrl;
+
+	disk = alloc_disk_node(0, node);
+	if (!disk)
+		goto out_free_queue;
+
+	kref_init(&ns->kref);
+	ns->ns_id = nsid;
+	ns->disk = disk;
+	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
+
+	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+	if (ctrl->max_hw_sectors) {
+		blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
+		blk_queue_max_segments(ns->queue,
+			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
+	}
+	if (ctrl->stripe_size)
+		blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
+	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
+	blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
+
+	disk->major = nvme_major;
+	disk->first_minor = 0;
+	disk->fops = &nvme_fops;
+	disk->private_data = ns;
+	disk->queue = ns->queue;
+	disk->driverfs_dev = ctrl->device;
+	disk->flags = GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
+
+	if (nvme_revalidate_disk(ns->disk))
+		goto out_free_disk;
+
+	list_add_tail(&ns->list, &ctrl->namespaces);
+	kref_get(&ctrl->kref);
+	if (ns->type == NVME_NS_LIGHTNVM)
+		return;
+
+	add_disk(ns->disk);
+	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
+					&nvme_ns_attr_group))
+		pr_warn("%s: failed to create sysfs group for identification\n",
+			ns->disk->disk_name);
+	return;
+ out_free_disk:
+	kfree(disk);
+ out_free_queue:
+	blk_cleanup_queue(ns->queue);
+ out_free_ns:
+	kfree(ns);
+}
+
+static void nvme_ns_remove(struct nvme_ns *ns)
+{
+	bool kill = nvme_io_incapable(ns->ctrl) &&
+			!blk_queue_dying(ns->queue);
+
+	lockdep_assert_held(&ns->ctrl->namespaces_mutex);
+
+	if (kill) {
+		blk_set_queue_dying(ns->queue);
+
+		/*
+		 * The controller was shutdown first if we got here through
+		 * device removal. The shutdown may requeue outstanding
+		 * requests. These need to be aborted immediately so
+		 * del_gendisk doesn't block indefinitely for their completion.
+		 */
+		blk_mq_abort_requeue_list(ns->queue);
+	}
+	if (ns->disk->flags & GENHD_FL_UP) {
+		if (blk_get_integrity(ns->disk))
+			blk_integrity_unregister(ns->disk);
+		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
+					&nvme_ns_attr_group);
+		del_gendisk(ns->disk);
+	}
+	if (kill || !blk_queue_dying(ns->queue)) {
+		blk_mq_abort_requeue_list(ns->queue);
+		blk_cleanup_queue(ns->queue);
+	}
+	list_del_init(&ns->list);
+	nvme_put_ns(ns);
+}
+
+static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+	struct nvme_ns *ns;
+
+	ns = nvme_find_ns(ctrl, nsid);
+	if (ns) {
+		if (revalidate_disk(ns->disk))
+			nvme_ns_remove(ns);
+	} else
+		nvme_alloc_ns(ctrl, nsid);
+}
+
+static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
+{
+	struct nvme_ns *ns;
+	__le32 *ns_list;
+	unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
+	int ret = 0;
+
+	ns_list = kzalloc(0x1000, GFP_KERNEL);
+	if (!ns_list)
+		return -ENOMEM;
+
+	for (i = 0; i < num_lists; i++) {
+		ret = nvme_identify_ns_list(ctrl, prev, ns_list);
+		if (ret)
+			goto out;
+
+		for (j = 0; j < min(nn, 1024U); j++) {
+			nsid = le32_to_cpu(ns_list[j]);
+			if (!nsid)
+				goto out;
+
+			nvme_validate_ns(ctrl, nsid);
+
+			while (++prev < nsid) {
+				ns = nvme_find_ns(ctrl, prev);
+				if (ns)
+					nvme_ns_remove(ns);
+			}
+		}
+		nn -= j;
+	}
+ out:
+	kfree(ns_list);
+	return ret;
+}
+
+static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn)
+{
+	struct nvme_ns *ns, *next;
+	unsigned i;
+
+	lockdep_assert_held(&ctrl->namespaces_mutex);
+
+	for (i = 1; i <= nn; i++)
+		nvme_validate_ns(ctrl, i);
+
+	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
+		if (ns->ns_id > nn)
+			nvme_ns_remove(ns);
+	}
+}
+
+void nvme_scan_namespaces(struct nvme_ctrl *ctrl)
+{
+	struct nvme_id_ctrl *id;
+	unsigned nn;
+
+	if (nvme_identify_ctrl(ctrl, &id))
+		return;
+
+	mutex_lock(&ctrl->namespaces_mutex);
+	nn = le32_to_cpu(id->nn);
+	if (ctrl->vs >= NVME_VS(1, 1) &&
+	    !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+		if (!nvme_scan_ns_list(ctrl, nn))
+			goto done;
+	}
+	__nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn));
+ done:
+	list_sort(NULL, &ctrl->namespaces, ns_cmp);
+	mutex_unlock(&ctrl->namespaces_mutex);
+	kfree(id);
+}
+
+void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns, *next;
+
+	mutex_lock(&ctrl->namespaces_mutex);
+	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
+		nvme_ns_remove(ns);
+	mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+static DEFINE_IDA(nvme_instance_ida);
+
+static int nvme_set_instance(struct nvme_ctrl *ctrl)
+{
+	int instance, error;
+
+	do {
+		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
+			return -ENODEV;
+
+		spin_lock(&dev_list_lock);
+		error = ida_get_new(&nvme_instance_ida, &instance);
+		spin_unlock(&dev_list_lock);
+	} while (error == -EAGAIN);
+
+	if (error)
+		return -ENODEV;
+
+	ctrl->instance = instance;
+	return 0;
+}
+
+static void nvme_release_instance(struct nvme_ctrl *ctrl)
+{
+	spin_lock(&dev_list_lock);
+	ida_remove(&nvme_instance_ida, ctrl->instance);
+	spin_unlock(&dev_list_lock);
+}
+
+void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
+ {
+	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
+
+	spin_lock(&dev_list_lock);
+	list_del(&ctrl->node);
+	spin_unlock(&dev_list_lock);
+}
+
+static void nvme_free_ctrl(struct kref *kref)
+{
+	struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
+
+	put_device(ctrl->device);
+	nvme_release_instance(ctrl);
+
+	ctrl->ops->free_ctrl(ctrl);
+}
+
+void nvme_put_ctrl(struct nvme_ctrl *ctrl)
+{
+	kref_put(&ctrl->kref, nvme_free_ctrl);
+}
+
+/*
+ * Initialize a NVMe controller structures.  This needs to be called during
+ * earliest initialization so that we have the initialized structured around
+ * during probing.
+ */
+int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
+		const struct nvme_ctrl_ops *ops, unsigned long quirks)
+{
+	int ret;
+
+	INIT_LIST_HEAD(&ctrl->namespaces);
+	mutex_init(&ctrl->namespaces_mutex);
+	kref_init(&ctrl->kref);
+	ctrl->dev = dev;
+	ctrl->ops = ops;
+	ctrl->quirks = quirks;
+
+	ret = nvme_set_instance(ctrl);
+	if (ret)
+		goto out;
+
+	ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
+				MKDEV(nvme_char_major, ctrl->instance),
+				dev, nvme_dev_attr_groups,
+				"nvme%d", ctrl->instance);
+	if (IS_ERR(ctrl->device)) {
+		ret = PTR_ERR(ctrl->device);
+		goto out_release_instance;
+	}
+	get_device(ctrl->device);
+	dev_set_drvdata(ctrl->device, ctrl);
+
+	spin_lock(&dev_list_lock);
+	list_add_tail(&ctrl->node, &nvme_ctrl_list);
+	spin_unlock(&dev_list_lock);
+
+	return 0;
+out_release_instance:
+	nvme_release_instance(ctrl);
+out:
+	return ret;
+}
+
+void nvme_stop_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	mutex_lock(&ctrl->namespaces_mutex);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		spin_lock_irq(ns->queue->queue_lock);
+		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
+		spin_unlock_irq(ns->queue->queue_lock);
+
+		blk_mq_cancel_requeue_work(ns->queue);
+		blk_mq_stop_hw_queues(ns->queue);
+	}
+	mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+void nvme_start_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	mutex_lock(&ctrl->namespaces_mutex);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
+		blk_mq_start_stopped_hw_queues(ns->queue, true);
+		blk_mq_kick_requeue_list(ns->queue);
+	}
+	mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+int __init nvme_core_init(void)
+{
+	int result;
+
+	result = register_blkdev(nvme_major, "nvme");
+	if (result < 0)
+		return result;
+	else if (result > 0)
+		nvme_major = result;
+
+	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
+							&nvme_dev_fops);
+	if (result < 0)
+		goto unregister_blkdev;
+	else if (result > 0)
+		nvme_char_major = result;
+
+	nvme_class = class_create(THIS_MODULE, "nvme");
+	if (IS_ERR(nvme_class)) {
+		result = PTR_ERR(nvme_class);
+		goto unregister_chrdev;
+	}
+
+	return 0;
+
+ unregister_chrdev:
+	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+ unregister_blkdev:
+	unregister_blkdev(nvme_major, "nvme");
+	return result;
+}
+
+void nvme_core_exit(void)
+{
+	unregister_blkdev(nvme_major, "nvme");
+	class_destroy(nvme_class);
+	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+}
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 1af54ea20e7b..5cd3725e2fa4 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -146,6 +146,16 @@ struct nvme_nvm_command {
 	};
 };
 
+struct nvme_nvm_lp_mlc {
+	__u16			num_pairs;
+	__u8			pairs[886];
+};
+
+struct nvme_nvm_lp_tbl {
+	__u8			id[8];
+	struct nvme_nvm_lp_mlc	mlc;
+};
+
 struct nvme_nvm_id_group {
 	__u8			mtype;
 	__u8			fmtype;
@@ -169,7 +179,8 @@ struct nvme_nvm_id_group {
 	__le32			mpos;
 	__le32			mccap;
 	__le16			cpar;
-	__u8			reserved[906];
+	__u8			reserved[10];
+	struct nvme_nvm_lp_tbl lptbl;
 } __packed;
 
 struct nvme_nvm_addr_format {
@@ -266,6 +277,15 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
 		dst->mccap = le32_to_cpu(src->mccap);
 
 		dst->cpar = le16_to_cpu(src->cpar);
+
+		if (dst->fmtype == NVM_ID_FMTYPE_MLC) {
+			memcpy(dst->lptbl.id, src->lptbl.id, 8);
+			dst->lptbl.mlc.num_pairs =
+					le16_to_cpu(src->lptbl.mlc.num_pairs);
+			/* 4 bits per pair */
+			memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs,
+						dst->lptbl.mlc.num_pairs >> 1);
+		}
 	}
 
 	return 0;
@@ -274,7 +294,6 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
 static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_nvm_id *nvme_nvm_id;
 	struct nvme_nvm_command c = {};
 	int ret;
@@ -287,7 +306,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
 	if (!nvme_nvm_id)
 		return -ENOMEM;
 
-	ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c,
+	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
 				nvme_nvm_id, sizeof(struct nvme_nvm_id));
 	if (ret) {
 		ret = -EIO;
@@ -312,9 +331,8 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
 				nvm_l2p_update_fn *update_l2p, void *priv)
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_nvm_command c = {};
-	u32 len = queue_max_hw_sectors(dev->admin_q) << 9;
+	u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9;
 	u32 nlb_pr_rq = len / sizeof(u64);
 	u64 cmd_slba = slba;
 	void *entries;
@@ -332,10 +350,10 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
 		c.l2p.slba = cpu_to_le64(cmd_slba);
 		c.l2p.nlb = cpu_to_le32(cmd_nlb);
 
-		ret = nvme_submit_sync_cmd(dev->admin_q,
+		ret = nvme_submit_sync_cmd(ns->ctrl->admin_q,
 				(struct nvme_command *)&c, entries, len);
 		if (ret) {
-			dev_err(dev->dev, "L2P table transfer failed (%d)\n",
+			dev_err(ns->ctrl->dev, "L2P table transfer failed (%d)\n",
 									ret);
 			ret = -EIO;
 			goto out;
@@ -361,7 +379,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 {
 	struct request_queue *q = nvmdev->q;
 	struct nvme_ns *ns = q->queuedata;
-	struct nvme_dev *dev = ns->dev;
+	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_nvm_command c = {};
 	struct nvme_nvm_bb_tbl *bb_tbl;
 	int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks;
@@ -375,41 +393,36 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 	if (!bb_tbl)
 		return -ENOMEM;
 
-	ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c,
+	ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c,
 								bb_tbl, tblsz);
 	if (ret) {
-		dev_err(dev->dev, "get bad block table failed (%d)\n", ret);
+		dev_err(ctrl->dev, "get bad block table failed (%d)\n", ret);
 		ret = -EIO;
 		goto out;
 	}
 
 	if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' ||
 		bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') {
-		dev_err(dev->dev, "bbt format mismatch\n");
+		dev_err(ctrl->dev, "bbt format mismatch\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (le16_to_cpu(bb_tbl->verid) != 1) {
 		ret = -EINVAL;
-		dev_err(dev->dev, "bbt version not supported\n");
+		dev_err(ctrl->dev, "bbt version not supported\n");
 		goto out;
 	}
 
 	if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) {
 		ret = -EINVAL;
-		dev_err(dev->dev, "bbt unsuspected blocks returned (%u!=%u)",
+		dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)",
 					le32_to_cpu(bb_tbl->tblks), nr_blocks);
 		goto out;
 	}
 
 	ppa = dev_to_generic_addr(nvmdev, ppa);
 	ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv);
-	if (ret) {
-		ret = -EINTR;
-		goto out;
-	}
-
 out:
 	kfree(bb_tbl);
 	return ret;
@@ -419,7 +432,6 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd,
 								int type)
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_nvm_command c = {};
 	int ret = 0;
 
@@ -429,10 +441,10 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd,
 	c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1);
 	c.set_bb.value = type;
 
-	ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c,
+	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
 								NULL, 0);
 	if (ret)
-		dev_err(dev->dev, "set bad block table failed (%d)\n", ret);
+		dev_err(ns->ctrl->dev, "set bad block table failed (%d)\n", ret);
 	return ret;
 }
 
@@ -453,11 +465,8 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
 static void nvme_nvm_end_io(struct request *rq, int error)
 {
 	struct nvm_rq *rqd = rq->end_io_data;
-	struct nvm_dev *dev = rqd->dev;
 
-	if (dev->mt && dev->mt->end_io(rqd, error))
-		pr_err("nvme: err status: %x result: %lx\n",
-				rq->errors, (unsigned long)rq->special);
+	nvm_end_io(rqd, error);
 
 	kfree(rq->cmd);
 	blk_mq_free_request(rq);
@@ -520,9 +529,8 @@ static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd)
 static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
-	struct nvme_dev *dev = ns->dev;
 
-	return dma_pool_create(name, dev->dev, PAGE_SIZE, PAGE_SIZE, 0);
+	return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0);
 }
 
 static void nvme_nvm_destroy_dma_pool(void *pool)
@@ -580,8 +588,9 @@ void nvme_nvm_unregister(struct request_queue *q, char *disk_name)
 
 int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
 {
-	struct nvme_dev *dev = ns->dev;
-	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	/* XXX: this is poking into PCI structures from generic code! */
+	struct pci_dev *pdev = to_pci_dev(ctrl->dev);
 
 	/* QEMU NVMe simulator - PCI ID + Vendor specific bit */
 	if (pdev->vendor == PCI_VENDOR_ID_CNEX &&
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 044253dca30a..4fb5bb737868 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -19,58 +19,77 @@
 #include <linux/kref.h>
 #include <linux/blk-mq.h>
 
+enum {
+	/*
+	 * Driver internal status code for commands that were cancelled due
+	 * to timeouts or controller shutdown.  The value is negative so
+	 * that it a) doesn't overlap with the unsigned hardware error codes,
+	 * and b) can easily be tested for.
+	 */
+	NVME_SC_CANCELLED		= -EINTR,
+};
+
 extern unsigned char nvme_io_timeout;
 #define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
 
+extern unsigned char admin_timeout;
+#define ADMIN_TIMEOUT	(admin_timeout * HZ)
+
+extern unsigned char shutdown_timeout;
+#define SHUTDOWN_TIMEOUT	(shutdown_timeout * HZ)
+
 enum {
 	NVME_NS_LBA		= 0,
 	NVME_NS_LIGHTNVM	= 1,
 };
 
 /*
- * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+ * List of workarounds for devices that required behavior not specified in
+ * the standard.
  */
-struct nvme_dev {
-	struct list_head node;
-	struct nvme_queue **queues;
+enum nvme_quirks {
+	/*
+	 * Prefers I/O aligned to a stripe size specified in a vendor
+	 * specific Identify field.
+	 */
+	NVME_QUIRK_STRIPE_SIZE			= (1 << 0),
+
+	/*
+	 * The controller doesn't handle Identify value others than 0 or 1
+	 * correctly.
+	 */
+	NVME_QUIRK_IDENTIFY_CNS			= (1 << 1),
+};
+
+struct nvme_ctrl {
+	const struct nvme_ctrl_ops *ops;
 	struct request_queue *admin_q;
-	struct blk_mq_tag_set tagset;
-	struct blk_mq_tag_set admin_tagset;
-	u32 __iomem *dbs;
 	struct device *dev;
-	struct dma_pool *prp_page_pool;
-	struct dma_pool *prp_small_pool;
+	struct kref kref;
 	int instance;
-	unsigned queue_count;
-	unsigned online_queues;
-	unsigned max_qid;
-	int q_depth;
-	u32 db_stride;
-	u32 ctrl_config;
-	struct msix_entry *entry;
-	struct nvme_bar __iomem *bar;
+	struct blk_mq_tag_set *tagset;
 	struct list_head namespaces;
-	struct kref kref;
-	struct device *device;
-	struct work_struct reset_work;
-	struct work_struct probe_work;
-	struct work_struct scan_work;
+	struct mutex namespaces_mutex;
+	struct device *device;	/* char device */
+	struct list_head node;
+
 	char name[12];
 	char serial[20];
 	char model[40];
 	char firmware_rev[8];
-	bool subsystem;
+
+	u32 ctrl_config;
+
+	u32 page_size;
 	u32 max_hw_sectors;
 	u32 stripe_size;
-	u32 page_size;
-	void __iomem *cmb;
-	dma_addr_t cmb_dma_addr;
-	u64 cmb_size;
-	u32 cmbsz;
 	u16 oncs;
-	u16 abort_limit;
+	atomic_t abort_limit;
 	u8 event_limit;
 	u8 vwc;
+	u32 vs;
+	bool subsystem;
+	unsigned long quirks;
 };
 
 /*
@@ -79,11 +98,14 @@ struct nvme_dev {
 struct nvme_ns {
 	struct list_head list;
 
-	struct nvme_dev *dev;
+	struct nvme_ctrl *ctrl;
 	struct request_queue *queue;
 	struct gendisk *disk;
 	struct kref kref;
 
+	u8 eui[8];
+	u8 uuid[16];
+
 	unsigned ns_id;
 	int lba_shift;
 	u16 ms;
@@ -94,41 +116,156 @@ struct nvme_ns {
 	u32 mode_select_block_len;
 };
 
-/*
- * The nvme_iod describes the data in an I/O, including the list of PRP
- * entries.  You can't see it in this data structure because C doesn't let
- * me express that.  Use nvme_alloc_iod to ensure there's enough space
- * allocated to store the PRP list.
- */
-struct nvme_iod {
-	unsigned long private;	/* For the use of the submitter of the I/O */
-	int npages;		/* In the PRP list. 0 means small pool in use */
-	int offset;		/* Of PRP list */
-	int nents;		/* Used in scatterlist */
-	int length;		/* Of data, in bytes */
-	dma_addr_t first_dma;
-	struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */
-	struct scatterlist sg[0];
+struct nvme_ctrl_ops {
+	int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
+	int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
+	int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
+	bool (*io_incapable)(struct nvme_ctrl *ctrl);
+	int (*reset_ctrl)(struct nvme_ctrl *ctrl);
+	void (*free_ctrl)(struct nvme_ctrl *ctrl);
 };
 
+static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
+{
+	u32 val = 0;
+
+	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
+		return false;
+	return val & NVME_CSTS_RDY;
+}
+
+static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl)
+{
+	u32 val = 0;
+
+	if (ctrl->ops->io_incapable(ctrl))
+		return false;
+	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
+		return false;
+	return val & NVME_CSTS_CFS;
+}
+
+static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
+{
+	if (!ctrl->subsystem)
+		return -ENOTTY;
+	return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
+}
+
 static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
 {
 	return (sector >> (ns->lba_shift - 9));
 }
 
+static inline void nvme_setup_flush(struct nvme_ns *ns,
+		struct nvme_command *cmnd)
+{
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->common.opcode = nvme_cmd_flush;
+	cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+}
+
+static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
+		struct nvme_command *cmnd)
+{
+	u16 control = 0;
+	u32 dsmgmt = 0;
+
+	if (req->cmd_flags & REQ_FUA)
+		control |= NVME_RW_FUA;
+	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+		control |= NVME_RW_LR;
+
+	if (req->cmd_flags & REQ_RAHEAD)
+		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+	cmnd->rw.command_id = req->tag;
+	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+
+	if (ns->ms) {
+		switch (ns->pi_type) {
+		case NVME_NS_DPS_PI_TYPE3:
+			control |= NVME_RW_PRINFO_PRCHK_GUARD;
+			break;
+		case NVME_NS_DPS_PI_TYPE1:
+		case NVME_NS_DPS_PI_TYPE2:
+			control |= NVME_RW_PRINFO_PRCHK_GUARD |
+					NVME_RW_PRINFO_PRCHK_REF;
+			cmnd->rw.reftag = cpu_to_le32(
+					nvme_block_nr(ns, blk_rq_pos(req)));
+			break;
+		}
+		if (!blk_integrity_rq(req))
+			control |= NVME_RW_PRINFO_PRACT;
+	}
+
+	cmnd->rw.control = cpu_to_le16(control);
+	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+}
+
+
+static inline int nvme_error_status(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_SUCCESS:
+		return 0;
+	case NVME_SC_CAP_EXCEEDED:
+		return -ENOSPC;
+	default:
+		return -EIO;
+	}
+}
+
+static inline bool nvme_req_needs_retry(struct request *req, u16 status)
+{
+	return !(status & NVME_SC_DNR || blk_noretry_request(req)) &&
+		(jiffies - req->start_time) < req->timeout;
+}
+
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
+int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
+int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
+		const struct nvme_ctrl_ops *ops, unsigned long quirks);
+void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
+void nvme_put_ctrl(struct nvme_ctrl *ctrl);
+int nvme_init_identify(struct nvme_ctrl *ctrl);
+
+void nvme_scan_namespaces(struct nvme_ctrl *ctrl);
+void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
+
+void nvme_stop_queues(struct nvme_ctrl *ctrl);
+void nvme_start_queues(struct nvme_ctrl *ctrl);
+
+struct request *nvme_alloc_request(struct request_queue *q,
+		struct nvme_command *cmd, unsigned int flags);
+void nvme_requeue_req(struct request *req);
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		void *buf, unsigned bufflen);
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
-		void *buffer, void __user *ubuffer, unsigned bufflen,
+		void *buffer, unsigned bufflen,  u32 *result, unsigned timeout);
+int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void __user *ubuffer, unsigned bufflen, u32 *result,
+		unsigned timeout);
+int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void __user *ubuffer, unsigned bufflen,
+		void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
 		u32 *result, unsigned timeout);
-int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id);
-int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
+int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id);
+int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
 		struct nvme_id_ns **id);
-int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log);
-int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
+int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log);
+int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
 			dma_addr_t dma_addr, u32 *result);
-int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
+int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
 			dma_addr_t dma_addr, u32 *result);
+int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
+
+extern spinlock_t dev_list_lock;
 
 struct sg_io_hdr;
 
@@ -154,4 +291,7 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i
 }
 #endif /* CONFIG_NVM */
 
+int __init nvme_core_init(void);
+void nvme_core_exit(void);
+
 #endif /* _NVME_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f5c0e2613c7c..72ef8322d32a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -12,6 +12,7 @@
  * more details.
  */
 
+#include <linux/aer.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
@@ -28,10 +29,10 @@
 #include <linux/kdev_t.h>
 #include <linux/kthread.h>
 #include <linux/kernel.h>
-#include <linux/list_sort.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/poison.h>
 #include <linux/ptrace.h>
@@ -39,23 +40,24 @@
 #include <linux/slab.h>
 #include <linux/t10-pi.h>
 #include <linux/types.h>
-#include <linux/pr.h>
-#include <scsi/sg.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <asm/unaligned.h>
 
-#include <uapi/linux/nvme_ioctl.h>
 #include "nvme.h"
 
-#define NVME_MINORS		(1U << MINORBITS)
 #define NVME_Q_DEPTH		1024
 #define NVME_AQ_DEPTH		256
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
-#define ADMIN_TIMEOUT		(admin_timeout * HZ)
-#define SHUTDOWN_TIMEOUT	(shutdown_timeout * HZ)
+		
+/*
+ * We handle AEN commands ourselves and don't even let the
+ * block layer know about them.
+ */
+#define NVME_NR_AEN_COMMANDS	1
+#define NVME_AQ_BLKMQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
 
-static unsigned char admin_timeout = 60;
+unsigned char admin_timeout = 60;
 module_param(admin_timeout, byte, 0644);
 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
 
@@ -63,16 +65,10 @@ unsigned char nvme_io_timeout = 30;
 module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
 
-static unsigned char shutdown_timeout = 5;
+unsigned char shutdown_timeout = 5;
 module_param(shutdown_timeout, byte, 0644);
 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
 
-static int nvme_major;
-module_param(nvme_major, int, 0);
-
-static int nvme_char_major;
-module_param(nvme_char_major, int, 0);
-
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -80,28 +76,60 @@ static bool use_cmb_sqes = true;
 module_param(use_cmb_sqes, bool, 0644);
 MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
 
-static DEFINE_SPINLOCK(dev_list_lock);
 static LIST_HEAD(dev_list);
 static struct task_struct *nvme_thread;
 static struct workqueue_struct *nvme_workq;
 static wait_queue_head_t nvme_kthread_wait;
 
-static struct class *nvme_class;
+struct nvme_dev;
+struct nvme_queue;
 
-static int __nvme_reset(struct nvme_dev *dev);
 static int nvme_reset(struct nvme_dev *dev);
 static void nvme_process_cq(struct nvme_queue *nvmeq);
-static void nvme_dead_ctrl(struct nvme_dev *dev);
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
+static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 
-struct async_cmd_info {
-	struct kthread_work work;
-	struct kthread_worker *worker;
-	struct request *req;
-	u32 result;
-	int status;
-	void *ctx;
+/*
+ * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+ */
+struct nvme_dev {
+	struct list_head node;
+	struct nvme_queue **queues;
+	struct blk_mq_tag_set tagset;
+	struct blk_mq_tag_set admin_tagset;
+	u32 __iomem *dbs;
+	struct device *dev;
+	struct dma_pool *prp_page_pool;
+	struct dma_pool *prp_small_pool;
+	unsigned queue_count;
+	unsigned online_queues;
+	unsigned max_qid;
+	int q_depth;
+	u32 db_stride;
+	struct msix_entry *entry;
+	void __iomem *bar;
+	struct work_struct reset_work;
+	struct work_struct scan_work;
+	struct work_struct remove_work;
+	struct mutex shutdown_lock;
+	bool subsystem;
+	void __iomem *cmb;
+	dma_addr_t cmb_dma_addr;
+	u64 cmb_size;
+	u32 cmbsz;
+	unsigned long flags;
+
+#define NVME_CTRL_RESETTING    0
+
+	struct nvme_ctrl ctrl;
+	struct completion ioq_wait;
 };
 
+static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
+{
+	return container_of(ctrl, struct nvme_dev, ctrl);
+}
+
 /*
  * An NVM Express queue.  Each device has at least two (one for admin
  * commands and one for I/O commands).
@@ -126,7 +154,24 @@ struct nvme_queue {
 	u16 qid;
 	u8 cq_phase;
 	u8 cqe_seen;
-	struct async_cmd_info cmdinfo;
+};
+
+/*
+ * The nvme_iod describes the data in an I/O, including the list of PRP
+ * entries.  You can't see it in this data structure because C doesn't let
+ * me express that.  Use nvme_init_iod to ensure there's enough space
+ * allocated to store the PRP list.
+ */
+struct nvme_iod {
+	struct nvme_queue *nvmeq;
+	int aborted;
+	int npages;		/* In the PRP list. 0 means small pool in use */
+	int nents;		/* Used in scatterlist */
+	int length;		/* Of data, in bytes */
+	dma_addr_t first_dma;
+	struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
+	struct scatterlist *sg;
+	struct scatterlist inline_sg[0];
 };
 
 /*
@@ -148,23 +193,11 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
 }
 
-typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
-						struct nvme_completion *);
-
-struct nvme_cmd_info {
-	nvme_completion_fn fn;
-	void *ctx;
-	int aborted;
-	struct nvme_queue *nvmeq;
-	struct nvme_iod iod[0];
-};
-
 /*
  * Max size of iod being embedded in the request payload
  */
 #define NVME_INT_PAGES		2
-#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->page_size)
-#define NVME_INT_MASK		0x01
+#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->ctrl.page_size)
 
 /*
  * Will slightly overestimate the number of pages needed.  This is OK
@@ -173,19 +206,22 @@ struct nvme_cmd_info {
  */
 static int nvme_npages(unsigned size, struct nvme_dev *dev)
 {
-	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+	unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
+				      dev->ctrl.page_size);
 	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
 }
 
-static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev,
+		unsigned int size, unsigned int nseg)
 {
-	unsigned int ret = sizeof(struct nvme_cmd_info);
-
-	ret += sizeof(struct nvme_iod);
-	ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
-	ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
+	return sizeof(__le64 *) * nvme_npages(size, dev) +
+			sizeof(struct scatterlist) * nseg;
+}
 
-	return ret;
+static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+{
+	return sizeof(struct nvme_iod) +
+		nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES);
 }
 
 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@ -215,11 +251,11 @@ static int nvme_admin_init_request(void *data, struct request *req,
 				unsigned int numa_node)
 {
 	struct nvme_dev *dev = data;
-	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = dev->queues[0];
 
 	BUG_ON(!nvmeq);
-	cmd->nvmeq = nvmeq;
+	iod->nvmeq = nvmeq;
 	return 0;
 }
 
@@ -242,148 +278,36 @@ static int nvme_init_request(void *data, struct request *req,
 				unsigned int numa_node)
 {
 	struct nvme_dev *dev = data;
-	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
 
 	BUG_ON(!nvmeq);
-	cmd->nvmeq = nvmeq;
+	iod->nvmeq = nvmeq;
 	return 0;
 }
 
-static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
-				nvme_completion_fn handler)
-{
-	cmd->fn = handler;
-	cmd->ctx = ctx;
-	cmd->aborted = 0;
-	blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
-}
-
-static void *iod_get_private(struct nvme_iod *iod)
-{
-	return (void *) (iod->private & ~0x1UL);
-}
-
-/*
- * If bit 0 is set, the iod is embedded in the request payload.
- */
-static bool iod_should_kfree(struct nvme_iod *iod)
-{
-	return (iod->private & NVME_INT_MASK) == 0;
-}
-
-/* Special values must be less than 0x1000 */
-#define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
-#define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
-#define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
-#define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
-
-static void special_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	if (ctx == CMD_CTX_CANCELLED)
-		return;
-	if (ctx == CMD_CTX_COMPLETED) {
-		dev_warn(nvmeq->q_dmadev,
-				"completed id %d twice on queue %d\n",
-				cqe->command_id, le16_to_cpup(&cqe->sq_id));
-		return;
-	}
-	if (ctx == CMD_CTX_INVALID) {
-		dev_warn(nvmeq->q_dmadev,
-				"invalid id %d completed on queue %d\n",
-				cqe->command_id, le16_to_cpup(&cqe->sq_id));
-		return;
-	}
-	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
-}
-
-static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
-{
-	void *ctx;
-
-	if (fn)
-		*fn = cmd->fn;
-	ctx = cmd->ctx;
-	cmd->fn = special_completion;
-	cmd->ctx = CMD_CTX_CANCELLED;
-	return ctx;
-}
-
-static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
+static void nvme_complete_async_event(struct nvme_dev *dev,
+		struct nvme_completion *cqe)
 {
-	u32 result = le32_to_cpup(&cqe->result);
-	u16 status = le16_to_cpup(&cqe->status) >> 1;
+	u16 status = le16_to_cpu(cqe->status) >> 1;
+	u32 result = le32_to_cpu(cqe->result);
 
 	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
-		++nvmeq->dev->event_limit;
+		++dev->ctrl.event_limit;
 	if (status != NVME_SC_SUCCESS)
 		return;
 
 	switch (result & 0xff07) {
 	case NVME_AER_NOTICE_NS_CHANGED:
-		dev_info(nvmeq->q_dmadev, "rescanning\n");
-		schedule_work(&nvmeq->dev->scan_work);
+		dev_info(dev->dev, "rescanning\n");
+		queue_work(nvme_workq, &dev->scan_work);
 	default:
-		dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result);
+		dev_warn(dev->dev, "async event result %08x\n", result);
 	}
 }
 
-static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct request *req = ctx;
-
-	u16 status = le16_to_cpup(&cqe->status) >> 1;
-	u32 result = le32_to_cpup(&cqe->result);
-
-	blk_mq_free_request(req);
-
-	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
-	++nvmeq->dev->abort_limit;
-}
-
-static void async_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct async_cmd_info *cmdinfo = ctx;
-	cmdinfo->result = le32_to_cpup(&cqe->result);
-	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
-	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
-	blk_mq_free_request(cmdinfo->req);
-}
-
-static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
-				  unsigned int tag)
-{
-	struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag);
-
-	return blk_mq_rq_to_pdu(req);
-}
-
-/*
- * Called with local interrupts disabled and the q_lock held.  May not sleep.
- */
-static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
-						nvme_completion_fn *fn)
-{
-	struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
-	void *ctx;
-	if (tag >= nvmeq->q_depth) {
-		*fn = special_completion;
-		return CMD_CTX_INVALID;
-	}
-	if (fn)
-		*fn = cmd->fn;
-	ctx = cmd->ctx;
-	cmd->fn = special_completion;
-	cmd->ctx = CMD_CTX_COMPLETED;
-	return ctx;
-}
-
 /**
- * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
+ * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
  * @cmd: The command to send
  *
@@ -405,69 +329,44 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
 	nvmeq->sq_tail = tail;
 }
 
-static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&nvmeq->q_lock, flags);
-	__nvme_submit_cmd(nvmeq, cmd);
-	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
-}
-
-static __le64 **iod_list(struct nvme_iod *iod)
+static __le64 **iod_list(struct request *req)
 {
-	return ((void *)iod) + iod->offset;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	return (__le64 **)(iod->sg + req->nr_phys_segments);
 }
 
-static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
-			    unsigned nseg, unsigned long private)
+static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
 {
-	iod->private = private;
-	iod->offset = offsetof(struct nvme_iod, sg[nseg]);
-	iod->npages = -1;
-	iod->length = nbytes;
-	iod->nents = 0;
-}
-
-static struct nvme_iod *
-__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
-		 unsigned long priv, gfp_t gfp)
-{
-	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
-				sizeof(__le64 *) * nvme_npages(bytes, dev) +
-				sizeof(struct scatterlist) * nseg, gfp);
-
-	if (iod)
-		iod_init(iod, bytes, nseg, priv);
-
-	return iod;
-}
-
-static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
-			               gfp_t gfp)
-{
-	unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
-                                                sizeof(struct nvme_dsm_range);
-	struct nvme_iod *iod;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
+	int nseg = rq->nr_phys_segments;
+	unsigned size;
 
-	if (rq->nr_phys_segments <= NVME_INT_PAGES &&
-	    size <= NVME_INT_BYTES(dev)) {
-		struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
+	if (rq->cmd_flags & REQ_DISCARD)
+		size = sizeof(struct nvme_dsm_range);
+	else
+		size = blk_rq_bytes(rq);
 
-		iod = cmd->iod;
-		iod_init(iod, size, rq->nr_phys_segments,
-				(unsigned long) rq | NVME_INT_MASK);
-		return iod;
+	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
+		iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
+		if (!iod->sg)
+			return BLK_MQ_RQ_QUEUE_BUSY;
+	} else {
+		iod->sg = iod->inline_sg;
 	}
 
-	return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
-				(unsigned long) rq, gfp);
+	iod->aborted = 0;
+	iod->npages = -1;
+	iod->nents = 0;
+	iod->length = size;
+	return 0;
 }
 
-static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
+static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
 {
-	const int last_prp = dev->page_size / 8 - 1;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	const int last_prp = dev->ctrl.page_size / 8 - 1;
 	int i;
-	__le64 **list = iod_list(iod);
+	__le64 **list = iod_list(req);
 	dma_addr_t prp_dma = iod->first_dma;
 
 	if (iod->npages == 0)
@@ -479,20 +378,8 @@ static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 		prp_dma = next_prp_dma;
 	}
 
-	if (iod_should_kfree(iod))
-		kfree(iod);
-}
-
-static int nvme_error_status(u16 status)
-{
-	switch (status & 0x7ff) {
-	case NVME_SC_SUCCESS:
-		return 0;
-	case NVME_SC_CAP_EXCEEDED:
-		return -ENOSPC;
-	default:
-		return -EIO;
-	}
+	if (iod->sg != iod->inline_sg)
+		kfree(iod->sg);
 }
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
@@ -549,27 +436,6 @@ static void nvme_dif_remap(struct request *req,
 	}
 	kunmap_atomic(pmap);
 }
-
-static void nvme_init_integrity(struct nvme_ns *ns)
-{
-	struct blk_integrity integrity;
-
-	switch (ns->pi_type) {
-	case NVME_NS_DPS_PI_TYPE3:
-		integrity.profile = &t10_pi_type3_crc;
-		break;
-	case NVME_NS_DPS_PI_TYPE1:
-	case NVME_NS_DPS_PI_TYPE2:
-		integrity.profile = &t10_pi_type1_crc;
-		break;
-	default:
-		integrity.profile = NULL;
-		break;
-	}
-	integrity.tuple_size = ns->ms;
-	blk_integrity_register(ns->disk, &integrity);
-	blk_queue_max_integrity_segments(ns->queue, 1);
-}
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 static void nvme_dif_remap(struct request *req,
 			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
@@ -581,91 +447,27 @@ static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
 static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
 {
 }
-static void nvme_init_integrity(struct nvme_ns *ns)
-{
-}
 #endif
 
-static void req_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct nvme_iod *iod = ctx;
-	struct request *req = iod_get_private(iod);
-	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-	u16 status = le16_to_cpup(&cqe->status) >> 1;
-	bool requeue = false;
-	int error = 0;
-
-	if (unlikely(status)) {
-		if (!(status & NVME_SC_DNR || blk_noretry_request(req))
-		    && (jiffies - req->start_time) < req->timeout) {
-			unsigned long flags;
-
-			requeue = true;
-			blk_mq_requeue_request(req);
-			spin_lock_irqsave(req->q->queue_lock, flags);
-			if (!blk_queue_stopped(req->q))
-				blk_mq_kick_requeue_list(req->q);
-			spin_unlock_irqrestore(req->q->queue_lock, flags);
-			goto release_iod;
-		}
-
-		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
-			if (cmd_rq->ctx == CMD_CTX_CANCELLED)
-				error = -EINTR;
-			else
-				error = status;
-		} else {
-			error = nvme_error_status(status);
-		}
-	}
-
-	if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
-		u32 result = le32_to_cpup(&cqe->result);
-		req->special = (void *)(uintptr_t)result;
-	}
-
-	if (cmd_rq->aborted)
-		dev_warn(nvmeq->dev->dev,
-			"completing aborted command with status:%04x\n",
-			error);
-
-release_iod:
-	if (iod->nents) {
-		dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
-			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		if (blk_integrity_rq(req)) {
-			if (!rq_data_dir(req))
-				nvme_dif_remap(req, nvme_dif_complete);
-			dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1,
-				rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		}
-	}
-	nvme_free_iod(nvmeq->dev, iod);
-
-	if (likely(!requeue))
-		blk_mq_complete_request(req, error);
-}
-
-/* length is in bytes.  gfp flags indicates whether we may sleep. */
-static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
-		int total_len, gfp_t gfp)
+static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req,
+		int total_len)
 {
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct dma_pool *pool;
 	int length = total_len;
 	struct scatterlist *sg = iod->sg;
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
-	u32 page_size = dev->page_size;
+	u32 page_size = dev->ctrl.page_size;
 	int offset = dma_addr & (page_size - 1);
 	__le64 *prp_list;
-	__le64 **list = iod_list(iod);
+	__le64 **list = iod_list(req);
 	dma_addr_t prp_dma;
 	int nprps, i;
 
 	length -= (page_size - offset);
 	if (length <= 0)
-		return total_len;
+		return true;
 
 	dma_len -= (page_size - offset);
 	if (dma_len) {
@@ -678,7 +480,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
 
 	if (length <= page_size) {
 		iod->first_dma = dma_addr;
-		return total_len;
+		return true;
 	}
 
 	nprps = DIV_ROUND_UP(length, page_size);
@@ -690,11 +492,11 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
 		iod->npages = 1;
 	}
 
-	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 	if (!prp_list) {
 		iod->first_dma = dma_addr;
 		iod->npages = -1;
-		return (total_len - length) + page_size;
+		return false;
 	}
 	list[0] = prp_list;
 	iod->first_dma = prp_dma;
@@ -702,9 +504,9 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
 	for (;;) {
 		if (i == page_size >> 3) {
 			__le64 *old_prp_list = prp_list;
-			prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 			if (!prp_list)
-				return total_len - length;
+				return false;
 			list[iod->npages++] = prp_list;
 			prp_list[0] = old_prp_list[i - 1];
 			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
@@ -724,115 +526,105 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
 		dma_len = sg_dma_len(sg);
 	}
 
-	return total_len;
+	return true;
 }
 
-static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
-		struct nvme_iod *iod)
+static int nvme_map_data(struct nvme_dev *dev, struct request *req,
+		struct nvme_command *cmnd)
 {
-	struct nvme_command cmnd;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct request_queue *q = req->q;
+	enum dma_data_direction dma_dir = rq_data_dir(req) ?
+			DMA_TO_DEVICE : DMA_FROM_DEVICE;
+	int ret = BLK_MQ_RQ_QUEUE_ERROR;
 
-	memcpy(&cmnd, req->cmd, sizeof(cmnd));
-	cmnd.rw.command_id = req->tag;
-	if (req->nr_phys_segments) {
-		cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-		cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
-	}
+	sg_init_table(iod->sg, req->nr_phys_segments);
+	iod->nents = blk_rq_map_sg(q, req, iod->sg);
+	if (!iod->nents)
+		goto out;
 
-	__nvme_submit_cmd(nvmeq, &cmnd);
-}
+	ret = BLK_MQ_RQ_QUEUE_BUSY;
+	if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir))
+		goto out;
 
-/*
- * We reuse the small pool to allocate the 16-byte range here as it is not
- * worth having a special pool for these or additional cases to handle freeing
- * the iod.
- */
-static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-		struct request *req, struct nvme_iod *iod)
-{
-	struct nvme_dsm_range *range =
-				(struct nvme_dsm_range *)iod_list(iod)[0];
-	struct nvme_command cmnd;
+	if (!nvme_setup_prps(dev, req, blk_rq_bytes(req)))
+		goto out_unmap;
 
-	range->cattr = cpu_to_le32(0);
-	range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
-	range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+	ret = BLK_MQ_RQ_QUEUE_ERROR;
+	if (blk_integrity_rq(req)) {
+		if (blk_rq_count_integrity_sg(q, req->bio) != 1)
+			goto out_unmap;
 
-	memset(&cmnd, 0, sizeof(cmnd));
-	cmnd.dsm.opcode = nvme_cmd_dsm;
-	cmnd.dsm.command_id = req->tag;
-	cmnd.dsm.nsid = cpu_to_le32(ns->ns_id);
-	cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma);
-	cmnd.dsm.nr = 0;
-	cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+		sg_init_table(&iod->meta_sg, 1);
+		if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
+			goto out_unmap;
 
-	__nvme_submit_cmd(nvmeq, &cmnd);
-}
+		if (rq_data_dir(req))
+			nvme_dif_remap(req, nvme_dif_prep);
 
-static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-								int cmdid)
-{
-	struct nvme_command cmnd;
+		if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
+			goto out_unmap;
+	}
 
-	memset(&cmnd, 0, sizeof(cmnd));
-	cmnd.common.opcode = nvme_cmd_flush;
-	cmnd.common.command_id = cmdid;
-	cmnd.common.nsid = cpu_to_le32(ns->ns_id);
+	cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+	cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
+	if (blk_integrity_rq(req))
+		cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
+	return BLK_MQ_RQ_QUEUE_OK;
 
-	__nvme_submit_cmd(nvmeq, &cmnd);
+out_unmap:
+	dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
+out:
+	return ret;
 }
 
-static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
-							struct nvme_ns *ns)
+static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 {
-	struct request *req = iod_get_private(iod);
-	struct nvme_command cmnd;
-	u16 control = 0;
-	u32 dsmgmt = 0;
-
-	if (req->cmd_flags & REQ_FUA)
-		control |= NVME_RW_FUA;
-	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
-		control |= NVME_RW_LR;
-
-	if (req->cmd_flags & REQ_RAHEAD)
-		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
-
-	memset(&cmnd, 0, sizeof(cmnd));
-	cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
-	cmnd.rw.command_id = req->tag;
-	cmnd.rw.nsid = cpu_to_le32(ns->ns_id);
-	cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-	cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
-	cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
-	cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
-
-	if (ns->ms) {
-		switch (ns->pi_type) {
-		case NVME_NS_DPS_PI_TYPE3:
-			control |= NVME_RW_PRINFO_PRCHK_GUARD;
-			break;
-		case NVME_NS_DPS_PI_TYPE1:
-		case NVME_NS_DPS_PI_TYPE2:
-			control |= NVME_RW_PRINFO_PRCHK_GUARD |
-					NVME_RW_PRINFO_PRCHK_REF;
-			cmnd.rw.reftag = cpu_to_le32(
-					nvme_block_nr(ns, blk_rq_pos(req)));
-			break;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	enum dma_data_direction dma_dir = rq_data_dir(req) ?
+			DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+	if (iod->nents) {
+		dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
+		if (blk_integrity_rq(req)) {
+			if (!rq_data_dir(req))
+				nvme_dif_remap(req, nvme_dif_complete);
+			dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
 		}
-		if (blk_integrity_rq(req))
-			cmnd.rw.metadata =
-				cpu_to_le64(sg_dma_address(iod->meta_sg));
-		else
-			control |= NVME_RW_PRINFO_PRACT;
 	}
 
-	cmnd.rw.control = cpu_to_le16(control);
-	cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt);
+	nvme_free_iod(dev, req);
+}
 
-	__nvme_submit_cmd(nvmeq, &cmnd);
+/*
+ * We reuse the small pool to allocate the 16-byte range here as it is not
+ * worth having a special pool for these or additional cases to handle freeing
+ * the iod.
+ */
+static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+		struct request *req, struct nvme_command *cmnd)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_dsm_range *range;
 
-	return 0;
+	range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
+						&iod->first_dma);
+	if (!range)
+		return BLK_MQ_RQ_QUEUE_BUSY;
+	iod_list(req)[0] = (__le64 *)range;
+	iod->npages = 0;
+
+	range->cattr = cpu_to_le32(0);
+	range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
+	range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->dsm.opcode = nvme_cmd_dsm;
+	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
+	cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
+	cmnd->dsm.nr = 0;
+	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+	return BLK_MQ_RQ_QUEUE_OK;
 }
 
 /*
@@ -845,9 +637,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct nvme_queue *nvmeq = hctx->driver_data;
 	struct nvme_dev *dev = nvmeq->dev;
 	struct request *req = bd->rq;
-	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
-	struct nvme_iod *iod;
-	enum dma_data_direction dma_dir;
+	struct nvme_command cmnd;
+	int ret = BLK_MQ_RQ_QUEUE_OK;
 
 	/*
 	 * If formated with metadata, require the block layer provide a buffer
@@ -857,91 +648,72 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (ns && ns->ms && !blk_integrity_rq(req)) {
 		if (!(ns->pi_type && ns->ms == 8) &&
 					req->cmd_type != REQ_TYPE_DRV_PRIV) {
-			blk_mq_complete_request(req, -EFAULT);
+			blk_mq_end_request(req, -EFAULT);
 			return BLK_MQ_RQ_QUEUE_OK;
 		}
 	}
 
-	iod = nvme_alloc_iod(req, dev, GFP_ATOMIC);
-	if (!iod)
-		return BLK_MQ_RQ_QUEUE_BUSY;
+	ret = nvme_init_iod(req, dev);
+	if (ret)
+		return ret;
 
 	if (req->cmd_flags & REQ_DISCARD) {
-		void *range;
-		/*
-		 * We reuse the small pool to allocate the 16-byte range here
-		 * as it is not worth having a special pool for these or
-		 * additional cases to handle freeing the iod.
-		 */
-		range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC,
-						&iod->first_dma);
-		if (!range)
-			goto retry_cmd;
-		iod_list(iod)[0] = (__le64 *)range;
-		iod->npages = 0;
-	} else if (req->nr_phys_segments) {
-		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+		ret = nvme_setup_discard(nvmeq, ns, req, &cmnd);
+	} else {
+		if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+			memcpy(&cmnd, req->cmd, sizeof(cmnd));
+		else if (req->cmd_flags & REQ_FLUSH)
+			nvme_setup_flush(ns, &cmnd);
+		else
+			nvme_setup_rw(ns, req, &cmnd);
 
-		sg_init_table(iod->sg, req->nr_phys_segments);
-		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
-		if (!iod->nents)
-			goto error_cmd;
+		if (req->nr_phys_segments)
+			ret = nvme_map_data(dev, req, &cmnd);
+	}
 
-		if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
-			goto retry_cmd;
+	if (ret)
+		goto out;
 
-		if (blk_rq_bytes(req) !=
-                    nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
-			dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
-			goto retry_cmd;
-		}
-		if (blk_integrity_rq(req)) {
-			if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) {
-				dma_unmap_sg(dev->dev, iod->sg, iod->nents,
-						dma_dir);
-				goto error_cmd;
-			}
+	cmnd.common.command_id = req->tag;
+	blk_mq_start_request(req);
 
-			sg_init_table(iod->meta_sg, 1);
-			if (blk_rq_map_integrity_sg(
-					req->q, req->bio, iod->meta_sg) != 1) {
-				dma_unmap_sg(dev->dev, iod->sg, iod->nents,
-						dma_dir);
-				goto error_cmd;
-			}
+	spin_lock_irq(&nvmeq->q_lock);
+	__nvme_submit_cmd(nvmeq, &cmnd);
+	nvme_process_cq(nvmeq);
+	spin_unlock_irq(&nvmeq->q_lock);
+	return BLK_MQ_RQ_QUEUE_OK;
+out:
+	nvme_free_iod(dev, req);
+	return ret;
+}
 
-			if (rq_data_dir(req))
-				nvme_dif_remap(req, nvme_dif_prep);
+static void nvme_complete_rq(struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_dev *dev = iod->nvmeq->dev;
+	int error = 0;
 
-			if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) {
-				dma_unmap_sg(dev->dev, iod->sg, iod->nents,
-						dma_dir);
-				goto error_cmd;
-			}
+	nvme_unmap_data(dev, req);
+
+	if (unlikely(req->errors)) {
+		if (nvme_req_needs_retry(req, req->errors)) {
+			nvme_requeue_req(req);
+			return;
 		}
-	}
 
-	nvme_set_info(cmd, iod, req_completion);
-	spin_lock_irq(&nvmeq->q_lock);
-	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
-		nvme_submit_priv(nvmeq, req, iod);
-	else if (req->cmd_flags & REQ_DISCARD)
-		nvme_submit_discard(nvmeq, ns, req, iod);
-	else if (req->cmd_flags & REQ_FLUSH)
-		nvme_submit_flush(nvmeq, ns, req->tag);
-	else
-		nvme_submit_iod(nvmeq, iod, ns);
+		if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+			error = req->errors;
+		else
+			error = nvme_error_status(req->errors);
+	}
 
-	nvme_process_cq(nvmeq);
-	spin_unlock_irq(&nvmeq->q_lock);
-	return BLK_MQ_RQ_QUEUE_OK;
+	if (unlikely(iod->aborted)) {
+		dev_warn(dev->dev,
+			"completing aborted command with status: %04x\n",
+			req->errors);
+	}
 
- error_cmd:
-	nvme_free_iod(dev, iod);
-	return BLK_MQ_RQ_QUEUE_ERROR;
- retry_cmd:
-	nvme_free_iod(dev, iod);
-	return BLK_MQ_RQ_QUEUE_BUSY;
+	blk_mq_end_request(req, error);
 }
 
 static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
@@ -952,20 +724,47 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 	phase = nvmeq->cq_phase;
 
 	for (;;) {
-		void *ctx;
-		nvme_completion_fn fn;
 		struct nvme_completion cqe = nvmeq->cqes[head];
-		if ((le16_to_cpu(cqe.status) & 1) != phase)
+		u16 status = le16_to_cpu(cqe.status);
+		struct request *req;
+
+		if ((status & 1) != phase)
 			break;
 		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
 		if (++head == nvmeq->q_depth) {
 			head = 0;
 			phase = !phase;
 		}
+
 		if (tag && *tag == cqe.command_id)
 			*tag = -1;
-		ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
-		fn(nvmeq, ctx, &cqe);
+
+		if (unlikely(cqe.command_id >= nvmeq->q_depth)) {
+			dev_warn(nvmeq->q_dmadev,
+				"invalid id %d completed on queue %d\n",
+				cqe.command_id, le16_to_cpu(cqe.sq_id));
+			continue;
+		}
+
+		/*
+		 * AEN requests are special as they don't time out and can
+		 * survive any kind of queue freeze and often don't respond to
+		 * aborts.  We don't even bother to allocate a struct request
+		 * for them but rather special case them here.
+		 */
+		if (unlikely(nvmeq->qid == 0 &&
+				cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) {
+			nvme_complete_async_event(nvmeq->dev, &cqe);
+			continue;
+		}
+
+		req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
+		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
+			u32 result = le32_to_cpu(cqe.result);
+			req->special = (void *)(uintptr_t)result;
+		}
+		blk_mq_complete_request(req, status >> 1);
+
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -1028,112 +827,15 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 	return 0;
 }
 
-/*
- * Returns 0 on success.  If the result is negative, it's a Linux error code;
- * if the result is positive, it's an NVM Express status code
- */
-int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
-		void *buffer, void __user *ubuffer, unsigned bufflen,
-		u32 *result, unsigned timeout)
-{
-	bool write = cmd->common.opcode & 1;
-	struct bio *bio = NULL;
-	struct request *req;
-	int ret;
-
-	req = blk_mq_alloc_request(q, write, 0);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
-	req->cmd_type = REQ_TYPE_DRV_PRIV;
-	req->cmd_flags |= REQ_FAILFAST_DRIVER;
-	req->__data_len = 0;
-	req->__sector = (sector_t) -1;
-	req->bio = req->biotail = NULL;
-
-	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
-
-	req->cmd = (unsigned char *)cmd;
-	req->cmd_len = sizeof(struct nvme_command);
-	req->special = (void *)0;
-
-	if (buffer && bufflen) {
-		ret = blk_rq_map_kern(q, req, buffer, bufflen,
-				      __GFP_DIRECT_RECLAIM);
-		if (ret)
-			goto out;
-	} else if (ubuffer && bufflen) {
-		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
-				      __GFP_DIRECT_RECLAIM);
-		if (ret)
-			goto out;
-		bio = req->bio;
-	}
-
-	blk_execute_rq(req->q, NULL, req, 0);
-	if (bio)
-		blk_rq_unmap_user(bio);
-	if (result)
-		*result = (u32)(uintptr_t)req->special;
-	ret = req->errors;
- out:
-	blk_mq_free_request(req);
-	return ret;
-}
-
-int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
-		void *buffer, unsigned bufflen)
-{
-	return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
-}
-
-static int nvme_submit_async_admin_req(struct nvme_dev *dev)
+static void nvme_submit_async_event(struct nvme_dev *dev)
 {
-	struct nvme_queue *nvmeq = dev->queues[0];
 	struct nvme_command c;
-	struct nvme_cmd_info *cmd_info;
-	struct request *req;
-
-	req = blk_mq_alloc_request(dev->admin_q, WRITE,
-			BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
-	req->cmd_flags |= REQ_NO_TIMEOUT;
-	cmd_info = blk_mq_rq_to_pdu(req);
-	nvme_set_info(cmd_info, NULL, async_req_completion);
 
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = nvme_admin_async_event;
-	c.common.command_id = req->tag;
-
-	blk_mq_free_request(req);
-	__nvme_submit_cmd(nvmeq, &c);
-	return 0;
-}
-
-static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
-			struct nvme_command *cmd,
-			struct async_cmd_info *cmdinfo, unsigned timeout)
-{
-	struct nvme_queue *nvmeq = dev->queues[0];
-	struct request *req;
-	struct nvme_cmd_info *cmd_rq;
-
-	req = blk_mq_alloc_request(dev->admin_q, WRITE, 0);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
-	req->timeout = timeout;
-	cmd_rq = blk_mq_rq_to_pdu(req);
-	cmdinfo->req = req;
-	nvme_set_info(cmd_rq, cmdinfo, async_completion);
-	cmdinfo->status = -EINTR;
+	c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit;
 
-	cmd->common.command_id = req->tag;
-
-	nvme_submit_cmd(nvmeq, cmd);
-	return 0;
+	__nvme_submit_cmd(dev->queues[0], &c);
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -1144,7 +846,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 	c.delete_queue.opcode = opcode;
 	c.delete_queue.qid = cpu_to_le16(id);
 
-	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
+	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
 
 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
@@ -1165,7 +867,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cq_flags = cpu_to_le16(flags);
 	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
 
-	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
+	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
@@ -1186,7 +888,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	c.create_sq.sq_flags = cpu_to_le16(flags);
 	c.create_sq.cqid = cpu_to_le16(qid);
 
-	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
+	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
 
 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1199,195 +901,111 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
-int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
-{
-	struct nvme_command c = { };
-	int error;
-
-	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
-	c.identify.opcode = nvme_admin_identify;
-	c.identify.cns = cpu_to_le32(1);
-
-	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
-	if (!*id)
-		return -ENOMEM;
-
-	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
-			sizeof(struct nvme_id_ctrl));
-	if (error)
-		kfree(*id);
-	return error;
-}
-
-int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
-		struct nvme_id_ns **id)
-{
-	struct nvme_command c = { };
-	int error;
-
-	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
-	c.identify.opcode = nvme_admin_identify,
-	c.identify.nsid = cpu_to_le32(nsid),
-
-	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
-	if (!*id)
-		return -ENOMEM;
-
-	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
-			sizeof(struct nvme_id_ns));
-	if (error)
-		kfree(*id);
-	return error;
-}
-
-int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
-					dma_addr_t dma_addr, u32 *result)
-{
-	struct nvme_command c;
-
-	memset(&c, 0, sizeof(c));
-	c.features.opcode = nvme_admin_get_features;
-	c.features.nsid = cpu_to_le32(nsid);
-	c.features.prp1 = cpu_to_le64(dma_addr);
-	c.features.fid = cpu_to_le32(fid);
-
-	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
-			result, 0);
-}
-
-int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
-					dma_addr_t dma_addr, u32 *result)
-{
-	struct nvme_command c;
-
-	memset(&c, 0, sizeof(c));
-	c.features.opcode = nvme_admin_set_features;
-	c.features.prp1 = cpu_to_le64(dma_addr);
-	c.features.fid = cpu_to_le32(fid);
-	c.features.dword11 = cpu_to_le32(dword11);
-
-	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
-			result, 0);
-}
-
-int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
+static void abort_endio(struct request *req, int error)
 {
-	struct nvme_command c = { };
-	int error;
-
-	c.common.opcode = nvme_admin_get_log_page,
-	c.common.nsid = cpu_to_le32(0xFFFFFFFF),
-	c.common.cdw10[0] = cpu_to_le32(
-			(((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
-			 NVME_LOG_SMART),
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = iod->nvmeq;
+	u32 result = (u32)(uintptr_t)req->special;
+	u16 status = req->errors;
 
-	*log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
-	if (!*log)
-		return -ENOMEM;
+	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
+	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
 
-	error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
-			sizeof(struct nvme_smart_log));
-	if (error)
-		kfree(*log);
-	return error;
+	blk_mq_free_request(req);
 }
 
-/**
- * nvme_abort_req - Attempt aborting a request
- *
- * Schedule controller reset if the command was already aborted once before and
- * still hasn't been returned to the driver, or if this is the admin queue.
- */
-static void nvme_abort_req(struct request *req)
+static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
-	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = iod->nvmeq;
 	struct nvme_dev *dev = nvmeq->dev;
 	struct request *abort_req;
-	struct nvme_cmd_info *abort_cmd;
 	struct nvme_command cmd;
 
-	if (!nvmeq->qid || cmd_rq->aborted) {
-		spin_lock(&dev_list_lock);
-		if (!__nvme_reset(dev)) {
-			dev_warn(dev->dev,
-				 "I/O %d QID %d timeout, reset controller\n",
-				 req->tag, nvmeq->qid);
-		}
-		spin_unlock(&dev_list_lock);
-		return;
+	/*
+	 * Shutdown immediately if controller times out while starting. The
+	 * reset work will see the pci device disabled when it gets the forced
+	 * cancellation error. All outstanding requests are completed on
+	 * shutdown, so we return BLK_EH_HANDLED.
+	 */
+	if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
+		dev_warn(dev->dev,
+			 "I/O %d QID %d timeout, disable controller\n",
+			 req->tag, nvmeq->qid);
+		nvme_dev_disable(dev, false);
+		req->errors = NVME_SC_CANCELLED;
+		return BLK_EH_HANDLED;
 	}
 
-	if (!dev->abort_limit)
-		return;
+	/*
+ 	 * Shutdown the controller immediately and schedule a reset if the
+ 	 * command was already aborted once before and still hasn't been
+ 	 * returned to the driver, or if this is the admin queue.
+	 */
+	if (!nvmeq->qid || iod->aborted) {
+		dev_warn(dev->dev,
+			 "I/O %d QID %d timeout, reset controller\n",
+			 req->tag, nvmeq->qid);
+		nvme_dev_disable(dev, false);
+		queue_work(nvme_workq, &dev->reset_work);
 
-	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE,
-			BLK_MQ_REQ_NOWAIT);
-	if (IS_ERR(abort_req))
-		return;
+		/*
+		 * Mark the request as handled, since the inline shutdown
+		 * forces all outstanding requests to complete.
+		 */
+		req->errors = NVME_SC_CANCELLED;
+		return BLK_EH_HANDLED;
+	}
 
-	abort_cmd = blk_mq_rq_to_pdu(abort_req);
-	nvme_set_info(abort_cmd, abort_req, abort_completion);
+	iod->aborted = 1;
+
+	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
+		atomic_inc(&dev->ctrl.abort_limit);
+		return BLK_EH_RESET_TIMER;
+	}
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.abort.opcode = nvme_admin_abort_cmd;
 	cmd.abort.cid = req->tag;
 	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
-	cmd.abort.command_id = abort_req->tag;
 
-	--dev->abort_limit;
-	cmd_rq->aborted = 1;
+	dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n",
+				 req->tag, nvmeq->qid);
 
-	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
-							nvmeq->qid);
-	nvme_submit_cmd(dev->queues[0], &cmd);
+	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
+			BLK_MQ_REQ_NOWAIT);
+	if (IS_ERR(abort_req)) {
+		atomic_inc(&dev->ctrl.abort_limit);
+		return BLK_EH_RESET_TIMER;
+	}
+
+	abort_req->timeout = ADMIN_TIMEOUT;
+	abort_req->end_io_data = NULL;
+	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
+
+	/*
+	 * The aborted req will be completed on receiving the abort req.
+	 * We enable the timer again. If hit twice, it'll cause a device reset,
+	 * as the device then is in a faulty state.
+	 */
+	return BLK_EH_RESET_TIMER;
 }
 
 static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
 {
 	struct nvme_queue *nvmeq = data;
-	void *ctx;
-	nvme_completion_fn fn;
-	struct nvme_cmd_info *cmd;
-	struct nvme_completion cqe;
+	int status;
 
 	if (!blk_mq_request_started(req))
 		return;
 
-	cmd = blk_mq_rq_to_pdu(req);
-
-	if (cmd->ctx == CMD_CTX_CANCELLED)
-		return;
+	dev_warn(nvmeq->q_dmadev,
+		 "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid);
 
+	status = NVME_SC_ABORT_REQ;
 	if (blk_queue_dying(req->q))
-		cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
-	else
-		cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
-
-
-	dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
-						req->tag, nvmeq->qid);
-	ctx = cancel_cmd_info(cmd, &fn);
-	fn(nvmeq, ctx, &cqe);
-}
-
-static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
-{
-	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
-	struct nvme_queue *nvmeq = cmd->nvmeq;
-
-	dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
-							nvmeq->qid);
-	spin_lock_irq(&nvmeq->q_lock);
-	nvme_abort_req(req);
-	spin_unlock_irq(&nvmeq->q_lock);
-
-	/*
-	 * The aborted req will be completed on receiving the abort req.
-	 * We enable the timer again. If hit twice, it'll cause a device reset,
-	 * as the device then is in a faulty state.
-	 */
-	return BLK_EH_RESET_TIMER;
+		status |= NVME_SC_DNR;
+	blk_mq_complete_request(req, status);
 }
 
 static void nvme_free_queue(struct nvme_queue *nvmeq)
@@ -1430,8 +1048,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 	nvmeq->cq_vector = -1;
 	spin_unlock_irq(&nvmeq->q_lock);
 
-	if (!nvmeq->qid && nvmeq->dev->admin_q)
-		blk_mq_freeze_queue_start(nvmeq->dev->admin_q);
+	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
+		blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q);
 
 	irq_set_affinity_hint(vector, NULL);
 	free_irq(vector, nvmeq);
@@ -1447,21 +1065,20 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq)
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
-static void nvme_disable_queue(struct nvme_dev *dev, int qid)
+static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 {
-	struct nvme_queue *nvmeq = dev->queues[qid];
+	struct nvme_queue *nvmeq = dev->queues[0];
 
 	if (!nvmeq)
 		return;
 	if (nvme_suspend_queue(nvmeq))
 		return;
 
-	/* Don't tell the adapter to delete the admin queue.
-	 * Don't tell a removed adapter to delete IO queues. */
-	if (qid && readl(&dev->bar->csts) != -1) {
-		adapter_delete_sq(dev, qid);
-		adapter_delete_cq(dev, qid);
-	}
+	if (shutdown)
+		nvme_shutdown_ctrl(&dev->ctrl);
+	else
+		nvme_disable_ctrl(&dev->ctrl, lo_hi_readq(
+						dev->bar + NVME_REG_CAP));
 
 	spin_lock_irq(&nvmeq->q_lock);
 	nvme_process_cq(nvmeq);
@@ -1472,11 +1089,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
 				int entry_size)
 {
 	int q_depth = dev->q_depth;
-	unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size);
+	unsigned q_size_aligned = roundup(q_depth * entry_size,
+					  dev->ctrl.page_size);
 
 	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
 		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
-		mem_per_q = round_down(mem_per_q, dev->page_size);
+		mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
 		q_depth = div_u64(mem_per_q, entry_size);
 
 		/*
@@ -1495,8 +1113,8 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 				int qid, int depth)
 {
 	if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
-		unsigned offset = (qid - 1) *
-					roundup(SQ_SIZE(depth), dev->page_size);
+		unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
+						      dev->ctrl.page_size);
 		nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
 		nvmeq->sq_cmds_io = dev->cmb + offset;
 	} else {
@@ -1527,7 +1145,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->q_dmadev = dev->dev;
 	nvmeq->dev = dev;
 	snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
-			dev->instance, qid);
+			dev->ctrl.instance, qid);
 	spin_lock_init(&nvmeq->q_lock);
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
@@ -1604,79 +1222,9 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	return result;
 }
 
-static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
-{
-	unsigned long timeout;
-	u32 bit = enabled ? NVME_CSTS_RDY : 0;
-
-	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
-
-	while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {
-		msleep(100);
-		if (fatal_signal_pending(current))
-			return -EINTR;
-		if (time_after(jiffies, timeout)) {
-			dev_err(dev->dev,
-				"Device not ready; aborting %s\n", enabled ?
-						"initialisation" : "reset");
-			return -ENODEV;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * If the device has been passed off to us in an enabled state, just clear
- * the enabled bit.  The spec says we should set the 'shutdown notification
- * bits', but doing so may cause the device to complete commands to the
- * admin queue ... and we don't know what memory that might be pointing at!
- */
-static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
-{
-	dev->ctrl_config &= ~NVME_CC_SHN_MASK;
-	dev->ctrl_config &= ~NVME_CC_ENABLE;
-	writel(dev->ctrl_config, &dev->bar->cc);
-
-	return nvme_wait_ready(dev, cap, false);
-}
-
-static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
-{
-	dev->ctrl_config &= ~NVME_CC_SHN_MASK;
-	dev->ctrl_config |= NVME_CC_ENABLE;
-	writel(dev->ctrl_config, &dev->bar->cc);
-
-	return nvme_wait_ready(dev, cap, true);
-}
-
-static int nvme_shutdown_ctrl(struct nvme_dev *dev)
-{
-	unsigned long timeout;
-
-	dev->ctrl_config &= ~NVME_CC_SHN_MASK;
-	dev->ctrl_config |= NVME_CC_SHN_NORMAL;
-
-	writel(dev->ctrl_config, &dev->bar->cc);
-
-	timeout = SHUTDOWN_TIMEOUT + jiffies;
-	while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
-							NVME_CSTS_SHST_CMPLT) {
-		msleep(100);
-		if (fatal_signal_pending(current))
-			return -EINTR;
-		if (time_after(jiffies, timeout)) {
-			dev_err(dev->dev,
-				"Device shutdown incomplete; abort shutdown\n");
-			return -ENODEV;
-		}
-	}
-
-	return 0;
-}
-
 static struct blk_mq_ops nvme_mq_admin_ops = {
 	.queue_rq	= nvme_queue_rq,
+	.complete	= nvme_complete_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_admin_init_hctx,
 	.exit_hctx      = nvme_admin_exit_hctx,
@@ -1686,6 +1234,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = {
 
 static struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
+	.complete	= nvme_complete_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_init_hctx,
 	.init_request	= nvme_init_request,
@@ -1695,19 +1244,23 @@ static struct blk_mq_ops nvme_mq_ops = {
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
 {
-	if (dev->admin_q && !blk_queue_dying(dev->admin_q)) {
-		blk_cleanup_queue(dev->admin_q);
+	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+		blk_cleanup_queue(dev->ctrl.admin_q);
 		blk_mq_free_tag_set(&dev->admin_tagset);
 	}
 }
 
 static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 {
-	if (!dev->admin_q) {
+	if (!dev->ctrl.admin_q) {
 		dev->admin_tagset.ops = &nvme_mq_admin_ops;
 		dev->admin_tagset.nr_hw_queues = 1;
-		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
-		dev->admin_tagset.reserved_tags = 1;
+
+		/*
+		 * Subtract one to leave an empty queue entry for 'Full Queue'
+		 * condition. See NVM-Express 1.2 specification, section 4.1.2.
+		 */
+		dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1;
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
 		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
@@ -1716,18 +1269,18 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
 			return -ENOMEM;
 
-		dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);
-		if (IS_ERR(dev->admin_q)) {
+		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
+		if (IS_ERR(dev->ctrl.admin_q)) {
 			blk_mq_free_tag_set(&dev->admin_tagset);
 			return -ENOMEM;
 		}
-		if (!blk_get_queue(dev->admin_q)) {
+		if (!blk_get_queue(dev->ctrl.admin_q)) {
 			nvme_dev_remove_admin(dev);
-			dev->admin_q = NULL;
+			dev->ctrl.admin_q = NULL;
 			return -ENODEV;
 		}
 	} else
-		blk_mq_unfreeze_queue(dev->admin_q);
+		blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
 
 	return 0;
 }
@@ -1736,31 +1289,17 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
 	int result;
 	u32 aqa;
-	u64 cap = lo_hi_readq(&dev->bar->cap);
+	u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
 	struct nvme_queue *nvmeq;
-	/*
-	 * default to a 4K page size, with the intention to update this
-	 * path in the future to accomodate architectures with differing
-	 * kernel and IO page sizes.
-	 */
-	unsigned page_shift = 12;
-	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;
-
-	if (page_shift < dev_page_min) {
-		dev_err(dev->dev,
-				"Minimum device page size (%u) too large for "
-				"host (%u)\n", 1 << dev_page_min,
-				1 << page_shift);
-		return -ENODEV;
-	}
 
-	dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ?
+	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ?
 						NVME_CAP_NSSRC(cap) : 0;
 
-	if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO))
-		writel(NVME_CSTS_NSSRO, &dev->bar->csts);
+	if (dev->subsystem &&
+	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
+		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
 
-	result = nvme_disable_ctrl(dev, cap);
+	result = nvme_disable_ctrl(&dev->ctrl, cap);
 	if (result < 0)
 		return result;
 
@@ -1774,18 +1313,11 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	aqa = nvmeq->q_depth - 1;
 	aqa |= aqa << 16;
 
-	dev->page_size = 1 << page_shift;
-
-	dev->ctrl_config = NVME_CC_CSS_NVM;
-	dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
-	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
-	dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
-
-	writel(aqa, &dev->bar->aqa);
-	lo_hi_writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
-	lo_hi_writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
+	writel(aqa, dev->bar + NVME_REG_AQA);
+	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
+	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
 
-	result = nvme_enable_ctrl(dev, cap);
+	result = nvme_enable_ctrl(&dev->ctrl, cap);
 	if (result)
 		goto free_nvmeq;
 
@@ -1803,406 +1335,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
-{
-	struct nvme_dev *dev = ns->dev;
-	struct nvme_user_io io;
-	struct nvme_command c;
-	unsigned length, meta_len;
-	int status, write;
-	dma_addr_t meta_dma = 0;
-	void *meta = NULL;
-	void __user *metadata;
-
-	if (copy_from_user(&io, uio, sizeof(io)))
-		return -EFAULT;
-
-	switch (io.opcode) {
-	case nvme_cmd_write:
-	case nvme_cmd_read:
-	case nvme_cmd_compare:
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	length = (io.nblocks + 1) << ns->lba_shift;
-	meta_len = (io.nblocks + 1) * ns->ms;
-	metadata = (void __user *)(uintptr_t)io.metadata;
-	write = io.opcode & 1;
-
-	if (ns->ext) {
-		length += meta_len;
-		meta_len = 0;
-	}
-	if (meta_len) {
-		if (((io.metadata & 3) || !io.metadata) && !ns->ext)
-			return -EINVAL;
-
-		meta = dma_alloc_coherent(dev->dev, meta_len,
-						&meta_dma, GFP_KERNEL);
-
-		if (!meta) {
-			status = -ENOMEM;
-			goto unmap;
-		}
-		if (write) {
-			if (copy_from_user(meta, metadata, meta_len)) {
-				status = -EFAULT;
-				goto unmap;
-			}
-		}
-	}
-
-	memset(&c, 0, sizeof(c));
-	c.rw.opcode = io.opcode;
-	c.rw.flags = io.flags;
-	c.rw.nsid = cpu_to_le32(ns->ns_id);
-	c.rw.slba = cpu_to_le64(io.slba);
-	c.rw.length = cpu_to_le16(io.nblocks);
-	c.rw.control = cpu_to_le16(io.control);
-	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
-	c.rw.reftag = cpu_to_le32(io.reftag);
-	c.rw.apptag = cpu_to_le16(io.apptag);
-	c.rw.appmask = cpu_to_le16(io.appmask);
-	c.rw.metadata = cpu_to_le64(meta_dma);
-
-	status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
-			(void __user *)(uintptr_t)io.addr, length, NULL, 0);
- unmap:
-	if (meta) {
-		if (status == NVME_SC_SUCCESS && !write) {
-			if (copy_to_user(metadata, meta, meta_len))
-				status = -EFAULT;
-		}
-		dma_free_coherent(dev->dev, meta_len, meta, meta_dma);
-	}
-	return status;
-}
-
-static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
-			struct nvme_passthru_cmd __user *ucmd)
-{
-	struct nvme_passthru_cmd cmd;
-	struct nvme_command c;
-	unsigned timeout = 0;
-	int status;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
-		return -EFAULT;
-
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = cmd.opcode;
-	c.common.flags = cmd.flags;
-	c.common.nsid = cpu_to_le32(cmd.nsid);
-	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
-	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
-	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
-	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
-	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
-	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
-	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
-	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
-
-	if (cmd.timeout_ms)
-		timeout = msecs_to_jiffies(cmd.timeout_ms);
-
-	status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
-			NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
-			&cmd.result, timeout);
-	if (status >= 0) {
-		if (put_user(cmd.result, &ucmd->result))
-			return -EFAULT;
-	}
-
-	return status;
-}
-
-static int nvme_subsys_reset(struct nvme_dev *dev)
-{
-	if (!dev->subsystem)
-		return -ENOTTY;
-
-	writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */
-	return 0;
-}
-
-static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
-							unsigned long arg)
-{
-	struct nvme_ns *ns = bdev->bd_disk->private_data;
-
-	switch (cmd) {
-	case NVME_IOCTL_ID:
-		force_successful_syscall_return();
-		return ns->ns_id;
-	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_cmd(ns->dev, NULL, (void __user *)arg);
-	case NVME_IOCTL_IO_CMD:
-		return nvme_user_cmd(ns->dev, ns, (void __user *)arg);
-	case NVME_IOCTL_SUBMIT_IO:
-		return nvme_submit_io(ns, (void __user *)arg);
-	case SG_GET_VERSION_NUM:
-		return nvme_sg_get_version_num((void __user *)arg);
-	case SG_IO:
-		return nvme_sg_io(ns, (void __user *)arg);
-	default:
-		return -ENOTTY;
-	}
-}
-
-#ifdef CONFIG_COMPAT
-static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
-					unsigned int cmd, unsigned long arg)
-{
-	switch (cmd) {
-	case SG_IO:
-		return -ENOIOCTLCMD;
-	}
-	return nvme_ioctl(bdev, mode, cmd, arg);
-}
-#else
-#define nvme_compat_ioctl	NULL
-#endif
-
-static void nvme_free_dev(struct kref *kref);
-static void nvme_free_ns(struct kref *kref)
-{
-	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
-
-	if (ns->type == NVME_NS_LIGHTNVM)
-		nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
-
-	spin_lock(&dev_list_lock);
-	ns->disk->private_data = NULL;
-	spin_unlock(&dev_list_lock);
-
-	kref_put(&ns->dev->kref, nvme_free_dev);
-	put_disk(ns->disk);
-	kfree(ns);
-}
-
-static int nvme_open(struct block_device *bdev, fmode_t mode)
-{
-	int ret = 0;
-	struct nvme_ns *ns;
-
-	spin_lock(&dev_list_lock);
-	ns = bdev->bd_disk->private_data;
-	if (!ns)
-		ret = -ENXIO;
-	else if (!kref_get_unless_zero(&ns->kref))
-		ret = -ENXIO;
-	spin_unlock(&dev_list_lock);
-
-	return ret;
-}
-
-static void nvme_release(struct gendisk *disk, fmode_t mode)
-{
-	struct nvme_ns *ns = disk->private_data;
-	kref_put(&ns->kref, nvme_free_ns);
-}
-
-static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
-{
-	/* some standard values */
-	geo->heads = 1 << 6;
-	geo->sectors = 1 << 5;
-	geo->cylinders = get_capacity(bd->bd_disk) >> 11;
-	return 0;
-}
-
-static void nvme_config_discard(struct nvme_ns *ns)
-{
-	u32 logical_block_size = queue_logical_block_size(ns->queue);
-	ns->queue->limits.discard_zeroes_data = 0;
-	ns->queue->limits.discard_alignment = logical_block_size;
-	ns->queue->limits.discard_granularity = logical_block_size;
-	blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
-	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
-}
-
-static int nvme_revalidate_disk(struct gendisk *disk)
-{
-	struct nvme_ns *ns = disk->private_data;
-	struct nvme_dev *dev = ns->dev;
-	struct nvme_id_ns *id;
-	u8 lbaf, pi_type;
-	u16 old_ms;
-	unsigned short bs;
-
-	if (nvme_identify_ns(dev, ns->ns_id, &id)) {
-		dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__,
-						dev->instance, ns->ns_id);
-		return -ENODEV;
-	}
-	if (id->ncap == 0) {
-		kfree(id);
-		return -ENODEV;
-	}
-
-	if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
-		if (nvme_nvm_register(ns->queue, disk->disk_name)) {
-			dev_warn(dev->dev,
-				"%s: LightNVM init failure\n", __func__);
-			kfree(id);
-			return -ENODEV;
-		}
-		ns->type = NVME_NS_LIGHTNVM;
-	}
-
-	old_ms = ns->ms;
-	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
-	ns->lba_shift = id->lbaf[lbaf].ds;
-	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
-	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
-
-	/*
-	 * If identify namespace failed, use default 512 byte block size so
-	 * block layer can use before failing read/write for 0 capacity.
-	 */
-	if (ns->lba_shift == 0)
-		ns->lba_shift = 9;
-	bs = 1 << ns->lba_shift;
-
-	/* XXX: PI implementation requires metadata equal t10 pi tuple size */
-	pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
-					id->dps & NVME_NS_DPS_PI_MASK : 0;
-
-	blk_mq_freeze_queue(disk->queue);
-	if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
-				ns->ms != old_ms ||
-				bs != queue_logical_block_size(disk->queue) ||
-				(ns->ms && ns->ext)))
-		blk_integrity_unregister(disk);
-
-	ns->pi_type = pi_type;
-	blk_queue_logical_block_size(ns->queue, bs);
-
-	if (ns->ms && !ns->ext)
-		nvme_init_integrity(ns);
-
-	if ((ns->ms && !(ns->ms == 8 && ns->pi_type) &&
-						!blk_get_integrity(disk)) ||
-						ns->type == NVME_NS_LIGHTNVM)
-		set_capacity(disk, 0);
-	else
-		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
-
-	if (dev->oncs & NVME_CTRL_ONCS_DSM)
-		nvme_config_discard(ns);
-	blk_mq_unfreeze_queue(disk->queue);
-
-	kfree(id);
-	return 0;
-}
-
-static char nvme_pr_type(enum pr_type type)
-{
-	switch (type) {
-	case PR_WRITE_EXCLUSIVE:
-		return 1;
-	case PR_EXCLUSIVE_ACCESS:
-		return 2;
-	case PR_WRITE_EXCLUSIVE_REG_ONLY:
-		return 3;
-	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
-		return 4;
-	case PR_WRITE_EXCLUSIVE_ALL_REGS:
-		return 5;
-	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
-		return 6;
-	default:
-		return 0;
-	}
-};
-
-static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
-				u64 key, u64 sa_key, u8 op)
-{
-	struct nvme_ns *ns = bdev->bd_disk->private_data;
-	struct nvme_command c;
-	u8 data[16] = { 0, };
-
-	put_unaligned_le64(key, &data[0]);
-	put_unaligned_le64(sa_key, &data[8]);
-
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = op;
-	c.common.nsid = cpu_to_le32(ns->ns_id);
-	c.common.cdw10[0] = cpu_to_le32(cdw10);
-
-	return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
-}
-
-static int nvme_pr_register(struct block_device *bdev, u64 old,
-		u64 new, unsigned flags)
-{
-	u32 cdw10;
-
-	if (flags & ~PR_FL_IGNORE_KEY)
-		return -EOPNOTSUPP;
-
-	cdw10 = old ? 2 : 0;
-	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
-	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
-	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
-}
-
-static int nvme_pr_reserve(struct block_device *bdev, u64 key,
-		enum pr_type type, unsigned flags)
-{
-	u32 cdw10;
-
-	if (flags & ~PR_FL_IGNORE_KEY)
-		return -EOPNOTSUPP;
-
-	cdw10 = nvme_pr_type(type) << 8;
-	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
-	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
-}
-
-static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
-		enum pr_type type, bool abort)
-{
-	u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
-	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
-}
-
-static int nvme_pr_clear(struct block_device *bdev, u64 key)
-{
-	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
-	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
-}
-
-static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
-{
-	u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
-	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
-}
-
-static const struct pr_ops nvme_pr_ops = {
-	.pr_register	= nvme_pr_register,
-	.pr_reserve	= nvme_pr_reserve,
-	.pr_release	= nvme_pr_release,
-	.pr_preempt	= nvme_pr_preempt,
-	.pr_clear	= nvme_pr_clear,
-};
-
-static const struct block_device_operations nvme_fops = {
-	.owner		= THIS_MODULE,
-	.ioctl		= nvme_ioctl,
-	.compat_ioctl	= nvme_compat_ioctl,
-	.open		= nvme_open,
-	.release	= nvme_release,
-	.getgeo		= nvme_getgeo,
-	.revalidate_disk= nvme_revalidate_disk,
-	.pr_ops		= &nvme_pr_ops,
-};
-
 static int nvme_kthread(void *data)
 {
 	struct nvme_dev *dev, *next;
@@ -2212,14 +1344,20 @@ static int nvme_kthread(void *data)
 		spin_lock(&dev_list_lock);
 		list_for_each_entry_safe(dev, next, &dev_list, node) {
 			int i;
-			u32 csts = readl(&dev->bar->csts);
+			u32 csts = readl(dev->bar + NVME_REG_CSTS);
+
+			/*
+			 * Skip controllers currently under reset.
+			 */
+			if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work))
+				continue;
 
 			if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
 							csts & NVME_CSTS_CFS) {
-				if (!__nvme_reset(dev)) {
+				if (queue_work(nvme_workq, &dev->reset_work)) {
 					dev_warn(dev->dev,
 						"Failed status: %x, reset controller\n",
-						readl(&dev->bar->csts));
+						readl(dev->bar + NVME_REG_CSTS));
 				}
 				continue;
 			}
@@ -2230,11 +1368,8 @@ static int nvme_kthread(void *data)
 				spin_lock_irq(&nvmeq->q_lock);
 				nvme_process_cq(nvmeq);
 
-				while ((i == 0) && (dev->event_limit > 0)) {
-					if (nvme_submit_async_admin_req(dev))
-						break;
-					dev->event_limit--;
-				}
+				while (i == 0 && dev->ctrl.event_limit > 0)
+					nvme_submit_async_event(dev);
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
 		}
@@ -2244,127 +1379,33 @@ static int nvme_kthread(void *data)
 	return 0;
 }
 
-static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
-{
-	struct nvme_ns *ns;
-	struct gendisk *disk;
-	int node = dev_to_node(dev->dev);
-
-	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
-	if (!ns)
-		return;
-
-	ns->queue = blk_mq_init_queue(&dev->tagset);
-	if (IS_ERR(ns->queue))
-		goto out_free_ns;
-	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
-	ns->dev = dev;
-	ns->queue->queuedata = ns;
-
-	disk = alloc_disk_node(0, node);
-	if (!disk)
-		goto out_free_queue;
-
-	kref_init(&ns->kref);
-	ns->ns_id = nsid;
-	ns->disk = disk;
-	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
-	list_add_tail(&ns->list, &dev->namespaces);
-
-	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
-	if (dev->max_hw_sectors) {
-		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
-		blk_queue_max_segments(ns->queue,
-			(dev->max_hw_sectors / (dev->page_size >> 9)) + 1);
-	}
-	if (dev->stripe_size)
-		blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
-	if (dev->vwc & NVME_CTRL_VWC_PRESENT)
-		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
-	blk_queue_virt_boundary(ns->queue, dev->page_size - 1);
-
-	disk->major = nvme_major;
-	disk->first_minor = 0;
-	disk->fops = &nvme_fops;
-	disk->private_data = ns;
-	disk->queue = ns->queue;
-	disk->driverfs_dev = dev->device;
-	disk->flags = GENHD_FL_EXT_DEVT;
-	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
-
-	/*
-	 * Initialize capacity to 0 until we establish the namespace format and
-	 * setup integrity extentions if necessary. The revalidate_disk after
-	 * add_disk allows the driver to register with integrity if the format
-	 * requires it.
-	 */
-	set_capacity(disk, 0);
-	if (nvme_revalidate_disk(ns->disk))
-		goto out_free_disk;
-
-	kref_get(&dev->kref);
-	if (ns->type != NVME_NS_LIGHTNVM) {
-		add_disk(ns->disk);
-		if (ns->ms) {
-			struct block_device *bd = bdget_disk(ns->disk, 0);
-			if (!bd)
-				return;
-			if (blkdev_get(bd, FMODE_READ, NULL)) {
-				bdput(bd);
-				return;
-			}
-			blkdev_reread_part(bd);
-			blkdev_put(bd, FMODE_READ);
-		}
-	}
-	return;
- out_free_disk:
-	kfree(disk);
-	list_del(&ns->list);
- out_free_queue:
-	blk_cleanup_queue(ns->queue);
- out_free_ns:
-	kfree(ns);
-}
-
-/*
- * Create I/O queues.  Failing to create an I/O queue is not an issue,
- * we can continue with less than the desired amount of queues, and
- * even a controller without I/O queues an still be used to issue
- * admin commands.  This might be useful to upgrade a buggy firmware
- * for example.
- */
-static void nvme_create_io_queues(struct nvme_dev *dev)
+static int nvme_create_io_queues(struct nvme_dev *dev)
 {
 	unsigned i;
+	int ret = 0;
 
-	for (i = dev->queue_count; i <= dev->max_qid; i++)
-		if (!nvme_alloc_queue(dev, i, dev->q_depth))
+	for (i = dev->queue_count; i <= dev->max_qid; i++) {
+		if (!nvme_alloc_queue(dev, i, dev->q_depth)) {
+			ret = -ENOMEM;
 			break;
+		}
+	}
 
-	for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
-		if (nvme_create_queue(dev->queues[i], i)) {
+	for (i = dev->online_queues; i <= dev->queue_count - 1; i++) {
+		ret = nvme_create_queue(dev->queues[i], i);
+		if (ret) {
 			nvme_free_queues(dev, i);
 			break;
 		}
-}
-
-static int set_queue_count(struct nvme_dev *dev, int count)
-{
-	int status;
-	u32 result;
-	u32 q_count = (count - 1) | ((count - 1) << 16);
-
-	status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
-								&result);
-	if (status < 0)
-		return status;
-	if (status > 0) {
-		dev_err(dev->dev, "Could not set queue count (%d)\n", status);
-		return 0;
 	}
-	return min(result & 0xffff, result >> 16) + 1;
+
+	/*
+	 * Ignore failing Create SQ/CQ commands, we can continue with less
+	 * than the desired aount of queues, and even a controller without
+	 * I/O queues an still be used to issue admin commands.  This might
+	 * be useful to upgrade a buggy firmware for example.
+	 */
+	return ret >= 0 ? 0 : ret;
 }
 
 static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
@@ -2379,11 +1420,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
 	if (!use_cmb_sqes)
 		return NULL;
 
-	dev->cmbsz = readl(&dev->bar->cmbsz);
+	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
 	if (!(NVME_CMB_SZ(dev->cmbsz)))
 		return NULL;
 
-	cmbloc = readl(&dev->bar->cmbloc);
+	cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
 
 	szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
 	size = szu * NVME_CMB_SZ(dev->cmbsz);
@@ -2431,11 +1472,20 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	int result, i, vecs, nr_io_queues, size;
 
 	nr_io_queues = num_possible_cpus();
-	result = set_queue_count(dev, nr_io_queues);
-	if (result <= 0)
+	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
+	if (result < 0)
 		return result;
-	if (result < nr_io_queues)
-		nr_io_queues = result;
+
+	/*
+	 * Degraded controllers might return an error when setting the queue
+	 * count.  We still want to be able to bring them online and offer
+	 * access to the admin queue, as that might be only way to fix them up.
+	 */
+	if (result > 0) {
+		dev_err(dev->dev, "Could not set queue count (%d)\n", result);
+		nr_io_queues = 0;
+		result = 0;
+	}
 
 	if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
 		result = nvme_cmb_qdepth(dev, nr_io_queues,
@@ -2457,7 +1507,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 				return -ENOMEM;
 			size = db_bar_size(dev, nr_io_queues);
 		} while (1);
-		dev->dbs = ((void __iomem *)dev->bar) + 4096;
+		dev->dbs = dev->bar + 4096;
 		adminq->q_db = dev->dbs;
 	}
 
@@ -2501,115 +1551,115 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	/* Free previously allocated queues that are no longer usable */
 	nvme_free_queues(dev, nr_io_queues + 1);
-	nvme_create_io_queues(dev);
-
-	return 0;
+	return nvme_create_io_queues(dev);
 
  free_queues:
 	nvme_free_queues(dev, 1);
 	return result;
 }
 
-static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
+static void nvme_set_irq_hints(struct nvme_dev *dev)
 {
-	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
-	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
+	struct nvme_queue *nvmeq;
+	int i;
 
-	return nsa->ns_id - nsb->ns_id;
-}
+	for (i = 0; i < dev->online_queues; i++) {
+		nvmeq = dev->queues[i];
 
-static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid)
-{
-	struct nvme_ns *ns;
+		if (!nvmeq->tags || !(*nvmeq->tags))
+			continue;
 
-	list_for_each_entry(ns, &dev->namespaces, list) {
-		if (ns->ns_id == nsid)
-			return ns;
-		if (ns->ns_id > nsid)
-			break;
+		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
+					blk_mq_tags_cpumask(*nvmeq->tags));
 	}
-	return NULL;
 }
 
-static inline bool nvme_io_incapable(struct nvme_dev *dev)
+static void nvme_dev_scan(struct work_struct *work)
 {
-	return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS ||
-							dev->online_queues < 2);
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
+
+	if (!dev->tagset.tags)
+		return;
+	nvme_scan_namespaces(&dev->ctrl);
+	nvme_set_irq_hints(dev);
 }
 
-static void nvme_ns_remove(struct nvme_ns *ns)
+static void nvme_del_queue_end(struct request *req, int error)
 {
-	bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue);
+	struct nvme_queue *nvmeq = req->end_io_data;
 
-	if (kill) {
-		blk_set_queue_dying(ns->queue);
-
-		/*
-		 * The controller was shutdown first if we got here through
-		 * device removal. The shutdown may requeue outstanding
-		 * requests. These need to be aborted immediately so
-		 * del_gendisk doesn't block indefinitely for their completion.
-		 */
-		blk_mq_abort_requeue_list(ns->queue);
-	}
-	if (ns->disk->flags & GENHD_FL_UP)
-		del_gendisk(ns->disk);
-	if (kill || !blk_queue_dying(ns->queue)) {
-		blk_mq_abort_requeue_list(ns->queue);
-		blk_cleanup_queue(ns->queue);
-	}
-	list_del_init(&ns->list);
-	kref_put(&ns->kref, nvme_free_ns);
+	blk_mq_free_request(req);
+	complete(&nvmeq->dev->ioq_wait);
 }
 
-static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
+static void nvme_del_cq_end(struct request *req, int error)
 {
-	struct nvme_ns *ns, *next;
-	unsigned i;
+	struct nvme_queue *nvmeq = req->end_io_data;
 
-	for (i = 1; i <= nn; i++) {
-		ns = nvme_find_ns(dev, i);
-		if (ns) {
-			if (revalidate_disk(ns->disk))
-				nvme_ns_remove(ns);
-		} else
-			nvme_alloc_ns(dev, i);
-	}
-	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
-		if (ns->ns_id > nn)
-			nvme_ns_remove(ns);
+	if (!error) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&nvmeq->q_lock, flags);
+		nvme_process_cq(nvmeq);
+		spin_unlock_irqrestore(&nvmeq->q_lock, flags);
 	}
-	list_sort(NULL, &dev->namespaces, ns_cmp);
+
+	nvme_del_queue_end(req, error);
 }
 
-static void nvme_set_irq_hints(struct nvme_dev *dev)
+static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
 {
-	struct nvme_queue *nvmeq;
-	int i;
+	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
+	struct request *req;
+	struct nvme_command cmd;
 
-	for (i = 0; i < dev->online_queues; i++) {
-		nvmeq = dev->queues[i];
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.delete_queue.opcode = opcode;
+	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
 
-		if (!nvmeq->tags || !(*nvmeq->tags))
-			continue;
+	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
-		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
-					blk_mq_tags_cpumask(*nvmeq->tags));
-	}
+	req->timeout = ADMIN_TIMEOUT;
+	req->end_io_data = nvmeq;
+
+	blk_execute_rq_nowait(q, NULL, req, false,
+			opcode == nvme_admin_delete_cq ?
+				nvme_del_cq_end : nvme_del_queue_end);
+	return 0;
 }
 
-static void nvme_dev_scan(struct work_struct *work)
+static void nvme_disable_io_queues(struct nvme_dev *dev)
 {
-	struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
-	struct nvme_id_ctrl *ctrl;
+	int pass;
+	unsigned long timeout;
+	u8 opcode = nvme_admin_delete_sq;
 
-	if (!dev->tagset.tags)
-		return;
-	if (nvme_identify_ctrl(dev, &ctrl))
-		return;
-	nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn));
-	kfree(ctrl);
-	nvme_set_irq_hints(dev);
+	for (pass = 0; pass < 2; pass++) {
+		int sent = 0, i = dev->queue_count - 1;
+
+		reinit_completion(&dev->ioq_wait);
+ retry:
+		timeout = ADMIN_TIMEOUT;
+		for (; i > 0; i--) {
+			struct nvme_queue *nvmeq = dev->queues[i];
+
+			if (!pass)
+				nvme_suspend_queue(nvmeq);
+			if (nvme_delete_queue(nvmeq, opcode))
+				break;
+			++sent;
+		}
+		while (sent--) {
+			timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
+			if (timeout == 0)
+				return;
+			if (i)
+				goto retry;
+		}
+		opcode = nvme_admin_delete_cq;
+	}
 }
 
 /*
@@ -2620,42 +1670,7 @@ static void nvme_dev_scan(struct work_struct *work)
  */
 static int nvme_dev_add(struct nvme_dev *dev)
 {
-	struct pci_dev *pdev = to_pci_dev(dev->dev);
-	int res;
-	struct nvme_id_ctrl *ctrl;
-	int shift = NVME_CAP_MPSMIN(lo_hi_readq(&dev->bar->cap)) + 12;
-
-	res = nvme_identify_ctrl(dev, &ctrl);
-	if (res) {
-		dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
-		return -EIO;
-	}
-
-	dev->oncs = le16_to_cpup(&ctrl->oncs);
-	dev->abort_limit = ctrl->acl + 1;
-	dev->vwc = ctrl->vwc;
-	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
-	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
-	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
-	if (ctrl->mdts)
-		dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
-	else
-		dev->max_hw_sectors = UINT_MAX;
-	if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
-			(pdev->device == 0x0953) && ctrl->vs[3]) {
-		unsigned int max_hw_sectors;
-
-		dev->stripe_size = 1 << (ctrl->vs[3] + shift);
-		max_hw_sectors = dev->stripe_size >> (shift - 9);
-		if (dev->max_hw_sectors) {
-			dev->max_hw_sectors = min(max_hw_sectors,
-							dev->max_hw_sectors);
-		} else
-			dev->max_hw_sectors = max_hw_sectors;
-	}
-	kfree(ctrl);
-
-	if (!dev->tagset.tags) {
+	if (!dev->ctrl.tagset) {
 		dev->tagset.ops = &nvme_mq_ops;
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
@@ -2668,8 +1683,9 @@ static int nvme_dev_add(struct nvme_dev *dev)
 
 		if (blk_mq_alloc_tag_set(&dev->tagset))
 			return 0;
+		dev->ctrl.tagset = &dev->tagset;
 	}
-	schedule_work(&dev->scan_work);
+	queue_work(nvme_workq, &dev->scan_work);
 	return 0;
 }
 
@@ -2699,7 +1715,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	if (!dev->bar)
 		goto disable;
 
-	if (readl(&dev->bar->csts) == -1) {
+	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
 		result = -ENODEV;
 		goto unmap;
 	}
@@ -2714,10 +1730,11 @@ static int nvme_dev_map(struct nvme_dev *dev)
 			goto unmap;
 	}
 
-	cap = lo_hi_readq(&dev->bar->cap);
+	cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
+
 	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
 	dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
-	dev->dbs = ((void __iomem *)dev->bar) + 4096;
+	dev->dbs = dev->bar + 4096;
 
 	/*
 	 * Temporary fix for the Apple controller found in the MacBook8,1 and
@@ -2730,9 +1747,11 @@ static int nvme_dev_map(struct nvme_dev *dev)
 			dev->q_depth);
 	}
 
-	if (readl(&dev->bar->vs) >= NVME_VS(1, 2))
+	if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2))
 		dev->cmb = nvme_map_cmb(dev);
 
+	pci_enable_pcie_error_reporting(pdev);
+	pci_save_state(pdev);
 	return 0;
 
  unmap:
@@ -2760,152 +1779,34 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
 		pci_release_regions(pdev);
 	}
 
-	if (pci_is_enabled(pdev))
+	if (pci_is_enabled(pdev)) {
+		pci_disable_pcie_error_reporting(pdev);
 		pci_disable_device(pdev);
-}
-
-struct nvme_delq_ctx {
-	struct task_struct *waiter;
-	struct kthread_worker *worker;
-	atomic_t refcount;
-};
-
-static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
-{
-	dq->waiter = current;
-	mb();
-
-	for (;;) {
-		set_current_state(TASK_KILLABLE);
-		if (!atomic_read(&dq->refcount))
-			break;
-		if (!schedule_timeout(ADMIN_TIMEOUT) ||
-					fatal_signal_pending(current)) {
-			/*
-			 * Disable the controller first since we can't trust it
-			 * at this point, but leave the admin queue enabled
-			 * until all queue deletion requests are flushed.
-			 * FIXME: This may take a while if there are more h/w
-			 * queues than admin tags.
-			 */
-			set_current_state(TASK_RUNNING);
-			nvme_disable_ctrl(dev, lo_hi_readq(&dev->bar->cap));
-			nvme_clear_queue(dev->queues[0]);
-			flush_kthread_worker(dq->worker);
-			nvme_disable_queue(dev, 0);
-			return;
-		}
 	}
-	set_current_state(TASK_RUNNING);
-}
-
-static void nvme_put_dq(struct nvme_delq_ctx *dq)
-{
-	atomic_dec(&dq->refcount);
-	if (dq->waiter)
-		wake_up_process(dq->waiter);
-}
-
-static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
-{
-	atomic_inc(&dq->refcount);
-	return dq;
-}
-
-static void nvme_del_queue_end(struct nvme_queue *nvmeq)
-{
-	struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
-	nvme_put_dq(dq);
-
-	spin_lock_irq(&nvmeq->q_lock);
-	nvme_process_cq(nvmeq);
-	spin_unlock_irq(&nvmeq->q_lock);
-}
-
-static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
-						kthread_work_func_t fn)
-{
-	struct nvme_command c;
-
-	memset(&c, 0, sizeof(c));
-	c.delete_queue.opcode = opcode;
-	c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
-
-	init_kthread_work(&nvmeq->cmdinfo.work, fn);
-	return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
-								ADMIN_TIMEOUT);
-}
-
-static void nvme_del_cq_work_handler(struct kthread_work *work)
-{
-	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
-							cmdinfo.work);
-	nvme_del_queue_end(nvmeq);
-}
-
-static int nvme_delete_cq(struct nvme_queue *nvmeq)
-{
-	return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
-						nvme_del_cq_work_handler);
-}
-
-static void nvme_del_sq_work_handler(struct kthread_work *work)
-{
-	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
-							cmdinfo.work);
-	int status = nvmeq->cmdinfo.status;
-
-	if (!status)
-		status = nvme_delete_cq(nvmeq);
-	if (status)
-		nvme_del_queue_end(nvmeq);
-}
-
-static int nvme_delete_sq(struct nvme_queue *nvmeq)
-{
-	return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
-						nvme_del_sq_work_handler);
 }
 
-static void nvme_del_queue_start(struct kthread_work *work)
+static int nvme_dev_list_add(struct nvme_dev *dev)
 {
-	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
-							cmdinfo.work);
-	if (nvme_delete_sq(nvmeq))
-		nvme_del_queue_end(nvmeq);
-}
+	bool start_thread = false;
 
-static void nvme_disable_io_queues(struct nvme_dev *dev)
-{
-	int i;
-	DEFINE_KTHREAD_WORKER_ONSTACK(worker);
-	struct nvme_delq_ctx dq;
-	struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
-					&worker, "nvme%d", dev->instance);
-
-	if (IS_ERR(kworker_task)) {
-		dev_err(dev->dev,
-			"Failed to create queue del task\n");
-		for (i = dev->queue_count - 1; i > 0; i--)
-			nvme_disable_queue(dev, i);
-		return;
+	spin_lock(&dev_list_lock);
+	if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
+		start_thread = true;
+		nvme_thread = NULL;
 	}
+	list_add(&dev->node, &dev_list);
+	spin_unlock(&dev_list_lock);
 
-	dq.waiter = NULL;
-	atomic_set(&dq.refcount, 0);
-	dq.worker = &worker;
-	for (i = dev->queue_count - 1; i > 0; i--) {
-		struct nvme_queue *nvmeq = dev->queues[i];
+	if (start_thread) {
+		nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
+		wake_up_all(&nvme_kthread_wait);
+	} else
+		wait_event_killable(nvme_kthread_wait, nvme_thread);
 
-		if (nvme_suspend_queue(nvmeq))
-			continue;
-		nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
-		nvmeq->cmdinfo.worker = dq.worker;
-		init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
-		queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
-	}
-	nvme_wait_dq(&dq, dev);
-	kthread_stop(kworker_task);
+	if (IS_ERR_OR_NULL(nvme_thread))
+		return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
+
+	return 0;
 }
 
 /*
@@ -2928,44 +1829,17 @@ static void nvme_dev_list_remove(struct nvme_dev *dev)
 		kthread_stop(tmp);
 }
 
-static void nvme_freeze_queues(struct nvme_dev *dev)
-{
-	struct nvme_ns *ns;
-
-	list_for_each_entry(ns, &dev->namespaces, list) {
-		blk_mq_freeze_queue_start(ns->queue);
-
-		spin_lock_irq(ns->queue->queue_lock);
-		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
-		spin_unlock_irq(ns->queue->queue_lock);
-
-		blk_mq_cancel_requeue_work(ns->queue);
-		blk_mq_stop_hw_queues(ns->queue);
-	}
-}
-
-static void nvme_unfreeze_queues(struct nvme_dev *dev)
-{
-	struct nvme_ns *ns;
-
-	list_for_each_entry(ns, &dev->namespaces, list) {
-		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
-		blk_mq_unfreeze_queue(ns->queue);
-		blk_mq_start_stopped_hw_queues(ns->queue, true);
-		blk_mq_kick_requeue_list(ns->queue);
-	}
-}
-
-static void nvme_dev_shutdown(struct nvme_dev *dev)
+static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 {
 	int i;
 	u32 csts = -1;
 
 	nvme_dev_list_remove(dev);
 
+	mutex_lock(&dev->shutdown_lock);
 	if (dev->bar) {
-		nvme_freeze_queues(dev);
-		csts = readl(&dev->bar->csts);
+		nvme_stop_queues(&dev->ctrl);
+		csts = readl(dev->bar + NVME_REG_CSTS);
 	}
 	if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
 		for (i = dev->queue_count - 1; i >= 0; i--) {
@@ -2974,30 +1848,13 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 		}
 	} else {
 		nvme_disable_io_queues(dev);
-		nvme_shutdown_ctrl(dev);
-		nvme_disable_queue(dev, 0);
+		nvme_disable_admin_queue(dev, shutdown);
 	}
 	nvme_dev_unmap(dev);
 
 	for (i = dev->queue_count - 1; i >= 0; i--)
 		nvme_clear_queue(dev->queues[i]);
-}
-
-static void nvme_dev_remove(struct nvme_dev *dev)
-{
-	struct nvme_ns *ns, *next;
-
-	if (nvme_io_incapable(dev)) {
-		/*
-		 * If the device is not capable of IO (surprise hot-removal,
-		 * for example), we need to quiesce prior to deleting the
-		 * namespaces. This will end outstanding requests and prevent
-		 * attempts to sync dirty data.
-		 */
-		nvme_dev_shutdown(dev);
-	}
-	list_for_each_entry_safe(ns, next, &dev->namespaces, list)
-		nvme_ns_remove(ns);
+	mutex_unlock(&dev->shutdown_lock);
 }
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
@@ -3023,119 +1880,36 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
 	dma_pool_destroy(dev->prp_small_pool);
 }
 
-static DEFINE_IDA(nvme_instance_ida);
-
-static int nvme_set_instance(struct nvme_dev *dev)
-{
-	int instance, error;
-
-	do {
-		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
-			return -ENODEV;
-
-		spin_lock(&dev_list_lock);
-		error = ida_get_new(&nvme_instance_ida, &instance);
-		spin_unlock(&dev_list_lock);
-	} while (error == -EAGAIN);
-
-	if (error)
-		return -ENODEV;
-
-	dev->instance = instance;
-	return 0;
-}
-
-static void nvme_release_instance(struct nvme_dev *dev)
+static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 {
-	spin_lock(&dev_list_lock);
-	ida_remove(&nvme_instance_ida, dev->instance);
-	spin_unlock(&dev_list_lock);
-}
-
-static void nvme_free_dev(struct kref *kref)
-{
-	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
 
 	put_device(dev->dev);
-	put_device(dev->device);
-	nvme_release_instance(dev);
 	if (dev->tagset.tags)
 		blk_mq_free_tag_set(&dev->tagset);
-	if (dev->admin_q)
-		blk_put_queue(dev->admin_q);
+	if (dev->ctrl.admin_q)
+		blk_put_queue(dev->ctrl.admin_q);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
 }
 
-static int nvme_dev_open(struct inode *inode, struct file *f)
+static void nvme_reset_work(struct work_struct *work)
 {
-	struct nvme_dev *dev;
-	int instance = iminor(inode);
-	int ret = -ENODEV;
-
-	spin_lock(&dev_list_lock);
-	list_for_each_entry(dev, &dev_list, node) {
-		if (dev->instance == instance) {
-			if (!dev->admin_q) {
-				ret = -EWOULDBLOCK;
-				break;
-			}
-			if (!kref_get_unless_zero(&dev->kref))
-				break;
-			f->private_data = dev;
-			ret = 0;
-			break;
-		}
-	}
-	spin_unlock(&dev_list_lock);
-
-	return ret;
-}
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
+	int result;
 
-static int nvme_dev_release(struct inode *inode, struct file *f)
-{
-	struct nvme_dev *dev = f->private_data;
-	kref_put(&dev->kref, nvme_free_dev);
-	return 0;
-}
+	if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
+		goto out;
 
-static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
-{
-	struct nvme_dev *dev = f->private_data;
-	struct nvme_ns *ns;
-
-	switch (cmd) {
-	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_cmd(dev, NULL, (void __user *)arg);
-	case NVME_IOCTL_IO_CMD:
-		if (list_empty(&dev->namespaces))
-			return -ENOTTY;
-		ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
-		return nvme_user_cmd(dev, ns, (void __user *)arg);
-	case NVME_IOCTL_RESET:
-		dev_warn(dev->dev, "resetting controller\n");
-		return nvme_reset(dev);
-	case NVME_IOCTL_SUBSYS_RESET:
-		return nvme_subsys_reset(dev);
-	default:
-		return -ENOTTY;
-	}
-}
+	/*
+	 * If we're called to reset a live controller first shut it down before
+	 * moving on.
+	 */
+	if (dev->bar)
+		nvme_dev_disable(dev, false);
 
-static const struct file_operations nvme_dev_fops = {
-	.owner		= THIS_MODULE,
-	.open		= nvme_dev_open,
-	.release	= nvme_dev_release,
-	.unlocked_ioctl	= nvme_dev_ioctl,
-	.compat_ioctl	= nvme_dev_ioctl,
-};
-
-static void nvme_probe_work(struct work_struct *work)
-{
-	struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
-	bool start_thread = false;
-	int result;
+	set_bit(NVME_CTRL_RESETTING, &dev->flags);
 
 	result = nvme_dev_map(dev);
 	if (result)
@@ -3145,35 +1919,24 @@ static void nvme_probe_work(struct work_struct *work)
 	if (result)
 		goto unmap;
 
-	spin_lock(&dev_list_lock);
-	if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
-		start_thread = true;
-		nvme_thread = NULL;
-	}
-	list_add(&dev->node, &dev_list);
-	spin_unlock(&dev_list_lock);
-
-	if (start_thread) {
-		nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
-		wake_up_all(&nvme_kthread_wait);
-	} else
-		wait_event_killable(nvme_kthread_wait, nvme_thread);
-
-	if (IS_ERR_OR_NULL(nvme_thread)) {
-		result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
-		goto disable;
-	}
-
 	nvme_init_queue(dev->queues[0], 0);
 	result = nvme_alloc_admin_tags(dev);
 	if (result)
 		goto disable;
 
+	result = nvme_init_identify(&dev->ctrl);
+	if (result)
+		goto free_tags;
+
 	result = nvme_setup_io_queues(dev);
 	if (result)
 		goto free_tags;
 
-	dev->event_limit = 1;
+	dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
+
+	result = nvme_dev_list_add(dev);
+	if (result)
+		goto remove;
 
 	/*
 	 * Keep the controller around but remove all namespaces if we don't have
@@ -3181,117 +1944,98 @@ static void nvme_probe_work(struct work_struct *work)
 	 */
 	if (dev->online_queues < 2) {
 		dev_warn(dev->dev, "IO queues not created\n");
-		nvme_dev_remove(dev);
+		nvme_remove_namespaces(&dev->ctrl);
 	} else {
-		nvme_unfreeze_queues(dev);
+		nvme_start_queues(&dev->ctrl);
 		nvme_dev_add(dev);
 	}
 
+	clear_bit(NVME_CTRL_RESETTING, &dev->flags);
 	return;
 
+ remove:
+	nvme_dev_list_remove(dev);
  free_tags:
 	nvme_dev_remove_admin(dev);
-	blk_put_queue(dev->admin_q);
-	dev->admin_q = NULL;
+	blk_put_queue(dev->ctrl.admin_q);
+	dev->ctrl.admin_q = NULL;
 	dev->queues[0]->tags = NULL;
  disable:
-	nvme_disable_queue(dev, 0);
-	nvme_dev_list_remove(dev);
+	nvme_disable_admin_queue(dev, false);
  unmap:
 	nvme_dev_unmap(dev);
  out:
-	if (!work_busy(&dev->reset_work))
-		nvme_dead_ctrl(dev);
+	nvme_remove_dead_ctrl(dev);
 }
 
-static int nvme_remove_dead_ctrl(void *arg)
+static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 {
-	struct nvme_dev *dev = (struct nvme_dev *)arg;
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	if (pci_get_drvdata(pdev))
 		pci_stop_and_remove_bus_device_locked(pdev);
-	kref_put(&dev->kref, nvme_free_dev);
-	return 0;
+	nvme_put_ctrl(&dev->ctrl);
 }
 
-static void nvme_dead_ctrl(struct nvme_dev *dev)
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
 {
-	dev_warn(dev->dev, "Device failed to resume\n");
-	kref_get(&dev->kref);
-	if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
-						dev->instance))) {
-		dev_err(dev->dev,
-			"Failed to start controller remove task\n");
-		kref_put(&dev->kref, nvme_free_dev);
-	}
+	dev_warn(dev->dev, "Removing after probe failure\n");
+	kref_get(&dev->ctrl.kref);
+	if (!schedule_work(&dev->remove_work))
+		nvme_put_ctrl(&dev->ctrl);
 }
 
-static void nvme_reset_work(struct work_struct *ws)
+static int nvme_reset(struct nvme_dev *dev)
 {
-	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
-	bool in_probe = work_busy(&dev->probe_work);
-
-	nvme_dev_shutdown(dev);
+	if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
+		return -ENODEV;
 
-	/* Synchronize with device probe so that work will see failure status
-	 * and exit gracefully without trying to schedule another reset */
-	flush_work(&dev->probe_work);
+	if (!queue_work(nvme_workq, &dev->reset_work))
+		return -EBUSY;
 
-	/* Fail this device if reset occured during probe to avoid
-	 * infinite initialization loops. */
-	if (in_probe) {
-		nvme_dead_ctrl(dev);
-		return;
-	}
-	/* Schedule device resume asynchronously so the reset work is available
-	 * to cleanup errors that may occur during reinitialization */
-	schedule_work(&dev->probe_work);
+	flush_work(&dev->reset_work);
+	return 0;
 }
 
-static int __nvme_reset(struct nvme_dev *dev)
+static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
 {
-	if (work_pending(&dev->reset_work))
-		return -EBUSY;
-	list_del_init(&dev->node);
-	queue_work(nvme_workq, &dev->reset_work);
+	*val = readl(to_nvme_dev(ctrl)->bar + off);
 	return 0;
 }
 
-static int nvme_reset(struct nvme_dev *dev)
+static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
 {
-	int ret;
-
-	if (!dev->admin_q || blk_queue_dying(dev->admin_q))
-		return -ENODEV;
-
-	spin_lock(&dev_list_lock);
-	ret = __nvme_reset(dev);
-	spin_unlock(&dev_list_lock);
-
-	if (!ret) {
-		flush_work(&dev->reset_work);
-		flush_work(&dev->probe_work);
-		return 0;
-	}
+	writel(val, to_nvme_dev(ctrl)->bar + off);
+	return 0;
+}
 
-	return ret;
+static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
+{
+	*val = readq(to_nvme_dev(ctrl)->bar + off);
+	return 0;
 }
 
-static ssize_t nvme_sysfs_reset(struct device *dev,
-				struct device_attribute *attr, const char *buf,
-				size_t count)
+static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl)
 {
-	struct nvme_dev *ndev = dev_get_drvdata(dev);
-	int ret;
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
 
-	ret = nvme_reset(ndev);
-	if (ret < 0)
-		return ret;
+	return !dev->bar || dev->online_queues < 2;
+}
 
-	return count;
+static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl)
+{
+	return nvme_reset(to_nvme_dev(ctrl));
 }
-static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
+
+static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
+	.reg_read32		= nvme_pci_reg_read32,
+	.reg_write32		= nvme_pci_reg_write32,
+	.reg_read64		= nvme_pci_reg_read64,
+	.io_incapable		= nvme_pci_io_incapable,
+	.reset_ctrl		= nvme_pci_reset_ctrl,
+	.free_ctrl		= nvme_pci_free_ctrl,
+};
 
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
@@ -3314,46 +2058,30 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (!dev->queues)
 		goto free;
 
-	INIT_LIST_HEAD(&dev->namespaces);
-	INIT_WORK(&dev->reset_work, nvme_reset_work);
 	dev->dev = get_device(&pdev->dev);
 	pci_set_drvdata(pdev, dev);
-	result = nvme_set_instance(dev);
-	if (result)
-		goto put_pci;
+
+	INIT_LIST_HEAD(&dev->node);
+	INIT_WORK(&dev->scan_work, nvme_dev_scan);
+	INIT_WORK(&dev->reset_work, nvme_reset_work);
+	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+	mutex_init(&dev->shutdown_lock);
+	init_completion(&dev->ioq_wait);
 
 	result = nvme_setup_prp_pools(dev);
 	if (result)
-		goto release;
-
-	kref_init(&dev->kref);
-	dev->device = device_create(nvme_class, &pdev->dev,
-				MKDEV(nvme_char_major, dev->instance),
-				dev, "nvme%d", dev->instance);
-	if (IS_ERR(dev->device)) {
-		result = PTR_ERR(dev->device);
-		goto release_pools;
-	}
-	get_device(dev->device);
-	dev_set_drvdata(dev->device, dev);
+		goto put_pci;
 
-	result = device_create_file(dev->device, &dev_attr_reset_controller);
+	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
+			id->driver_data);
 	if (result)
-		goto put_dev;
+		goto release_pools;
 
-	INIT_LIST_HEAD(&dev->node);
-	INIT_WORK(&dev->scan_work, nvme_dev_scan);
-	INIT_WORK(&dev->probe_work, nvme_probe_work);
-	schedule_work(&dev->probe_work);
+	queue_work(nvme_workq, &dev->reset_work);
 	return 0;
 
- put_dev:
-	device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
-	put_device(dev->device);
  release_pools:
 	nvme_release_prp_pools(dev);
- release:
-	nvme_release_instance(dev);
  put_pci:
 	put_device(dev->dev);
  free:
@@ -3368,15 +2096,15 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
 	struct nvme_dev *dev = pci_get_drvdata(pdev);
 
 	if (prepare)
-		nvme_dev_shutdown(dev);
+		nvme_dev_disable(dev, false);
 	else
-		schedule_work(&dev->probe_work);
+		queue_work(nvme_workq, &dev->reset_work);
 }
 
 static void nvme_shutdown(struct pci_dev *pdev)
 {
 	struct nvme_dev *dev = pci_get_drvdata(pdev);
-	nvme_dev_shutdown(dev);
+	nvme_dev_disable(dev, true);
 }
 
 static void nvme_remove(struct pci_dev *pdev)
@@ -3388,34 +2116,25 @@ static void nvme_remove(struct pci_dev *pdev)
 	spin_unlock(&dev_list_lock);
 
 	pci_set_drvdata(pdev, NULL);
-	flush_work(&dev->probe_work);
 	flush_work(&dev->reset_work);
 	flush_work(&dev->scan_work);
-	device_remove_file(dev->device, &dev_attr_reset_controller);
-	nvme_dev_remove(dev);
-	nvme_dev_shutdown(dev);
+	nvme_remove_namespaces(&dev->ctrl);
+	nvme_uninit_ctrl(&dev->ctrl);
+	nvme_dev_disable(dev, true);
 	nvme_dev_remove_admin(dev);
-	device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
 	nvme_free_queues(dev, 0);
 	nvme_release_cmb(dev);
 	nvme_release_prp_pools(dev);
-	kref_put(&dev->kref, nvme_free_dev);
+	nvme_put_ctrl(&dev->ctrl);
 }
 
-/* These functions are yet to be implemented */
-#define nvme_error_detected NULL
-#define nvme_dump_registers NULL
-#define nvme_link_reset NULL
-#define nvme_slot_reset NULL
-#define nvme_error_resume NULL
-
 #ifdef CONFIG_PM_SLEEP
 static int nvme_suspend(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct nvme_dev *ndev = pci_get_drvdata(pdev);
 
-	nvme_dev_shutdown(ndev);
+	nvme_dev_disable(ndev, true);
 	return 0;
 }
 
@@ -3424,17 +2143,53 @@ static int nvme_resume(struct device *dev)
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct nvme_dev *ndev = pci_get_drvdata(pdev);
 
-	schedule_work(&ndev->probe_work);
+	queue_work(nvme_workq, &ndev->reset_work);
 	return 0;
 }
 #endif
 
 static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
 
+static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
+						pci_channel_state_t state)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	/*
+	 * A frozen channel requires a reset. When detected, this method will
+	 * shutdown the controller to quiesce. The controller will be restarted
+	 * after the slot reset through driver's slot_reset callback.
+	 */
+	dev_warn(&pdev->dev, "error detected: state:%d\n", state);
+	switch (state) {
+	case pci_channel_io_normal:
+		return PCI_ERS_RESULT_CAN_RECOVER;
+	case pci_channel_io_frozen:
+		nvme_dev_disable(dev, false);
+		return PCI_ERS_RESULT_NEED_RESET;
+	case pci_channel_io_perm_failure:
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	dev_info(&pdev->dev, "restart after slot reset\n");
+	pci_restore_state(pdev);
+	queue_work(nvme_workq, &dev->reset_work);
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void nvme_error_resume(struct pci_dev *pdev)
+{
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+}
+
 static const struct pci_error_handlers nvme_err_handler = {
 	.error_detected	= nvme_error_detected,
-	.mmio_enabled	= nvme_dump_registers,
-	.link_reset	= nvme_link_reset,
 	.slot_reset	= nvme_slot_reset,
 	.resume		= nvme_error_resume,
 	.reset_notify	= nvme_reset_notify,
@@ -3444,6 +2199,10 @@ static const struct pci_error_handlers nvme_err_handler = {
 #define PCI_CLASS_STORAGE_EXPRESS	0x010802
 
 static const struct pci_device_id nvme_id_table[] = {
+	{ PCI_VDEVICE(INTEL, 0x0953),
+		.driver_data = NVME_QUIRK_STRIPE_SIZE, },
+	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
+		.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
 	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
 	{ 0, }
@@ -3468,40 +2227,21 @@ static int __init nvme_init(void)
 
 	init_waitqueue_head(&nvme_kthread_wait);
 
-	nvme_workq = create_singlethread_workqueue("nvme");
+	nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
 	if (!nvme_workq)
 		return -ENOMEM;
 
-	result = register_blkdev(nvme_major, "nvme");
+	result = nvme_core_init();
 	if (result < 0)
 		goto kill_workq;
-	else if (result > 0)
-		nvme_major = result;
-
-	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
-							&nvme_dev_fops);
-	if (result < 0)
-		goto unregister_blkdev;
-	else if (result > 0)
-		nvme_char_major = result;
-
-	nvme_class = class_create(THIS_MODULE, "nvme");
-	if (IS_ERR(nvme_class)) {
-		result = PTR_ERR(nvme_class);
-		goto unregister_chrdev;
-	}
 
 	result = pci_register_driver(&nvme_driver);
 	if (result)
-		goto destroy_class;
+		goto core_exit;
 	return 0;
 
- destroy_class:
-	class_destroy(nvme_class);
- unregister_chrdev:
-	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
- unregister_blkdev:
-	unregister_blkdev(nvme_major, "nvme");
+ core_exit:
+	nvme_core_exit();
  kill_workq:
 	destroy_workqueue(nvme_workq);
 	return result;
@@ -3510,10 +2250,8 @@ static int __init nvme_init(void)
 static void __exit nvme_exit(void)
 {
 	pci_unregister_driver(&nvme_driver);
-	unregister_blkdev(nvme_major, "nvme");
+	nvme_core_exit();
 	destroy_workqueue(nvme_workq);
-	class_destroy(nvme_class);
-	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
 	BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
 	_nvme_check_size();
 }
diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c
index c3d8d3887a31..e947e298a737 100644
--- a/drivers/nvme/host/scsi.c
+++ b/drivers/nvme/host/scsi.c
@@ -524,7 +524,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
 					struct sg_io_hdr *hdr, u8 *inq_response,
 					int alloc_len)
 {
-	struct nvme_dev *dev = ns->dev;
+	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_id_ns *id_ns;
 	int res;
 	int nvme_sc;
@@ -532,10 +532,10 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
 	u8 resp_data_format = 0x02;
 	u8 protect;
 	u8 cmdque = 0x01 << 1;
-	u8 fw_offset = sizeof(dev->firmware_rev);
+	u8 fw_offset = sizeof(ctrl->firmware_rev);
 
 	/* nvme ns identify - use DPS value for PROTECT field */
-	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+	nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
 		return res;
@@ -553,12 +553,12 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
 	inq_response[5] = protect;	/* sccs=0 | acc=0 | tpgs=0 | pc3=0 */
 	inq_response[7] = cmdque;	/* wbus16=0 | sync=0 | vs=0 */
 	strncpy(&inq_response[8], "NVMe    ", 8);
-	strncpy(&inq_response[16], dev->model, 16);
+	strncpy(&inq_response[16], ctrl->model, 16);
 
-	while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4)
+	while (ctrl->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4)
 		fw_offset--;
 	fw_offset -= 4;
-	strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4);
+	strncpy(&inq_response[32], ctrl->firmware_rev + fw_offset, 4);
 
 	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
 	return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
@@ -588,82 +588,113 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns,
 					struct sg_io_hdr *hdr, u8 *inq_response,
 					int alloc_len)
 {
-	struct nvme_dev *dev = ns->dev;
 	int xfer_len;
 
 	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
 	inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */
 	inq_response[3] = INQ_SERIAL_NUMBER_LENGTH;    /* Page Length */
-	strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH);
+	strncpy(&inq_response[4], ns->ctrl->serial, INQ_SERIAL_NUMBER_LENGTH);
 
 	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
 	return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 }
 
-static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
-					u8 *inq_response, int alloc_len)
+static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+		u8 *inq_response, int alloc_len)
 {
-	struct nvme_dev *dev = ns->dev;
-	int res;
-	int nvme_sc;
-	int xfer_len;
-	__be32 tmp_id = cpu_to_be32(ns->ns_id);
+	struct nvme_id_ns *id_ns;
+	int nvme_sc, res;
+	size_t len;
+	void *eui;
 
-	memset(inq_response, 0, alloc_len);
-	inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;    /* Page Code */
-	if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) {
-		struct nvme_id_ns *id_ns;
-		void *eui;
-		int len;
+	nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		return res;
 
-		nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
-		res = nvme_trans_status_code(hdr, nvme_sc);
-		if (res)
-			return res;
+	eui = id_ns->eui64;
+	len = sizeof(id_ns->eui64);
 
-		eui = id_ns->eui64;
-		len = sizeof(id_ns->eui64);
-		if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) {
-			if (bitmap_empty(eui, len * 8)) {
-				eui = id_ns->nguid;
-				len = sizeof(id_ns->nguid);
-			}
-		}
+	if (ns->ctrl->vs >= NVME_VS(1, 2)) {
 		if (bitmap_empty(eui, len * 8)) {
-			kfree(id_ns);
-			goto scsi_string;
+			eui = id_ns->nguid;
+			len = sizeof(id_ns->nguid);
 		}
+	}
 
-		inq_response[3] = 4 + len; /* Page Length */
-		/* Designation Descriptor start */
-		inq_response[4] = 0x01;    /* Proto ID=0h | Code set=1h */
-		inq_response[5] = 0x02;    /* PIV=0b | Asso=00b | Designator Type=2h */
-		inq_response[6] = 0x00;    /* Rsvd */
-		inq_response[7] = len;     /* Designator Length */
-		memcpy(&inq_response[8], eui, len);
-		kfree(id_ns);
-	} else {
- scsi_string:
-		if (alloc_len < 72) {
-			return nvme_trans_completion(hdr,
-					SAM_STAT_CHECK_CONDITION,
-					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
-					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-		}
-		inq_response[3] = 0x48;    /* Page Length */
-		/* Designation Descriptor start */
-		inq_response[4] = 0x03;    /* Proto ID=0h | Code set=3h */
-		inq_response[5] = 0x08;    /* PIV=0b | Asso=00b | Designator Type=8h */
-		inq_response[6] = 0x00;    /* Rsvd */
-		inq_response[7] = 0x44;    /* Designator Length */
-
-		sprintf(&inq_response[8], "%04x", to_pci_dev(dev->dev)->vendor);
-		memcpy(&inq_response[12], dev->model, sizeof(dev->model));
-		sprintf(&inq_response[52], "%04x", tmp_id);
-		memcpy(&inq_response[56], dev->serial, sizeof(dev->serial));
+	if (bitmap_empty(eui, len * 8)) {
+		res = -EOPNOTSUPP;
+		goto out_free_id;
 	}
-	xfer_len = alloc_len;
-	return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+	memset(inq_response, 0, alloc_len);
+	inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;
+	inq_response[3] = 4 + len; /* Page Length */
+
+	/* Designation Descriptor start */
+	inq_response[4] = 0x01;	/* Proto ID=0h | Code set=1h */
+	inq_response[5] = 0x02;	/* PIV=0b | Asso=00b | Designator Type=2h */
+	inq_response[6] = 0x00;	/* Rsvd */
+	inq_response[7] = len;	/* Designator Length */
+	memcpy(&inq_response[8], eui, len);
+
+	res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len);
+out_free_id:
+	kfree(id_ns);
+	return res;
+}
+
+static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns,
+		struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct nvme_id_ctrl *id_ctrl;
+	int nvme_sc, res;
+
+	if (alloc_len < 72) {
+		return nvme_trans_completion(hdr,
+				SAM_STAT_CHECK_CONDITION,
+				ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	}
+
+	nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		return res;
+
+	memset(inq_response, 0, alloc_len);
+	inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;
+	inq_response[3] = 0x48;	/* Page Length */
+
+	/* Designation Descriptor start */
+	inq_response[4] = 0x03;	/* Proto ID=0h | Code set=3h */
+	inq_response[5] = 0x08;	/* PIV=0b | Asso=00b | Designator Type=8h */
+	inq_response[6] = 0x00;	/* Rsvd */
+	inq_response[7] = 0x44;	/* Designator Length */
+
+	sprintf(&inq_response[8], "%04x", le16_to_cpu(id_ctrl->vid));
+	memcpy(&inq_response[12], ctrl->model, sizeof(ctrl->model));
+	sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id));
+	memcpy(&inq_response[56], ctrl->serial, sizeof(ctrl->serial));
+
+	res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len);
+	kfree(id_ctrl);
+	return res;
+}
+
+static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					u8 *resp, int alloc_len)
+{
+	int res;
+
+	if (ns->ctrl->vs >= NVME_VS(1, 1)) {
+		res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len);
+		if (res != -EOPNOTSUPP)
+			return res;
+	}
+
+	return nvme_fill_device_id_scsi_string(ns, hdr, resp, alloc_len);
 }
 
 static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
@@ -672,7 +703,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	u8 *inq_response;
 	int res;
 	int nvme_sc;
-	struct nvme_dev *dev = ns->dev;
+	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_id_ctrl *id_ctrl;
 	struct nvme_id_ns *id_ns;
 	int xfer_len;
@@ -688,7 +719,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	if (inq_response == NULL)
 		return -ENOMEM;
 
-	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+	nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
 		goto out_free_inq;
@@ -704,7 +735,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	app_chk = protect << 1;
 	ref_chk = protect;
 
-	nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
+	nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
 		goto out_free_inq;
@@ -815,7 +846,6 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
 	int res;
 	int xfer_len;
 	u8 *log_response;
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_smart_log *smart_log;
 	u8 temp_c;
 	u16 temp_k;
@@ -824,7 +854,7 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
 	if (log_response == NULL)
 		return -ENOMEM;
 
-	res = nvme_get_log_page(dev, &smart_log);
+	res = nvme_get_log_page(ns->ctrl, &smart_log);
 	if (res < 0)
 		goto out_free_response;
 
@@ -862,7 +892,6 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	int res;
 	int xfer_len;
 	u8 *log_response;
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_smart_log *smart_log;
 	u32 feature_resp;
 	u8 temp_c_cur, temp_c_thresh;
@@ -872,7 +901,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	if (log_response == NULL)
 		return -ENOMEM;
 
-	res = nvme_get_log_page(dev, &smart_log);
+	res = nvme_get_log_page(ns->ctrl, &smart_log);
 	if (res < 0)
 		goto out_free_response;
 
@@ -886,7 +915,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	kfree(smart_log);
 
 	/* Get Features for Temp Threshold */
-	res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0,
+	res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0,
 								&feature_resp);
 	if (res != NVME_SC_SUCCESS)
 		temp_c_thresh = LOG_TEMP_UNKNOWN;
@@ -948,7 +977,6 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 {
 	int res;
 	int nvme_sc;
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_id_ns *id_ns;
 	u8 flbas;
 	u32 lba_length;
@@ -958,7 +986,7 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN)
 		return -EINVAL;
 
-	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+	nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
 		return res;
@@ -1014,14 +1042,13 @@ static int nvme_trans_fill_caching_page(struct nvme_ns *ns,
 {
 	int res = 0;
 	int nvme_sc;
-	struct nvme_dev *dev = ns->dev;
 	u32 feature_resp;
 	u8 vwc;
 
 	if (len < MODE_PAGE_CACHING_LEN)
 		return -EINVAL;
 
-	nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0,
+	nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0,
 								&feature_resp);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
@@ -1207,12 +1234,11 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 {
 	int res;
 	int nvme_sc;
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_id_ctrl *id_ctrl;
 	int lowest_pow_st;	/* max npss = lowest power consumption */
 	unsigned ps_desired = 0;
 
-	nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
+	nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
 		return res;
@@ -1256,7 +1282,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 		break;
 	}
-	nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0,
+	nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0,
 				    NULL);
 	return nvme_trans_status_code(hdr, nvme_sc);
 }
@@ -1280,7 +1306,6 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr
 					u8 buffer_id)
 {
 	int nvme_sc;
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_command c;
 
 	if (hdr->iovec_count > 0) {
@@ -1297,7 +1322,7 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr
 	c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1);
 	c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS);
 
-	nvme_sc = __nvme_submit_sync_cmd(dev->admin_q, &c, NULL,
+	nvme_sc = nvme_submit_user_cmd(ns->ctrl->admin_q, &c,
 			hdr->dxferp, tot_len, NULL, 0);
 	return nvme_trans_status_code(hdr, nvme_sc);
 }
@@ -1364,14 +1389,13 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 {
 	int res = 0;
 	int nvme_sc;
-	struct nvme_dev *dev = ns->dev;
 	unsigned dword11;
 
 	switch (page_code) {
 	case MODE_PAGE_CACHING:
 		dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0);
-		nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11,
-					    0, NULL);
+		nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC,
+					    dword11, 0, NULL);
 		res = nvme_trans_status_code(hdr, nvme_sc);
 		break;
 	case MODE_PAGE_CONTROL:
@@ -1473,7 +1497,6 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
 {
 	int res = 0;
 	int nvme_sc;
-	struct nvme_dev *dev = ns->dev;
 	u8 flbas;
 
 	/*
@@ -1486,7 +1509,7 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
 	if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) {
 		struct nvme_id_ns *id_ns;
 
-		nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+		nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
 		res = nvme_trans_status_code(hdr, nvme_sc);
 		if (res)
 			return res;
@@ -1570,7 +1593,6 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 {
 	int res;
 	int nvme_sc;
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_id_ns *id_ns;
 	u8 i;
 	u8 flbas, nlbaf;
@@ -1579,7 +1601,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	struct nvme_command c;
 
 	/* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */
-	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+	nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
 		return res;
@@ -1611,7 +1633,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	c.format.nsid = cpu_to_le32(ns->ns_id);
 	c.format.cdw10 = cpu_to_le32(cdw10);
 
-	nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
+	nvme_sc = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 
 	kfree(id_ns);
@@ -1704,7 +1726,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 			nvme_sc = NVME_SC_LBA_RANGE;
 			break;
 		}
-		nvme_sc = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
+		nvme_sc = nvme_submit_user_cmd(ns->queue, &c,
 				next_mapping_addr, unit_len, NULL, 0);
 		if (nvme_sc)
 			break;
@@ -2040,7 +2062,6 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	u32 alloc_len;
 	u32 resp_size;
 	u32 xfer_len;
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_id_ns *id_ns;
 	u8 *response;
 
@@ -2052,7 +2073,7 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		resp_size = READ_CAP_10_RESP_SIZE;
 	}
 
-	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+	nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
 		return res;	
@@ -2080,7 +2101,6 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	int nvme_sc;
 	u32 alloc_len, xfer_len, resp_size;
 	u8 *response;
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_id_ctrl *id_ctrl;
 	u32 ll_length, lun_id;
 	u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET;
@@ -2094,7 +2114,7 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	case ALL_LUNS_RETURNED:
 	case ALL_WELL_KNOWN_LUNS_RETURNED:
 	case RESTRICTED_LUNS_RETURNED:
-		nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
+		nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl);
 		res = nvme_trans_status_code(hdr, nvme_sc);
 		if (res)
 			return res;
@@ -2295,9 +2315,7 @@ static int nvme_trans_test_unit_ready(struct nvme_ns *ns,
 					struct sg_io_hdr *hdr,
 					u8 *cmd)
 {
-	struct nvme_dev *dev = ns->dev;
-
-	if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY))
+	if (nvme_ctrl_ready(ns->ctrl))
 		return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
 					    NOT_READY, SCSI_ASC_LUN_NOT_READY,
 					    SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index dd92c5edf219..b48ac6300c79 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -138,22 +138,22 @@ static int __oprofilefs_create_file(struct dentry *root, char const *name,
 	struct dentry *dentry;
 	struct inode *inode;
 
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 	dentry = d_alloc_name(root, name);
 	if (!dentry) {
-		mutex_unlock(&d_inode(root)->i_mutex);
+		inode_unlock(d_inode(root));
 		return -ENOMEM;
 	}
 	inode = oprofilefs_get_inode(root->d_sb, S_IFREG | perm);
 	if (!inode) {
 		dput(dentry);
-		mutex_unlock(&d_inode(root)->i_mutex);
+		inode_unlock(d_inode(root));
 		return -ENOMEM;
 	}
 	inode->i_fop = fops;
 	inode->i_private = priv;
 	d_add(dentry, inode);
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 	return 0;
 }
 
@@ -215,22 +215,22 @@ struct dentry *oprofilefs_mkdir(struct dentry *parent, char const *name)
 	struct dentry *dentry;
 	struct inode *inode;
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 	dentry = d_alloc_name(parent, name);
 	if (!dentry) {
-		mutex_unlock(&d_inode(parent)->i_mutex);
+		inode_unlock(d_inode(parent));
 		return NULL;
 	}
 	inode = oprofilefs_get_inode(parent->d_sb, S_IFDIR | 0755);
 	if (!inode) {
 		dput(dentry);
-		mutex_unlock(&d_inode(parent)->i_mutex);
+		inode_unlock(d_inode(parent));
 		return NULL;
 	}
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	d_add(dentry, inode);
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	return dentry;
 }
 
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c
index 2940bd769936..25aba1613e21 100644
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -1045,6 +1045,9 @@ static int tw_chrdev_open(struct inode *inode, struct file *file)
 static const struct file_operations tw_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= tw_chrdev_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = tw_chrdev_ioctl,
+#endif
 	.open		= tw_chrdev_open,
 	.release	= NULL,
 	.llseek		= noop_llseek,
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index c1fe0d2f90ca..e2f31c93717d 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1106,6 +1106,7 @@ config SCSI_IPR
 	tristate "IBM Power Linux RAID adapter support"
 	depends on PCI && SCSI && ATA
 	select FW_LOADER
+	select IRQ_POLL
 	---help---
 	  This driver supports the IBM Power Linux family RAID adapters.
 	  This includes IBM pSeries 5712, 5703, 5709, and 570A, as well
@@ -1620,23 +1621,6 @@ config ATARI_SCSI
 	  ST-DMA, replacing ACSI).  It does NOT support other schemes, like
 	  in the Hades (without DMA).
 
-config ATARI_SCSI_TOSHIBA_DELAY
-	bool "Long delays for Toshiba CD-ROMs"
-	depends on ATARI_SCSI
-	help
-	  This option increases the delay after a SCSI arbitration to
-	  accommodate some flaky Toshiba CD-ROM drives. Say Y if you intend to
-	  use a Toshiba CD-ROM drive; otherwise, the option is not needed and
-	  would impact performance a bit, so say N.
-
-config ATARI_SCSI_RESET_BOOT
-	bool "Reset SCSI-devices at boottime"
-	depends on ATARI_SCSI
-	help
-	  Reset the devices on your Atari whenever it boots.  This makes the
-	  boot process fractionally longer but may assist recovery from errors
-	  that leave the devices with SCSI operations partway completed.
-
 config MAC_SCSI
 	tristate "Macintosh NCR5380 SCSI"
 	depends on MAC && SCSI=y
diff --git a/drivers/scsi/NCR5380.c b/drivers/scsi/NCR5380.c
index a777e5c412df..d72867257346 100644
--- a/drivers/scsi/NCR5380.c
+++ b/drivers/scsi/NCR5380.c
@@ -1,17 +1,17 @@
-/* 
+/*
  * NCR 5380 generic driver routines.  These should make it *trivial*
- *      to implement 5380 SCSI drivers under Linux with a non-trantor
- *      architecture.
+ * to implement 5380 SCSI drivers under Linux with a non-trantor
+ * architecture.
  *
- *      Note that these routines also work with NR53c400 family chips.
+ * Note that these routines also work with NR53c400 family chips.
  *
  * Copyright 1993, Drew Eckhardt
- *      Visionary Computing 
- *      (Unix and Linux consulting and custom programming)
- *      drew@colorado.edu
- *      +1 (303) 666-5836
+ * Visionary Computing
+ * (Unix and Linux consulting and custom programming)
+ * drew@colorado.edu
+ * +1 (303) 666-5836
  *
- * For more information, please consult 
+ * For more information, please consult
  *
  * NCR 5380 Family
  * SCSI Protocol Controller
@@ -25,84 +25,28 @@
  */
 
 /*
- * Revision 1.10 1998/9/2	Alan Cox
- *				(alan@lxorguk.ukuu.org.uk)
- * Fixed up the timer lockups reported so far. Things still suck. Looking 
- * forward to 2.3 and per device request queues. Then it'll be possible to
- * SMP thread this beast and improve life no end.
- 
- * Revision 1.9  1997/7/27	Ronald van Cuijlenborg
- *				(ronald.van.cuijlenborg@tip.nl or nutty@dds.nl)
- * (hopefully) fixed and enhanced USLEEP
- * added support for DTC3181E card (for Mustek scanner)
- *
-
- * Revision 1.8			Ingmar Baumgart
- *				(ingmar@gonzo.schwaben.de)
- * added support for NCR53C400a card
- *
-
- * Revision 1.7  1996/3/2       Ray Van Tassle (rayvt@comm.mot.com)
- * added proc_info
- * added support needed for DTC 3180/3280
- * fixed a couple of bugs
- *
-
- * Revision 1.5  1994/01/19  09:14:57  drew
- * Fixed udelay() hack that was being used on DATAOUT phases
- * instead of a proper wait for the final handshake.
- *
- * Revision 1.4  1994/01/19  06:44:25  drew
- * *** empty log message ***
- *
- * Revision 1.3  1994/01/19  05:24:40  drew
- * Added support for TCR LAST_BYTE_SENT bit.
- *
- * Revision 1.2  1994/01/15  06:14:11  drew
- * REAL DMA support, bug fixes.
- *
- * Revision 1.1  1994/01/15  06:00:54  drew
- * Initial revision
- *
+ * With contributions from Ray Van Tassle, Ingmar Baumgart,
+ * Ronald van Cuijlenborg, Alan Cox and others.
  */
 
 /*
- * Further development / testing that should be done : 
+ * Further development / testing that should be done :
  * 1.  Cleanup the NCR5380_transfer_dma function and DMA operation complete
- *     code so that everything does the same thing that's done at the 
- *     end of a pseudo-DMA read operation.
+ * code so that everything does the same thing that's done at the
+ * end of a pseudo-DMA read operation.
  *
  * 2.  Fix REAL_DMA (interrupt driven, polled works fine) -
- *     basically, transfer size needs to be reduced by one 
- *     and the last byte read as is done with PSEUDO_DMA.
- * 
- * 4.  Test SCSI-II tagged queueing (I have no devices which support 
- *      tagged queueing)
- *
- * 5.  Test linked command handling code after Eric is ready with 
- *      the high level code.
+ * basically, transfer size needs to be reduced by one
+ * and the last byte read as is done with PSEUDO_DMA.
+ *
+ * 4.  Test SCSI-II tagged queueing (I have no devices which support
+ * tagged queueing)
  */
-#include <scsi/scsi_dbg.h>
-#include <scsi/scsi_transport_spi.h>
-
-#if (NDEBUG & NDEBUG_LISTS)
-#define LIST(x,y) {printk("LINE:%d   Adding %p to %p\n", __LINE__, (void*)(x), (void*)(y)); if ((x)==(y)) udelay(5); }
-#define REMOVE(w,x,y,z) {printk("LINE:%d   Removing: %p->%p  %p->%p \n", __LINE__, (void*)(w), (void*)(x), (void*)(y), (void*)(z)); if ((x)==(y)) udelay(5); }
-#else
-#define LIST(x,y)
-#define REMOVE(w,x,y,z)
-#endif
 
 #ifndef notyet
-#undef LINKED
 #undef REAL_DMA
 #endif
 
-#ifdef REAL_DMA_POLL
-#undef READ_OVERRUNS
-#define READ_OVERRUNS
-#endif
-
 #ifdef BOARD_REQUIRES_NO_DELAY
 #define io_recovery_delay(x)
 #else
@@ -112,44 +56,28 @@
 /*
  * Design
  *
- * This is a generic 5380 driver.  To use it on a different platform, 
+ * This is a generic 5380 driver.  To use it on a different platform,
  * one simply writes appropriate system specific macros (ie, data
- * transfer - some PC's will use the I/O bus, 68K's must use 
+ * transfer - some PC's will use the I/O bus, 68K's must use
  * memory mapped) and drops this file in their 'C' wrapper.
  *
- * (Note from hch:  unfortunately it was not enough for the different
- * m68k folks and instead of improving this driver they copied it
- * and hacked it up for their needs.  As a consequence they lost
- * most updates to this driver.  Maybe someone will fix all these
- * drivers to use a common core one day..)
- *
- * As far as command queueing, two queues are maintained for 
+ * As far as command queueing, two queues are maintained for
  * each 5380 in the system - commands that haven't been issued yet,
- * and commands that are currently executing.  This means that an 
- * unlimited number of commands may be queued, letting 
- * more commands propagate from the higher driver levels giving higher 
- * throughput.  Note that both I_T_L and I_T_L_Q nexuses are supported, 
- * allowing multiple commands to propagate all the way to a SCSI-II device 
+ * and commands that are currently executing.  This means that an
+ * unlimited number of commands may be queued, letting
+ * more commands propagate from the higher driver levels giving higher
+ * throughput.  Note that both I_T_L and I_T_L_Q nexuses are supported,
+ * allowing multiple commands to propagate all the way to a SCSI-II device
  * while a command is already executing.
  *
  *
- * Issues specific to the NCR5380 : 
+ * Issues specific to the NCR5380 :
  *
- * When used in a PIO or pseudo-dma mode, the NCR5380 is a braindead 
- * piece of hardware that requires you to sit in a loop polling for 
- * the REQ signal as long as you are connected.  Some devices are 
- * brain dead (ie, many TEXEL CD ROM drives) and won't disconnect 
- * while doing long seek operations.
- * 
- * The workaround for this is to keep track of devices that have
- * disconnected.  If the device hasn't disconnected, for commands that
- * should disconnect, we do something like 
- *
- * while (!REQ is asserted) { sleep for N usecs; poll for M usecs }
- * 
- * Some tweaking of N and M needs to be done.  An algorithm based 
- * on "time to data" would give the best results as long as short time
- * to datas (ie, on the same track) were considered, however these 
+ * When used in a PIO or pseudo-dma mode, the NCR5380 is a braindead
+ * piece of hardware that requires you to sit in a loop polling for
+ * the REQ signal as long as you are connected.  Some devices are
+ * brain dead (ie, many TEXEL CD ROM drives) and won't disconnect
+ * while doing long seek operations. [...] These
  * broken devices are the exception rather than the rule and I'd rather
  * spend my time optimizing for the normal case.
  *
@@ -159,23 +87,23 @@
  * which is started from a workqueue for each NCR5380 host in the
  * system.  It attempts to establish I_T_L or I_T_L_Q nexuses by
  * removing the commands from the issue queue and calling
- * NCR5380_select() if a nexus is not established. 
+ * NCR5380_select() if a nexus is not established.
  *
  * Once a nexus is established, the NCR5380_information_transfer()
  * phase goes through the various phases as instructed by the target.
  * if the target goes into MSG IN and sends a DISCONNECT message,
  * the command structure is placed into the per instance disconnected
- * queue, and NCR5380_main tries to find more work.  If the target is 
+ * queue, and NCR5380_main tries to find more work.  If the target is
  * idle for too long, the system will try to sleep.
  *
  * If a command has disconnected, eventually an interrupt will trigger,
  * calling NCR5380_intr()  which will in turn call NCR5380_reselect
  * to reestablish a nexus.  This will run main if necessary.
  *
- * On command termination, the done function will be called as 
+ * On command termination, the done function will be called as
  * appropriate.
  *
- * SCSI pointers are maintained in the SCp field of SCSI command 
+ * SCSI pointers are maintained in the SCp field of SCSI command
  * structures, being initialized after the command is connected
  * in NCR5380_select, and set as appropriate in NCR5380_information_transfer.
  * Note that in violation of the standard, an implicit SAVE POINTERS operation
@@ -185,73 +113,48 @@
 /*
  * Using this file :
  * This file a skeleton Linux SCSI driver for the NCR 5380 series
- * of chips.  To use it, you write an architecture specific functions 
+ * of chips.  To use it, you write an architecture specific functions
  * and macros and include this file in your driver.
  *
- * These macros control options : 
- * AUTOPROBE_IRQ - if defined, the NCR5380_probe_irq() function will be 
- *      defined.
- * 
+ * These macros control options :
+ * AUTOPROBE_IRQ - if defined, the NCR5380_probe_irq() function will be
+ * defined.
+ *
  * AUTOSENSE - if defined, REQUEST SENSE will be performed automatically
- *      for commands that return with a CHECK CONDITION status. 
+ * for commands that return with a CHECK CONDITION status.
  *
  * DIFFERENTIAL - if defined, NCR53c81 chips will use external differential
- *      transceivers. 
+ * transceivers.
  *
  * DONT_USE_INTR - if defined, never use interrupts, even if we probe or
- *      override-configure an IRQ.
- *
- * LIMIT_TRANSFERSIZE - if defined, limit the pseudo-dma transfers to 512
- *      bytes at a time.  Since interrupts are disabled by default during
- *      these transfers, we might need this to give reasonable interrupt
- *      service time if the transfer size gets too large.
- *
- * LINKED - if defined, linked commands are supported.
+ * override-configure an IRQ.
  *
  * PSEUDO_DMA - if defined, PSEUDO DMA is used during the data transfer phases.
  *
  * REAL_DMA - if defined, REAL DMA is used during the data transfer phases.
  *
  * REAL_DMA_POLL - if defined, REAL DMA is used but the driver doesn't
- *      rely on phase mismatch and EOP interrupts to determine end 
- *      of phase.
- *
- * UNSAFE - leave interrupts enabled during pseudo-DMA transfers.  You
- *          only really want to use this if you're having a problem with
- *          dropped characters during high speed communications, and even
- *          then, you're going to be better off twiddling with transfersize
- *          in the high level code.
- *
- * Defaults for these will be provided although the user may want to adjust 
- * these to allocate CPU resources to the SCSI driver or "real" code.
- * 
- * USLEEP_SLEEP - amount of time, in jiffies, to sleep
- *
- * USLEEP_POLL - amount of time, in jiffies, to poll
+ * rely on phase mismatch and EOP interrupts to determine end
+ * of phase.
  *
  * These macros MUST be defined :
- * NCR5380_local_declare() - declare any local variables needed for your
- *      transfer routines.
  *
- * NCR5380_setup(instance) - initialize any local variables needed from a given
- *      instance of the host adapter for NCR5380_{read,write,pread,pwrite}
- * 
  * NCR5380_read(register)  - read from the specified register
  *
- * NCR5380_write(register, value) - write to the specific register 
+ * NCR5380_write(register, value) - write to the specific register
  *
- * NCR5380_implementation_fields  - additional fields needed for this 
- *      specific implementation of the NCR5380
+ * NCR5380_implementation_fields  - additional fields needed for this
+ * specific implementation of the NCR5380
  *
  * Either real DMA *or* pseudo DMA may be implemented
- * REAL functions : 
+ * REAL functions :
  * NCR5380_REAL_DMA should be defined if real DMA is to be used.
- * Note that the DMA setup functions should return the number of bytes 
- *      that they were able to program the controller for.
+ * Note that the DMA setup functions should return the number of bytes
+ * that they were able to program the controller for.
  *
- * Also note that generic i386/PC versions of these macros are 
- *      available as NCR5380_i386_dma_write_setup,
- *      NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual.
+ * Also note that generic i386/PC versions of these macros are
+ * available as NCR5380_i386_dma_write_setup,
+ * NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual.
  *
  * NCR5380_dma_write_setup(instance, src, count) - initialize
  * NCR5380_dma_read_setup(instance, dst, count) - initialize
@@ -262,25 +165,25 @@
  * NCR5380_pread(instance, dst, count);
  *
  * The generic driver is initialized by calling NCR5380_init(instance),
- * after setting the appropriate host specific fields and ID.  If the 
+ * after setting the appropriate host specific fields and ID.  If the
  * driver wishes to autoprobe for an IRQ line, the NCR5380_probe_irq(instance,
  * possible) function may be used.
  */
 
-static int do_abort(struct Scsi_Host *host);
-static void do_reset(struct Scsi_Host *host);
+static int do_abort(struct Scsi_Host *);
+static void do_reset(struct Scsi_Host *);
 
-/*
- *	initialize_SCp		-	init the scsi pointer field
- *	@cmd: command block to set up
+/**
+ * initialize_SCp - init the scsi pointer field
+ * @cmd: command block to set up
  *
- *	Set up the internal fields in the SCSI command.
+ * Set up the internal fields in the SCSI command.
  */
 
 static inline void initialize_SCp(struct scsi_cmnd *cmd)
 {
-	/* 
-	 * Initialize the Scsi Pointer field so that all of the commands in the 
+	/*
+	 * Initialize the Scsi Pointer field so that all of the commands in the
 	 * various queues are valid.
 	 */
 
@@ -295,120 +198,123 @@ static inline void initialize_SCp(struct scsi_cmnd *cmd)
 		cmd->SCp.ptr = NULL;
 		cmd->SCp.this_residual = 0;
 	}
+
+	cmd->SCp.Status = 0;
+	cmd->SCp.Message = 0;
 }
 
 /**
- *	NCR5380_poll_politely	-	wait for NCR5380 status bits
- *	@instance: controller to poll
- *	@reg: 5380 register to poll
- *	@bit: Bitmask to check
- *	@val: Value required to exit
- *
- *	Polls the NCR5380 in a reasonably efficient manner waiting for
- *	an event to occur, after a short quick poll we begin giving the
- *	CPU back in non IRQ contexts
- *
- *	Returns the value of the register or a negative error code.
+ * NCR5380_poll_politely2 - wait for two chip register values
+ * @instance: controller to poll
+ * @reg1: 5380 register to poll
+ * @bit1: Bitmask to check
+ * @val1: Expected value
+ * @reg2: Second 5380 register to poll
+ * @bit2: Second bitmask to check
+ * @val2: Second expected value
+ * @wait: Time-out in jiffies
+ *
+ * Polls the chip in a reasonably efficient manner waiting for an
+ * event to occur. After a short quick poll we begin to yield the CPU
+ * (if possible). In irq contexts the time-out is arbitrarily limited.
+ * Callers may hold locks as long as they are held in irq mode.
+ *
+ * Returns 0 if either or both event(s) occurred otherwise -ETIMEDOUT.
  */
- 
-static int NCR5380_poll_politely(struct Scsi_Host *instance, int reg, int bit, int val, int t)
+
+static int NCR5380_poll_politely2(struct Scsi_Host *instance,
+                                  int reg1, int bit1, int val1,
+                                  int reg2, int bit2, int val2, int wait)
 {
-	NCR5380_local_declare();
-	int n = 500;		/* At about 8uS a cycle for the cpu access */
-	unsigned long end = jiffies + t;
-	int r;
-	
-	NCR5380_setup(instance);
-
-	while( n-- > 0)
-	{
-		r = NCR5380_read(reg);
-		if((r & bit) == val)
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	unsigned long deadline = jiffies + wait;
+	unsigned long n;
+
+	/* Busy-wait for up to 10 ms */
+	n = min(10000U, jiffies_to_usecs(wait));
+	n *= hostdata->accesses_per_ms;
+	n /= 2000;
+	do {
+		if ((NCR5380_read(reg1) & bit1) == val1)
+			return 0;
+		if ((NCR5380_read(reg2) & bit2) == val2)
 			return 0;
 		cpu_relax();
-	}
-	
-	/* t time yet ? */
-	while(time_before(jiffies, end))
-	{
-		r = NCR5380_read(reg);
-		if((r & bit) == val)
+	} while (n--);
+
+	if (irqs_disabled() || in_interrupt())
+		return -ETIMEDOUT;
+
+	/* Repeatedly sleep for 1 ms until deadline */
+	while (time_is_after_jiffies(deadline)) {
+		schedule_timeout_uninterruptible(1);
+		if ((NCR5380_read(reg1) & bit1) == val1)
+			return 0;
+		if ((NCR5380_read(reg2) & bit2) == val2)
 			return 0;
-		if(!in_interrupt())
-			cond_resched();
-		else
-			cpu_relax();
 	}
+
 	return -ETIMEDOUT;
 }
 
-static struct {
-	unsigned char value;
-	const char *name;
-} phases[] __maybe_unused = {
-	{PHASE_DATAOUT, "DATAOUT"}, 
-	{PHASE_DATAIN, "DATAIN"}, 
-	{PHASE_CMDOUT, "CMDOUT"}, 
-	{PHASE_STATIN, "STATIN"}, 
-	{PHASE_MSGOUT, "MSGOUT"}, 
-	{PHASE_MSGIN, "MSGIN"}, 
-	{PHASE_UNKNOWN, "UNKNOWN"}
-};
+static inline int NCR5380_poll_politely(struct Scsi_Host *instance,
+                                        int reg, int bit, int val, int wait)
+{
+	return NCR5380_poll_politely2(instance, reg, bit, val,
+	                                        reg, bit, val, wait);
+}
 
 #if NDEBUG
 static struct {
 	unsigned char mask;
 	const char *name;
-} signals[] = { 
-	{SR_DBP, "PARITY"}, 
-	{SR_RST, "RST"}, 
-	{SR_BSY, "BSY"}, 
-	{SR_REQ, "REQ"}, 
-	{SR_MSG, "MSG"}, 
-	{SR_CD, "CD"}, 
-	{SR_IO, "IO"}, 
-	{SR_SEL, "SEL"}, 
+} signals[] = {
+	{SR_DBP, "PARITY"},
+	{SR_RST, "RST"},
+	{SR_BSY, "BSY"},
+	{SR_REQ, "REQ"},
+	{SR_MSG, "MSG"},
+	{SR_CD, "CD"},
+	{SR_IO, "IO"},
+	{SR_SEL, "SEL"},
 	{0, NULL}
-}, 
+},
 basrs[] = {
-	{BASR_ATN, "ATN"}, 
-	{BASR_ACK, "ACK"}, 
+	{BASR_ATN, "ATN"},
+	{BASR_ACK, "ACK"},
 	{0, NULL}
-}, 
-icrs[] = { 
-	{ICR_ASSERT_RST, "ASSERT RST"}, 
-	{ICR_ASSERT_ACK, "ASSERT ACK"}, 
-	{ICR_ASSERT_BSY, "ASSERT BSY"}, 
-	{ICR_ASSERT_SEL, "ASSERT SEL"}, 
-	{ICR_ASSERT_ATN, "ASSERT ATN"}, 
-	{ICR_ASSERT_DATA, "ASSERT DATA"}, 
+},
+icrs[] = {
+	{ICR_ASSERT_RST, "ASSERT RST"},
+	{ICR_ASSERT_ACK, "ASSERT ACK"},
+	{ICR_ASSERT_BSY, "ASSERT BSY"},
+	{ICR_ASSERT_SEL, "ASSERT SEL"},
+	{ICR_ASSERT_ATN, "ASSERT ATN"},
+	{ICR_ASSERT_DATA, "ASSERT DATA"},
 	{0, NULL}
-}, 
-mrs[] = { 
-	{MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"}, 
-	{MR_TARGET, "MODE TARGET"}, 
-	{MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"}, 
-	{MR_ENABLE_PAR_INTR, "MODE PARITY INTR"}, 
-	{MR_MONITOR_BSY, "MODE MONITOR BSY"}, 
-	{MR_DMA_MODE, "MODE DMA"}, 
-	{MR_ARBITRATE, "MODE ARBITRATION"}, 
+},
+mrs[] = {
+	{MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"},
+	{MR_TARGET, "MODE TARGET"},
+	{MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"},
+	{MR_ENABLE_PAR_INTR, "MODE PARITY INTR"},
+	{MR_ENABLE_EOP_INTR, "MODE EOP INTR"},
+	{MR_MONITOR_BSY, "MODE MONITOR BSY"},
+	{MR_DMA_MODE, "MODE DMA"},
+	{MR_ARBITRATE, "MODE ARBITRATION"},
 	{0, NULL}
 };
 
 /**
- *	NCR5380_print	-	print scsi bus signals
- *	@instance:	adapter state to dump
- *
- *	Print the SCSI bus signals for debugging purposes
+ * NCR5380_print - print scsi bus signals
+ * @instance: adapter state to dump
  *
- *	Locks: caller holds hostdata lock (not essential)
+ * Print the SCSI bus signals for debugging purposes
  */
 
 static void NCR5380_print(struct Scsi_Host *instance)
 {
-	NCR5380_local_declare();
 	unsigned char status, data, basr, mr, icr, i;
-	NCR5380_setup(instance);
 
 	data = NCR5380_read(CURRENT_SCSI_DATA_REG);
 	status = NCR5380_read(STATUS_REG);
@@ -435,117 +341,56 @@ static void NCR5380_print(struct Scsi_Host *instance)
 	printk("\n");
 }
 
+static struct {
+	unsigned char value;
+	const char *name;
+} phases[] = {
+	{PHASE_DATAOUT, "DATAOUT"},
+	{PHASE_DATAIN, "DATAIN"},
+	{PHASE_CMDOUT, "CMDOUT"},
+	{PHASE_STATIN, "STATIN"},
+	{PHASE_MSGOUT, "MSGOUT"},
+	{PHASE_MSGIN, "MSGIN"},
+	{PHASE_UNKNOWN, "UNKNOWN"}
+};
 
-/* 
- *	NCR5380_print_phase	-	show SCSI phase
- *	@instance: adapter to dump
- *
- * 	Print the current SCSI phase for debugging purposes
+/**
+ * NCR5380_print_phase - show SCSI phase
+ * @instance: adapter to dump
  *
- *	Locks: none
+ * Print the current SCSI phase for debugging purposes
  */
 
 static void NCR5380_print_phase(struct Scsi_Host *instance)
 {
-	NCR5380_local_declare();
 	unsigned char status;
 	int i;
-	NCR5380_setup(instance);
 
 	status = NCR5380_read(STATUS_REG);
 	if (!(status & SR_REQ))
-		printk("scsi%d : REQ not asserted, phase unknown.\n", instance->host_no);
+		shost_printk(KERN_DEBUG, instance, "REQ not asserted, phase unknown.\n");
 	else {
-		for (i = 0; (phases[i].value != PHASE_UNKNOWN) && (phases[i].value != (status & PHASE_MASK)); ++i);
-		printk("scsi%d : phase %s\n", instance->host_no, phases[i].name);
+		for (i = 0; (phases[i].value != PHASE_UNKNOWN) &&
+		     (phases[i].value != (status & PHASE_MASK)); ++i)
+			;
+		shost_printk(KERN_DEBUG, instance, "phase %s\n", phases[i].name);
 	}
 }
 #endif
 
-/*
- * These need tweaking, and would probably work best as per-device 
- * flags initialized differently for disk, tape, cd, etc devices.
- * People with broken devices are free to experiment as to what gives
- * the best results for them.
- *
- * USLEEP_SLEEP should be a minimum seek time.
- *
- * USLEEP_POLL should be a maximum rotational latency.
- */
-#ifndef USLEEP_SLEEP
-/* 20 ms (reasonable hard disk speed) */
-#define USLEEP_SLEEP msecs_to_jiffies(20)
-#endif
-/* 300 RPM (floppy speed) */
-#ifndef USLEEP_POLL
-#define USLEEP_POLL msecs_to_jiffies(200)
-#endif
-#ifndef USLEEP_WAITLONG
-/* RvC: (reasonable time to wait on select error) */
-#define USLEEP_WAITLONG USLEEP_SLEEP
-#endif
 
-/* 
- * Function : int should_disconnect (unsigned char cmd)
- *
- * Purpose : decide whether a command would normally disconnect or 
- *      not, since if it won't disconnect we should go to sleep.
- *
- * Input : cmd - opcode of SCSI command
- *
- * Returns : DISCONNECT_LONG if we should disconnect for a really long 
- *      time (ie always, sleep, look for REQ active, sleep), 
- *      DISCONNECT_TIME_TO_DATA if we would only disconnect for a normal
- *      time-to-data delay, DISCONNECT_NONE if this command would return
- *      immediately.
- *
- *      Future sleep algorithms based on time to data can exploit 
- *      something like this so they can differentiate between "normal" 
- *      (ie, read, write, seek) and unusual commands (ie, * format).
- *
- * Note : We don't deal with commands that handle an immediate disconnect,
- *        
- */
-
-static int should_disconnect(unsigned char cmd)
-{
-	switch (cmd) {
-	case READ_6:
-	case WRITE_6:
-	case SEEK_6:
-	case READ_10:
-	case WRITE_10:
-	case SEEK_10:
-		return DISCONNECT_TIME_TO_DATA;
-	case FORMAT_UNIT:
-	case SEARCH_HIGH:
-	case SEARCH_LOW:
-	case SEARCH_EQUAL:
-		return DISCONNECT_LONG;
-	default:
-		return DISCONNECT_NONE;
-	}
-}
-
-static void NCR5380_set_timer(struct NCR5380_hostdata *hostdata, unsigned long timeout)
-{
-	hostdata->time_expires = jiffies + timeout;
-	schedule_delayed_work(&hostdata->coroutine, timeout);
-}
-
-
-static int probe_irq __initdata = 0;
+static int probe_irq __initdata;
 
 /**
- *	probe_intr	-	helper for IRQ autoprobe
- *	@irq: interrupt number
- *	@dev_id: unused
- *	@regs: unused
+ * probe_intr	-	helper for IRQ autoprobe
+ * @irq: interrupt number
+ * @dev_id: unused
+ * @regs: unused
  *
- *	Set a flag to indicate the IRQ in question was received. This is
- *	used by the IRQ probe code.
+ * Set a flag to indicate the IRQ in question was received. This is
+ * used by the IRQ probe code.
  */
- 
+
 static irqreturn_t __init probe_intr(int irq, void *dev_id)
 {
 	probe_irq = irq;
@@ -553,24 +398,20 @@ static irqreturn_t __init probe_intr(int irq, void *dev_id)
 }
 
 /**
- *	NCR5380_probe_irq	-	find the IRQ of an NCR5380
- *	@instance: NCR5380 controller
- *	@possible: bitmask of ISA IRQ lines
+ * NCR5380_probe_irq	-	find the IRQ of an NCR5380
+ * @instance: NCR5380 controller
+ * @possible: bitmask of ISA IRQ lines
  *
- *	Autoprobe for the IRQ line used by the NCR5380 by triggering an IRQ
- *	and then looking to see what interrupt actually turned up.
- *
- *	Locks: none, irqs must be enabled on entry
+ * Autoprobe for the IRQ line used by the NCR5380 by triggering an IRQ
+ * and then looking to see what interrupt actually turned up.
  */
 
 static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance,
 						int possible)
 {
-	NCR5380_local_declare();
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	unsigned long timeout;
 	int trying_irqs, i, mask;
-	NCR5380_setup(instance);
 
 	for (trying_irqs = 0, i = 1, mask = 2; i < 16; ++i, mask <<= 1)
 		if ((mask & possible) && (request_irq(i, &probe_intr, 0, "NCR-probe", NULL) == 0))
@@ -581,7 +422,7 @@ static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance,
 
 	/*
 	 * A interrupt is triggered whenever BSY = false, SEL = true
-	 * and a bit set in the SELECT_ENABLE_REG is asserted on the 
+	 * and a bit set in the SELECT_ENABLE_REG is asserted on the
 	 * SCSI bus.
 	 *
 	 * Note that the bus is only driven when the phase control signals
@@ -596,7 +437,7 @@ static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance,
 
 	while (probe_irq == NO_IRQ && time_before(jiffies, timeout))
 		schedule_timeout_uninterruptible(1);
-	
+
 	NCR5380_write(SELECT_ENABLE_REG, 0);
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 
@@ -608,12 +449,10 @@ static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance,
 }
 
 /**
- *	NCR58380_info - report driver and host information
- *	@instance: relevant scsi host instance
- *
- *	For use as the host template info() handler.
+ * NCR58380_info - report driver and host information
+ * @instance: relevant scsi host instance
  *
- *	Locks: none
+ * For use as the host template info() handler.
  */
 
 static const char *NCR5380_info(struct Scsi_Host *instance)
@@ -633,20 +472,14 @@ static void prepare_info(struct Scsi_Host *instance)
 	         "can_queue %d, cmd_per_lun %d, "
 	         "sg_tablesize %d, this_id %d, "
 	         "flags { %s%s%s}, "
-#if defined(USLEEP_POLL) && defined(USLEEP_WAITLONG)
-		 "USLEEP_POLL %lu, USLEEP_WAITLONG %lu, "
-#endif
 	         "options { %s} ",
 	         instance->hostt->name, instance->io_port, instance->n_io_port,
 	         instance->base, instance->irq,
 	         instance->can_queue, instance->cmd_per_lun,
 	         instance->sg_tablesize, instance->this_id,
-	         hostdata->flags & FLAG_NCR53C400     ? "NCR53C400 "     : "",
-	         hostdata->flags & FLAG_DTC3181E      ? "DTC3181E "      : "",
+	         hostdata->flags & FLAG_NO_DMA_FIXUP  ? "NO_DMA_FIXUP "  : "",
 	         hostdata->flags & FLAG_NO_PSEUDO_DMA ? "NO_PSEUDO_DMA " : "",
-#if defined(USLEEP_POLL) && defined(USLEEP_WAITLONG)
-	         USLEEP_POLL, USLEEP_WAITLONG,
-#endif
+	         hostdata->flags & FLAG_TOSHIBA_DELAY ? "TOSHIBA_DELAY "  : "",
 #ifdef AUTOPROBE_IRQ
 	         "AUTOPROBE_IRQ "
 #endif
@@ -665,46 +498,10 @@ static void prepare_info(struct Scsi_Host *instance)
 #ifdef PSEUDO_DMA
 	         "PSEUDO_DMA "
 #endif
-#ifdef UNSAFE
-	         "UNSAFE "
-#endif
-#ifdef NCR53C400
-	         "NCR53C400 "
-#endif
 	         "");
 }
 
-/**
- *	NCR5380_print_status 	-	dump controller info
- *	@instance: controller to dump
- *
- *	Print commands in the various queues, called from NCR5380_abort 
- *	and NCR5380_debug to aid debugging.
- *
- *	Locks: called functions disable irqs
- */
-
-static void NCR5380_print_status(struct Scsi_Host *instance)
-{
-	NCR5380_dprint(NDEBUG_ANY, instance);
-	NCR5380_dprint_phase(NDEBUG_ANY, instance);
-}
-
 #ifdef PSEUDO_DMA
-/******************************************/
-/*
- * /proc/scsi/[dtc pas16 t128 generic]/[0-ASC_NUM_BOARD_SUPPORTED]
- *
- * *buffer: I/O buffer
- * **start: if inout == FALSE pointer into buffer where user read should start
- * offset: current offset
- * length: length of buffer
- * hostno: Scsi_Host host_no
- * inout: TRUE - user is writing; FALSE - user is reading
- *
- * Return the number of bytes read from or written
- */
-
 static int __maybe_unused NCR5380_write_info(struct Scsi_Host *instance,
 	char *buffer, int length)
 {
@@ -714,104 +511,41 @@ static int __maybe_unused NCR5380_write_info(struct Scsi_Host *instance,
 	hostdata->spin_max_w = 0;
 	return 0;
 }
-#endif
-
-static
-void lprint_Scsi_Cmnd(struct scsi_cmnd *cmd, struct seq_file *m);
-static
-void lprint_command(unsigned char *cmd, struct seq_file *m);
-static
-void lprint_opcode(int opcode, struct seq_file *m);
 
 static int __maybe_unused NCR5380_show_info(struct seq_file *m,
-	struct Scsi_Host *instance)
+                                            struct Scsi_Host *instance)
 {
-	struct NCR5380_hostdata *hostdata;
-	struct scsi_cmnd *ptr;
-
-	hostdata = (struct NCR5380_hostdata *) instance->hostdata;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
-#ifdef PSEUDO_DMA
 	seq_printf(m, "Highwater I/O busy spin counts: write %d, read %d\n",
 	        hostdata->spin_max_w, hostdata->spin_max_r);
-#endif
-	spin_lock_irq(instance->host_lock);
-	if (!hostdata->connected)
-		seq_printf(m, "scsi%d: no currently connected command\n", instance->host_no);
-	else
-		lprint_Scsi_Cmnd((struct scsi_cmnd *) hostdata->connected, m);
-	seq_printf(m, "scsi%d: issue_queue\n", instance->host_no);
-	for (ptr = (struct scsi_cmnd *) hostdata->issue_queue; ptr; ptr = (struct scsi_cmnd *) ptr->host_scribble)
-		lprint_Scsi_Cmnd(ptr, m);
-
-	seq_printf(m, "scsi%d: disconnected_queue\n", instance->host_no);
-	for (ptr = (struct scsi_cmnd *) hostdata->disconnected_queue; ptr; ptr = (struct scsi_cmnd *) ptr->host_scribble)
-		lprint_Scsi_Cmnd(ptr, m);
-	spin_unlock_irq(instance->host_lock);
 	return 0;
 }
-
-static void lprint_Scsi_Cmnd(struct scsi_cmnd *cmd, struct seq_file *m)
-{
-	seq_printf(m, "scsi%d : destination target %d, lun %llu\n", cmd->device->host->host_no, cmd->device->id, cmd->device->lun);
-	seq_puts(m, "        command = ");
-	lprint_command(cmd->cmnd, m);
-}
-
-static void lprint_command(unsigned char *command, struct seq_file *m)
-{
-	int i, s;
-	lprint_opcode(command[0], m);
-	for (i = 1, s = COMMAND_SIZE(command[0]); i < s; ++i)
-		seq_printf(m, "%02x ", command[i]);
-	seq_putc(m, '\n');
-}
-
-static void lprint_opcode(int opcode, struct seq_file *m)
-{
-	seq_printf(m, "%2d (0x%02x)", opcode, opcode);
-}
-
+#endif
 
 /**
- *	NCR5380_init	-	initialise an NCR5380
- *	@instance: adapter to configure
- *	@flags: control flags
+ * NCR5380_init - initialise an NCR5380
+ * @instance: adapter to configure
+ * @flags: control flags
  *
- *	Initializes *instance and corresponding 5380 chip,
- *      with flags OR'd into the initial flags value.
+ * Initializes *instance and corresponding 5380 chip,
+ * with flags OR'd into the initial flags value.
  *
- *	Notes : I assume that the host, hostno, and id bits have been
- *      set correctly.  I don't care about the irq and other fields. 
+ * Notes : I assume that the host, hostno, and id bits have been
+ * set correctly. I don't care about the irq and other fields.
  *
- *	Returns 0 for success
- *
- *	Locks: interrupts must be enabled when we are called 
+ * Returns 0 for success
  */
 
 static int NCR5380_init(struct Scsi_Host *instance, int flags)
 {
-	NCR5380_local_declare();
-	int i, pass;
-	unsigned long timeout;
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-
-	if(in_interrupt())
-		printk(KERN_ERR "NCR5380_init called with interrupts off!\n");
-	/* 
-	 * On NCR53C400 boards, NCR5380 registers are mapped 8 past 
-	 * the base address.
-	 */
-
-#ifdef NCR53C400
-	if (flags & FLAG_NCR53C400)
-		instance->NCR5380_instance_name += NCR53C400_address_adjust;
-#endif
-
-	NCR5380_setup(instance);
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	int i;
+	unsigned long deadline;
 
-	hostdata->aborted = 0;
+	hostdata->host = instance;
 	hostdata->id_mask = 1 << instance->this_id;
+	hostdata->id_higher_mask = 0;
 	for (i = hostdata->id_mask; i <= 0x80; i <<= 1)
 		if (i > hostdata->id_mask)
 			hostdata->id_higher_mask |= i;
@@ -820,21 +554,21 @@ static int NCR5380_init(struct Scsi_Host *instance, int flags)
 #ifdef REAL_DMA
 	hostdata->dmalen = 0;
 #endif
-	hostdata->targets_present = 0;
+	spin_lock_init(&hostdata->lock);
 	hostdata->connected = NULL;
-	hostdata->issue_queue = NULL;
-	hostdata->disconnected_queue = NULL;
-	
-	INIT_DELAYED_WORK(&hostdata->coroutine, NCR5380_main);
-	
-	/* The CHECK code seems to break the 53C400. Will check it later maybe */
-	if (flags & FLAG_NCR53C400)
-		hostdata->flags = FLAG_HAS_LAST_BYTE_SENT | flags;
-	else
-		hostdata->flags = FLAG_CHECK_LAST_BYTE_SENT | flags;
+	hostdata->sensing = NULL;
+	INIT_LIST_HEAD(&hostdata->autosense);
+	INIT_LIST_HEAD(&hostdata->unissued);
+	INIT_LIST_HEAD(&hostdata->disconnected);
 
-	hostdata->host = instance;
-	hostdata->time_expires = 0;
+	hostdata->flags = flags;
+
+	INIT_WORK(&hostdata->main_task, NCR5380_main);
+	hostdata->work_q = alloc_workqueue("ncr5380_%d",
+	                        WQ_UNBOUND | WQ_MEM_RECLAIM,
+	                        1, instance->host_no);
+	if (!hostdata->work_q)
+		return -ENOMEM;
 
 	prepare_info(instance);
 
@@ -843,43 +577,69 @@ static int NCR5380_init(struct Scsi_Host *instance, int flags)
 	NCR5380_write(TARGET_COMMAND_REG, 0);
 	NCR5380_write(SELECT_ENABLE_REG, 0);
 
-#ifdef NCR53C400
-	if (hostdata->flags & FLAG_NCR53C400) {
-		NCR5380_write(C400_CONTROL_STATUS_REG, CSR_BASE);
-	}
-#endif
+	/* Calibrate register polling loop */
+	i = 0;
+	deadline = jiffies + 1;
+	do {
+		cpu_relax();
+	} while (time_is_after_jiffies(deadline));
+	deadline += msecs_to_jiffies(256);
+	do {
+		NCR5380_read(STATUS_REG);
+		++i;
+		cpu_relax();
+	} while (time_is_after_jiffies(deadline));
+	hostdata->accesses_per_ms = i / 256;
 
-	/* 
-	 * Detect and correct bus wedge problems.
-	 *
-	 * If the system crashed, it may have crashed in a state 
-	 * where a SCSI command was still executing, and the 
-	 * SCSI bus is not in a BUS FREE STATE.
-	 *
-	 * If this is the case, we'll try to abort the currently
-	 * established nexus which we know nothing about, and that
-	 * failing, do a hard reset of the SCSI bus 
-	 */
+	return 0;
+}
+
+/**
+ * NCR5380_maybe_reset_bus - Detect and correct bus wedge problems.
+ * @instance: adapter to check
+ *
+ * If the system crashed, it may have crashed with a connected target and
+ * the SCSI bus busy. Check for BUS FREE phase. If not, try to abort the
+ * currently established nexus, which we know nothing about. Failing that
+ * do a bus reset.
+ *
+ * Note that a bus reset will cause the chip to assert IRQ.
+ *
+ * Returns 0 if successful, otherwise -ENXIO.
+ */
+
+static int NCR5380_maybe_reset_bus(struct Scsi_Host *instance)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	int pass;
 
 	for (pass = 1; (NCR5380_read(STATUS_REG) & SR_BSY) && pass <= 6; ++pass) {
 		switch (pass) {
 		case 1:
 		case 3:
 		case 5:
-			printk(KERN_INFO "scsi%d: SCSI bus busy, waiting up to five seconds\n", instance->host_no);
-			timeout = jiffies + 5 * HZ;
-			NCR5380_poll_politely(instance, STATUS_REG, SR_BSY, 0, 5*HZ);
+			shost_printk(KERN_ERR, instance, "SCSI bus busy, waiting up to five seconds\n");
+			NCR5380_poll_politely(instance,
+			                      STATUS_REG, SR_BSY, 0, 5 * HZ);
 			break;
 		case 2:
-			printk(KERN_WARNING "scsi%d: bus busy, attempting abort\n", instance->host_no);
+			shost_printk(KERN_ERR, instance, "bus busy, attempting abort\n");
 			do_abort(instance);
 			break;
 		case 4:
-			printk(KERN_WARNING "scsi%d: bus busy, attempting reset\n", instance->host_no);
+			shost_printk(KERN_ERR, instance, "bus busy, attempting reset\n");
 			do_reset(instance);
+			/* Wait after a reset; the SCSI standard calls for
+			 * 250ms, we wait 500ms to be on the safe side.
+			 * But some Toshiba CD-ROMs need ten times that.
+			 */
+			if (hostdata->flags & FLAG_TOSHIBA_DELAY)
+				msleep(2500);
+			else
+				msleep(500);
 			break;
 		case 6:
-			printk(KERN_ERR "scsi%d: bus locked solid or invalid override\n", instance->host_no);
+			shost_printk(KERN_ERR, instance, "bus locked solid\n");
 			return -ENXIO;
 		}
 	}
@@ -887,450 +647,513 @@ static int NCR5380_init(struct Scsi_Host *instance, int flags)
 }
 
 /**
- *	NCR5380_exit	-	remove an NCR5380
- *	@instance: adapter to remove
+ * NCR5380_exit - remove an NCR5380
+ * @instance: adapter to remove
+ *
+ * Assumes that no more work can be queued (e.g. by NCR5380_intr).
  */
 
 static void NCR5380_exit(struct Scsi_Host *instance)
 {
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
-	cancel_delayed_work_sync(&hostdata->coroutine);
+	cancel_work_sync(&hostdata->main_task);
+	destroy_workqueue(hostdata->work_q);
 }
 
 /**
- *	NCR5380_queue_command 		-	queue a command
- *	@cmd: SCSI command
- *	@done: completion handler
- *
- *      cmd is added to the per instance issue_queue, with minor 
- *      twiddling done to the host specific fields of cmd.  If the 
- *      main coroutine is not running, it is restarted.
+ * complete_cmd - finish processing a command and return it to the SCSI ML
+ * @instance: the host instance
+ * @cmd: command to complete
+ */
+
+static void complete_cmd(struct Scsi_Host *instance,
+                         struct scsi_cmnd *cmd)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+
+	dsprintk(NDEBUG_QUEUES, instance, "complete_cmd: cmd %p\n", cmd);
+
+	if (hostdata->sensing == cmd) {
+		/* Autosense processing ends here */
+		if ((cmd->result & 0xff) != SAM_STAT_GOOD) {
+			scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+			set_host_byte(cmd, DID_ERROR);
+		} else
+			scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+		hostdata->sensing = NULL;
+	}
+
+	hostdata->busy[scmd_id(cmd)] &= ~(1 << cmd->device->lun);
+
+	cmd->scsi_done(cmd);
+}
+
+/**
+ * NCR5380_queue_command - queue a command
+ * @instance: the relevant SCSI adapter
+ * @cmd: SCSI command
  *
- *	Locks: host lock taken by caller
+ * cmd is added to the per-instance issue queue, with minor
+ * twiddling done to the host specific fields of cmd.  If the
+ * main coroutine is not running, it is restarted.
  */
 
-static int NCR5380_queue_command_lck(struct scsi_cmnd *cmd, void (*done) (struct scsi_cmnd *))
+static int NCR5380_queue_command(struct Scsi_Host *instance,
+                                 struct scsi_cmnd *cmd)
 {
-	struct Scsi_Host *instance = cmd->device->host;
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-	struct scsi_cmnd *tmp;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
+	unsigned long flags;
 
 #if (NDEBUG & NDEBUG_NO_WRITE)
 	switch (cmd->cmnd[0]) {
 	case WRITE_6:
 	case WRITE_10:
-		printk("scsi%d : WRITE attempted with NO_WRITE debugging flag set\n", instance->host_no);
+		shost_printk(KERN_DEBUG, instance, "WRITE attempted with NDEBUG_NO_WRITE set\n");
 		cmd->result = (DID_ERROR << 16);
-		done(cmd);
+		cmd->scsi_done(cmd);
 		return 0;
 	}
-#endif				/* (NDEBUG & NDEBUG_NO_WRITE) */
-
-	/* 
-	 * We use the host_scribble field as a pointer to the next command  
-	 * in a queue 
-	 */
+#endif /* (NDEBUG & NDEBUG_NO_WRITE) */
 
-	cmd->host_scribble = NULL;
-	cmd->scsi_done = done;
 	cmd->result = 0;
 
-	/* 
-	 * Insert the cmd into the issue queue. Note that REQUEST SENSE 
+	spin_lock_irqsave(&hostdata->lock, flags);
+
+	/*
+	 * Insert the cmd into the issue queue. Note that REQUEST SENSE
 	 * commands are added to the head of the queue since any command will
-	 * clear the contingent allegiance condition that exists and the 
+	 * clear the contingent allegiance condition that exists and the
 	 * sense data is only guaranteed to be valid while the condition exists.
 	 */
 
-	if (!(hostdata->issue_queue) || (cmd->cmnd[0] == REQUEST_SENSE)) {
-		LIST(cmd, hostdata->issue_queue);
-		cmd->host_scribble = (unsigned char *) hostdata->issue_queue;
-		hostdata->issue_queue = cmd;
-	} else {
-		for (tmp = (struct scsi_cmnd *) hostdata->issue_queue; tmp->host_scribble; tmp = (struct scsi_cmnd *) tmp->host_scribble);
-		LIST(cmd, tmp);
-		tmp->host_scribble = (unsigned char *) cmd;
-	}
-	dprintk(NDEBUG_QUEUES, "scsi%d : command added to %s of queue\n", instance->host_no, (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail");
+	if (cmd->cmnd[0] == REQUEST_SENSE)
+		list_add(&ncmd->list, &hostdata->unissued);
+	else
+		list_add_tail(&ncmd->list, &hostdata->unissued);
+
+	spin_unlock_irqrestore(&hostdata->lock, flags);
+
+	dsprintk(NDEBUG_QUEUES, instance, "command %p added to %s of queue\n",
+	         cmd, (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail");
 
-	/* Run the coroutine if it isn't already running. */
 	/* Kick off command processing */
-	schedule_delayed_work(&hostdata->coroutine, 0);
+	queue_work(hostdata->work_q, &hostdata->main_task);
 	return 0;
 }
 
-static DEF_SCSI_QCMD(NCR5380_queue_command)
+/**
+ * dequeue_next_cmd - dequeue a command for processing
+ * @instance: the scsi host instance
+ *
+ * Priority is given to commands on the autosense queue. These commands
+ * need autosense because of a CHECK CONDITION result.
+ *
+ * Returns a command pointer if a command is found for a target that is
+ * not already busy. Otherwise returns NULL.
+ */
+
+static struct scsi_cmnd *dequeue_next_cmd(struct Scsi_Host *instance)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	struct NCR5380_cmd *ncmd;
+	struct scsi_cmnd *cmd;
+
+	if (list_empty(&hostdata->autosense)) {
+		list_for_each_entry(ncmd, &hostdata->unissued, list) {
+			cmd = NCR5380_to_scmd(ncmd);
+			dsprintk(NDEBUG_QUEUES, instance, "dequeue: cmd=%p target=%d busy=0x%02x lun=%llu\n",
+			         cmd, scmd_id(cmd), hostdata->busy[scmd_id(cmd)], cmd->device->lun);
+
+			if (!(hostdata->busy[scmd_id(cmd)] & (1 << cmd->device->lun))) {
+				list_del(&ncmd->list);
+				dsprintk(NDEBUG_QUEUES, instance,
+				         "dequeue: removed %p from issue queue\n", cmd);
+				return cmd;
+			}
+		}
+	} else {
+		/* Autosense processing begins here */
+		ncmd = list_first_entry(&hostdata->autosense,
+		                        struct NCR5380_cmd, list);
+		list_del(&ncmd->list);
+		cmd = NCR5380_to_scmd(ncmd);
+		dsprintk(NDEBUG_QUEUES, instance,
+		         "dequeue: removed %p from autosense queue\n", cmd);
+		scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0);
+		hostdata->sensing = cmd;
+		return cmd;
+	}
+	return NULL;
+}
+
+static void requeue_cmd(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
+
+	if (hostdata->sensing) {
+		scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+		list_add(&ncmd->list, &hostdata->autosense);
+		hostdata->sensing = NULL;
+	} else
+		list_add(&ncmd->list, &hostdata->unissued);
+}
 
 /**
- *	NCR5380_main	-	NCR state machines
- *
- *	NCR5380_main is a coroutine that runs as long as more work can 
- *      be done on the NCR5380 host adapters in a system.  Both 
- *      NCR5380_queue_command() and NCR5380_intr() will try to start it 
- *      in case it is not running.
- * 
- *	Locks: called as its own thread with no locks held. Takes the
- *	host lock and called routines may take the isa dma lock.
+ * NCR5380_main - NCR state machines
+ *
+ * NCR5380_main is a coroutine that runs as long as more work can
+ * be done on the NCR5380 host adapters in a system.  Both
+ * NCR5380_queue_command() and NCR5380_intr() will try to start it
+ * in case it is not running.
  */
 
 static void NCR5380_main(struct work_struct *work)
 {
 	struct NCR5380_hostdata *hostdata =
-		container_of(work, struct NCR5380_hostdata, coroutine.work);
+		container_of(work, struct NCR5380_hostdata, main_task);
 	struct Scsi_Host *instance = hostdata->host;
-	struct scsi_cmnd *tmp, *prev;
+	struct scsi_cmnd *cmd;
 	int done;
-	
-	spin_lock_irq(instance->host_lock);
+
 	do {
-		/* Lock held here */
 		done = 1;
-		if (!hostdata->connected && !hostdata->selecting) {
-			dprintk(NDEBUG_MAIN, "scsi%d : not connected\n", instance->host_no);
-			/*
-			 * Search through the issue_queue for a command destined
-			 * for a target that's not busy.
-			 */
-			for (tmp = (struct scsi_cmnd *) hostdata->issue_queue, prev = NULL; tmp; prev = tmp, tmp = (struct scsi_cmnd *) tmp->host_scribble)
-			{
-				if (prev != tmp)
-				    dprintk(NDEBUG_LISTS, "MAIN tmp=%p   target=%d   busy=%d lun=%llu\n", tmp, tmp->device->id, hostdata->busy[tmp->device->id], tmp->device->lun);
-				/*  When we find one, remove it from the issue queue. */
-				if (!(hostdata->busy[tmp->device->id] &
-				      (1 << (u8)(tmp->device->lun & 0xff)))) {
-					if (prev) {
-						REMOVE(prev, prev->host_scribble, tmp, tmp->host_scribble);
-						prev->host_scribble = tmp->host_scribble;
-					} else {
-						REMOVE(-1, hostdata->issue_queue, tmp, tmp->host_scribble);
-						hostdata->issue_queue = (struct scsi_cmnd *) tmp->host_scribble;
-					}
-					tmp->host_scribble = NULL;
 
-					/* 
-					 * Attempt to establish an I_T_L nexus here. 
-					 * On success, instance->hostdata->connected is set.
-					 * On failure, we must add the command back to the
-					 *   issue queue so we can keep trying. 
-					 */
-					dprintk(NDEBUG_MAIN|NDEBUG_QUEUES, "scsi%d : main() : command for target %d lun %llu removed from issue_queue\n", instance->host_no, tmp->device->id, tmp->device->lun);
-	
-					/*
-					 * A successful selection is defined as one that 
-					 * leaves us with the command connected and 
-					 * in hostdata->connected, OR has terminated the
-					 * command.
-					 *
-					 * With successful commands, we fall through
-					 * and see if we can do an information transfer,
-					 * with failures we will restart.
-					 */
-					hostdata->selecting = NULL;
-					/* RvC: have to preset this to indicate a new command is being performed */
+		spin_lock_irq(&hostdata->lock);
+		while (!hostdata->connected &&
+		       (cmd = dequeue_next_cmd(instance))) {
 
-					/*
-					 * REQUEST SENSE commands are issued without tagged
-					 * queueing, even on SCSI-II devices because the
-					 * contingent allegiance condition exists for the
-					 * entire unit.
-					 */
+			dsprintk(NDEBUG_MAIN, instance, "main: dequeued %p\n", cmd);
 
-					if (!NCR5380_select(instance, tmp)) {
-						break;
-					} else {
-						LIST(tmp, hostdata->issue_queue);
-						tmp->host_scribble = (unsigned char *) hostdata->issue_queue;
-						hostdata->issue_queue = tmp;
-						done = 0;
-						dprintk(NDEBUG_MAIN|NDEBUG_QUEUES, "scsi%d : main(): select() failed, returned to issue_queue\n", instance->host_no);
-					}
-					/* lock held here still */
-				}	/* if target/lun is not busy */
-			}	/* for */
-			/* exited locked */
-		}	/* if (!hostdata->connected) */
-		if (hostdata->selecting) {
-			tmp = (struct scsi_cmnd *) hostdata->selecting;
-			/* Selection will drop and retake the lock */
-			if (!NCR5380_select(instance, tmp)) {
-				/* Ok ?? */
+			/*
+			 * Attempt to establish an I_T_L nexus here.
+			 * On success, instance->hostdata->connected is set.
+			 * On failure, we must add the command back to the
+			 * issue queue so we can keep trying.
+			 */
+			/*
+			 * REQUEST SENSE commands are issued without tagged
+			 * queueing, even on SCSI-II devices because the
+			 * contingent allegiance condition exists for the
+			 * entire unit.
+			 */
+
+			cmd = NCR5380_select(instance, cmd);
+			if (!cmd) {
+				dsprintk(NDEBUG_MAIN, instance, "main: select complete\n");
 			} else {
-				/* RvC: device failed, so we wait a long time
-				   this is needed for Mustek scanners, that
-				   do not respond to commands immediately
-				   after a scan */
-				printk(KERN_DEBUG "scsi%d: device %d did not respond in time\n", instance->host_no, tmp->device->id);
-				LIST(tmp, hostdata->issue_queue);
-				tmp->host_scribble = (unsigned char *) hostdata->issue_queue;
-				hostdata->issue_queue = tmp;
-				NCR5380_set_timer(hostdata, USLEEP_WAITLONG);
+				dsprintk(NDEBUG_MAIN | NDEBUG_QUEUES, instance,
+				         "main: select failed, returning %p to queue\n", cmd);
+				requeue_cmd(instance, cmd);
 			}
-		}	/* if hostdata->selecting */
+		}
 		if (hostdata->connected
 #ifdef REAL_DMA
 		    && !hostdata->dmalen
 #endif
-		    && (!hostdata->time_expires || time_before_eq(hostdata->time_expires, jiffies))
 		    ) {
-			dprintk(NDEBUG_MAIN, "scsi%d : main() : performing information transfer\n", instance->host_no);
+			dsprintk(NDEBUG_MAIN, instance, "main: performing information transfer\n");
 			NCR5380_information_transfer(instance);
-			dprintk(NDEBUG_MAIN, "scsi%d : main() : done set false\n", instance->host_no);
 			done = 0;
-		} else
-			break;
+		}
+		spin_unlock_irq(&hostdata->lock);
+		if (!done)
+			cond_resched();
 	} while (!done);
-	
-	spin_unlock_irq(instance->host_lock);
 }
 
 #ifndef DONT_USE_INTR
 
 /**
- * 	NCR5380_intr	-	generic NCR5380 irq handler
- *	@irq: interrupt number
- *	@dev_id: device info
- *
- *	Handle interrupts, reestablishing I_T_L or I_T_L_Q nexuses
- *      from the disconnected queue, and restarting NCR5380_main() 
- *      as required.
- *
- *	Locks: takes the needed instance locks
+ * NCR5380_intr - generic NCR5380 irq handler
+ * @irq: interrupt number
+ * @dev_id: device info
+ *
+ * Handle interrupts, reestablishing I_T_L or I_T_L_Q nexuses
+ * from the disconnected queue, and restarting NCR5380_main()
+ * as required.
+ *
+ * The chip can assert IRQ in any of six different conditions. The IRQ flag
+ * is then cleared by reading the Reset Parity/Interrupt Register (RPIR).
+ * Three of these six conditions are latched in the Bus and Status Register:
+ * - End of DMA (cleared by ending DMA Mode)
+ * - Parity error (cleared by reading RPIR)
+ * - Loss of BSY (cleared by reading RPIR)
+ * Two conditions have flag bits that are not latched:
+ * - Bus phase mismatch (non-maskable in DMA Mode, cleared by ending DMA Mode)
+ * - Bus reset (non-maskable)
+ * The remaining condition has no flag bit at all:
+ * - Selection/reselection
+ *
+ * Hence, establishing the cause(s) of any interrupt is partly guesswork.
+ * In "The DP8490 and DP5380 Comparison Guide", National Semiconductor
+ * claimed that "the design of the [DP8490] interrupt logic ensures
+ * interrupts will not be lost (they can be on the DP5380)."
+ * The L5380/53C80 datasheet from LOGIC Devices has more details.
+ *
+ * Checking for bus reset by reading RST is futile because of interrupt
+ * latency, but a bus reset will reset chip logic. Checking for parity error
+ * is unnecessary because that interrupt is never enabled. A Loss of BSY
+ * condition will clear DMA Mode. We can tell when this occurs because the
+ * the Busy Monitor interrupt is enabled together with DMA Mode.
  */
 
-static irqreturn_t NCR5380_intr(int dummy, void *dev_id)
+static irqreturn_t NCR5380_intr(int irq, void *dev_id)
 {
-	NCR5380_local_declare();
 	struct Scsi_Host *instance = dev_id;
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-	int done;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	int handled = 0;
 	unsigned char basr;
 	unsigned long flags;
 
-	dprintk(NDEBUG_INTR, "scsi : NCR5380 irq %d triggered\n",
-		instance->irq);
+	spin_lock_irqsave(&hostdata->lock, flags);
+
+	basr = NCR5380_read(BUS_AND_STATUS_REG);
+	if (basr & BASR_IRQ) {
+		unsigned char mr = NCR5380_read(MODE_REG);
+		unsigned char sr = NCR5380_read(STATUS_REG);
+
+		dsprintk(NDEBUG_INTR, instance, "IRQ %d, BASR 0x%02x, SR 0x%02x, MR 0x%02x\n",
+		         irq, basr, sr, mr);
 
-	do {
-		done = 1;
-		spin_lock_irqsave(instance->host_lock, flags);
-		/* Look for pending interrupts */
-		NCR5380_setup(instance);
-		basr = NCR5380_read(BUS_AND_STATUS_REG);
-		/* XXX dispatch to appropriate routine if found and done=0 */
-		if (basr & BASR_IRQ) {
-			NCR5380_dprint(NDEBUG_INTR, instance);
-			if ((NCR5380_read(STATUS_REG) & (SR_SEL | SR_IO)) == (SR_SEL | SR_IO)) {
-				done = 0;
-				dprintk(NDEBUG_INTR, "scsi%d : SEL interrupt\n", instance->host_no);
-				NCR5380_reselect(instance);
-				(void) NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-			} else if (basr & BASR_PARITY_ERROR) {
-				dprintk(NDEBUG_INTR, "scsi%d : PARITY interrupt\n", instance->host_no);
-				(void) NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-			} else if ((NCR5380_read(STATUS_REG) & SR_RST) == SR_RST) {
-				dprintk(NDEBUG_INTR, "scsi%d : RESET interrupt\n", instance->host_no);
-				(void) NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-			} else {
 #if defined(REAL_DMA)
-				/*
-				 * We should only get PHASE MISMATCH and EOP interrupts
-				 * if we have DMA enabled, so do a sanity check based on
-				 * the current setting of the MODE register.
-				 */
+		if ((mr & MR_DMA_MODE) || (mr & MR_MONITOR_BSY)) {
+			/* Probably End of DMA, Phase Mismatch or Loss of BSY.
+			 * We ack IRQ after clearing Mode Register. Workarounds
+			 * for End of DMA errata need to happen in DMA Mode.
+			 */
 
-				if ((NCR5380_read(MODE_REG) & MR_DMA) && ((basr & BASR_END_DMA_TRANSFER) || !(basr & BASR_PHASE_MATCH))) {
-					int transferred;
+			dsprintk(NDEBUG_INTR, instance, "interrupt in DMA mode\n");
 
-					if (!hostdata->connected)
-						panic("scsi%d : received end of DMA interrupt with no connected cmd\n", instance->hostno);
+			int transferred;
 
-					transferred = (hostdata->dmalen - NCR5380_dma_residual(instance));
-					hostdata->connected->SCp.this_residual -= transferred;
-					hostdata->connected->SCp.ptr += transferred;
-					hostdata->dmalen = 0;
+			if (!hostdata->connected)
+				panic("scsi%d : DMA interrupt with no connected cmd\n",
+				      instance->hostno);
 
-					(void) NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-							
-					/* FIXME: we need to poll briefly then defer a workqueue task ! */
-					NCR5380_poll_politely(hostdata, BUS_AND_STATUS_REG, BASR_ACK, 0, 2*HZ);
+			transferred = hostdata->dmalen - NCR5380_dma_residual(instance);
+			hostdata->connected->SCp.this_residual -= transferred;
+			hostdata->connected->SCp.ptr += transferred;
+			hostdata->dmalen = 0;
 
-					NCR5380_write(MODE_REG, MR_BASE);
-					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-				}
-#else
-				dprintk(NDEBUG_INTR, "scsi : unknown interrupt, BASR 0x%X, MR 0x%X, SR 0x%x\n", basr, NCR5380_read(MODE_REG), NCR5380_read(STATUS_REG));
-				(void) NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-#endif
+			/* FIXME: we need to poll briefly then defer a workqueue task ! */
+			NCR5380_poll_politely(hostdata, BUS_AND_STATUS_REG, BASR_ACK, 0, 2 * HZ);
+
+			NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+			NCR5380_write(MODE_REG, MR_BASE);
+			NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+		} else
+#endif /* REAL_DMA */
+		if ((NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_mask) &&
+		    (sr & (SR_SEL | SR_IO | SR_BSY | SR_RST)) == (SR_SEL | SR_IO)) {
+			/* Probably reselected */
+			NCR5380_write(SELECT_ENABLE_REG, 0);
+			NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+
+			dsprintk(NDEBUG_INTR, instance, "interrupt with SEL and IO\n");
+
+			if (!hostdata->connected) {
+				NCR5380_reselect(instance);
+				queue_work(hostdata->work_q, &hostdata->main_task);
 			}
-		}	/* if BASR_IRQ */
-		spin_unlock_irqrestore(instance->host_lock, flags);
-		if(!done)
-			schedule_delayed_work(&hostdata->coroutine, 0);
-	} while (!done);
-	return IRQ_HANDLED;
+			if (!hostdata->connected)
+				NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+		} else {
+			/* Probably Bus Reset */
+			NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+
+			dsprintk(NDEBUG_INTR, instance, "unknown interrupt\n");
+		}
+		handled = 1;
+	} else {
+		shost_printk(KERN_NOTICE, instance, "interrupt without IRQ bit\n");
+	}
+
+	spin_unlock_irqrestore(&hostdata->lock, flags);
+
+	return IRQ_RETVAL(handled);
 }
 
-#endif 
+#endif
 
-/* 
+/*
  * Function : int NCR5380_select(struct Scsi_Host *instance,
- *                               struct scsi_cmnd *cmd)
+ * struct scsi_cmnd *cmd)
  *
  * Purpose : establishes I_T_L or I_T_L_Q nexus for new or existing command,
- *      including ARBITRATION, SELECTION, and initial message out for 
- *      IDENTIFY and queue messages. 
- *
- * Inputs : instance - instantiation of the 5380 driver on which this 
- *      target lives, cmd - SCSI command to execute.
- * 
- * Returns : -1 if selection could not execute for some reason,
- *      0 if selection succeeded or failed because the target 
- *      did not respond.
- *
- * Side effects : 
- *      If bus busy, arbitration failed, etc, NCR5380_select() will exit 
- *              with registers as they should have been on entry - ie
- *              SELECT_ENABLE will be set appropriately, the NCR5380
- *              will cease to drive any SCSI bus signals.
- *
- *      If successful : I_T_L or I_T_L_Q nexus will be established, 
- *              instance->connected will be set to cmd.  
- *              SELECT interrupt will be disabled.
- *
- *      If failed (no target) : cmd->scsi_done() will be called, and the 
- *              cmd->result host byte set to DID_BAD_TARGET.
- *
- *	Locks: caller holds hostdata lock in IRQ mode
+ * including ARBITRATION, SELECTION, and initial message out for
+ * IDENTIFY and queue messages.
+ *
+ * Inputs : instance - instantiation of the 5380 driver on which this
+ * target lives, cmd - SCSI command to execute.
+ *
+ * Returns cmd if selection failed but should be retried,
+ * NULL if selection failed and should not be retried, or
+ * NULL if selection succeeded (hostdata->connected == cmd).
+ *
+ * Side effects :
+ * If bus busy, arbitration failed, etc, NCR5380_select() will exit
+ * with registers as they should have been on entry - ie
+ * SELECT_ENABLE will be set appropriately, the NCR5380
+ * will cease to drive any SCSI bus signals.
+ *
+ * If successful : I_T_L or I_T_L_Q nexus will be established,
+ * instance->connected will be set to cmd.
+ * SELECT interrupt will be disabled.
+ *
+ * If failed (no target) : cmd->scsi_done() will be called, and the
+ * cmd->result host byte set to DID_BAD_TARGET.
  */
- 
-static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
+
+static struct scsi_cmnd *NCR5380_select(struct Scsi_Host *instance,
+                                        struct scsi_cmnd *cmd)
 {
-	NCR5380_local_declare();
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	unsigned char tmp[3], phase;
 	unsigned char *data;
 	int len;
-	unsigned long timeout;
-	unsigned char value;
 	int err;
-	NCR5380_setup(instance);
-
-	if (hostdata->selecting)
-		goto part2;
-
-	hostdata->restart_select = 0;
 
 	NCR5380_dprint(NDEBUG_ARBITRATION, instance);
-	dprintk(NDEBUG_ARBITRATION, "scsi%d : starting arbitration, id = %d\n", instance->host_no, instance->this_id);
+	dsprintk(NDEBUG_ARBITRATION, instance, "starting arbitration, id = %d\n",
+	         instance->this_id);
+
+	/*
+	 * Arbitration and selection phases are slow and involve dropping the
+	 * lock, so we have to watch out for EH. An exception handler may
+	 * change 'selecting' to NULL. This function will then return NULL
+	 * so that the caller will forget about 'cmd'. (During information
+	 * transfer phases, EH may change 'connected' to NULL.)
+	 */
+	hostdata->selecting = cmd;
 
-	/* 
-	 * Set the phase bits to 0, otherwise the NCR5380 won't drive the 
+	/*
+	 * Set the phase bits to 0, otherwise the NCR5380 won't drive the
 	 * data bus during SELECTION.
 	 */
 
 	NCR5380_write(TARGET_COMMAND_REG, 0);
 
-	/* 
+	/*
 	 * Start arbitration.
 	 */
 
 	NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask);
 	NCR5380_write(MODE_REG, MR_ARBITRATE);
 
+	/* The chip now waits for BUS FREE phase. Then after the 800 ns
+	 * Bus Free Delay, arbitration will begin.
+	 */
 
-	/* We can be relaxed here, interrupts are on, we are
-	   in workqueue context, the birds are singing in the trees */
-	spin_unlock_irq(instance->host_lock);
-	err = NCR5380_poll_politely(instance, INITIATOR_COMMAND_REG, ICR_ARBITRATION_PROGRESS, ICR_ARBITRATION_PROGRESS, 5*HZ);
-	spin_lock_irq(instance->host_lock);
+	spin_unlock_irq(&hostdata->lock);
+	err = NCR5380_poll_politely2(instance, MODE_REG, MR_ARBITRATE, 0,
+	                INITIATOR_COMMAND_REG, ICR_ARBITRATION_PROGRESS,
+	                                       ICR_ARBITRATION_PROGRESS, HZ);
+	spin_lock_irq(&hostdata->lock);
+	if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE)) {
+		/* Reselection interrupt */
+		goto out;
+	}
 	if (err < 0) {
-		printk(KERN_DEBUG "scsi: arbitration timeout at %d\n", __LINE__);
 		NCR5380_write(MODE_REG, MR_BASE);
-		NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-		goto failed;
+		shost_printk(KERN_ERR, instance,
+		             "select: arbitration timeout\n");
+		goto out;
 	}
+	spin_unlock_irq(&hostdata->lock);
 
-	dprintk(NDEBUG_ARBITRATION, "scsi%d : arbitration complete\n", instance->host_no);
-
-	/* 
-	 * The arbitration delay is 2.2us, but this is a minimum and there is 
-	 * no maximum so we can safely sleep for ceil(2.2) usecs to accommodate
-	 * the integral nature of udelay().
-	 *
-	 */
-
+	/* The SCSI-2 arbitration delay is 2.4 us */
 	udelay(3);
 
 	/* Check for lost arbitration */
-	if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) || (NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_higher_mask) || (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST)) {
-		NCR5380_write(MODE_REG, MR_BASE);
-		dprintk(NDEBUG_ARBITRATION, "scsi%d : lost arbitration, deasserting MR_ARBITRATE\n", instance->host_no);
-		goto failed;
-	}
-	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_SEL);
-
-	if (!(hostdata->flags & FLAG_DTC3181E) &&
-	    /* RvC: DTC3181E has some trouble with this
-	     *      so we simply removed it. Seems to work with
-	     *      only Mustek scanner attached
-	     */
+	if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) ||
+	    (NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_higher_mask) ||
 	    (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST)) {
 		NCR5380_write(MODE_REG, MR_BASE);
-		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-		dprintk(NDEBUG_ARBITRATION, "scsi%d : lost arbitration, deasserting ICR_ASSERT_SEL\n", instance->host_no);
-		goto failed;
+		dsprintk(NDEBUG_ARBITRATION, instance, "lost arbitration, deasserting MR_ARBITRATE\n");
+		spin_lock_irq(&hostdata->lock);
+		goto out;
 	}
-	/* 
-	 * Again, bus clear + bus settle time is 1.2us, however, this is 
+
+	/* After/during arbitration, BSY should be asserted.
+	 * IBM DPES-31080 Version S31Q works now
+	 * Tnx to Thomas_Roesch@m2.maus.de for finding this! (Roman)
+	 */
+	NCR5380_write(INITIATOR_COMMAND_REG,
+		      ICR_BASE | ICR_ASSERT_SEL | ICR_ASSERT_BSY);
+
+	/*
+	 * Again, bus clear + bus settle time is 1.2us, however, this is
 	 * a minimum so we'll udelay ceil(1.2)
 	 */
 
-	udelay(2);
+	if (hostdata->flags & FLAG_TOSHIBA_DELAY)
+		udelay(15);
+	else
+		udelay(2);
+
+	spin_lock_irq(&hostdata->lock);
+
+	/* NCR5380_reselect() clears MODE_REG after a reselection interrupt */
+	if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE))
+		goto out;
+
+	if (!hostdata->selecting) {
+		NCR5380_write(MODE_REG, MR_BASE);
+		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+		goto out;
+	}
 
-	dprintk(NDEBUG_ARBITRATION, "scsi%d : won arbitration\n", instance->host_no);
+	dsprintk(NDEBUG_ARBITRATION, instance, "won arbitration\n");
 
-	/* 
-	 * Now that we have won arbitration, start Selection process, asserting 
+	/*
+	 * Now that we have won arbitration, start Selection process, asserting
 	 * the host and target ID's on the SCSI bus.
 	 */
 
-	NCR5380_write(OUTPUT_DATA_REG, (hostdata->id_mask | (1 << scmd_id(cmd))));
+	NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask | (1 << scmd_id(cmd)));
 
-	/* 
+	/*
 	 * Raise ATN while SEL is true before BSY goes false from arbitration,
 	 * since this is the only way to guarantee that we'll get a MESSAGE OUT
 	 * phase immediately after selection.
 	 */
 
-	NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_BSY | ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL));
+	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY |
+	              ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL);
 	NCR5380_write(MODE_REG, MR_BASE);
 
-	/* 
+	/*
 	 * Reselect interrupts must be turned off prior to the dropping of BSY,
 	 * otherwise we will trigger an interrupt.
 	 */
 	NCR5380_write(SELECT_ENABLE_REG, 0);
 
+	spin_unlock_irq(&hostdata->lock);
+
 	/*
-	 * The initiator shall then wait at least two deskew delays and release 
+	 * The initiator shall then wait at least two deskew delays and release
 	 * the BSY signal.
 	 */
-	udelay(1);		/* wingel -- wait two bus deskew delay >2*45ns */
+	udelay(1);        /* wingel -- wait two bus deskew delay >2*45ns */
 
 	/* Reset BSY */
-	NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL));
+	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA |
+	              ICR_ASSERT_ATN | ICR_ASSERT_SEL);
 
-	/* 
+	/*
 	 * Something weird happens when we cease to drive BSY - looks
-	 * like the board/chip is letting us do another read before the 
+	 * like the board/chip is letting us do another read before the
 	 * appropriate propagation delay has expired, and we're confusing
 	 * a BSY signal from ourselves as the target's response to SELECTION.
 	 *
 	 * A small delay (the 'C++' frontend breaks the pipeline with an
 	 * unnecessary jump, making it work on my 386-33/Trantor T128, the
-	 * tighter 'C' code breaks and requires this) solves the problem - 
-	 * the 1 us delay is arbitrary, and only used because this delay will 
-	 * be the same on other platforms and since it works here, it should 
+	 * tighter 'C' code breaks and requires this) solves the problem -
+	 * the 1 us delay is arbitrary, and only used because this delay will
+	 * be the same on other platforms and since it works here, it should
 	 * work there.
 	 *
 	 * wingel suggests that this could be due to failing to wait
@@ -1339,50 +1162,43 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 
 	udelay(1);
 
-	dprintk(NDEBUG_SELECTION, "scsi%d : selecting target %d\n", instance->host_no, scmd_id(cmd));
+	dsprintk(NDEBUG_SELECTION, instance, "selecting target %d\n", scmd_id(cmd));
 
-	/* 
-	 * The SCSI specification calls for a 250 ms timeout for the actual 
+	/*
+	 * The SCSI specification calls for a 250 ms timeout for the actual
 	 * selection.
 	 */
 
-	timeout = jiffies + msecs_to_jiffies(250);
-
-	/* 
-	 * XXX very interesting - we're seeing a bounce where the BSY we 
-	 * asserted is being reflected / still asserted (propagation delay?)
-	 * and it's detecting as true.  Sigh.
-	 */
-
-	hostdata->select_time = 0;	/* we count the clock ticks at which we polled */
-	hostdata->selecting = cmd;
+	err = NCR5380_poll_politely(instance, STATUS_REG, SR_BSY, SR_BSY,
+	                            msecs_to_jiffies(250));
 
-part2:
-	/* RvC: here we enter after a sleeping period, or immediately after
-	   execution of part 1
-	   we poll only once ech clock tick */
-	value = NCR5380_read(STATUS_REG) & (SR_BSY | SR_IO);
-
-	if (!value && (hostdata->select_time < HZ/4)) {
-		/* RvC: we still must wait for a device response */
-		hostdata->select_time++;	/* after 25 ticks the device has failed */
-		NCR5380_set_timer(hostdata, 1);
-		return 0;	/* RvC: we return here with hostdata->selecting set,
-				   to go to sleep */
-	}
-
-	hostdata->selecting = NULL;/* clear this pointer, because we passed the
-					   waiting period */
 	if ((NCR5380_read(STATUS_REG) & (SR_SEL | SR_IO)) == (SR_SEL | SR_IO)) {
+		spin_lock_irq(&hostdata->lock);
 		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 		NCR5380_reselect(instance);
-		printk("scsi%d : reselection after won arbitration?\n", instance->host_no);
+		if (!hostdata->connected)
+			NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+		shost_printk(KERN_ERR, instance, "reselection after won arbitration?\n");
+		goto out;
+	}
+
+	if (err < 0) {
+		spin_lock_irq(&hostdata->lock);
+		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 		NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-		return -1;
+		/* Can't touch cmd if it has been reclaimed by the scsi ML */
+		if (hostdata->selecting) {
+			cmd->result = DID_BAD_TARGET << 16;
+			complete_cmd(instance, cmd);
+			dsprintk(NDEBUG_SELECTION, instance, "target did not respond within 250ms\n");
+			cmd = NULL;
+		}
+		goto out;
 	}
-	/* 
-	 * No less than two deskew delays after the initiator detects the 
-	 * BSY signal is true, it shall release the SEL signal and may 
+
+	/*
+	 * No less than two deskew delays after the initiator detects the
+	 * BSY signal is true, it shall release the SEL signal and may
 	 * change the DATA BUS.                                     -wingel
 	 */
 
@@ -1390,53 +1206,38 @@ part2:
 
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
 
-	if (!(NCR5380_read(STATUS_REG) & SR_BSY)) {
-		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-		if (hostdata->targets_present & (1 << scmd_id(cmd))) {
-			printk(KERN_DEBUG "scsi%d : weirdness\n", instance->host_no);
-			if (hostdata->restart_select)
-				printk(KERN_DEBUG "\trestart select\n");
-			NCR5380_dprint(NDEBUG_SELECTION, instance);
-			NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-			return -1;
-		}
-		cmd->result = DID_BAD_TARGET << 16;
-		cmd->scsi_done(cmd);
-		NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-		dprintk(NDEBUG_SELECTION, "scsi%d : target did not respond within 250ms\n", instance->host_no);
-		NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-		return 0;
-	}
-	hostdata->targets_present |= (1 << scmd_id(cmd));
-
 	/*
-	 * Since we followed the SCSI spec, and raised ATN while SEL 
+	 * Since we followed the SCSI spec, and raised ATN while SEL
 	 * was true but before BSY was false during selection, the information
 	 * transfer phase should be a MESSAGE OUT phase so that we can send the
 	 * IDENTIFY message.
-	 * 
+	 *
 	 * If SCSI-II tagged queuing is enabled, we also send a SIMPLE_QUEUE_TAG
 	 * message (2 bytes) with a tag ID that we increment with every command
 	 * until it wraps back to 0.
 	 *
 	 * XXX - it turns out that there are some broken SCSI-II devices,
-	 *       which claim to support tagged queuing but fail when more than
-	 *       some number of commands are issued at once.
+	 * which claim to support tagged queuing but fail when more than
+	 * some number of commands are issued at once.
 	 */
 
 	/* Wait for start of REQ/ACK handshake */
 
-	spin_unlock_irq(instance->host_lock);
 	err = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ);
-	spin_lock_irq(instance->host_lock);
-	
-	if(err) {
-		printk(KERN_ERR "scsi%d: timeout at NCR5380.c:%d\n", instance->host_no, __LINE__);
+	spin_lock_irq(&hostdata->lock);
+	if (err < 0) {
+		shost_printk(KERN_ERR, instance, "select: REQ timeout\n");
+		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 		NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-		goto failed;
+		goto out;
+	}
+	if (!hostdata->selecting) {
+		do_abort(instance);
+		goto out;
 	}
 
-	dprintk(NDEBUG_SELECTION, "scsi%d : target %d selected, going into MESSAGE OUT phase.\n", instance->host_no, cmd->device->id);
+	dsprintk(NDEBUG_SELECTION, instance, "target %d selected, going into MESSAGE OUT phase.\n",
+	         scmd_id(cmd));
 	tmp[0] = IDENTIFY(((instance->irq == NO_IRQ) ? 0 : 1), cmd->device->lun);
 
 	len = 1;
@@ -1446,104 +1247,82 @@ part2:
 	data = tmp;
 	phase = PHASE_MSGOUT;
 	NCR5380_transfer_pio(instance, &phase, &len, &data);
-	dprintk(NDEBUG_SELECTION, "scsi%d : nexus established.\n", instance->host_no);
+	dsprintk(NDEBUG_SELECTION, instance, "nexus established.\n");
 	/* XXX need to handle errors here */
+
 	hostdata->connected = cmd;
-	hostdata->busy[cmd->device->id] |= (1 << (cmd->device->lun & 0xFF));
+	hostdata->busy[cmd->device->id] |= 1 << cmd->device->lun;
 
 	initialize_SCp(cmd);
 
-	return 0;
-
-	/* Selection failed */
-failed:
-	return -1;
+	cmd = NULL;
 
+out:
+	if (!hostdata->selecting)
+		return NULL;
+	hostdata->selecting = NULL;
+	return cmd;
 }
 
-/* 
- * Function : int NCR5380_transfer_pio (struct Scsi_Host *instance, 
- *      unsigned char *phase, int *count, unsigned char **data)
+/*
+ * Function : int NCR5380_transfer_pio (struct Scsi_Host *instance,
+ * unsigned char *phase, int *count, unsigned char **data)
  *
  * Purpose : transfers data in given phase using polled I/O
  *
- * Inputs : instance - instance of driver, *phase - pointer to 
- *      what phase is expected, *count - pointer to number of 
- *      bytes to transfer, **data - pointer to data pointer.
- * 
+ * Inputs : instance - instance of driver, *phase - pointer to
+ * what phase is expected, *count - pointer to number of
+ * bytes to transfer, **data - pointer to data pointer.
+ *
  * Returns : -1 when different phase is entered without transferring
- *      maximum number of bytes, 0 if all bytes or transferred or exit
- *      is in same phase.
+ * maximum number of bytes, 0 if all bytes are transferred or exit
+ * is in same phase.
  *
- *      Also, *phase, *count, *data are modified in place.
+ * Also, *phase, *count, *data are modified in place.
  *
  * XXX Note : handling for bus free may be useful.
  */
 
 /*
- * Note : this code is not as quick as it could be, however it 
+ * Note : this code is not as quick as it could be, however it
  * IS 100% reliable, and for the actual data transfer where speed
  * counts, we will always do a pseudo DMA or DMA transfer.
  */
 
-static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase, int *count, unsigned char **data) {
-	NCR5380_local_declare();
+static int NCR5380_transfer_pio(struct Scsi_Host *instance,
+				unsigned char *phase, int *count,
+				unsigned char **data)
+{
 	unsigned char p = *phase, tmp;
 	int c = *count;
 	unsigned char *d = *data;
-	/*
-	 *      RvC: some administrative data to process polling time
-	 */
-	int break_allowed = 0;
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-	NCR5380_setup(instance);
-
-	if (!(p & SR_IO))
-		dprintk(NDEBUG_PIO, "scsi%d : pio write %d bytes\n", instance->host_no, c);
-	else
-		dprintk(NDEBUG_PIO, "scsi%d : pio read %d bytes\n", instance->host_no, c);
 
-	/* 
-	 * The NCR5380 chip will only drive the SCSI bus when the 
+	/*
+	 * The NCR5380 chip will only drive the SCSI bus when the
 	 * phase specified in the appropriate bits of the TARGET COMMAND
 	 * REGISTER match the STATUS REGISTER
 	 */
 
-	 NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p));
+	NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p));
 
-	/* RvC: don't know if this is necessary, but other SCSI I/O is short
-	 *      so breaks are not necessary there
-	 */
-	if ((p == PHASE_DATAIN) || (p == PHASE_DATAOUT)) {
-		break_allowed = 1;
-	}
 	do {
-		/* 
-		 * Wait for assertion of REQ, after which the phase bits will be 
-		 * valid 
-		 */
-
-		/* RvC: we simply poll once, after that we stop temporarily
-		 *      and let the device buffer fill up
-		 *      if breaking is not allowed, we keep polling as long as needed
+		/*
+		 * Wait for assertion of REQ, after which the phase bits will be
+		 * valid
 		 */
 
-		/* FIXME */
-		while (!((tmp = NCR5380_read(STATUS_REG)) & SR_REQ) && !break_allowed);
-		if (!(tmp & SR_REQ)) {
-			/* timeout condition */
-			NCR5380_set_timer(hostdata, USLEEP_SLEEP);
+		if (NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ) < 0)
 			break;
-		}
 
-		dprintk(NDEBUG_HANDSHAKE, "scsi%d : REQ detected\n", instance->host_no);
+		dsprintk(NDEBUG_HANDSHAKE, instance, "REQ asserted\n");
 
 		/* Check for phase mismatch */
-		if ((tmp & PHASE_MASK) != p) {
-			dprintk(NDEBUG_HANDSHAKE, "scsi%d : phase mismatch\n", instance->host_no);
-			NCR5380_dprint_phase(NDEBUG_HANDSHAKE, instance);
+		if ((NCR5380_read(STATUS_REG) & PHASE_MASK) != p) {
+			dsprintk(NDEBUG_PIO, instance, "phase mismatch\n");
+			NCR5380_dprint_phase(NDEBUG_PIO, instance);
 			break;
 		}
+
 		/* Do actual transfer from SCSI bus to / from memory */
 		if (!(p & SR_IO))
 			NCR5380_write(OUTPUT_DATA_REG, *d);
@@ -1552,7 +1331,7 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase
 
 		++d;
 
-		/* 
+		/*
 		 * The SCSI standard suggests that in MSGOUT phase, the initiator
 		 * should drop ATN on the last byte of the message phase
 		 * after REQ has been asserted for the handshake but before
@@ -1563,29 +1342,34 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase
 			if (!((p & SR_MSG) && c > 1)) {
 				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA);
 				NCR5380_dprint(NDEBUG_PIO, instance);
-				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ACK);
+				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
+				              ICR_ASSERT_DATA | ICR_ASSERT_ACK);
 			} else {
-				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ATN);
+				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
+				              ICR_ASSERT_DATA | ICR_ASSERT_ATN);
 				NCR5380_dprint(NDEBUG_PIO, instance);
-				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
+				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
+				              ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
 			}
 		} else {
 			NCR5380_dprint(NDEBUG_PIO, instance);
 			NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ACK);
 		}
 
-		/* FIXME - if this fails bus reset ?? */
-		NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, 0, 5*HZ);
-		dprintk(NDEBUG_HANDSHAKE, "scsi%d : req false, handshake complete\n", instance->host_no);
+		if (NCR5380_poll_politely(instance,
+		                          STATUS_REG, SR_REQ, 0, 5 * HZ) < 0)
+			break;
+
+		dsprintk(NDEBUG_HANDSHAKE, instance, "REQ negated, handshake complete\n");
 
 /*
- * We have several special cases to consider during REQ/ACK handshaking : 
- * 1.  We were in MSGOUT phase, and we are on the last byte of the 
- *      message.  ATN must be dropped as ACK is dropped.
+ * We have several special cases to consider during REQ/ACK handshaking :
+ * 1.  We were in MSGOUT phase, and we are on the last byte of the
+ * message.  ATN must be dropped as ACK is dropped.
  *
- * 2.  We are in a MSGIN phase, and we are on the last byte of the  
- *      message.  We must exit with ACK asserted, so that the calling
- *      code may raise ATN before dropping ACK to reject the message.
+ * 2.  We are in a MSGIN phase, and we are on the last byte of the
+ * message.  We must exit with ACK asserted, so that the calling
+ * code may raise ATN before dropping ACK to reject the message.
  *
  * 3.  ACK and ATN are clear and the target may proceed as normal.
  */
@@ -1597,12 +1381,16 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase
 		}
 	} while (--c);
 
-	dprintk(NDEBUG_PIO, "scsi%d : residual %d\n", instance->host_no, c);
+	dsprintk(NDEBUG_PIO, instance, "residual %d\n", c);
 
 	*count = c;
 	*data = d;
 	tmp = NCR5380_read(STATUS_REG);
-	if (tmp & SR_REQ)
+	/* The phase read from the bus is valid if either REQ is (already)
+	 * asserted or if ACK hasn't been released yet. The latter applies if
+	 * we're in MSG IN, DATA IN or STATUS and all bytes have been received.
+	 */
+	if ((tmp & SR_REQ) || ((tmp & SR_IO) && c == 0))
 		*phase = tmp & PHASE_MASK;
 	else
 		*phase = PHASE_UNKNOWN;
@@ -1614,79 +1402,80 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase
 }
 
 /**
- *	do_reset	-	issue a reset command
- *	@host: adapter to reset
+ * do_reset - issue a reset command
+ * @instance: adapter to reset
  *
- *	Issue a reset sequence to the NCR5380 and try and get the bus
- *	back into sane shape.
+ * Issue a reset sequence to the NCR5380 and try and get the bus
+ * back into sane shape.
  *
- *	Locks: caller holds queue lock
+ * This clears the reset interrupt flag because there may be no handler for
+ * it. When the driver is initialized, the NCR5380_intr() handler has not yet
+ * been installed. And when in EH we may have released the ST DMA interrupt.
  */
- 
-static void do_reset(struct Scsi_Host *host) {
-	NCR5380_local_declare();
-	NCR5380_setup(host);
 
-	NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG) & PHASE_MASK));
+static void do_reset(struct Scsi_Host *instance)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	NCR5380_write(TARGET_COMMAND_REG,
+	              PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG) & PHASE_MASK));
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST);
-	udelay(25);
+	udelay(50);
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+	(void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+	local_irq_restore(flags);
 }
 
-/*
- * Function : do_abort (Scsi_Host *host)
- * 
- * Purpose : abort the currently established nexus.  Should only be 
- *      called from a routine which can drop into a 
- * 
- * Returns : 0 on success, -1 on failure.
- *
- * Locks: queue lock held by caller
- *	FIXME: sort this out and get new_eh running
+/**
+ * do_abort - abort the currently established nexus by going to
+ * MESSAGE OUT phase and sending an ABORT message.
+ * @instance: relevant scsi host instance
+ *
+ * Returns 0 on success, -1 on failure.
  */
 
-static int do_abort(struct Scsi_Host *host) {
-	NCR5380_local_declare();
+static int do_abort(struct Scsi_Host *instance)
+{
 	unsigned char *msgptr, phase, tmp;
 	int len;
 	int rc;
-	NCR5380_setup(host);
-
 
 	/* Request message out phase */
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
 
-	/* 
-	 * Wait for the target to indicate a valid phase by asserting 
-	 * REQ.  Once this happens, we'll have either a MSGOUT phase 
-	 * and can immediately send the ABORT message, or we'll have some 
+	/*
+	 * Wait for the target to indicate a valid phase by asserting
+	 * REQ.  Once this happens, we'll have either a MSGOUT phase
+	 * and can immediately send the ABORT message, or we'll have some
 	 * other phase and will have to source/sink data.
-	 * 
+	 *
 	 * We really don't care what value was on the bus or what value
 	 * the target sees, so we just handshake.
 	 */
 
-	rc = NCR5380_poll_politely(host, STATUS_REG, SR_REQ, SR_REQ, 60 * HZ);
-	
-	if(rc < 0)
-		return -1;
+	rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, 10 * HZ);
+	if (rc < 0)
+		goto timeout;
+
+	tmp = NCR5380_read(STATUS_REG) & PHASE_MASK;
 
-	tmp = (unsigned char)rc;
-	
 	NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp));
 
-	if ((tmp & PHASE_MASK) != PHASE_MSGOUT) {
-		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
-		rc = NCR5380_poll_politely(host, STATUS_REG, SR_REQ, 0, 3*HZ);
+	if (tmp != PHASE_MSGOUT) {
+		NCR5380_write(INITIATOR_COMMAND_REG,
+		              ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
+		rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, 0, 3 * HZ);
+		if (rc < 0)
+			goto timeout;
 		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
-		if(rc == -1)
-			return -1;
 	}
+
 	tmp = ABORT;
 	msgptr = &tmp;
 	len = 1;
 	phase = PHASE_MSGOUT;
-	NCR5380_transfer_pio(host, &phase, &len, &msgptr);
+	NCR5380_transfer_pio(instance, &phase, &len, &msgptr);
 
 	/*
 	 * If we got here, and the command completed successfully,
@@ -1694,32 +1483,37 @@ static int do_abort(struct Scsi_Host *host) {
 	 */
 
 	return len ? -1 : 0;
+
+timeout:
+	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+	return -1;
 }
 
 #if defined(REAL_DMA) || defined(PSEUDO_DMA) || defined (REAL_DMA_POLL)
-/* 
- * Function : int NCR5380_transfer_dma (struct Scsi_Host *instance, 
- *      unsigned char *phase, int *count, unsigned char **data)
+/*
+ * Function : int NCR5380_transfer_dma (struct Scsi_Host *instance,
+ * unsigned char *phase, int *count, unsigned char **data)
  *
  * Purpose : transfers data in given phase using either real
- *      or pseudo DMA.
+ * or pseudo DMA.
  *
- * Inputs : instance - instance of driver, *phase - pointer to 
- *      what phase is expected, *count - pointer to number of 
- *      bytes to transfer, **data - pointer to data pointer.
- * 
- * Returns : -1 when different phase is entered without transferring
- *      maximum number of bytes, 0 if all bytes or transferred or exit
- *      is in same phase.
+ * Inputs : instance - instance of driver, *phase - pointer to
+ * what phase is expected, *count - pointer to number of
+ * bytes to transfer, **data - pointer to data pointer.
  *
- *      Also, *phase, *count, *data are modified in place.
+ * Returns : -1 when different phase is entered without transferring
+ * maximum number of bytes, 0 if all bytes or transferred or exit
+ * is in same phase.
  *
- *	Locks: io_request lock held by caller
+ * Also, *phase, *count, *data are modified in place.
  */
 
 
-static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase, int *count, unsigned char **data) {
-	NCR5380_local_declare();
+static int NCR5380_transfer_dma(struct Scsi_Host *instance,
+				unsigned char *phase, int *count,
+				unsigned char **data)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	register int c = *count;
 	register unsigned char p = *phase;
 	register unsigned char *d = *data;
@@ -1730,54 +1524,47 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
 	unsigned char saved_data = 0, overrun = 0, residue;
 #endif
 
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-
-	NCR5380_setup(instance);
-
 	if ((tmp = (NCR5380_read(STATUS_REG) & PHASE_MASK)) != p) {
 		*phase = tmp;
 		return -1;
 	}
 #if defined(REAL_DMA) || defined(REAL_DMA_POLL)
-#ifdef READ_OVERRUNS
 	if (p & SR_IO) {
-		c -= 2;
+		if (!(hostdata->flags & FLAG_NO_DMA_FIXUPS))
+			c -= 2;
 	}
-#endif
-	dprintk(NDEBUG_DMA, "scsi%d : initializing DMA channel %d for %s, %d bytes %s %0x\n", instance->host_no, instance->dma_channel, (p & SR_IO) ? "reading" : "writing", c, (p & SR_IO) ? "to" : "from", (unsigned) d);
 	hostdata->dma_len = (p & SR_IO) ? NCR5380_dma_read_setup(instance, d, c) : NCR5380_dma_write_setup(instance, d, c);
+
+	dsprintk(NDEBUG_DMA, instance, "initializing DMA %s: length %d, address %p\n",
+	         (p & SR_IO) ? "receive" : "send", c, *data);
 #endif
 
 	NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p));
 
 #ifdef REAL_DMA
-	NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_ENABLE_EOP_INTR | MR_MONITOR_BSY);
+	NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY |
+	                        MR_ENABLE_EOP_INTR);
 #elif defined(REAL_DMA_POLL)
-	NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE);
+	NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY);
 #else
 	/*
 	 * Note : on my sample board, watch-dog timeouts occurred when interrupts
-	 * were not disabled for the duration of a single DMA transfer, from 
+	 * were not disabled for the duration of a single DMA transfer, from
 	 * before the setting of DMA mode to after transfer of the last byte.
 	 */
 
-#if defined(PSEUDO_DMA) && defined(UNSAFE)
-	spin_unlock_irq(instance->host_lock);
-#endif
-	/* KLL May need eop and parity in 53c400 */
-	if (hostdata->flags & FLAG_NCR53C400)
-		NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE |
-				MR_ENABLE_PAR_CHECK | MR_ENABLE_PAR_INTR |
-				MR_ENABLE_EOP_INTR | MR_MONITOR_BSY);
+	if (hostdata->flags & FLAG_NO_DMA_FIXUP)
+		NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY |
+		                        MR_ENABLE_EOP_INTR);
 	else
-		NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE);
+		NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY);
 #endif				/* def REAL_DMA */
 
 	dprintk(NDEBUG_DMA, "scsi%d : mode reg = 0x%X\n", instance->host_no, NCR5380_read(MODE_REG));
 
-	/* 
-	 *	On the PAS16 at least I/O recovery delays are not needed here.
-	 *	Everyone else seems to want them.
+	/*
+	 * On the PAS16 at least I/O recovery delays are not needed here.
+	 * Everyone else seems to want them.
 	 */
 
 	if (p & SR_IO) {
@@ -1797,49 +1584,49 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
 	} while ((tmp & BASR_PHASE_MATCH) && !(tmp & (BASR_BUSY_ERROR | BASR_END_DMA_TRANSFER)));
 
 /*
-   At this point, either we've completed DMA, or we have a phase mismatch,
-   or we've unexpectedly lost BUSY (which is a real error).
-
-   For write DMAs, we want to wait until the last byte has been
-   transferred out over the bus before we turn off DMA mode.  Alas, there
-   seems to be no terribly good way of doing this on a 5380 under all
-   conditions.  For non-scatter-gather operations, we can wait until REQ
-   and ACK both go false, or until a phase mismatch occurs.  Gather-writes
-   are nastier, since the device will be expecting more data than we
-   are prepared to send it, and REQ will remain asserted.  On a 53C8[01] we
-   could test LAST BIT SENT to assure transfer (I imagine this is precisely
-   why this signal was added to the newer chips) but on the older 538[01]
-   this signal does not exist.  The workaround for this lack is a watchdog;
-   we bail out of the wait-loop after a modest amount of wait-time if
-   the usual exit conditions are not met.  Not a terribly clean or
-   correct solution :-%
-
-   Reads are equally tricky due to a nasty characteristic of the NCR5380.
-   If the chip is in DMA mode for an READ, it will respond to a target's
-   REQ by latching the SCSI data into the INPUT DATA register and asserting
-   ACK, even if it has _already_ been notified by the DMA controller that
-   the current DMA transfer has completed!  If the NCR5380 is then taken
-   out of DMA mode, this already-acknowledged byte is lost.
-
-   This is not a problem for "one DMA transfer per command" reads, because
-   the situation will never arise... either all of the data is DMA'ed
-   properly, or the target switches to MESSAGE IN phase to signal a
-   disconnection (either operation bringing the DMA to a clean halt).
-   However, in order to handle scatter-reads, we must work around the
-   problem.  The chosen fix is to DMA N-2 bytes, then check for the
-   condition before taking the NCR5380 out of DMA mode.  One or two extra
-   bytes are transferred via PIO as necessary to fill out the original
-   request.
+ * At this point, either we've completed DMA, or we have a phase mismatch,
+ * or we've unexpectedly lost BUSY (which is a real error).
+ *
+ * For DMA sends, we want to wait until the last byte has been
+ * transferred out over the bus before we turn off DMA mode.  Alas, there
+ * seems to be no terribly good way of doing this on a 5380 under all
+ * conditions.  For non-scatter-gather operations, we can wait until REQ
+ * and ACK both go false, or until a phase mismatch occurs.  Gather-sends
+ * are nastier, since the device will be expecting more data than we
+ * are prepared to send it, and REQ will remain asserted.  On a 53C8[01] we
+ * could test Last Byte Sent to assure transfer (I imagine this is precisely
+ * why this signal was added to the newer chips) but on the older 538[01]
+ * this signal does not exist.  The workaround for this lack is a watchdog;
+ * we bail out of the wait-loop after a modest amount of wait-time if
+ * the usual exit conditions are not met.  Not a terribly clean or
+ * correct solution :-%
+ *
+ * DMA receive is equally tricky due to a nasty characteristic of the NCR5380.
+ * If the chip is in DMA receive mode, it will respond to a target's
+ * REQ by latching the SCSI data into the INPUT DATA register and asserting
+ * ACK, even if it has _already_ been notified by the DMA controller that
+ * the current DMA transfer has completed!  If the NCR5380 is then taken
+ * out of DMA mode, this already-acknowledged byte is lost. This is
+ * not a problem for "one DMA transfer per READ command", because
+ * the situation will never arise... either all of the data is DMA'ed
+ * properly, or the target switches to MESSAGE IN phase to signal a
+ * disconnection (either operation bringing the DMA to a clean halt).
+ * However, in order to handle scatter-receive, we must work around the
+ * problem.  The chosen fix is to DMA N-2 bytes, then check for the
+ * condition before taking the NCR5380 out of DMA mode.  One or two extra
+ * bytes are transferred via PIO as necessary to fill out the original
+ * request.
  */
 
 	if (p & SR_IO) {
-#ifdef READ_OVERRUNS
-		udelay(10);
-		if (((NCR5380_read(BUS_AND_STATUS_REG) & (BASR_PHASE_MATCH | BASR_ACK)) == (BASR_PHASE_MATCH | BASR_ACK))) {
-			saved_data = NCR5380_read(INPUT_DATA_REGISTER);
-			overrun = 1;
+		if (!(hostdata->flags & FLAG_NO_DMA_FIXUPS)) {
+			udelay(10);
+			if ((NCR5380_read(BUS_AND_STATUS_REG) & (BASR_PHASE_MATCH | BASR_ACK)) ==
+			    (BASR_PHASE_MATCH | BASR_ACK)) {
+				saved_data = NCR5380_read(INPUT_DATA_REGISTER);
+				overrun = 1;
+			}
 		}
-#endif
 	} else {
 		int limit = 100;
 		while (((tmp = NCR5380_read(BUS_AND_STATUS_REG)) & BASR_ACK) || (NCR5380_read(STATUS_REG) & SR_REQ)) {
@@ -1850,7 +1637,8 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
 		}
 	}
 
-	dprintk(NDEBUG_DMA, "scsi%d : polled DMA transfer complete, basr 0x%X, sr 0x%X\n", instance->host_no, tmp, NCR5380_read(STATUS_REG));
+	dsprintk(NDEBUG_DMA, "polled DMA transfer complete, basr 0x%02x, sr 0x%02x\n",
+	         tmp, NCR5380_read(STATUS_REG));
 
 	NCR5380_write(MODE_REG, MR_BASE);
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
@@ -1861,8 +1649,8 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
 	*data += c;
 	*phase = NCR5380_read(STATUS_REG) & PHASE_MASK;
 
-#ifdef READ_OVERRUNS
-	if (*phase == p && (p & SR_IO) && residue == 0) {
+	if (!(hostdata->flags & FLAG_NO_DMA_FIXUPS) &&
+	    *phase == p && (p & SR_IO) && residue == 0) {
 		if (overrun) {
 			dprintk(NDEBUG_DMA, "Got an input overrun, using saved byte\n");
 			**data = saved_data;
@@ -1877,7 +1665,6 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
 		NCR5380_transfer_pio(instance, phase, &cnt, data);
 		*count -= toPIO - cnt;
 	}
-#endif
 
 	dprintk(NDEBUG_DMA, "Return with data ptr = 0x%X, count %d, last 0x%X, next 0x%X\n", *data, *count, *(*data + *count - 1), *(*data + *count));
 	return 0;
@@ -1886,95 +1673,64 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
 	return 0;
 #else				/* defined(REAL_DMA_POLL) */
 	if (p & SR_IO) {
-#ifdef DMA_WORKS_RIGHT
-		foo = NCR5380_pread(instance, d, c);
-#else
-		int diff = 1;
-		if (hostdata->flags & FLAG_NCR53C400) {
-			diff = 0;
-		}
-		if (!(foo = NCR5380_pread(instance, d, c - diff))) {
+		foo = NCR5380_pread(instance, d,
+			hostdata->flags & FLAG_NO_DMA_FIXUP ? c : c - 1);
+		if (!foo && !(hostdata->flags & FLAG_NO_DMA_FIXUP)) {
 			/*
-			 * We can't disable DMA mode after successfully transferring 
+			 * We can't disable DMA mode after successfully transferring
 			 * what we plan to be the last byte, since that would open up
-			 * a race condition where if the target asserted REQ before 
+			 * a race condition where if the target asserted REQ before
 			 * we got the DMA mode reset, the NCR5380 would have latched
 			 * an additional byte into the INPUT DATA register and we'd
 			 * have dropped it.
-			 * 
-			 * The workaround was to transfer one fewer bytes than we 
-			 * intended to with the pseudo-DMA read function, wait for 
+			 *
+			 * The workaround was to transfer one fewer bytes than we
+			 * intended to with the pseudo-DMA read function, wait for
 			 * the chip to latch the last byte, read it, and then disable
 			 * pseudo-DMA mode.
-			 * 
+			 *
 			 * After REQ is asserted, the NCR5380 asserts DRQ and ACK.
 			 * REQ is deasserted when ACK is asserted, and not reasserted
 			 * until ACK goes false.  Since the NCR5380 won't lower ACK
 			 * until DACK is asserted, which won't happen unless we twiddle
-			 * the DMA port or we take the NCR5380 out of DMA mode, we 
-			 * can guarantee that we won't handshake another extra 
+			 * the DMA port or we take the NCR5380 out of DMA mode, we
+			 * can guarantee that we won't handshake another extra
 			 * byte.
 			 */
 
-			if (!(hostdata->flags & FLAG_NCR53C400)) {
-				while (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_DRQ));
-				/* Wait for clean handshake */
-				while (NCR5380_read(STATUS_REG) & SR_REQ);
-				d[c - 1] = NCR5380_read(INPUT_DATA_REG);
+			if (NCR5380_poll_politely(instance, BUS_AND_STATUS_REG,
+			                          BASR_DRQ, BASR_DRQ, HZ) < 0) {
+				foo = -1;
+				shost_printk(KERN_ERR, instance, "PDMA read: DRQ timeout\n");
+			}
+			if (NCR5380_poll_politely(instance, STATUS_REG,
+			                          SR_REQ, 0, HZ) < 0) {
+				foo = -1;
+				shost_printk(KERN_ERR, instance, "PDMA read: !REQ timeout\n");
 			}
+			d[c - 1] = NCR5380_read(INPUT_DATA_REG);
 		}
-#endif
 	} else {
-#ifdef DMA_WORKS_RIGHT
 		foo = NCR5380_pwrite(instance, d, c);
-#else
-		int timeout;
-		dprintk(NDEBUG_C400_PWRITE, "About to pwrite %d bytes\n", c);
-		if (!(foo = NCR5380_pwrite(instance, d, c))) {
+		if (!foo && !(hostdata->flags & FLAG_NO_DMA_FIXUP)) {
 			/*
-			 * Wait for the last byte to be sent.  If REQ is being asserted for 
-			 * the byte we're interested, we'll ACK it and it will go false.  
+			 * Wait for the last byte to be sent.  If REQ is being asserted for
+			 * the byte we're interested, we'll ACK it and it will go false.
 			 */
-			if (!(hostdata->flags & FLAG_HAS_LAST_BYTE_SENT)) {
-				timeout = 20000;
-				while (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_DRQ) && (NCR5380_read(BUS_AND_STATUS_REG) & BASR_PHASE_MATCH));
-
-				if (!timeout)
-					dprintk(NDEBUG_LAST_BYTE_SENT, "scsi%d : timed out on last byte\n", instance->host_no);
-
-				if (hostdata->flags & FLAG_CHECK_LAST_BYTE_SENT) {
-					hostdata->flags &= ~FLAG_CHECK_LAST_BYTE_SENT;
-					if (NCR5380_read(TARGET_COMMAND_REG) & TCR_LAST_BYTE_SENT) {
-						hostdata->flags |= FLAG_HAS_LAST_BYTE_SENT;
-						dprintk(NDEBUG_LAST_BYTE_SENT, "scsi%d : last byte sent works\n", instance->host_no);
-					}
-				}
-			} else {
-				dprintk(NDEBUG_C400_PWRITE, "Waiting for LASTBYTE\n");
-				while (!(NCR5380_read(TARGET_COMMAND_REG) & TCR_LAST_BYTE_SENT));
-				dprintk(NDEBUG_C400_PWRITE, "Got LASTBYTE\n");
+			if (NCR5380_poll_politely2(instance,
+			     BUS_AND_STATUS_REG, BASR_DRQ, BASR_DRQ,
+			     BUS_AND_STATUS_REG, BASR_PHASE_MATCH, 0, HZ) < 0) {
+				foo = -1;
+				shost_printk(KERN_ERR, instance, "PDMA write: DRQ and phase timeout\n");
 			}
 		}
-#endif
 	}
 	NCR5380_write(MODE_REG, MR_BASE);
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-
-	if ((!(p & SR_IO)) && (hostdata->flags & FLAG_NCR53C400)) {
-		dprintk(NDEBUG_C400_PWRITE, "53C400w: Checking for IRQ\n");
-		if (NCR5380_read(BUS_AND_STATUS_REG) & BASR_IRQ) {
-			dprintk(NDEBUG_C400_PWRITE, "53C400w:    got it, reading reset interrupt reg\n");
-			NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-		} else {
-			printk("53C400w:    IRQ NOT THERE!\n");
-		}
-	}
+	NCR5380_read(RESET_PARITY_INTERRUPT_REG);
 	*data = d + c;
 	*count = 0;
 	*phase = NCR5380_read(STATUS_REG) & PHASE_MASK;
-#if defined(PSEUDO_DMA) && defined(UNSAFE)
-	spin_lock_irq(instance->host_lock);
-#endif				/* defined(REAL_DMA_POLL) */
 	return foo;
 #endif				/* def REAL_DMA */
 }
@@ -1983,25 +1739,23 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
 /*
  * Function : NCR5380_information_transfer (struct Scsi_Host *instance)
  *
- * Purpose : run through the various SCSI phases and do as the target 
- *      directs us to.  Operates on the currently connected command, 
- *      instance->connected.
+ * Purpose : run through the various SCSI phases and do as the target
+ * directs us to.  Operates on the currently connected command,
+ * instance->connected.
  *
  * Inputs : instance, instance for which we are doing commands
  *
- * Side effects : SCSI things happen, the disconnected queue will be 
- *      modified if a command disconnects, *instance->connected will
- *      change.
- *
- * XXX Note : we need to watch for bus free or a reset condition here 
- *      to recover from an unexpected bus free condition.
+ * Side effects : SCSI things happen, the disconnected queue will be
+ * modified if a command disconnects, *instance->connected will
+ * change.
  *
- * Locks: io_request_lock held by caller in IRQ mode
+ * XXX Note : we need to watch for bus free or a reset condition here
+ * to recover from an unexpected bus free condition.
  */
 
-static void NCR5380_information_transfer(struct Scsi_Host *instance) {
-	NCR5380_local_declare();
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *)instance->hostdata;
+static void NCR5380_information_transfer(struct Scsi_Host *instance)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	unsigned char msgout = NOP;
 	int sink = 0;
 	int len;
@@ -2010,13 +1764,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 #endif
 	unsigned char *data;
 	unsigned char phase, tmp, extended_msg[10], old_phase = 0xff;
-	struct scsi_cmnd *cmd = (struct scsi_cmnd *) hostdata->connected;
-	/* RvC: we need to set the end of the polling time */
-	unsigned long poll_time = jiffies + USLEEP_POLL;
+	struct scsi_cmnd *cmd;
 
-	NCR5380_setup(instance);
+	while ((cmd = hostdata->connected)) {
+		struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
 
-	while (1) {
 		tmp = NCR5380_read(STATUS_REG);
 		/* We only have a valid SCSI phase when REQ is asserted */
 		if (tmp & SR_REQ) {
@@ -2028,24 +1780,28 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 			if (sink && (phase != PHASE_MSGOUT)) {
 				NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp));
 
-				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
-				while (NCR5380_read(STATUS_REG) & SR_REQ);
-				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
+				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN |
+				              ICR_ASSERT_ACK);
+				while (NCR5380_read(STATUS_REG) & SR_REQ)
+					;
+				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
+				              ICR_ASSERT_ATN);
 				sink = 0;
 				continue;
 			}
+
 			switch (phase) {
-			case PHASE_DATAIN:
 			case PHASE_DATAOUT:
 #if (NDEBUG & NDEBUG_NO_DATAOUT)
-				printk("scsi%d : NDEBUG_NO_DATAOUT set, attempted DATAOUT aborted\n", instance->host_no);
+				shost_printk(KERN_DEBUG, instance, "NDEBUG_NO_DATAOUT set, attempted DATAOUT aborted\n");
 				sink = 1;
 				do_abort(instance);
 				cmd->result = DID_ERROR << 16;
-				cmd->scsi_done(cmd);
+				complete_cmd(instance, cmd);
 				return;
 #endif
-				/* 
+			case PHASE_DATAIN:
+				/*
 				 * If there is no room left in the current buffer in the
 				 * scatter-gather list, move onto the next one.
 				 */
@@ -2055,10 +1811,13 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 					--cmd->SCp.buffers_residual;
 					cmd->SCp.this_residual = cmd->SCp.buffer->length;
 					cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
-					dprintk(NDEBUG_INFORMATION, "scsi%d : %d bytes and %d buffers left\n", instance->host_no, cmd->SCp.this_residual, cmd->SCp.buffers_residual);
+					dsprintk(NDEBUG_INFORMATION, instance, "%d bytes and %d buffers left\n",
+					         cmd->SCp.this_residual,
+					         cmd->SCp.buffers_residual);
 				}
+
 				/*
-				 * The preferred transfer method is going to be 
+				 * The preferred transfer method is going to be
 				 * PSEUDO-DMA for systems that are strictly PIO,
 				 * since we can let the hardware do the handshaking.
 				 *
@@ -2068,50 +1827,39 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 				 */
 
 #if defined(PSEUDO_DMA) || defined(REAL_DMA_POLL)
-				/* KLL
-				 * PSEUDO_DMA is defined here. If this is the g_NCR5380
-				 * driver then it will always be defined, so the
-				 * FLAG_NO_PSEUDO_DMA is used to inhibit PDMA in the base
-				 * NCR5380 case.  I think this is a fairly clean solution.
-				 * We supplement these 2 if's with the flag.
-				 */
-#ifdef NCR5380_dma_xfer_len
-				if (!cmd->device->borken && !(hostdata->flags & FLAG_NO_PSEUDO_DMA) && (transfersize = NCR5380_dma_xfer_len(instance, cmd)) != 0) {
-#else
-				transfersize = cmd->transfersize;
-
-#ifdef LIMIT_TRANSFERSIZE	/* If we have problems with interrupt service */
-				if (transfersize > 512)
-					transfersize = 512;
-#endif				/* LIMIT_TRANSFERSIZE */
-
-				if (!cmd->device->borken && transfersize && !(hostdata->flags & FLAG_NO_PSEUDO_DMA) && cmd->SCp.this_residual && !(cmd->SCp.this_residual % transfersize)) {
-					/* Limit transfers to 32K, for xx400 & xx406
-					 * pseudoDMA that transfers in 128 bytes blocks. */
-					if (transfersize > 32 * 1024)
-						transfersize = 32 * 1024;
-#endif
+				transfersize = 0;
+				if (!cmd->device->borken &&
+				    !(hostdata->flags & FLAG_NO_PSEUDO_DMA))
+					transfersize = NCR5380_dma_xfer_len(instance, cmd, phase);
+
+				if (transfersize) {
 					len = transfersize;
-					if (NCR5380_transfer_dma(instance, &phase, &len, (unsigned char **) &cmd->SCp.ptr)) {
+					if (NCR5380_transfer_dma(instance, &phase,
+					    &len, (unsigned char **)&cmd->SCp.ptr)) {
 						/*
-						 * If the watchdog timer fires, all future accesses to this
-						 * device will use the polled-IO.
+						 * If the watchdog timer fires, all future
+						 * accesses to this device will use the
+						 * polled-IO.
 						 */
 						scmd_printk(KERN_INFO, cmd,
-							    "switching to slow handshake\n");
+							"switching to slow handshake\n");
 						cmd->device->borken = 1;
-						NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
 						sink = 1;
 						do_abort(instance);
 						cmd->result = DID_ERROR << 16;
-						cmd->scsi_done(cmd);
+						complete_cmd(instance, cmd);
 						/* XXX - need to source or sink data here, as appropriate */
 					} else
 						cmd->SCp.this_residual -= transfersize - len;
 				} else
 #endif				/* defined(PSEUDO_DMA) || defined(REAL_DMA_POLL) */
-					NCR5380_transfer_pio(instance, &phase, (int *) &cmd->SCp.this_residual, (unsigned char **)
-							     &cmd->SCp.ptr);
+				{
+					spin_unlock_irq(&hostdata->lock);
+					NCR5380_transfer_pio(instance, &phase,
+					                     (int *)&cmd->SCp.this_residual,
+					                     (unsigned char **)&cmd->SCp.ptr);
+					spin_lock_irq(&hostdata->lock);
+				}
 				break;
 			case PHASE_MSGIN:
 				len = 1;
@@ -2120,101 +1868,42 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 				cmd->SCp.Message = tmp;
 
 				switch (tmp) {
-					/*
-					 * Linking lets us reduce the time required to get the 
-					 * next command out to the device, hopefully this will
-					 * mean we don't waste another revolution due to the delays
-					 * required by ARBITRATION and another SELECTION.
-					 *
-					 * In the current implementation proposal, low level drivers
-					 * merely have to start the next command, pointed to by 
-					 * next_link, done() is called as with unlinked commands.
-					 */
-#ifdef LINKED
-				case LINKED_CMD_COMPLETE:
-				case LINKED_FLG_CMD_COMPLETE:
-					/* Accept message by clearing ACK */
-					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-					dprintk(NDEBUG_LINKED, "scsi%d : target %d lun %llu linked command complete.\n", instance->host_no, cmd->device->id, cmd->device->lun);
-					/* 
-					 * Sanity check : A linked command should only terminate with
-					 * one of these messages if there are more linked commands
-					 * available.
-					 */
-					if (!cmd->next_link) {
-					    printk("scsi%d : target %d lun %llu linked command complete, no next_link\n" instance->host_no, cmd->device->id, cmd->device->lun);
-						sink = 1;
-						do_abort(instance);
-						return;
-					}
-					initialize_SCp(cmd->next_link);
-					/* The next command is still part of this process */
-					cmd->next_link->tag = cmd->tag;
-					cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8);
-					dprintk(NDEBUG_LINKED, "scsi%d : target %d lun %llu linked request done, calling scsi_done().\n", instance->host_no, cmd->device->id, cmd->device->lun);
-					cmd->scsi_done(cmd);
-					cmd = hostdata->connected;
-					break;
-#endif				/* def LINKED */
 				case ABORT:
 				case COMMAND_COMPLETE:
 					/* Accept message by clearing ACK */
 					sink = 1;
 					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-					hostdata->connected = NULL;
-					dprintk(NDEBUG_QUEUES, "scsi%d : command for target %d, lun %llu completed\n", instance->host_no, cmd->device->id, cmd->device->lun);
-					hostdata->busy[cmd->device->id] &= ~(1 << (cmd->device->lun & 0xFF));
-
-					/* 
-					 * I'm not sure what the correct thing to do here is : 
-					 * 
-					 * If the command that just executed is NOT a request 
-					 * sense, the obvious thing to do is to set the result
-					 * code to the values of the stored parameters.
-					 * 
-					 * If it was a REQUEST SENSE command, we need some way 
-					 * to differentiate between the failure code of the original
-					 * and the failure code of the REQUEST sense - the obvious
-					 * case is success, where we fall through and leave the result
-					 * code unchanged.
-					 * 
-					 * The non-obvious place is where the REQUEST SENSE failed 
-					 */
-
-					if (cmd->cmnd[0] != REQUEST_SENSE)
-						cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8);
-					else if (status_byte(cmd->SCp.Status) != GOOD)
-						cmd->result = (cmd->result & 0x00ffff) | (DID_ERROR << 16);
+					dsprintk(NDEBUG_QUEUES, instance,
+					         "COMMAND COMPLETE %p target %d lun %llu\n",
+					         cmd, scmd_id(cmd), cmd->device->lun);
 
-					if ((cmd->cmnd[0] == REQUEST_SENSE) &&
-						hostdata->ses.cmd_len) {
-						scsi_eh_restore_cmnd(cmd, &hostdata->ses);
-						hostdata->ses.cmd_len = 0 ;
-					}
-
-					if ((cmd->cmnd[0] != REQUEST_SENSE) && (status_byte(cmd->SCp.Status) == CHECK_CONDITION)) {
-						scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0);
-
-						dprintk(NDEBUG_AUTOSENSE, "scsi%d : performing request sense\n", instance->host_no);
+					hostdata->connected = NULL;
 
-						LIST(cmd, hostdata->issue_queue);
-						cmd->host_scribble = (unsigned char *)
-						    hostdata->issue_queue;
-						hostdata->issue_queue = (struct scsi_cmnd *) cmd;
-						dprintk(NDEBUG_QUEUES, "scsi%d : REQUEST SENSE added to head of issue queue\n", instance->host_no);
-					} else {
-						cmd->scsi_done(cmd);
+					cmd->result &= ~0xffff;
+					cmd->result |= cmd->SCp.Status;
+					cmd->result |= cmd->SCp.Message << 8;
+
+					if (cmd->cmnd[0] == REQUEST_SENSE)
+						complete_cmd(instance, cmd);
+					else {
+						if (cmd->SCp.Status == SAM_STAT_CHECK_CONDITION ||
+						    cmd->SCp.Status == SAM_STAT_COMMAND_TERMINATED) {
+							dsprintk(NDEBUG_QUEUES, instance, "autosense: adding cmd %p to tail of autosense queue\n",
+							         cmd);
+							list_add_tail(&ncmd->list,
+							              &hostdata->autosense);
+						} else
+							complete_cmd(instance, cmd);
 					}
 
-					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-					/* 
-					 * Restore phase bits to 0 so an interrupted selection, 
+					/*
+					 * Restore phase bits to 0 so an interrupted selection,
 					 * arbitration can resume.
 					 */
 					NCR5380_write(TARGET_COMMAND_REG, 0);
 
-					while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected)
-						barrier();
+					/* Enable reselect interrupts */
+					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
 					return;
 				case MESSAGE_REJECT:
 					/* Accept message by clearing ACK */
@@ -2229,38 +1918,33 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 					default:
 						break;
 					}
-				case DISCONNECT:{
-						/* Accept message by clearing ACK */
-						NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-						cmd->device->disconnect = 1;
-						LIST(cmd, hostdata->disconnected_queue);
-						cmd->host_scribble = (unsigned char *)
-						    hostdata->disconnected_queue;
-						hostdata->connected = NULL;
-						hostdata->disconnected_queue = cmd;
-						dprintk(NDEBUG_QUEUES, "scsi%d : command for target %d lun %llu was moved from connected to" "  the disconnected_queue\n", instance->host_no, cmd->device->id, cmd->device->lun);
-						/* 
-						 * Restore phase bits to 0 so an interrupted selection, 
-						 * arbitration can resume.
-						 */
-						NCR5380_write(TARGET_COMMAND_REG, 0);
-
-						/* Enable reselect interrupts */
-						NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-						/* Wait for bus free to avoid nasty timeouts - FIXME timeout !*/
-						/* NCR538_poll_politely(instance, STATUS_REG, SR_BSY, 0, 30 * HZ); */
-						while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected)
-							barrier();
-						return;
-					}
-					/* 
+					break;
+				case DISCONNECT:
+					/* Accept message by clearing ACK */
+					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+					hostdata->connected = NULL;
+					list_add(&ncmd->list, &hostdata->disconnected);
+					dsprintk(NDEBUG_INFORMATION | NDEBUG_QUEUES,
+					         instance, "connected command %p for target %d lun %llu moved to disconnected queue\n",
+					         cmd, scmd_id(cmd), cmd->device->lun);
+
+					/*
+					 * Restore phase bits to 0 so an interrupted selection,
+					 * arbitration can resume.
+					 */
+					NCR5380_write(TARGET_COMMAND_REG, 0);
+
+					/* Enable reselect interrupts */
+					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+					return;
+					/*
 					 * The SCSI data pointer is *IMPLICITLY* saved on a disconnect
-					 * operation, in violation of the SCSI spec so we can safely 
+					 * operation, in violation of the SCSI spec so we can safely
 					 * ignore SAVE/RESTORE pointers calls.
 					 *
-					 * Unfortunately, some disks violate the SCSI spec and 
+					 * Unfortunately, some disks violate the SCSI spec and
 					 * don't issue the required SAVE_POINTERS message before
-					 * disconnecting, and we have to break spec to remain 
+					 * disconnecting, and we have to break spec to remain
 					 * compatible.
 					 */
 				case SAVE_POINTERS:
@@ -2269,31 +1953,28 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 					break;
 				case EXTENDED_MESSAGE:
-/* 
- * Extended messages are sent in the following format :
- * Byte         
- * 0            EXTENDED_MESSAGE == 1
- * 1            length (includes one byte for code, doesn't 
- *              include first two bytes)
- * 2            code
- * 3..length+1  arguments
- *
- * Start the extended message buffer with the EXTENDED_MESSAGE
- * byte, since spi_print_msg() wants the whole thing.  
- */
+					/*
+					 * Start the message buffer with the EXTENDED_MESSAGE
+					 * byte, since spi_print_msg() wants the whole thing.
+					 */
 					extended_msg[0] = EXTENDED_MESSAGE;
 					/* Accept first byte by clearing ACK */
 					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-					dprintk(NDEBUG_EXTENDED, "scsi%d : receiving extended message\n", instance->host_no);
+
+					spin_unlock_irq(&hostdata->lock);
+
+					dsprintk(NDEBUG_EXTENDED, instance, "receiving extended message\n");
 
 					len = 2;
 					data = extended_msg + 1;
 					phase = PHASE_MSGIN;
 					NCR5380_transfer_pio(instance, &phase, &len, &data);
+					dsprintk(NDEBUG_EXTENDED, instance, "length %d, code 0x%02x\n",
+					         (int)extended_msg[1],
+					         (int)extended_msg[2]);
 
-					dprintk(NDEBUG_EXTENDED, "scsi%d : length=%d, code=0x%02x\n", instance->host_no, (int) extended_msg[1], (int) extended_msg[2]);
-
-					if (!len && extended_msg[1] <= (sizeof(extended_msg) - 1)) {
+					if (!len && extended_msg[1] > 0 &&
+					    extended_msg[1] <= sizeof(extended_msg) - 2) {
 						/* Accept third byte by clearing ACK */
 						NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 						len = extended_msg[1] - 1;
@@ -2301,7 +1982,8 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 						phase = PHASE_MSGIN;
 
 						NCR5380_transfer_pio(instance, &phase, &len, &data);
-						dprintk(NDEBUG_EXTENDED, "scsi%d : message received, residual %d\n", instance->host_no, len);
+						dsprintk(NDEBUG_EXTENDED, instance, "message received, residual %d\n",
+						         len);
 
 						switch (extended_msg[2]) {
 						case EXTENDED_SDTR:
@@ -2311,34 +1993,42 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 							tmp = 0;
 						}
 					} else if (len) {
-						printk("scsi%d: error receiving extended message\n", instance->host_no);
+						shost_printk(KERN_ERR, instance, "error receiving extended message\n");
 						tmp = 0;
 					} else {
-						printk("scsi%d: extended message code %02x length %d is too long\n", instance->host_no, extended_msg[2], extended_msg[1]);
+						shost_printk(KERN_NOTICE, instance, "extended message code %02x length %d is too long\n",
+						             extended_msg[2], extended_msg[1]);
 						tmp = 0;
 					}
+
+					spin_lock_irq(&hostdata->lock);
+					if (!hostdata->connected)
+						return;
+
 					/* Fall through to reject message */
 
-					/* 
-					 * If we get something weird that we aren't expecting, 
+					/*
+					 * If we get something weird that we aren't expecting,
 					 * reject it.
 					 */
 				default:
 					if (!tmp) {
-						printk("scsi%d: rejecting message ", instance->host_no);
+						shost_printk(KERN_ERR, instance, "rejecting message ");
 						spi_print_msg(extended_msg);
 						printk("\n");
 					} else if (tmp != EXTENDED_MESSAGE)
 						scmd_printk(KERN_INFO, cmd,
-							"rejecting unknown message %02x\n",tmp);
+						            "rejecting unknown message %02x\n",
+						            tmp);
 					else
 						scmd_printk(KERN_INFO, cmd,
-							"rejecting unknown extended message code %02x, length %d\n", extended_msg[1], extended_msg[0]);
+						            "rejecting unknown extended message code %02x, length %d\n",
+						            extended_msg[1], extended_msg[0]);
 
 					msgout = MESSAGE_REJECT;
 					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
 					break;
-				}	/* switch (tmp) */
+				} /* switch (tmp) */
 				break;
 			case PHASE_MSGOUT:
 				len = 1;
@@ -2346,10 +2036,9 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 				hostdata->last_message = msgout;
 				NCR5380_transfer_pio(instance, &phase, &len, &data);
 				if (msgout == ABORT) {
-					hostdata->busy[cmd->device->id] &= ~(1 << (cmd->device->lun & 0xFF));
 					hostdata->connected = NULL;
 					cmd->result = DID_ERROR << 16;
-					cmd->scsi_done(cmd);
+					complete_cmd(instance, cmd);
 					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
 					return;
 				}
@@ -2358,17 +2047,12 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 			case PHASE_CMDOUT:
 				len = cmd->cmd_len;
 				data = cmd->cmnd;
-				/* 
-				 * XXX for performance reasons, on machines with a 
-				 * PSEUDO-DMA architecture we should probably 
-				 * use the dma transfer function.  
+				/*
+				 * XXX for performance reasons, on machines with a
+				 * PSEUDO-DMA architecture we should probably
+				 * use the dma transfer function.
 				 */
 				NCR5380_transfer_pio(instance, &phase, &len, &data);
-				if (!cmd->device->disconnect && should_disconnect(cmd->cmnd[0])) {
-					NCR5380_set_timer(hostdata, USLEEP_SLEEP);
-					dprintk(NDEBUG_USLEEP, "scsi%d : issued command, sleeping until %lu\n", instance->host_no, hostdata->time_expires);
-					return;
-				}
 				break;
 			case PHASE_STATIN:
 				len = 1;
@@ -2377,46 +2061,37 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 				cmd->SCp.Status = tmp;
 				break;
 			default:
-				printk("scsi%d : unknown phase\n", instance->host_no);
+				shost_printk(KERN_ERR, instance, "unknown phase\n");
 				NCR5380_dprint(NDEBUG_ANY, instance);
-			}	/* switch(phase) */
-		}		/* if (tmp * SR_REQ) */
-		else {
-			/* RvC: go to sleep if polling time expired
-			 */
-			if (!cmd->device->disconnect && time_after_eq(jiffies, poll_time)) {
-				NCR5380_set_timer(hostdata, USLEEP_SLEEP);
-				dprintk(NDEBUG_USLEEP, "scsi%d : poll timed out, sleeping until %lu\n", instance->host_no, hostdata->time_expires);
-				return;
-			}
+			} /* switch(phase) */
+		} else {
+			spin_unlock_irq(&hostdata->lock);
+			NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ);
+			spin_lock_irq(&hostdata->lock);
 		}
-	}			/* while (1) */
+	}
 }
 
 /*
  * Function : void NCR5380_reselect (struct Scsi_Host *instance)
  *
- * Purpose : does reselection, initializing the instance->connected 
- *      field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q
- *      nexus has been reestablished,
- *      
- * Inputs : instance - this instance of the NCR5380.
+ * Purpose : does reselection, initializing the instance->connected
+ * field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q
+ * nexus has been reestablished,
  *
- * Locks: io_request_lock held by caller if IRQ driven
+ * Inputs : instance - this instance of the NCR5380.
  */
 
-static void NCR5380_reselect(struct Scsi_Host *instance) {
-	NCR5380_local_declare();
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *)
-	 instance->hostdata;
+static void NCR5380_reselect(struct Scsi_Host *instance)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	unsigned char target_mask;
 	unsigned char lun, phase;
 	int len;
 	unsigned char msg[3];
 	unsigned char *data;
-	struct scsi_cmnd *tmp = NULL, *prev;
-	int abort = 0;
-	NCR5380_setup(instance);
+	struct NCR5380_cmd *ncmd;
+	struct scsi_cmnd *tmp;
 
 	/*
 	 * Disable arbitration, etc. since the host adapter obviously
@@ -2424,12 +2099,12 @@ static void NCR5380_reselect(struct Scsi_Host *instance) {
 	 */
 
 	NCR5380_write(MODE_REG, MR_BASE);
-	hostdata->restart_select = 1;
 
 	target_mask = NCR5380_read(CURRENT_SCSI_DATA_REG) & ~(hostdata->id_mask);
-	dprintk(NDEBUG_SELECTION, "scsi%d : reselect\n", instance->host_no);
 
-	/* 
+	dsprintk(NDEBUG_RESELECTION, instance, "reselect\n");
+
+	/*
 	 * At this point, we have detected that our SCSI ID is on the bus,
 	 * SEL is true and BSY was false for at least one bus settle delay
 	 * (400 ns).
@@ -2439,103 +2114,110 @@ static void NCR5380_reselect(struct Scsi_Host *instance) {
 	 */
 
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY);
-
-	/* FIXME: timeout too long, must fail to workqueue */	
-	if(NCR5380_poll_politely(instance, STATUS_REG, SR_SEL, 0, 2*HZ)<0)
-		abort = 1;
-		
+	if (NCR5380_poll_politely(instance,
+	                          STATUS_REG, SR_SEL, 0, 2 * HZ) < 0) {
+		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+		return;
+	}
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 
 	/*
 	 * Wait for target to go into MSGIN.
-	 * FIXME: timeout needed and fail to work queeu
 	 */
 
-	if(NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, 2*HZ))
-		abort = 1;
+	if (NCR5380_poll_politely(instance,
+	                          STATUS_REG, SR_REQ, SR_REQ, 2 * HZ) < 0) {
+		do_abort(instance);
+		return;
+	}
 
 	len = 1;
 	data = msg;
 	phase = PHASE_MSGIN;
 	NCR5380_transfer_pio(instance, &phase, &len, &data);
 
+	if (len) {
+		do_abort(instance);
+		return;
+	}
+
 	if (!(msg[0] & 0x80)) {
-		printk(KERN_ERR "scsi%d : expecting IDENTIFY message, got ", instance->host_no);
+		shost_printk(KERN_ERR, instance, "expecting IDENTIFY message, got ");
 		spi_print_msg(msg);
-		abort = 1;
-	} else {
-		/* Accept message by clearing ACK */
-		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-		lun = (msg[0] & 0x07);
+		printk("\n");
+		do_abort(instance);
+		return;
+	}
+	lun = msg[0] & 0x07;
 
-		/* 
-		 * We need to add code for SCSI-II to track which devices have
-		 * I_T_L_Q nexuses established, and which have simple I_T_L
-		 * nexuses so we can chose to do additional data transfer.
-		 */
+	/*
+	 * We need to add code for SCSI-II to track which devices have
+	 * I_T_L_Q nexuses established, and which have simple I_T_L
+	 * nexuses so we can chose to do additional data transfer.
+	 */
 
-		/* 
-		 * Find the command corresponding to the I_T_L or I_T_L_Q  nexus we 
-		 * just reestablished, and remove it from the disconnected queue.
-		 */
+	/*
+	 * Find the command corresponding to the I_T_L or I_T_L_Q  nexus we
+	 * just reestablished, and remove it from the disconnected queue.
+	 */
 
+	tmp = NULL;
+	list_for_each_entry(ncmd, &hostdata->disconnected, list) {
+		struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
 
-		for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue, prev = NULL; tmp; prev = tmp, tmp = (struct scsi_cmnd *) tmp->host_scribble)
-			if ((target_mask == (1 << tmp->device->id)) && (lun == (u8)tmp->device->lun)
-			    ) {
-				if (prev) {
-					REMOVE(prev, prev->host_scribble, tmp, tmp->host_scribble);
-					prev->host_scribble = tmp->host_scribble;
-				} else {
-					REMOVE(-1, hostdata->disconnected_queue, tmp, tmp->host_scribble);
-					hostdata->disconnected_queue = (struct scsi_cmnd *) tmp->host_scribble;
-				}
-				tmp->host_scribble = NULL;
-				break;
-			}
-		if (!tmp) {
-			printk(KERN_ERR "scsi%d : warning : target bitmask %02x lun %d not in disconnect_queue.\n", instance->host_no, target_mask, lun);
-			/* 
-			 * Since we have an established nexus that we can't do anything with,
-			 * we must abort it.  
-			 */
-			abort = 1;
+		if (target_mask == (1 << scmd_id(cmd)) &&
+		    lun == (u8)cmd->device->lun) {
+			list_del(&ncmd->list);
+			tmp = cmd;
+			break;
 		}
 	}
 
-	if (abort) {
-		do_abort(instance);
+	if (tmp) {
+		dsprintk(NDEBUG_RESELECTION | NDEBUG_QUEUES, instance,
+		         "reselect: removed %p from disconnected queue\n", tmp);
 	} else {
-		hostdata->connected = tmp;
-		dprintk(NDEBUG_RESELECTION, "scsi%d : nexus established, target = %d, lun = %llu, tag = %d\n", instance->host_no, tmp->device->id, tmp->device->lun, tmp->tag);
+		shost_printk(KERN_ERR, instance, "target bitmask 0x%02x lun %d not in disconnected queue.\n",
+		             target_mask, lun);
+		/*
+		 * Since we have an established nexus that we can't do anything
+		 * with, we must abort it.
+		 */
+		do_abort(instance);
+		return;
 	}
+
+	/* Accept message by clearing ACK */
+	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+
+	hostdata->connected = tmp;
+	dsprintk(NDEBUG_RESELECTION, instance, "nexus established, target %d, lun %llu, tag %d\n",
+	         scmd_id(tmp), tmp->device->lun, tmp->tag);
 }
 
 /*
  * Function : void NCR5380_dma_complete (struct Scsi_Host *instance)
  *
  * Purpose : called by interrupt handler when DMA finishes or a phase
- *      mismatch occurs (which would finish the DMA transfer).  
+ * mismatch occurs (which would finish the DMA transfer).
  *
  * Inputs : instance - this instance of the NCR5380.
  *
  * Returns : pointer to the scsi_cmnd structure for which the I_T_L
- *      nexus has been reestablished, on failure NULL is returned.
+ * nexus has been reestablished, on failure NULL is returned.
  */
 
 #ifdef REAL_DMA
 static void NCR5380_dma_complete(NCR5380_instance * instance) {
-	NCR5380_local_declare();
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	int transferred;
-	NCR5380_setup(instance);
 
 	/*
 	 * XXX this might not be right.
 	 *
 	 * Wait for final byte to transfer, ie wait for ACK to go false.
 	 *
-	 * We should use the Last Byte Sent bit, unfortunately this is 
+	 * We should use the Last Byte Sent bit, unfortunately this is
 	 * not available on the 5380/5381 (only the various CMOS chips)
 	 *
 	 * FIXME: timeout, and need to handle long timeout/irq case
@@ -2543,7 +2225,6 @@ static void NCR5380_dma_complete(NCR5380_instance * instance) {
 
 	NCR5380_poll_politely(instance, BUS_AND_STATUS_REG, BASR_ACK, 0, 5*HZ);
 
-	NCR5380_write(MODE_REG, MR_BASE);
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 
 	/*
@@ -2560,190 +2241,251 @@ static void NCR5380_dma_complete(NCR5380_instance * instance) {
 }
 #endif				/* def REAL_DMA */
 
-/*
- * Function : int NCR5380_abort (struct scsi_cmnd *cmd)
- *
- * Purpose : abort a command
- *
- * Inputs : cmd - the scsi_cmnd to abort, code - code to set the
- *      host byte of the result field to, if zero DID_ABORTED is
- *      used.
- *
- * Returns : SUCCESS - success, FAILED on failure.
- *
- *	XXX - there is no way to abort the command that is currently
- *	connected, you have to wait for it to complete.  If this is
- *	a problem, we could implement longjmp() / setjmp(), setjmp()
- *	called where the loop started in NCR5380_main().
- *
- * Locks: host lock taken by caller
+/**
+ * list_find_cmd - test for presence of a command in a linked list
+ * @haystack: list of commands
+ * @needle: command to search for
  */
 
-static int NCR5380_abort(struct scsi_cmnd *cmd)
+static bool list_find_cmd(struct list_head *haystack,
+                          struct scsi_cmnd *needle)
 {
-	NCR5380_local_declare();
-	struct Scsi_Host *instance = cmd->device->host;
-	struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-	struct scsi_cmnd *tmp, **prev;
+	struct NCR5380_cmd *ncmd;
 
-	scmd_printk(KERN_WARNING, cmd, "aborting command\n");
+	list_for_each_entry(ncmd, haystack, list)
+		if (NCR5380_to_scmd(ncmd) == needle)
+			return true;
+	return false;
+}
 
-	NCR5380_print_status(instance);
+/**
+ * list_remove_cmd - remove a command from linked list
+ * @haystack: list of commands
+ * @needle: command to remove
+ */
 
-	NCR5380_setup(instance);
+static bool list_del_cmd(struct list_head *haystack,
+                         struct scsi_cmnd *needle)
+{
+	if (list_find_cmd(haystack, needle)) {
+		struct NCR5380_cmd *ncmd = scsi_cmd_priv(needle);
 
-	dprintk(NDEBUG_ABORT, "scsi%d : abort called\n", instance->host_no);
-	dprintk(NDEBUG_ABORT, "        basr 0x%X, sr 0x%X\n", NCR5380_read(BUS_AND_STATUS_REG), NCR5380_read(STATUS_REG));
+		list_del(&ncmd->list);
+		return true;
+	}
+	return false;
+}
 
-#if 0
-/*
- * Case 1 : If the command is the currently executing command, 
- * we'll set the aborted flag and return control so that 
- * information transfer routine can exit cleanly.
+/**
+ * NCR5380_abort - scsi host eh_abort_handler() method
+ * @cmd: the command to be aborted
+ *
+ * Try to abort a given command by removing it from queues and/or sending
+ * the target an abort message. This may not succeed in causing a target
+ * to abort the command. Nonetheless, the low-level driver must forget about
+ * the command because the mid-layer reclaims it and it may be re-issued.
+ *
+ * The normal path taken by a command is as follows. For EH we trace this
+ * same path to locate and abort the command.
+ *
+ * unissued -> selecting -> [unissued -> selecting ->]... connected ->
+ * [disconnected -> connected ->]...
+ * [autosense -> connected ->] done
+ *
+ * If cmd is unissued then just remove it.
+ * If cmd is disconnected, try to select the target.
+ * If cmd is connected, try to send an abort message.
+ * If cmd is waiting for autosense, give it a chance to complete but check
+ * that it isn't left connected.
+ * If cmd was not found at all then presumably it has already been completed,
+ * in which case return SUCCESS to try to avoid further EH measures.
+ * If the command has not completed yet, we must not fail to find it.
  */
 
-	if (hostdata->connected == cmd) {
-		dprintk(NDEBUG_ABORT, "scsi%d : aborting connected command\n", instance->host_no);
-		hostdata->aborted = 1;
-/*
- * We should perform BSY checking, and make sure we haven't slipped
- * into BUS FREE.
- */
+static int NCR5380_abort(struct scsi_cmnd *cmd)
+{
+	struct Scsi_Host *instance = cmd->device->host;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	unsigned long flags;
+	int result = SUCCESS;
 
-		NCR5380_write(INITIATOR_COMMAND_REG, ICR_ASSERT_ATN);
-/* 
- * Since we can't change phases until we've completed the current 
- * handshake, we have to source or sink a byte of data if the current
- * phase is not MSGOUT.
- */
+	spin_lock_irqsave(&hostdata->lock, flags);
 
-/* 
- * Return control to the executing NCR drive so we can clear the
- * aborted flag and get back into our main loop.
- */
+#if (NDEBUG & NDEBUG_ANY)
+	scmd_printk(KERN_INFO, cmd, __func__);
+#endif
+	NCR5380_dprint(NDEBUG_ANY, instance);
+	NCR5380_dprint_phase(NDEBUG_ANY, instance);
 
-		return SUCCESS;
+	if (list_del_cmd(&hostdata->unissued, cmd)) {
+		dsprintk(NDEBUG_ABORT, instance,
+		         "abort: removed %p from issue queue\n", cmd);
+		cmd->result = DID_ABORT << 16;
+		cmd->scsi_done(cmd); /* No tag or busy flag to worry about */
 	}
-#endif
 
-/* 
- * Case 2 : If the command hasn't been issued yet, we simply remove it 
- *          from the issue queue.
- */
- 
-	dprintk(NDEBUG_ABORT, "scsi%d : abort going into loop.\n", instance->host_no);
-	for (prev = (struct scsi_cmnd **) &(hostdata->issue_queue), tmp = (struct scsi_cmnd *) hostdata->issue_queue; tmp; prev = (struct scsi_cmnd **) &(tmp->host_scribble), tmp = (struct scsi_cmnd *) tmp->host_scribble)
-		if (cmd == tmp) {
-			REMOVE(5, *prev, tmp, tmp->host_scribble);
-			(*prev) = (struct scsi_cmnd *) tmp->host_scribble;
-			tmp->host_scribble = NULL;
-			tmp->result = DID_ABORT << 16;
-			dprintk(NDEBUG_ABORT, "scsi%d : abort removed command from issue queue.\n", instance->host_no);
-			tmp->scsi_done(tmp);
-			return SUCCESS;
+	if (hostdata->selecting == cmd) {
+		dsprintk(NDEBUG_ABORT, instance,
+		         "abort: cmd %p == selecting\n", cmd);
+		hostdata->selecting = NULL;
+		cmd->result = DID_ABORT << 16;
+		complete_cmd(instance, cmd);
+		goto out;
+	}
+
+	if (list_del_cmd(&hostdata->disconnected, cmd)) {
+		dsprintk(NDEBUG_ABORT, instance,
+		         "abort: removed %p from disconnected list\n", cmd);
+		cmd->result = DID_ERROR << 16;
+		if (!hostdata->connected)
+			NCR5380_select(instance, cmd);
+		if (hostdata->connected != cmd) {
+			complete_cmd(instance, cmd);
+			result = FAILED;
+			goto out;
+		}
+	}
+
+	if (hostdata->connected == cmd) {
+		dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd);
+		hostdata->connected = NULL;
+		if (do_abort(instance)) {
+			set_host_byte(cmd, DID_ERROR);
+			complete_cmd(instance, cmd);
+			result = FAILED;
+			goto out;
 		}
-#if (NDEBUG  & NDEBUG_ABORT)
-	/* KLL */
-		else if (prev == tmp)
-			printk(KERN_ERR "scsi%d : LOOP\n", instance->host_no);
+		set_host_byte(cmd, DID_ABORT);
+#ifdef REAL_DMA
+		hostdata->dma_len = 0;
 #endif
+		if (cmd->cmnd[0] == REQUEST_SENSE)
+			complete_cmd(instance, cmd);
+		else {
+			struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
 
-/* 
- * Case 3 : If any commands are connected, we're going to fail the abort
- *          and let the high level SCSI driver retry at a later time or 
- *          issue a reset.
- *
- *          Timeouts, and therefore aborted commands, will be highly unlikely
- *          and handling them cleanly in this situation would make the common
- *          case of noresets less efficient, and would pollute our code.  So,
- *          we fail.
- */
+			/* Perform autosense for this command */
+			list_add(&ncmd->list, &hostdata->autosense);
+		}
+	}
 
-	if (hostdata->connected) {
-		dprintk(NDEBUG_ABORT, "scsi%d : abort failed, command connected.\n", instance->host_no);
-		return FAILED;
+	if (list_find_cmd(&hostdata->autosense, cmd)) {
+		dsprintk(NDEBUG_ABORT, instance,
+		         "abort: found %p on sense queue\n", cmd);
+		spin_unlock_irqrestore(&hostdata->lock, flags);
+		queue_work(hostdata->work_q, &hostdata->main_task);
+		msleep(1000);
+		spin_lock_irqsave(&hostdata->lock, flags);
+		if (list_del_cmd(&hostdata->autosense, cmd)) {
+			dsprintk(NDEBUG_ABORT, instance,
+			         "abort: removed %p from sense queue\n", cmd);
+			set_host_byte(cmd, DID_ABORT);
+			complete_cmd(instance, cmd);
+			goto out;
+		}
 	}
-/*
- * Case 4: If the command is currently disconnected from the bus, and 
- *      there are no connected commands, we reconnect the I_T_L or 
- *      I_T_L_Q nexus associated with it, go into message out, and send 
- *      an abort message.
- *
- * This case is especially ugly. In order to reestablish the nexus, we
- * need to call NCR5380_select().  The easiest way to implement this 
- * function was to abort if the bus was busy, and let the interrupt
- * handler triggered on the SEL for reselect take care of lost arbitrations
- * where necessary, meaning interrupts need to be enabled.
- *
- * When interrupts are enabled, the queues may change - so we 
- * can't remove it from the disconnected queue before selecting it
- * because that could cause a failure in hashing the nexus if that 
- * device reselected.
- * 
- * Since the queues may change, we can't use the pointers from when we
- * first locate it.
- *
- * So, we must first locate the command, and if NCR5380_select()
- * succeeds, then issue the abort, relocate the command and remove
- * it from the disconnected queue.
- */
 
-	for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue; tmp; tmp = (struct scsi_cmnd *) tmp->host_scribble)
-		if (cmd == tmp) {
-			dprintk(NDEBUG_ABORT, "scsi%d : aborting disconnected command.\n", instance->host_no);
+	if (hostdata->connected == cmd) {
+		dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd);
+		hostdata->connected = NULL;
+		if (do_abort(instance)) {
+			set_host_byte(cmd, DID_ERROR);
+			complete_cmd(instance, cmd);
+			result = FAILED;
+			goto out;
+		}
+		set_host_byte(cmd, DID_ABORT);
+#ifdef REAL_DMA
+		hostdata->dma_len = 0;
+#endif
+		complete_cmd(instance, cmd);
+	}
 
-			if (NCR5380_select(instance, cmd))
-				return FAILED;
-			dprintk(NDEBUG_ABORT, "scsi%d : nexus reestablished.\n", instance->host_no);
+out:
+	if (result == FAILED)
+		dsprintk(NDEBUG_ABORT, instance, "abort: failed to abort %p\n", cmd);
+	else
+		dsprintk(NDEBUG_ABORT, instance, "abort: successfully aborted %p\n", cmd);
 
-			do_abort(instance);
+	queue_work(hostdata->work_q, &hostdata->main_task);
+	spin_unlock_irqrestore(&hostdata->lock, flags);
 
-			for (prev = (struct scsi_cmnd **) &(hostdata->disconnected_queue), tmp = (struct scsi_cmnd *) hostdata->disconnected_queue; tmp; prev = (struct scsi_cmnd **) &(tmp->host_scribble), tmp = (struct scsi_cmnd *) tmp->host_scribble)
-				if (cmd == tmp) {
-					REMOVE(5, *prev, tmp, tmp->host_scribble);
-					*prev = (struct scsi_cmnd *) tmp->host_scribble;
-					tmp->host_scribble = NULL;
-					tmp->result = DID_ABORT << 16;
-					tmp->scsi_done(tmp);
-					return SUCCESS;
-				}
-		}
-/*
- * Case 5 : If we reached this point, the command was not found in any of 
- *          the queues.
- *
- * We probably reached this point because of an unlikely race condition
- * between the command completing successfully and the abortion code,
- * so we won't panic, but we will notify the user in case something really
- * broke.
- */
-	printk(KERN_WARNING "scsi%d : warning : SCSI command probably completed successfully\n"
-			"         before abortion\n", instance->host_no);
-	return FAILED;
+	return result;
 }
 
 
-/* 
- * Function : int NCR5380_bus_reset (struct scsi_cmnd *cmd)
- * 
- * Purpose : reset the SCSI bus.
- *
- * Returns : SUCCESS
+/**
+ * NCR5380_bus_reset - reset the SCSI bus
+ * @cmd: SCSI command undergoing EH
  *
- * Locks: host lock taken by caller
+ * Returns SUCCESS
  */
 
 static int NCR5380_bus_reset(struct scsi_cmnd *cmd)
 {
 	struct Scsi_Host *instance = cmd->device->host;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	int i;
+	unsigned long flags;
+	struct NCR5380_cmd *ncmd;
 
-	NCR5380_local_declare();
-	NCR5380_setup(instance);
-	NCR5380_print_status(instance);
+	spin_lock_irqsave(&hostdata->lock, flags);
+
+#if (NDEBUG & NDEBUG_ANY)
+	scmd_printk(KERN_INFO, cmd, __func__);
+#endif
+	NCR5380_dprint(NDEBUG_ANY, instance);
+	NCR5380_dprint_phase(NDEBUG_ANY, instance);
 
-	spin_lock_irq(instance->host_lock);
 	do_reset(instance);
-	spin_unlock_irq(instance->host_lock);
+
+	/* reset NCR registers */
+	NCR5380_write(MODE_REG, MR_BASE);
+	NCR5380_write(TARGET_COMMAND_REG, 0);
+	NCR5380_write(SELECT_ENABLE_REG, 0);
+
+	/* After the reset, there are no more connected or disconnected commands
+	 * and no busy units; so clear the low-level status here to avoid
+	 * conflicts when the mid-level code tries to wake up the affected
+	 * commands!
+	 */
+
+	hostdata->selecting = NULL;
+
+	list_for_each_entry(ncmd, &hostdata->disconnected, list) {
+		struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
+
+		set_host_byte(cmd, DID_RESET);
+		cmd->scsi_done(cmd);
+	}
+
+	list_for_each_entry(ncmd, &hostdata->autosense, list) {
+		struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
+
+		set_host_byte(cmd, DID_RESET);
+		cmd->scsi_done(cmd);
+	}
+
+	if (hostdata->connected) {
+		set_host_byte(hostdata->connected, DID_RESET);
+		complete_cmd(instance, hostdata->connected);
+		hostdata->connected = NULL;
+	}
+
+	if (hostdata->sensing) {
+		set_host_byte(hostdata->connected, DID_RESET);
+		complete_cmd(instance, hostdata->sensing);
+		hostdata->sensing = NULL;
+	}
+
+	for (i = 0; i < 8; ++i)
+		hostdata->busy[i] = 0;
+#ifdef REAL_DMA
+	hostdata->dma_len = 0;
+#endif
+
+	queue_work(hostdata->work_q, &hostdata->main_task);
+	spin_unlock_irqrestore(&hostdata->lock, flags);
 
 	return SUCCESS;
 }
diff --git a/drivers/scsi/NCR5380.h b/drivers/scsi/NCR5380.h
index 162112dd1bf8..a79288682a74 100644
--- a/drivers/scsi/NCR5380.h
+++ b/drivers/scsi/NCR5380.h
@@ -22,8 +22,13 @@
 #ifndef NCR5380_H
 #define NCR5380_H
 
+#include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <scsi/scsi_dbg.h>
 #include <scsi/scsi_eh.h>
+#include <scsi/scsi_transport_spi.h>
 
 #define NDEBUG_ARBITRATION	0x1
 #define NDEBUG_AUTOSENSE	0x2
@@ -158,8 +163,7 @@
 /* Write any value to this register to start an ini mode DMA receive */
 #define START_DMA_INITIATOR_RECEIVE_REG 7	/* wo */
 
-#define C400_CONTROL_STATUS_REG NCR53C400_register_offset-8	/* rw */
-
+/* NCR 53C400(A) Control Status Register bits: */
 #define CSR_RESET              0x80	/* wo  Resets 53c400 */
 #define CSR_53C80_REG          0x80	/* ro  5380 registers busy */
 #define CSR_TRANS_DIR          0x40	/* rw  Data transfer direction */
@@ -176,16 +180,6 @@
 #define CSR_BASE CSR_53C80_INTR
 #endif
 
-/* Number of 128-byte blocks to be transferred */
-#define C400_BLOCK_COUNTER_REG   NCR53C400_register_offset-7	/* rw */
-
-/* Resume transfer after disconnect */
-#define C400_RESUME_TRANSFER_REG NCR53C400_register_offset-6	/* wo */
-
-/* Access to host buffer stack */
-#define C400_HOST_BUFFER         NCR53C400_register_offset-4	/* rw */
-
-
 /* Note : PHASE_* macros are based on the values of the STATUS register */
 #define PHASE_MASK 	(SR_MSG | SR_CD | SR_IO)
 
@@ -205,16 +199,6 @@
 
 #define PHASE_SR_TO_TCR(phase) ((phase) >> 2)
 
-/*
- * The internal should_disconnect() function returns these based on the 
- * expected length of a disconnect if a device supports disconnect/
- * reconnect.
- */
-
-#define DISCONNECT_NONE		0
-#define DISCONNECT_TIME_TO_DATA	1
-#define DISCONNECT_LONG		2
-
 /* 
  * "Special" value for the (unsigned char) command tag, to indicate
  * I_T_L nexus instead of I_T_L_Q.
@@ -236,15 +220,11 @@
 #define NO_IRQ		0
 #endif
 
-#define FLAG_HAS_LAST_BYTE_SENT		1	/* NCR53c81 or better */
-#define FLAG_CHECK_LAST_BYTE_SENT	2	/* Only test once */
-#define FLAG_NCR53C400			4	/* NCR53c400 */
+#define FLAG_NO_DMA_FIXUP		1	/* No DMA errata workarounds */
 #define FLAG_NO_PSEUDO_DMA		8	/* Inhibit DMA */
-#define FLAG_DTC3181E			16	/* DTC3181E */
 #define FLAG_LATE_DMA_SETUP		32	/* Setup NCR before DMA H/W */
 #define FLAG_TAGGED_QUEUING		64	/* as X3T9.2 spelled it */
-
-#ifndef ASM
+#define FLAG_TOSHIBA_DELAY		128	/* Allow for borken CD-ROMs */
 
 #ifdef SUPPORT_TAGS
 struct tag_alloc {
@@ -258,33 +238,24 @@ struct NCR5380_hostdata {
 	NCR5380_implementation_fields;		/* implementation specific */
 	struct Scsi_Host *host;			/* Host backpointer */
 	unsigned char id_mask, id_higher_mask;	/* 1 << id, all bits greater */
-	unsigned char targets_present;		/* targets we have connected
-						   to, so we can call a select
-						   failure a retryable condition */
-	volatile unsigned char busy[8];		/* index = target, bit = lun */
+	unsigned char busy[8];			/* index = target, bit = lun */
 #if defined(REAL_DMA) || defined(REAL_DMA_POLL)
-	volatile int dma_len;			/* requested length of DMA */
+	int dma_len;				/* requested length of DMA */
 #endif
-	volatile unsigned char last_message;	/* last message OUT */
-	volatile struct scsi_cmnd *connected;	/* currently connected command */
-	volatile struct scsi_cmnd *issue_queue;	/* waiting to be issued */
-	volatile struct scsi_cmnd *disconnected_queue;	/* waiting for reconnect */
-	volatile int restart_select;		/* we have disconnected,
-						   used to restart 
-						   NCR5380_select() */
-	volatile unsigned aborted:1;		/* flag, says aborted */
+	unsigned char last_message;		/* last message OUT */
+	struct scsi_cmnd *connected;		/* currently connected cmnd */
+	struct scsi_cmnd *selecting;		/* cmnd to be connected */
+	struct list_head unissued;		/* waiting to be issued */
+	struct list_head autosense;		/* priority issue queue */
+	struct list_head disconnected;		/* waiting for reconnect */
+	spinlock_t lock;			/* protects this struct */
 	int flags;
-	unsigned long time_expires;		/* in jiffies, set prior to sleeping */
-	int select_time;			/* timer in select for target response */
-	volatile struct scsi_cmnd *selecting;
-	struct delayed_work coroutine;		/* our co-routine */
 	struct scsi_eh_save ses;
+	struct scsi_cmnd *sensing;
 	char info[256];
 	int read_overruns;                /* number of bytes to cut from a
 	                                   * transfer to handle chip overruns */
-	int retain_dma_intr;
 	struct work_struct main_task;
-	volatile int main_running;
 #ifdef SUPPORT_TAGS
 	struct tag_alloc TagAlloc[8][8];	/* 8 targets and 8 LUNs */
 #endif
@@ -292,10 +263,23 @@ struct NCR5380_hostdata {
 	unsigned spin_max_r;
 	unsigned spin_max_w;
 #endif
+	struct workqueue_struct *work_q;
+	unsigned long accesses_per_ms;	/* chip register accesses per ms */
 };
 
 #ifdef __KERNEL__
 
+struct NCR5380_cmd {
+	struct list_head list;
+};
+
+#define NCR5380_CMD_SIZE		(sizeof(struct NCR5380_cmd))
+
+static inline struct scsi_cmnd *NCR5380_to_scmd(struct NCR5380_cmd *ncmd_ptr)
+{
+	return ((struct scsi_cmnd *)ncmd_ptr) - 1;
+}
+
 #ifndef NDEBUG
 #define NDEBUG (0)
 #endif
@@ -304,6 +288,11 @@ struct NCR5380_hostdata {
 	do { if ((NDEBUG) & (flg)) \
 		printk(KERN_DEBUG fmt, ## __VA_ARGS__); } while (0)
 
+#define dsprintk(flg, host, fmt, ...) \
+	do { if ((NDEBUG) & (flg)) \
+		shost_printk(KERN_DEBUG, host, fmt, ## __VA_ARGS__); \
+	} while (0)
+
 #if NDEBUG
 #define NCR5380_dprint(flg, arg) \
 	do { if ((NDEBUG) & (flg)) NCR5380_print(arg); } while (0)
@@ -320,6 +309,7 @@ static void NCR5380_print(struct Scsi_Host *instance);
 static int NCR5380_probe_irq(struct Scsi_Host *instance, int possible);
 #endif
 static int NCR5380_init(struct Scsi_Host *instance, int flags);
+static int NCR5380_maybe_reset_bus(struct Scsi_Host *);
 static void NCR5380_exit(struct Scsi_Host *instance);
 static void NCR5380_information_transfer(struct Scsi_Host *instance);
 #ifndef DONT_USE_INTR
@@ -328,7 +318,7 @@ static irqreturn_t NCR5380_intr(int irq, void *dev_id);
 static void NCR5380_main(struct work_struct *work);
 static const char *NCR5380_info(struct Scsi_Host *instance);
 static void NCR5380_reselect(struct Scsi_Host *instance);
-static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd);
+static struct scsi_cmnd *NCR5380_select(struct Scsi_Host *, struct scsi_cmnd *);
 #if defined(PSEUDO_DMA) || defined(REAL_DMA) || defined(REAL_DMA_POLL)
 static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase, int *count, unsigned char **data);
 #endif
@@ -443,5 +433,4 @@ static __inline__ int NCR5380_pc_dma_residual(struct Scsi_Host *instance)
 #endif				/* defined(i386) || defined(__alpha__) */
 #endif				/* defined(REAL_DMA)  */
 #endif				/* __KERNEL__ */
-#endif				/* ndef ASM */
 #endif				/* NCR5380_H */
diff --git a/drivers/scsi/arm/cumana_1.c b/drivers/scsi/arm/cumana_1.c
index d28d6c0f18c0..221f18c5df93 100644
--- a/drivers/scsi/arm/cumana_1.c
+++ b/drivers/scsi/arm/cumana_1.c
@@ -4,9 +4,7 @@
  * Copyright 1995-2002, Russell King
  */
 #include <linux/module.h>
-#include <linux/signal.h>
 #include <linux/ioport.h>
-#include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/init.h>
 
@@ -15,15 +13,14 @@
 
 #include <scsi/scsi_host.h>
 
-#include <scsi/scsicam.h>
-
 #define PSEUDO_DMA
 
 #define priv(host)			((struct NCR5380_hostdata *)(host)->hostdata)
-#define NCR5380_local_declare()		struct Scsi_Host *_instance
-#define NCR5380_setup(instance)		_instance = instance
-#define NCR5380_read(reg)		cumanascsi_read(_instance, reg)
-#define NCR5380_write(reg, value)	cumanascsi_write(_instance, reg, value)
+#define NCR5380_read(reg)		cumanascsi_read(instance, reg)
+#define NCR5380_write(reg, value)	cumanascsi_write(instance, reg, value)
+
+#define NCR5380_dma_xfer_len(instance, cmd, phase)	(cmd->transfersize)
+
 #define NCR5380_intr			cumanascsi_intr
 #define NCR5380_queue_command		cumanascsi_queue_command
 #define NCR5380_info			cumanascsi_info
@@ -211,6 +208,8 @@ static struct scsi_host_template cumanascsi_template = {
 	.cmd_per_lun		= 2,
 	.use_clustering		= DISABLE_CLUSTERING,
 	.proc_name		= "CumanaSCSI-1",
+	.cmd_size		= NCR5380_CMD_SIZE,
+	.max_sectors		= 128,
 };
 
 static int cumanascsi1_probe(struct expansion_card *ec,
@@ -240,23 +239,21 @@ static int cumanascsi1_probe(struct expansion_card *ec,
 
 	host->irq = ec->irq;
 
-	NCR5380_init(host, 0);
+	ret = NCR5380_init(host, 0);
+	if (ret)
+		goto out_unmap;
+
+	NCR5380_maybe_reset_bus(host);
 
         priv(host)->ctrl = 0;
         writeb(0, priv(host)->base + CTRL);
 
-	host->n_io_port = 255;
-	if (!(request_region(host->io_port, host->n_io_port, "CumanaSCSI-1"))) {
-		ret = -EBUSY;
-		goto out_unmap;
-	}
-
 	ret = request_irq(host->irq, cumanascsi_intr, 0,
 			  "CumanaSCSI-1", host);
 	if (ret) {
 		printk("scsi%d: IRQ%d not free: %d\n",
 		    host->host_no, host->irq, ret);
-		goto out_unmap;
+		goto out_exit;
 	}
 
 	ret = scsi_add_host(host, &ec->dev);
@@ -268,6 +265,8 @@ static int cumanascsi1_probe(struct expansion_card *ec,
 
  out_free_irq:
 	free_irq(host->irq, host);
+ out_exit:
+	NCR5380_exit(host);
  out_unmap:
 	iounmap(priv(host)->base);
 	iounmap(priv(host)->dma);
diff --git a/drivers/scsi/arm/oak.c b/drivers/scsi/arm/oak.c
index 7c6fa1479c9c..1fab1d1896b1 100644
--- a/drivers/scsi/arm/oak.c
+++ b/drivers/scsi/arm/oak.c
@@ -5,9 +5,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/signal.h>
 #include <linux/ioport.h>
-#include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/init.h>
 
@@ -20,14 +18,16 @@
 #define DONT_USE_INTR
 
 #define priv(host)			((struct NCR5380_hostdata *)(host)->hostdata)
-#define NCR5380_local_declare()		void __iomem *_base
-#define NCR5380_setup(host)		_base = priv(host)->base
 
-#define NCR5380_read(reg)		readb(_base + ((reg) << 2))
-#define NCR5380_write(reg, value)	writeb(value, _base + ((reg) << 2))
+#define NCR5380_read(reg) \
+	readb(priv(instance)->base + ((reg) << 2))
+#define NCR5380_write(reg, value) \
+	writeb(value, priv(instance)->base + ((reg) << 2))
+
+#define NCR5380_dma_xfer_len(instance, cmd, phase)	(cmd->transfersize)
+
 #define NCR5380_queue_command		oakscsi_queue_command
 #define NCR5380_info			oakscsi_info
-#define NCR5380_show_info		oakscsi_show_info
 
 #define NCR5380_implementation_fields	\
 	void __iomem *base
@@ -103,7 +103,6 @@ printk("reading %p len %d\n", addr, len);
 
 static struct scsi_host_template oakscsi_template = {
 	.module			= THIS_MODULE,
-	.show_info		= oakscsi_show_info,
 	.name			= "Oak 16-bit SCSI",
 	.info			= oakscsi_info,
 	.queuecommand		= oakscsi_queue_command,
@@ -115,6 +114,8 @@ static struct scsi_host_template oakscsi_template = {
 	.cmd_per_lun		= 2,
 	.use_clustering		= DISABLE_CLUSTERING,
 	.proc_name		= "oakscsi",
+	.cmd_size		= NCR5380_CMD_SIZE,
+	.max_sectors		= 128,
 };
 
 static int oakscsi_probe(struct expansion_card *ec, const struct ecard_id *id)
@@ -142,15 +143,21 @@ static int oakscsi_probe(struct expansion_card *ec, const struct ecard_id *id)
 	host->irq = NO_IRQ;
 	host->n_io_port = 255;
 
-	NCR5380_init(host, 0);
+	ret = NCR5380_init(host, 0);
+	if (ret)
+		goto out_unmap;
+
+	NCR5380_maybe_reset_bus(host);
 
 	ret = scsi_add_host(host, &ec->dev);
 	if (ret)
-		goto out_unmap;
+		goto out_exit;
 
 	scsi_scan_host(host);
 	goto out;
 
+ out_exit:
+	NCR5380_exit(host);
  out_unmap:
 	iounmap(priv(host)->base);
  unreg:
diff --git a/drivers/scsi/atari_NCR5380.c b/drivers/scsi/atari_NCR5380.c
index db87ece6edb2..e65478651ca9 100644
--- a/drivers/scsi/atari_NCR5380.c
+++ b/drivers/scsi/atari_NCR5380.c
@@ -1,15 +1,15 @@
 /*
  * NCR 5380 generic driver routines.  These should make it *trivial*
- *	to implement 5380 SCSI drivers under Linux with a non-trantor
- *	architecture.
+ * to implement 5380 SCSI drivers under Linux with a non-trantor
+ * architecture.
  *
- *	Note that these routines also work with NR53c400 family chips.
+ * Note that these routines also work with NR53c400 family chips.
  *
  * Copyright 1993, Drew Eckhardt
- *	Visionary Computing
- *	(Unix and Linux consulting and custom programming)
- *	drew@colorado.edu
- *	+1 (303) 666-5836
+ * Visionary Computing
+ * (Unix and Linux consulting and custom programming)
+ * drew@colorado.edu
+ * +1 (303) 666-5836
  *
  * For more information, please consult
  *
@@ -24,84 +24,10 @@
  * 1+ (800) 334-5454
  */
 
-/*
- * ++roman: To port the 5380 driver to the Atari, I had to do some changes in
- * this file, too:
- *
- *  - Some of the debug statements were incorrect (undefined variables and the
- *    like). I fixed that.
- *
- *  - In information_transfer(), I think a #ifdef was wrong. Looking at the
- *    possible DMA transfer size should also happen for REAL_DMA. I added this
- *    in the #if statement.
- *
- *  - When using real DMA, information_transfer() should return in a DATAOUT
- *    phase after starting the DMA. It has nothing more to do.
- *
- *  - The interrupt service routine should run main after end of DMA, too (not
- *    only after RESELECTION interrupts). Additionally, it should _not_ test
- *    for more interrupts after running main, since a DMA process may have
- *    been started and interrupts are turned on now. The new int could happen
- *    inside the execution of NCR5380_intr(), leading to recursive
- *    calls.
- *
- *  - I've added a function merge_contiguous_buffers() that tries to
- *    merge scatter-gather buffers that are located at contiguous
- *    physical addresses and can be processed with the same DMA setup.
- *    Since most scatter-gather operations work on a page (4K) of
- *    4 buffers (1K), in more than 90% of all cases three interrupts and
- *    DMA setup actions are saved.
- *
- * - I've deleted all the stuff for AUTOPROBE_IRQ, REAL_DMA_POLL, PSEUDO_DMA
- *    and USLEEP, because these were messing up readability and will never be
- *    needed for Atari SCSI.
- *
- * - I've revised the NCR5380_main() calling scheme (relax the 'main_running'
- *   stuff), and 'main' is executed in a bottom half if awoken by an
- *   interrupt.
- *
- * - The code was quite cluttered up by "#if (NDEBUG & NDEBUG_*) printk..."
- *   constructs. In my eyes, this made the source rather unreadable, so I
- *   finally replaced that by the *_PRINTK() macros.
- *
- */
-
-/*
- * Further development / testing that should be done :
- * 1.  Test linked command handling code after Eric is ready with
- *     the high level code.
- */
+/* Ported to Atari by Roman Hodek and others. */
 
 /* Adapted for the sun3 by Sam Creasey. */
 
-#include <scsi/scsi_dbg.h>
-#include <scsi/scsi_transport_spi.h>
-
-#if (NDEBUG & NDEBUG_LISTS)
-#define LIST(x, y)						\
-	do {							\
-		printk("LINE:%d   Adding %p to %p\n",		\
-		       __LINE__, (void*)(x), (void*)(y));	\
-		if ((x) == (y))					\
-			udelay(5);				\
-	} while (0)
-#define REMOVE(w, x, y, z)					\
-	do {							\
-		printk("LINE:%d   Removing: %p->%p  %p->%p \n",	\
-		       __LINE__, (void*)(w), (void*)(x),	\
-		       (void*)(y), (void*)(z));			\
-		if ((x) == (y))					\
-			udelay(5);				\
-	} while (0)
-#else
-#define LIST(x,y)
-#define REMOVE(w,x,y,z)
-#endif
-
-#ifndef notyet
-#undef LINKED
-#endif
-
 /*
  * Design
  *
@@ -126,17 +52,7 @@
  * piece of hardware that requires you to sit in a loop polling for
  * the REQ signal as long as you are connected.  Some devices are
  * brain dead (ie, many TEXEL CD ROM drives) and won't disconnect
- * while doing long seek operations.
- *
- * The workaround for this is to keep track of devices that have
- * disconnected.  If the device hasn't disconnected, for commands that
- * should disconnect, we do something like
- *
- * while (!REQ is asserted) { sleep for N usecs; poll for M usecs }
- *
- * Some tweaking of N and M needs to be done.  An algorithm based
- * on "time to data" would give the best results as long as short time
- * to datas (ie, on the same track) were considered, however these
+ * while doing long seek operations. [...] These
  * broken devices are the exception rather than the rule and I'd rather
  * spend my time optimizing for the normal case.
  *
@@ -177,12 +93,10 @@
  *
  * These macros control options :
  * AUTOSENSE - if defined, REQUEST SENSE will be performed automatically
- *	for commands that return with a CHECK CONDITION status.
+ * for commands that return with a CHECK CONDITION status.
  *
  * DIFFERENTIAL - if defined, NCR53c81 chips will use external differential
- *	transceivers.
- *
- * LINKED - if defined, linked commands are supported.
+ * transceivers.
  *
  * REAL_DMA - if defined, REAL DMA is used during the data transfer phases.
  *
@@ -195,17 +109,17 @@
  * NCR5380_write(register, value) - write to the specific register
  *
  * NCR5380_implementation_fields  - additional fields needed for this
- *      specific implementation of the NCR5380
+ * specific implementation of the NCR5380
  *
  * Either real DMA *or* pseudo DMA may be implemented
  * REAL functions :
  * NCR5380_REAL_DMA should be defined if real DMA is to be used.
  * Note that the DMA setup functions should return the number of bytes
- *	that they were able to program the controller for.
+ * that they were able to program the controller for.
  *
  * Also note that generic i386/PC versions of these macros are
- *	available as NCR5380_i386_dma_write_setup,
- *	NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual.
+ * available as NCR5380_i386_dma_write_setup,
+ * NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual.
  *
  * NCR5380_dma_write_setup(instance, src, count) - initialize
  * NCR5380_dma_read_setup(instance, dst, count) - initialize
@@ -221,18 +135,8 @@
  * possible) function may be used.
  */
 
-/* Macros ease life... :-) */
-#define	SETUP_HOSTDATA(in)				\
-    struct NCR5380_hostdata *hostdata =			\
-	(struct NCR5380_hostdata *)(in)->hostdata
-#define	HOSTDATA(in) ((struct NCR5380_hostdata *)(in)->hostdata)
-
-#define	NEXT(cmd)		((struct scsi_cmnd *)(cmd)->host_scribble)
-#define	SET_NEXT(cmd,next)	((cmd)->host_scribble = (void *)(next))
-#define	NEXTADDR(cmd)		((struct scsi_cmnd **)&(cmd)->host_scribble)
-
-#define	HOSTNO		instance->host_no
-#define	H_NO(cmd)	(cmd)->device->host->host_no
+static int do_abort(struct Scsi_Host *);
+static void do_reset(struct Scsi_Host *);
 
 #ifdef SUPPORT_TAGS
 
@@ -251,9 +155,7 @@
  * cannot know it in advance :-( We just see a QUEUE_FULL status being
  * returned. So, in this case, the driver internal queue size assumption is
  * reduced to the number of active tags if QUEUE_FULL is returned by the
- * target. The command is returned to the mid-level, but with status changed
- * to BUSY, since --as I've seen-- the mid-level can't handle QUEUE_FULL
- * correctly.
+ * target.
  *
  * We're also not allowed running tagged commands as long as an untagged
  * command is active. And REQUEST SENSE commands after a contingent allegiance
@@ -304,7 +206,8 @@ static void __init init_tags(struct NCR5380_hostdata *hostdata)
 static int is_lun_busy(struct scsi_cmnd *cmd, int should_be_tagged)
 {
 	u8 lun = cmd->device->lun;
-	SETUP_HOSTDATA(cmd->device->host);
+	struct Scsi_Host *instance = cmd->device->host;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
 	if (hostdata->busy[cmd->device->id] & (1 << lun))
 		return 1;
@@ -314,8 +217,8 @@ static int is_lun_busy(struct scsi_cmnd *cmd, int should_be_tagged)
 		return 0;
 	if (hostdata->TagAlloc[scmd_id(cmd)][lun].nr_allocated >=
 	    hostdata->TagAlloc[scmd_id(cmd)][lun].queue_size) {
-		dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %d: no free tags\n",
-			   H_NO(cmd), cmd->device->id, lun);
+		dsprintk(NDEBUG_TAGS, instance, "target %d lun %d: no free tags\n",
+		         scmd_id(cmd), lun);
 		return 1;
 	}
 	return 0;
@@ -330,7 +233,8 @@ static int is_lun_busy(struct scsi_cmnd *cmd, int should_be_tagged)
 static void cmd_get_tag(struct scsi_cmnd *cmd, int should_be_tagged)
 {
 	u8 lun = cmd->device->lun;
-	SETUP_HOSTDATA(cmd->device->host);
+	struct Scsi_Host *instance = cmd->device->host;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
 	/* If we or the target don't support tagged queuing, allocate the LUN for
 	 * an untagged command.
@@ -340,18 +244,16 @@ static void cmd_get_tag(struct scsi_cmnd *cmd, int should_be_tagged)
 	    !cmd->device->tagged_supported) {
 		cmd->tag = TAG_NONE;
 		hostdata->busy[cmd->device->id] |= (1 << lun);
-		dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %d now allocated by untagged "
-			   "command\n", H_NO(cmd), cmd->device->id, lun);
+		dsprintk(NDEBUG_TAGS, instance, "target %d lun %d now allocated by untagged command\n",
+		         scmd_id(cmd), lun);
 	} else {
 		struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][lun];
 
 		cmd->tag = find_first_zero_bit(ta->allocated, MAX_TAGS);
 		set_bit(cmd->tag, ta->allocated);
 		ta->nr_allocated++;
-		dprintk(NDEBUG_TAGS, "scsi%d: using tag %d for target %d lun %d "
-			   "(now %d tags in use)\n",
-			   H_NO(cmd), cmd->tag, cmd->device->id,
-			   lun, ta->nr_allocated);
+		dsprintk(NDEBUG_TAGS, instance, "using tag %d for target %d lun %d (%d tags allocated)\n",
+		         cmd->tag, scmd_id(cmd), lun, ta->nr_allocated);
 	}
 }
 
@@ -363,21 +265,22 @@ static void cmd_get_tag(struct scsi_cmnd *cmd, int should_be_tagged)
 static void cmd_free_tag(struct scsi_cmnd *cmd)
 {
 	u8 lun = cmd->device->lun;
-	SETUP_HOSTDATA(cmd->device->host);
+	struct Scsi_Host *instance = cmd->device->host;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
 	if (cmd->tag == TAG_NONE) {
 		hostdata->busy[cmd->device->id] &= ~(1 << lun);
-		dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %d untagged cmd finished\n",
-			   H_NO(cmd), cmd->device->id, lun);
+		dsprintk(NDEBUG_TAGS, instance, "target %d lun %d untagged cmd freed\n",
+		         scmd_id(cmd), lun);
 	} else if (cmd->tag >= MAX_TAGS) {
-		printk(KERN_NOTICE "scsi%d: trying to free bad tag %d!\n",
-		       H_NO(cmd), cmd->tag);
+		shost_printk(KERN_NOTICE, instance,
+		             "trying to free bad tag %d!\n", cmd->tag);
 	} else {
 		struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][lun];
 		clear_bit(cmd->tag, ta->allocated);
 		ta->nr_allocated--;
-		dprintk(NDEBUG_TAGS, "scsi%d: freed tag %d for target %d lun %d\n",
-			   H_NO(cmd), cmd->tag, cmd->device->id, lun);
+		dsprintk(NDEBUG_TAGS, instance, "freed tag %d for target %d lun %d\n",
+		         cmd->tag, scmd_id(cmd), lun);
 	}
 }
 
@@ -401,17 +304,15 @@ static void free_all_tags(struct NCR5380_hostdata *hostdata)
 
 #endif /* SUPPORT_TAGS */
 
-
-/*
- * Function: void merge_contiguous_buffers( struct scsi_cmnd *cmd )
- *
- * Purpose: Try to merge several scatter-gather requests into one DMA
- *    transfer. This is possible if the scatter buffers lie on
- *    physical contiguous addresses.
- *
- * Parameters: struct scsi_cmnd *cmd
- *    The command to work on. The first scatter buffer's data are
- *    assumed to be already transferred into ptr/this_residual.
+/**
+ * merge_contiguous_buffers - coalesce scatter-gather list entries
+ * @cmd: command requesting IO
+ *
+ * Try to merge several scatter-gather buffers into one DMA transfer.
+ * This is possible if the scatter buffers lie on physically
+ * contiguous addresses. The first scatter-gather buffer's data are
+ * assumed to be already transferred into cmd->SCp.this_residual.
+ * Every buffer merged avoids an interrupt and a DMA setup operation.
  */
 
 static void merge_contiguous_buffers(struct scsi_cmnd *cmd)
@@ -463,9 +364,7 @@ static inline void initialize_SCp(struct scsi_cmnd *cmd)
 		cmd->SCp.buffers_residual = scsi_sg_count(cmd) - 1;
 		cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 		cmd->SCp.this_residual = cmd->SCp.buffer->length;
-		/* ++roman: Try to merge some scatter-buffers if they are at
-		 * contiguous physical addresses.
-		 */
+
 		merge_contiguous_buffers(cmd);
 	} else {
 		cmd->SCp.buffer = NULL;
@@ -473,31 +372,110 @@ static inline void initialize_SCp(struct scsi_cmnd *cmd)
 		cmd->SCp.ptr = NULL;
 		cmd->SCp.this_residual = 0;
 	}
+
+	cmd->SCp.Status = 0;
+	cmd->SCp.Message = 0;
+}
+
+/**
+ * NCR5380_poll_politely2 - wait for two chip register values
+ * @instance: controller to poll
+ * @reg1: 5380 register to poll
+ * @bit1: Bitmask to check
+ * @val1: Expected value
+ * @reg2: Second 5380 register to poll
+ * @bit2: Second bitmask to check
+ * @val2: Second expected value
+ * @wait: Time-out in jiffies
+ *
+ * Polls the chip in a reasonably efficient manner waiting for an
+ * event to occur. After a short quick poll we begin to yield the CPU
+ * (if possible). In irq contexts the time-out is arbitrarily limited.
+ * Callers may hold locks as long as they are held in irq mode.
+ *
+ * Returns 0 if either or both event(s) occurred otherwise -ETIMEDOUT.
+ */
+
+static int NCR5380_poll_politely2(struct Scsi_Host *instance,
+                                  int reg1, int bit1, int val1,
+                                  int reg2, int bit2, int val2, int wait)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	unsigned long deadline = jiffies + wait;
+	unsigned long n;
+
+	/* Busy-wait for up to 10 ms */
+	n = min(10000U, jiffies_to_usecs(wait));
+	n *= hostdata->accesses_per_ms;
+	n /= 2000;
+	do {
+		if ((NCR5380_read(reg1) & bit1) == val1)
+			return 0;
+		if ((NCR5380_read(reg2) & bit2) == val2)
+			return 0;
+		cpu_relax();
+	} while (n--);
+
+	if (irqs_disabled() || in_interrupt())
+		return -ETIMEDOUT;
+
+	/* Repeatedly sleep for 1 ms until deadline */
+	while (time_is_after_jiffies(deadline)) {
+		schedule_timeout_uninterruptible(1);
+		if ((NCR5380_read(reg1) & bit1) == val1)
+			return 0;
+		if ((NCR5380_read(reg2) & bit2) == val2)
+			return 0;
+	}
+
+	return -ETIMEDOUT;
 }
 
-#include <linux/delay.h>
+static inline int NCR5380_poll_politely(struct Scsi_Host *instance,
+                                        int reg, int bit, int val, int wait)
+{
+	return NCR5380_poll_politely2(instance, reg, bit, val,
+	                                        reg, bit, val, wait);
+}
 
 #if NDEBUG
 static struct {
 	unsigned char mask;
 	const char *name;
 } signals[] = {
-	{ SR_DBP, "PARITY"}, { SR_RST, "RST" }, { SR_BSY, "BSY" },
-	{ SR_REQ, "REQ" }, { SR_MSG, "MSG" }, { SR_CD,  "CD" }, { SR_IO, "IO" },
-	{ SR_SEL, "SEL" }, {0, NULL}
-}, basrs[] = {
-	{BASR_ATN, "ATN"}, {BASR_ACK, "ACK"}, {0, NULL}
-}, icrs[] = {
-	{ICR_ASSERT_RST, "ASSERT RST"},{ICR_ASSERT_ACK, "ASSERT ACK"},
-	{ICR_ASSERT_BSY, "ASSERT BSY"}, {ICR_ASSERT_SEL, "ASSERT SEL"},
-	{ICR_ASSERT_ATN, "ASSERT ATN"}, {ICR_ASSERT_DATA, "ASSERT DATA"},
+	{SR_DBP, "PARITY"},
+	{SR_RST, "RST"},
+	{SR_BSY, "BSY"},
+	{SR_REQ, "REQ"},
+	{SR_MSG, "MSG"},
+	{SR_CD, "CD"},
+	{SR_IO, "IO"},
+	{SR_SEL, "SEL"},
+	{0, NULL}
+},
+basrs[] = {
+	{BASR_ATN, "ATN"},
+	{BASR_ACK, "ACK"},
 	{0, NULL}
-}, mrs[] = {
-	{MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"}, {MR_TARGET, "MODE TARGET"},
-	{MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"}, {MR_ENABLE_PAR_INTR,
-	"MODE PARITY INTR"}, {MR_ENABLE_EOP_INTR,"MODE EOP INTR"},
+},
+icrs[] = {
+	{ICR_ASSERT_RST, "ASSERT RST"},
+	{ICR_ASSERT_ACK, "ASSERT ACK"},
+	{ICR_ASSERT_BSY, "ASSERT BSY"},
+	{ICR_ASSERT_SEL, "ASSERT SEL"},
+	{ICR_ASSERT_ATN, "ASSERT ATN"},
+	{ICR_ASSERT_DATA, "ASSERT DATA"},
+	{0, NULL}
+},
+mrs[] = {
+	{MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"},
+	{MR_TARGET, "MODE TARGET"},
+	{MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"},
+	{MR_ENABLE_PAR_INTR, "MODE PARITY INTR"},
+	{MR_ENABLE_EOP_INTR, "MODE EOP INTR"},
 	{MR_MONITOR_BSY, "MODE MONITOR BSY"},
-	{MR_DMA_MODE, "MODE DMA"}, {MR_ARBITRATE, "MODE ARBITRATION"},
+	{MR_DMA_MODE, "MODE DMA"},
+	{MR_ARBITRATE, "MODE ARBITRATION"},
 	{0, NULL}
 };
 
@@ -511,15 +489,13 @@ static struct {
 static void NCR5380_print(struct Scsi_Host *instance)
 {
 	unsigned char status, data, basr, mr, icr, i;
-	unsigned long flags;
 
-	local_irq_save(flags);
 	data = NCR5380_read(CURRENT_SCSI_DATA_REG);
 	status = NCR5380_read(STATUS_REG);
 	mr = NCR5380_read(MODE_REG);
 	icr = NCR5380_read(INITIATOR_COMMAND_REG);
 	basr = NCR5380_read(BUS_AND_STATUS_REG);
-	local_irq_restore(flags);
+
 	printk("STATUS_REG: %02x ", status);
 	for (i = 0; signals[i].mask; ++i)
 		if (status & signals[i].mask)
@@ -543,8 +519,12 @@ static struct {
 	unsigned char value;
 	const char *name;
 } phases[] = {
-	{PHASE_DATAOUT, "DATAOUT"}, {PHASE_DATAIN, "DATAIN"}, {PHASE_CMDOUT, "CMDOUT"},
-	{PHASE_STATIN, "STATIN"}, {PHASE_MSGOUT, "MSGOUT"}, {PHASE_MSGIN, "MSGIN"},
+	{PHASE_DATAOUT, "DATAOUT"},
+	{PHASE_DATAIN, "DATAIN"},
+	{PHASE_CMDOUT, "CMDOUT"},
+	{PHASE_STATIN, "STATIN"},
+	{PHASE_MSGOUT, "MSGOUT"},
+	{PHASE_MSGIN, "MSGIN"},
 	{PHASE_UNKNOWN, "UNKNOWN"}
 };
 
@@ -553,8 +533,6 @@ static struct {
  * @instance: adapter to dump
  *
  * Print the current SCSI phase for debugging purposes
- *
- * Locks: none
  */
 
 static void NCR5380_print_phase(struct Scsi_Host *instance)
@@ -564,54 +542,21 @@ static void NCR5380_print_phase(struct Scsi_Host *instance)
 
 	status = NCR5380_read(STATUS_REG);
 	if (!(status & SR_REQ))
-		printk(KERN_DEBUG "scsi%d: REQ not asserted, phase unknown.\n", HOSTNO);
+		shost_printk(KERN_DEBUG, instance, "REQ not asserted, phase unknown.\n");
 	else {
 		for (i = 0; (phases[i].value != PHASE_UNKNOWN) &&
 		     (phases[i].value != (status & PHASE_MASK)); ++i)
 			;
-		printk(KERN_DEBUG "scsi%d: phase %s\n", HOSTNO, phases[i].name);
+		shost_printk(KERN_DEBUG, instance, "phase %s\n", phases[i].name);
 	}
 }
-
 #endif
 
-/*
- * ++roman: New scheme of calling NCR5380_main()
- *
- * If we're not in an interrupt, we can call our main directly, it cannot be
- * already running. Else, we queue it on a task queue, if not 'main_running'
- * tells us that a lower level is already executing it. This way,
- * 'main_running' needs not be protected in a special way.
- *
- * queue_main() is a utility function for putting our main onto the task
- * queue, if main_running is false. It should be called only from a
- * interrupt or bottom half.
- */
-
-#include <linux/gfp.h>
-#include <linux/workqueue.h>
-#include <linux/interrupt.h>
-
-static inline void queue_main(struct NCR5380_hostdata *hostdata)
-{
-	if (!hostdata->main_running) {
-		/* If in interrupt and NCR5380_main() not already running,
-		   queue it on the 'immediate' task queue, to be processed
-		   immediately after the current interrupt processing has
-		   finished. */
-		schedule_work(&hostdata->main_task);
-	}
-	/* else: nothing to do: the running NCR5380_main() will pick up
-	   any newly queued command. */
-}
-
 /**
  * NCR58380_info - report driver and host information
  * @instance: relevant scsi host instance
  *
  * For use as the host template info() handler.
- *
- * Locks: none
  */
 
 static const char *NCR5380_info(struct Scsi_Host *instance)
@@ -630,13 +575,14 @@ static void prepare_info(struct Scsi_Host *instance)
 	         "base 0x%lx, irq %d, "
 	         "can_queue %d, cmd_per_lun %d, "
 	         "sg_tablesize %d, this_id %d, "
-	         "flags { %s}, "
+	         "flags { %s%s}, "
 	         "options { %s} ",
 	         instance->hostt->name, instance->io_port, instance->n_io_port,
 	         instance->base, instance->irq,
 	         instance->can_queue, instance->cmd_per_lun,
 	         instance->sg_tablesize, instance->this_id,
 	         hostdata->flags & FLAG_TAGGED_QUEUING ? "TAGGED_QUEUING " : "",
+	         hostdata->flags & FLAG_TOSHIBA_DELAY  ? "TOSHIBA_DELAY "  : "",
 #ifdef DIFFERENTIAL
 	         "DIFFERENTIAL "
 #endif
@@ -653,102 +599,6 @@ static void prepare_info(struct Scsi_Host *instance)
 }
 
 /**
- * NCR5380_print_status - dump controller info
- * @instance: controller to dump
- *
- * Print commands in the various queues, called from NCR5380_abort
- * to aid debugging.
- */
-
-static void lprint_Scsi_Cmnd(struct scsi_cmnd *cmd)
-{
-	int i, s;
-	unsigned char *command;
-	printk("scsi%d: destination target %d, lun %llu\n",
-		H_NO(cmd), cmd->device->id, cmd->device->lun);
-	printk(KERN_CONT "        command = ");
-	command = cmd->cmnd;
-	printk(KERN_CONT "%2d (0x%02x)", command[0], command[0]);
-	for (i = 1, s = COMMAND_SIZE(command[0]); i < s; ++i)
-		printk(KERN_CONT " %02x", command[i]);
-	printk("\n");
-}
-
-static void NCR5380_print_status(struct Scsi_Host *instance)
-{
-	struct NCR5380_hostdata *hostdata;
-	struct scsi_cmnd *ptr;
-	unsigned long flags;
-
-	NCR5380_dprint(NDEBUG_ANY, instance);
-	NCR5380_dprint_phase(NDEBUG_ANY, instance);
-
-	hostdata = (struct NCR5380_hostdata *)instance->hostdata;
-
-	local_irq_save(flags);
-	printk("NCR5380: coroutine is%s running.\n",
-		hostdata->main_running ? "" : "n't");
-	if (!hostdata->connected)
-		printk("scsi%d: no currently connected command\n", HOSTNO);
-	else
-		lprint_Scsi_Cmnd((struct scsi_cmnd *) hostdata->connected);
-	printk("scsi%d: issue_queue\n", HOSTNO);
-	for (ptr = (struct scsi_cmnd *)hostdata->issue_queue; ptr; ptr = NEXT(ptr))
-		lprint_Scsi_Cmnd(ptr);
-
-	printk("scsi%d: disconnected_queue\n", HOSTNO);
-	for (ptr = (struct scsi_cmnd *) hostdata->disconnected_queue; ptr;
-	     ptr = NEXT(ptr))
-		lprint_Scsi_Cmnd(ptr);
-
-	local_irq_restore(flags);
-	printk("\n");
-}
-
-static void show_Scsi_Cmnd(struct scsi_cmnd *cmd, struct seq_file *m)
-{
-	int i, s;
-	unsigned char *command;
-	seq_printf(m, "scsi%d: destination target %d, lun %llu\n",
-		H_NO(cmd), cmd->device->id, cmd->device->lun);
-	seq_puts(m, "        command = ");
-	command = cmd->cmnd;
-	seq_printf(m, "%2d (0x%02x)", command[0], command[0]);
-	for (i = 1, s = COMMAND_SIZE(command[0]); i < s; ++i)
-		seq_printf(m, " %02x", command[i]);
-	seq_putc(m, '\n');
-}
-
-static int __maybe_unused NCR5380_show_info(struct seq_file *m,
-                                            struct Scsi_Host *instance)
-{
-	struct NCR5380_hostdata *hostdata;
-	struct scsi_cmnd *ptr;
-	unsigned long flags;
-
-	hostdata = (struct NCR5380_hostdata *)instance->hostdata;
-
-	local_irq_save(flags);
-	seq_printf(m, "NCR5380: coroutine is%s running.\n",
-		hostdata->main_running ? "" : "n't");
-	if (!hostdata->connected)
-		seq_printf(m, "scsi%d: no currently connected command\n", HOSTNO);
-	else
-		show_Scsi_Cmnd((struct scsi_cmnd *) hostdata->connected, m);
-	seq_printf(m, "scsi%d: issue_queue\n", HOSTNO);
-	for (ptr = (struct scsi_cmnd *)hostdata->issue_queue; ptr; ptr = NEXT(ptr))
-		show_Scsi_Cmnd(ptr, m);
-
-	seq_printf(m, "scsi%d: disconnected_queue\n", HOSTNO);
-	for (ptr = (struct scsi_cmnd *) hostdata->disconnected_queue; ptr;
-	     ptr = NEXT(ptr))
-		show_Scsi_Cmnd(ptr, m);
-
-	local_irq_restore(flags);
-	return 0;
-}
-
-/**
  * NCR5380_init - initialise an NCR5380
  * @instance: adapter to configure
  * @flags: control flags
@@ -764,11 +614,11 @@ static int __maybe_unused NCR5380_show_info(struct seq_file *m,
 
 static int __init NCR5380_init(struct Scsi_Host *instance, int flags)
 {
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	int i;
-	SETUP_HOSTDATA(instance);
+	unsigned long deadline;
 
 	hostdata->host = instance;
-	hostdata->aborted = 0;
 	hostdata->id_mask = 1 << instance->this_id;
 	hostdata->id_higher_mask = 0;
 	for (i = hostdata->id_mask; i <= 0x80; i <<= 1)
@@ -782,13 +632,21 @@ static int __init NCR5380_init(struct Scsi_Host *instance, int flags)
 #if defined (REAL_DMA)
 	hostdata->dma_len = 0;
 #endif
-	hostdata->targets_present = 0;
+	spin_lock_init(&hostdata->lock);
 	hostdata->connected = NULL;
-	hostdata->issue_queue = NULL;
-	hostdata->disconnected_queue = NULL;
+	hostdata->sensing = NULL;
+	INIT_LIST_HEAD(&hostdata->autosense);
+	INIT_LIST_HEAD(&hostdata->unissued);
+	INIT_LIST_HEAD(&hostdata->disconnected);
+
 	hostdata->flags = flags;
 
 	INIT_WORK(&hostdata->main_task, NCR5380_main);
+	hostdata->work_q = alloc_workqueue("ncr5380_%d",
+	                        WQ_UNBOUND | WQ_MEM_RECLAIM,
+	                        1, instance->host_no);
+	if (!hostdata->work_q)
+		return -ENOMEM;
 
 	prepare_info(instance);
 
@@ -797,6 +655,72 @@ static int __init NCR5380_init(struct Scsi_Host *instance, int flags)
 	NCR5380_write(TARGET_COMMAND_REG, 0);
 	NCR5380_write(SELECT_ENABLE_REG, 0);
 
+	/* Calibrate register polling loop */
+	i = 0;
+	deadline = jiffies + 1;
+	do {
+		cpu_relax();
+	} while (time_is_after_jiffies(deadline));
+	deadline += msecs_to_jiffies(256);
+	do {
+		NCR5380_read(STATUS_REG);
+		++i;
+		cpu_relax();
+	} while (time_is_after_jiffies(deadline));
+	hostdata->accesses_per_ms = i / 256;
+
+	return 0;
+}
+
+/**
+ * NCR5380_maybe_reset_bus - Detect and correct bus wedge problems.
+ * @instance: adapter to check
+ *
+ * If the system crashed, it may have crashed with a connected target and
+ * the SCSI bus busy. Check for BUS FREE phase. If not, try to abort the
+ * currently established nexus, which we know nothing about. Failing that
+ * do a bus reset.
+ *
+ * Note that a bus reset will cause the chip to assert IRQ.
+ *
+ * Returns 0 if successful, otherwise -ENXIO.
+ */
+
+static int NCR5380_maybe_reset_bus(struct Scsi_Host *instance)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	int pass;
+
+	for (pass = 1; (NCR5380_read(STATUS_REG) & SR_BSY) && pass <= 6; ++pass) {
+		switch (pass) {
+		case 1:
+		case 3:
+		case 5:
+			shost_printk(KERN_ERR, instance, "SCSI bus busy, waiting up to five seconds\n");
+			NCR5380_poll_politely(instance,
+			                      STATUS_REG, SR_BSY, 0, 5 * HZ);
+			break;
+		case 2:
+			shost_printk(KERN_ERR, instance, "bus busy, attempting abort\n");
+			do_abort(instance);
+			break;
+		case 4:
+			shost_printk(KERN_ERR, instance, "bus busy, attempting reset\n");
+			do_reset(instance);
+			/* Wait after a reset; the SCSI standard calls for
+			 * 250ms, we wait 500ms to be on the safe side.
+			 * But some Toshiba CD-ROMs need ten times that.
+			 */
+			if (hostdata->flags & FLAG_TOSHIBA_DELAY)
+				msleep(2500);
+			else
+				msleep(500);
+			break;
+		case 6:
+			shost_printk(KERN_ERR, instance, "bus locked solid\n");
+			return -ENXIO;
+		}
+	}
 	return 0;
 }
 
@@ -812,6 +736,38 @@ static void NCR5380_exit(struct Scsi_Host *instance)
 	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
 	cancel_work_sync(&hostdata->main_task);
+	destroy_workqueue(hostdata->work_q);
+}
+
+/**
+ * complete_cmd - finish processing a command and return it to the SCSI ML
+ * @instance: the host instance
+ * @cmd: command to complete
+ */
+
+static void complete_cmd(struct Scsi_Host *instance,
+                         struct scsi_cmnd *cmd)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+
+	dsprintk(NDEBUG_QUEUES, instance, "complete_cmd: cmd %p\n", cmd);
+
+	if (hostdata->sensing == cmd) {
+		/* Autosense processing ends here */
+		if ((cmd->result & 0xff) != SAM_STAT_GOOD) {
+			scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+			set_host_byte(cmd, DID_ERROR);
+		} else
+			scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+		hostdata->sensing = NULL;
+	}
+
+#ifdef SUPPORT_TAGS
+	cmd_free_tag(cmd);
+#else
+	hostdata->busy[scmd_id(cmd)] &= ~(1 << cmd->device->lun);
+#endif
+	cmd->scsi_done(cmd);
 }
 
 /**
@@ -819,7 +775,7 @@ static void NCR5380_exit(struct Scsi_Host *instance)
  * @instance: the relevant SCSI adapter
  * @cmd: SCSI command
  *
- * cmd is added to the per instance issue_queue, with minor
+ * cmd is added to the per-instance issue queue, with minor
  * twiddling done to the host specific fields of cmd.  If the
  * main coroutine is not running, it is restarted.
  */
@@ -828,44 +784,23 @@ static int NCR5380_queue_command(struct Scsi_Host *instance,
                                  struct scsi_cmnd *cmd)
 {
 	struct NCR5380_hostdata *hostdata = shost_priv(instance);
-	struct scsi_cmnd *tmp;
+	struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
 	unsigned long flags;
 
 #if (NDEBUG & NDEBUG_NO_WRITE)
 	switch (cmd->cmnd[0]) {
 	case WRITE_6:
 	case WRITE_10:
-		printk(KERN_NOTICE "scsi%d: WRITE attempted with NO_WRITE debugging flag set\n",
-		       H_NO(cmd));
+		shost_printk(KERN_DEBUG, instance, "WRITE attempted with NDEBUG_NO_WRITE set\n");
 		cmd->result = (DID_ERROR << 16);
 		cmd->scsi_done(cmd);
 		return 0;
 	}
 #endif /* (NDEBUG & NDEBUG_NO_WRITE) */
 
-	/*
-	 * We use the host_scribble field as a pointer to the next command
-	 * in a queue
-	 */
-
-	SET_NEXT(cmd, NULL);
 	cmd->result = 0;
 
 	/*
-	 * Insert the cmd into the issue queue. Note that REQUEST SENSE
-	 * commands are added to the head of the queue since any command will
-	 * clear the contingent allegiance condition that exists and the
-	 * sense data is only guaranteed to be valid while the condition exists.
-	 */
-
-	/* ++guenther: now that the issue queue is being set up, we can lock ST-DMA.
-	 * Otherwise a running NCR5380_main may steal the lock.
-	 * Lock before actually inserting due to fairness reasons explained in
-	 * atari_scsi.c. If we insert first, then it's impossible for this driver
-	 * to release the lock.
-	 * Stop timer for this command while waiting for the lock, or timeouts
-	 * may happen (and they really do), and it's no good if the command doesn't
-	 * appear in any of the queues.
 	 * ++roman: Just disabling the NCR interrupt isn't sufficient here,
 	 * because also a timer int can trigger an abort or reset, which would
 	 * alter queues and touch the lock.
@@ -873,7 +808,7 @@ static int NCR5380_queue_command(struct Scsi_Host *instance,
 	if (!NCR5380_acquire_dma_irq(instance))
 		return SCSI_MLQUEUE_HOST_BUSY;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&hostdata->lock, flags);
 
 	/*
 	 * Insert the cmd into the issue queue. Note that REQUEST SENSE
@@ -882,33 +817,18 @@ static int NCR5380_queue_command(struct Scsi_Host *instance,
 	 * sense data is only guaranteed to be valid while the condition exists.
 	 */
 
-	if (!(hostdata->issue_queue) || (cmd->cmnd[0] == REQUEST_SENSE)) {
-		LIST(cmd, hostdata->issue_queue);
-		SET_NEXT(cmd, hostdata->issue_queue);
-		hostdata->issue_queue = cmd;
-	} else {
-		for (tmp = (struct scsi_cmnd *)hostdata->issue_queue;
-		     NEXT(tmp); tmp = NEXT(tmp))
-			;
-		LIST(cmd, tmp);
-		SET_NEXT(tmp, cmd);
-	}
-	local_irq_restore(flags);
+	if (cmd->cmnd[0] == REQUEST_SENSE)
+		list_add(&ncmd->list, &hostdata->unissued);
+	else
+		list_add_tail(&ncmd->list, &hostdata->unissued);
 
-	dprintk(NDEBUG_QUEUES, "scsi%d: command added to %s of queue\n", H_NO(cmd),
-		  (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail");
+	spin_unlock_irqrestore(&hostdata->lock, flags);
 
-	/* If queue_command() is called from an interrupt (real one or bottom
-	 * half), we let queue_main() do the job of taking care about main. If it
-	 * is already running, this is a no-op, else main will be queued.
-	 *
-	 * If we're not in an interrupt, we can call NCR5380_main()
-	 * unconditionally, because it cannot be already running.
-	 */
-	if (in_interrupt() || irqs_disabled())
-		queue_main(hostdata);
-	else
-		NCR5380_main(&hostdata->main_task);
+	dsprintk(NDEBUG_QUEUES, instance, "command %p added to %s of queue\n",
+	         cmd, (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail");
+
+	/* Kick off command processing */
+	queue_work(hostdata->work_q, &hostdata->main_task);
 	return 0;
 }
 
@@ -917,22 +837,85 @@ static inline void maybe_release_dma_irq(struct Scsi_Host *instance)
 	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
 	/* Caller does the locking needed to set & test these data atomically */
-	if (!hostdata->disconnected_queue &&
-	    !hostdata->issue_queue &&
+	if (list_empty(&hostdata->disconnected) &&
+	    list_empty(&hostdata->unissued) &&
+	    list_empty(&hostdata->autosense) &&
 	    !hostdata->connected &&
-	    !hostdata->retain_dma_intr)
+	    !hostdata->selecting)
 		NCR5380_release_dma_irq(instance);
 }
 
 /**
+ * dequeue_next_cmd - dequeue a command for processing
+ * @instance: the scsi host instance
+ *
+ * Priority is given to commands on the autosense queue. These commands
+ * need autosense because of a CHECK CONDITION result.
+ *
+ * Returns a command pointer if a command is found for a target that is
+ * not already busy. Otherwise returns NULL.
+ */
+
+static struct scsi_cmnd *dequeue_next_cmd(struct Scsi_Host *instance)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	struct NCR5380_cmd *ncmd;
+	struct scsi_cmnd *cmd;
+
+	if (list_empty(&hostdata->autosense)) {
+		list_for_each_entry(ncmd, &hostdata->unissued, list) {
+			cmd = NCR5380_to_scmd(ncmd);
+			dsprintk(NDEBUG_QUEUES, instance, "dequeue: cmd=%p target=%d busy=0x%02x lun=%llu\n",
+			         cmd, scmd_id(cmd), hostdata->busy[scmd_id(cmd)], cmd->device->lun);
+
+			if (
+#ifdef SUPPORT_TAGS
+			    !is_lun_busy(cmd, 1)
+#else
+			    !(hostdata->busy[scmd_id(cmd)] & (1 << cmd->device->lun))
+#endif
+			) {
+				list_del(&ncmd->list);
+				dsprintk(NDEBUG_QUEUES, instance,
+				         "dequeue: removed %p from issue queue\n", cmd);
+				return cmd;
+			}
+		}
+	} else {
+		/* Autosense processing begins here */
+		ncmd = list_first_entry(&hostdata->autosense,
+		                        struct NCR5380_cmd, list);
+		list_del(&ncmd->list);
+		cmd = NCR5380_to_scmd(ncmd);
+		dsprintk(NDEBUG_QUEUES, instance,
+		         "dequeue: removed %p from autosense queue\n", cmd);
+		scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0);
+		hostdata->sensing = cmd;
+		return cmd;
+	}
+	return NULL;
+}
+
+static void requeue_cmd(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
+
+	if (hostdata->sensing) {
+		scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+		list_add(&ncmd->list, &hostdata->autosense);
+		hostdata->sensing = NULL;
+	} else
+		list_add(&ncmd->list, &hostdata->unissued);
+}
+
+/**
  * NCR5380_main - NCR state machines
  *
  * NCR5380_main is a coroutine that runs as long as more work can
  * be done on the NCR5380 host adapters in a system.  Both
  * NCR5380_queue_command() and NCR5380_intr() will try to start it
  * in case it is not running.
- *
- * Locks: called as its own thread with no locks held.
  */
 
 static void NCR5380_main(struct work_struct *work)
@@ -940,154 +923,69 @@ static void NCR5380_main(struct work_struct *work)
 	struct NCR5380_hostdata *hostdata =
 		container_of(work, struct NCR5380_hostdata, main_task);
 	struct Scsi_Host *instance = hostdata->host;
-	struct scsi_cmnd *tmp, *prev;
+	struct scsi_cmnd *cmd;
 	int done;
-	unsigned long flags;
 
 	/*
-	 * We run (with interrupts disabled) until we're sure that none of
-	 * the host adapters have anything that can be done, at which point
-	 * we set main_running to 0 and exit.
-	 *
-	 * Interrupts are enabled before doing various other internal
-	 * instructions, after we've decided that we need to run through
-	 * the loop again.
-	 *
-	 * this should prevent any race conditions.
-	 *
 	 * ++roman: Just disabling the NCR interrupt isn't sufficient here,
 	 * because also a timer int can trigger an abort or reset, which can
 	 * alter queues and touch the Falcon lock.
 	 */
 
-	/* Tell int handlers main() is now already executing.  Note that
-	   no races are possible here. If an int comes in before
-	   'main_running' is set here, and queues/executes main via the
-	   task queue, it doesn't do any harm, just this instance of main
-	   won't find any work left to do. */
-	if (hostdata->main_running)
-		return;
-	hostdata->main_running = 1;
-
-	local_save_flags(flags);
 	do {
-		local_irq_disable();	/* Freeze request queues */
 		done = 1;
 
-		if (!hostdata->connected) {
-			dprintk(NDEBUG_MAIN, "scsi%d: not connected\n", HOSTNO);
-			/*
-			 * Search through the issue_queue for a command destined
-			 * for a target that's not busy.
-			 */
-#if (NDEBUG & NDEBUG_LISTS)
-			for (tmp = (struct scsi_cmnd *) hostdata->issue_queue, prev = NULL;
-			     tmp && (tmp != prev); prev = tmp, tmp = NEXT(tmp))
-				;
-			/*printk("%p  ", tmp);*/
-			if ((tmp == prev) && tmp)
-				printk(" LOOP\n");
-			/* else printk("\n"); */
-#endif
-			for (tmp = (struct scsi_cmnd *) hostdata->issue_queue,
-			     prev = NULL; tmp; prev = tmp, tmp = NEXT(tmp)) {
-				u8 lun = tmp->device->lun;
-
-				dprintk(NDEBUG_LISTS,
-				        "MAIN tmp=%p target=%d busy=%d lun=%d\n",
-				        tmp, scmd_id(tmp), hostdata->busy[scmd_id(tmp)],
-				        lun);
-				/*  When we find one, remove it from the issue queue. */
-				/* ++guenther: possible race with Falcon locking */
-				if (
-#ifdef SUPPORT_TAGS
-				    !is_lun_busy( tmp, tmp->cmnd[0] != REQUEST_SENSE)
-#else
-				    !(hostdata->busy[tmp->device->id] & (1 << lun))
-#endif
-				    ) {
-					/* ++guenther: just to be sure, this must be atomic */
-					local_irq_disable();
-					if (prev) {
-						REMOVE(prev, NEXT(prev), tmp, NEXT(tmp));
-						SET_NEXT(prev, NEXT(tmp));
-					} else {
-						REMOVE(-1, hostdata->issue_queue, tmp, NEXT(tmp));
-						hostdata->issue_queue = NEXT(tmp);
-					}
-					SET_NEXT(tmp, NULL);
-					hostdata->retain_dma_intr++;
+		spin_lock_irq(&hostdata->lock);
+		while (!hostdata->connected &&
+		       (cmd = dequeue_next_cmd(instance))) {
 
-					/* reenable interrupts after finding one */
-					local_irq_restore(flags);
+			dsprintk(NDEBUG_MAIN, instance, "main: dequeued %p\n", cmd);
 
-					/*
-					 * Attempt to establish an I_T_L nexus here.
-					 * On success, instance->hostdata->connected is set.
-					 * On failure, we must add the command back to the
-					 *   issue queue so we can keep trying.
-					 */
-					dprintk(NDEBUG_MAIN, "scsi%d: main(): command for target %d "
-						    "lun %d removed from issue_queue\n",
-						    HOSTNO, tmp->device->id, lun);
-					/*
-					 * REQUEST SENSE commands are issued without tagged
-					 * queueing, even on SCSI-II devices because the
-					 * contingent allegiance condition exists for the
-					 * entire unit.
-					 */
-					/* ++roman: ...and the standard also requires that
-					 * REQUEST SENSE command are untagged.
-					 */
+			/*
+			 * Attempt to establish an I_T_L nexus here.
+			 * On success, instance->hostdata->connected is set.
+			 * On failure, we must add the command back to the
+			 * issue queue so we can keep trying.
+			 */
+			/*
+			 * REQUEST SENSE commands are issued without tagged
+			 * queueing, even on SCSI-II devices because the
+			 * contingent allegiance condition exists for the
+			 * entire unit.
+			 */
+			/* ++roman: ...and the standard also requires that
+			 * REQUEST SENSE command are untagged.
+			 */
 
 #ifdef SUPPORT_TAGS
-					cmd_get_tag(tmp, tmp->cmnd[0] != REQUEST_SENSE);
+			cmd_get_tag(cmd, cmd->cmnd[0] != REQUEST_SENSE);
 #endif
-					if (!NCR5380_select(instance, tmp)) {
-						local_irq_disable();
-						hostdata->retain_dma_intr--;
-						/* release if target did not response! */
-						maybe_release_dma_irq(instance);
-						local_irq_restore(flags);
-						break;
-					} else {
-						local_irq_disable();
-						LIST(tmp, hostdata->issue_queue);
-						SET_NEXT(tmp, hostdata->issue_queue);
-						hostdata->issue_queue = tmp;
+			cmd = NCR5380_select(instance, cmd);
+			if (!cmd) {
+				dsprintk(NDEBUG_MAIN, instance, "main: select complete\n");
+				maybe_release_dma_irq(instance);
+			} else {
+				dsprintk(NDEBUG_MAIN | NDEBUG_QUEUES, instance,
+				         "main: select failed, returning %p to queue\n", cmd);
+				requeue_cmd(instance, cmd);
 #ifdef SUPPORT_TAGS
-						cmd_free_tag(tmp);
+				cmd_free_tag(cmd);
 #endif
-						hostdata->retain_dma_intr--;
-						local_irq_restore(flags);
-						dprintk(NDEBUG_MAIN, "scsi%d: main(): select() failed, "
-							    "returned to issue_queue\n", HOSTNO);
-						if (hostdata->connected)
-							break;
-					}
-				} /* if target/lun/target queue is not busy */
-			} /* for issue_queue */
-		} /* if (!hostdata->connected) */
-
+			}
+		}
 		if (hostdata->connected
 #ifdef REAL_DMA
 		    && !hostdata->dma_len
 #endif
 		    ) {
-			local_irq_restore(flags);
-			dprintk(NDEBUG_MAIN, "scsi%d: main: performing information transfer\n",
-				    HOSTNO);
+			dsprintk(NDEBUG_MAIN, instance, "main: performing information transfer\n");
 			NCR5380_information_transfer(instance);
-			dprintk(NDEBUG_MAIN, "scsi%d: main: done set false\n", HOSTNO);
 			done = 0;
 		}
+		spin_unlock_irq(&hostdata->lock);
+		if (!done)
+			cond_resched();
 	} while (!done);
-
-	/* Better allow ints _after_ 'main_running' has been cleared, else
-	   an interrupt could believe we'll pick up the work it left for
-	   us, but we won't see it anymore here... */
-	hostdata->main_running = 0;
-	local_irq_restore(flags);
 }
 
 
@@ -1096,27 +994,20 @@ static void NCR5380_main(struct work_struct *work)
  * Function : void NCR5380_dma_complete (struct Scsi_Host *instance)
  *
  * Purpose : Called by interrupt handler when DMA finishes or a phase
- *	mismatch occurs (which would finish the DMA transfer).
+ * mismatch occurs (which would finish the DMA transfer).
  *
  * Inputs : instance - this instance of the NCR5380.
- *
  */
 
 static void NCR5380_dma_complete(struct Scsi_Host *instance)
 {
-	SETUP_HOSTDATA(instance);
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	int transferred;
 	unsigned char **data;
-	volatile int *count;
+	int *count;
 	int saved_data = 0, overrun = 0;
 	unsigned char p;
 
-	if (!hostdata->connected) {
-		printk(KERN_WARNING "scsi%d: received end of DMA interrupt with "
-		       "no connected cmd\n", HOSTNO);
-		return;
-	}
-
 	if (hostdata->read_overruns) {
 		p = hostdata->connected->SCp.phase;
 		if (p & SR_IO) {
@@ -1126,15 +1017,11 @@ static void NCR5380_dma_complete(struct Scsi_Host *instance)
 			    (BASR_PHASE_MATCH|BASR_ACK)) {
 				saved_data = NCR5380_read(INPUT_DATA_REG);
 				overrun = 1;
-				dprintk(NDEBUG_DMA, "scsi%d: read overrun handled\n", HOSTNO);
+				dsprintk(NDEBUG_DMA, instance, "read overrun handled\n");
 			}
 		}
 	}
 
-	dprintk(NDEBUG_DMA, "scsi%d: real DMA transfer complete, basr 0x%X, sr 0x%X\n",
-		   HOSTNO, NCR5380_read(BUS_AND_STATUS_REG),
-		   NCR5380_read(STATUS_REG));
-
 #if defined(CONFIG_SUN3)
 	if ((sun3scsi_dma_finish(rq_data_dir(hostdata->connected->request)))) {
 		pr_err("scsi%d: overrun in UDC counter -- not prepared to deal with this!\n",
@@ -1153,9 +1040,9 @@ static void NCR5380_dma_complete(struct Scsi_Host *instance)
 	}
 #endif
 
-	(void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
 	NCR5380_write(MODE_REG, MR_BASE);
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+	NCR5380_read(RESET_PARITY_INTERRUPT_REG);
 
 	transferred = hostdata->dma_len - NCR5380_dma_residual(instance);
 	hostdata->dma_len = 0;
@@ -1194,140 +1081,160 @@ static void NCR5380_dma_complete(struct Scsi_Host *instance)
  * Handle interrupts, reestablishing I_T_L or I_T_L_Q nexuses
  * from the disconnected queue, and restarting NCR5380_main()
  * as required.
+ *
+ * The chip can assert IRQ in any of six different conditions. The IRQ flag
+ * is then cleared by reading the Reset Parity/Interrupt Register (RPIR).
+ * Three of these six conditions are latched in the Bus and Status Register:
+ * - End of DMA (cleared by ending DMA Mode)
+ * - Parity error (cleared by reading RPIR)
+ * - Loss of BSY (cleared by reading RPIR)
+ * Two conditions have flag bits that are not latched:
+ * - Bus phase mismatch (non-maskable in DMA Mode, cleared by ending DMA Mode)
+ * - Bus reset (non-maskable)
+ * The remaining condition has no flag bit at all:
+ * - Selection/reselection
+ *
+ * Hence, establishing the cause(s) of any interrupt is partly guesswork.
+ * In "The DP8490 and DP5380 Comparison Guide", National Semiconductor
+ * claimed that "the design of the [DP8490] interrupt logic ensures
+ * interrupts will not be lost (they can be on the DP5380)."
+ * The L5380/53C80 datasheet from LOGIC Devices has more details.
+ *
+ * Checking for bus reset by reading RST is futile because of interrupt
+ * latency, but a bus reset will reset chip logic. Checking for parity error
+ * is unnecessary because that interrupt is never enabled. A Loss of BSY
+ * condition will clear DMA Mode. We can tell when this occurs because the
+ * the Busy Monitor interrupt is enabled together with DMA Mode.
  */
 
 static irqreturn_t NCR5380_intr(int irq, void *dev_id)
 {
 	struct Scsi_Host *instance = dev_id;
-	int done = 1, handled = 0;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	int handled = 0;
 	unsigned char basr;
+	unsigned long flags;
 
-	dprintk(NDEBUG_INTR, "scsi%d: NCR5380 irq triggered\n", HOSTNO);
+	spin_lock_irqsave(&hostdata->lock, flags);
 
-	/* Look for pending interrupts */
 	basr = NCR5380_read(BUS_AND_STATUS_REG);
-	dprintk(NDEBUG_INTR, "scsi%d: BASR=%02x\n", HOSTNO, basr);
-	/* dispatch to appropriate routine if found and done=0 */
 	if (basr & BASR_IRQ) {
-		NCR5380_dprint(NDEBUG_INTR, instance);
-		if ((NCR5380_read(STATUS_REG) & (SR_SEL|SR_IO)) == (SR_SEL|SR_IO)) {
-			done = 0;
-			dprintk(NDEBUG_INTR, "scsi%d: SEL interrupt\n", HOSTNO);
-			NCR5380_reselect(instance);
-			(void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-		} else if (basr & BASR_PARITY_ERROR) {
-			dprintk(NDEBUG_INTR, "scsi%d: PARITY interrupt\n", HOSTNO);
-			(void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-		} else if ((NCR5380_read(STATUS_REG) & SR_RST) == SR_RST) {
-			dprintk(NDEBUG_INTR, "scsi%d: RESET interrupt\n", HOSTNO);
-			(void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-		} else {
-			/*
-			 * The rest of the interrupt conditions can occur only during a
-			 * DMA transfer
-			 */
+		unsigned char mr = NCR5380_read(MODE_REG);
+		unsigned char sr = NCR5380_read(STATUS_REG);
+
+		dsprintk(NDEBUG_INTR, instance, "IRQ %d, BASR 0x%02x, SR 0x%02x, MR 0x%02x\n",
+		         irq, basr, sr, mr);
 
 #if defined(REAL_DMA)
-			/*
-			 * We should only get PHASE MISMATCH and EOP interrupts if we have
-			 * DMA enabled, so do a sanity check based on the current setting
-			 * of the MODE register.
+		if ((mr & MR_DMA_MODE) || (mr & MR_MONITOR_BSY)) {
+			/* Probably End of DMA, Phase Mismatch or Loss of BSY.
+			 * We ack IRQ after clearing Mode Register. Workarounds
+			 * for End of DMA errata need to happen in DMA Mode.
 			 */
 
-			if ((NCR5380_read(MODE_REG) & MR_DMA_MODE) &&
-			    ((basr & BASR_END_DMA_TRANSFER) ||
-			     !(basr & BASR_PHASE_MATCH))) {
+			dsprintk(NDEBUG_INTR, instance, "interrupt in DMA mode\n");
 
-				dprintk(NDEBUG_INTR, "scsi%d: PHASE MISM or EOP interrupt\n", HOSTNO);
-				NCR5380_dma_complete( instance );
-				done = 0;
-			} else
+			if (hostdata->connected) {
+				NCR5380_dma_complete(instance);
+				queue_work(hostdata->work_q, &hostdata->main_task);
+			} else {
+				NCR5380_write(MODE_REG, MR_BASE);
+				NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+			}
+		} else
 #endif /* REAL_DMA */
-			{
-/* MS: Ignore unknown phase mismatch interrupts (caused by EOP interrupt) */
-				if (basr & BASR_PHASE_MATCH)
-					dprintk(NDEBUG_INTR, "scsi%d: unknown interrupt, "
-					       "BASR 0x%x, MR 0x%x, SR 0x%x\n",
-					       HOSTNO, basr, NCR5380_read(MODE_REG),
-					       NCR5380_read(STATUS_REG));
-				(void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+		if ((NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_mask) &&
+		    (sr & (SR_SEL | SR_IO | SR_BSY | SR_RST)) == (SR_SEL | SR_IO)) {
+			/* Probably reselected */
+			NCR5380_write(SELECT_ENABLE_REG, 0);
+			NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+
+			dsprintk(NDEBUG_INTR, instance, "interrupt with SEL and IO\n");
+
+			if (!hostdata->connected) {
+				NCR5380_reselect(instance);
+				queue_work(hostdata->work_q, &hostdata->main_task);
+			}
+			if (!hostdata->connected)
+				NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+		} else {
+			/* Probably Bus Reset */
+			NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+
+			dsprintk(NDEBUG_INTR, instance, "unknown interrupt\n");
 #ifdef SUN3_SCSI_VME
-				dregs->csr |= CSR_DMA_ENABLE;
+			dregs->csr |= CSR_DMA_ENABLE;
 #endif
-			}
-		} /* if !(SELECTION || PARITY) */
+		}
 		handled = 1;
-	} /* BASR & IRQ */ else {
-		printk(KERN_NOTICE "scsi%d: interrupt without IRQ bit set in BASR, "
-		       "BASR 0x%X, MR 0x%X, SR 0x%x\n", HOSTNO, basr,
-		       NCR5380_read(MODE_REG), NCR5380_read(STATUS_REG));
-		(void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+	} else {
+		shost_printk(KERN_NOTICE, instance, "interrupt without IRQ bit\n");
 #ifdef SUN3_SCSI_VME
 		dregs->csr |= CSR_DMA_ENABLE;
 #endif
 	}
 
-	if (!done) {
-		dprintk(NDEBUG_INTR, "scsi%d: in int routine, calling main\n", HOSTNO);
-		/* Put a call to NCR5380_main() on the queue... */
-		queue_main(shost_priv(instance));
-	}
+	spin_unlock_irqrestore(&hostdata->lock, flags);
+
 	return IRQ_RETVAL(handled);
 }
 
 /*
  * Function : int NCR5380_select(struct Scsi_Host *instance,
- *                               struct scsi_cmnd *cmd)
+ * struct scsi_cmnd *cmd)
  *
  * Purpose : establishes I_T_L or I_T_L_Q nexus for new or existing command,
- *	including ARBITRATION, SELECTION, and initial message out for
- *	IDENTIFY and queue messages.
+ * including ARBITRATION, SELECTION, and initial message out for
+ * IDENTIFY and queue messages.
  *
  * Inputs : instance - instantiation of the 5380 driver on which this
- *	target lives, cmd - SCSI command to execute.
+ * target lives, cmd - SCSI command to execute.
  *
- * Returns : -1 if selection could not execute for some reason,
- *	0 if selection succeeded or failed because the target
- *	did not respond.
+ * Returns cmd if selection failed but should be retried,
+ * NULL if selection failed and should not be retried, or
+ * NULL if selection succeeded (hostdata->connected == cmd).
  *
  * Side effects :
- *	If bus busy, arbitration failed, etc, NCR5380_select() will exit
- *		with registers as they should have been on entry - ie
- *		SELECT_ENABLE will be set appropriately, the NCR5380
- *		will cease to drive any SCSI bus signals.
+ * If bus busy, arbitration failed, etc, NCR5380_select() will exit
+ * with registers as they should have been on entry - ie
+ * SELECT_ENABLE will be set appropriately, the NCR5380
+ * will cease to drive any SCSI bus signals.
  *
- *	If successful : I_T_L or I_T_L_Q nexus will be established,
- *		instance->connected will be set to cmd.
- *		SELECT interrupt will be disabled.
+ * If successful : I_T_L or I_T_L_Q nexus will be established,
+ * instance->connected will be set to cmd.
+ * SELECT interrupt will be disabled.
  *
- *	If failed (no target) : cmd->scsi_done() will be called, and the
- *		cmd->result host byte set to DID_BAD_TARGET.
+ * If failed (no target) : cmd->scsi_done() will be called, and the
+ * cmd->result host byte set to DID_BAD_TARGET.
  */
 
-static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
+static struct scsi_cmnd *NCR5380_select(struct Scsi_Host *instance,
+                                        struct scsi_cmnd *cmd)
 {
-	SETUP_HOSTDATA(instance);
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	unsigned char tmp[3], phase;
 	unsigned char *data;
 	int len;
-	unsigned long timeout;
-	unsigned long flags;
+	int err;
 
-	hostdata->restart_select = 0;
 	NCR5380_dprint(NDEBUG_ARBITRATION, instance);
-	dprintk(NDEBUG_ARBITRATION, "scsi%d: starting arbitration, id = %d\n", HOSTNO,
-		   instance->this_id);
+	dsprintk(NDEBUG_ARBITRATION, instance, "starting arbitration, id = %d\n",
+	         instance->this_id);
+
+	/*
+	 * Arbitration and selection phases are slow and involve dropping the
+	 * lock, so we have to watch out for EH. An exception handler may
+	 * change 'selecting' to NULL. This function will then return NULL
+	 * so that the caller will forget about 'cmd'. (During information
+	 * transfer phases, EH may change 'connected' to NULL.)
+	 */
+	hostdata->selecting = cmd;
 
 	/*
 	 * Set the phase bits to 0, otherwise the NCR5380 won't drive the
 	 * data bus during SELECTION.
 	 */
 
-	local_irq_save(flags);
-	if (hostdata->connected) {
-		local_irq_restore(flags);
-		return -1;
-	}
 	NCR5380_write(TARGET_COMMAND_REG, 0);
 
 	/*
@@ -1337,96 +1244,77 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 	NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask);
 	NCR5380_write(MODE_REG, MR_ARBITRATE);
 
-	local_irq_restore(flags);
-
-	/* Wait for arbitration logic to complete */
-#if defined(NCR_TIMEOUT)
-	{
-		unsigned long timeout = jiffies + 2*NCR_TIMEOUT;
+	/* The chip now waits for BUS FREE phase. Then after the 800 ns
+	 * Bus Free Delay, arbitration will begin.
+	 */
 
-		while (!(NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_PROGRESS) &&
-		       time_before(jiffies, timeout) && !hostdata->connected)
-			;
-		if (time_after_eq(jiffies, timeout)) {
-			printk("scsi : arbitration timeout at %d\n", __LINE__);
-			NCR5380_write(MODE_REG, MR_BASE);
-			NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-			return -1;
-		}
+	spin_unlock_irq(&hostdata->lock);
+	err = NCR5380_poll_politely2(instance, MODE_REG, MR_ARBITRATE, 0,
+	                INITIATOR_COMMAND_REG, ICR_ARBITRATION_PROGRESS,
+	                                       ICR_ARBITRATION_PROGRESS, HZ);
+	spin_lock_irq(&hostdata->lock);
+	if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE)) {
+		/* Reselection interrupt */
+		goto out;
 	}
-#else /* NCR_TIMEOUT */
-	while (!(NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_PROGRESS) &&
-	       !hostdata->connected)
-		;
-#endif
-
-	dprintk(NDEBUG_ARBITRATION, "scsi%d: arbitration complete\n", HOSTNO);
-
-	if (hostdata->connected) {
+	if (err < 0) {
 		NCR5380_write(MODE_REG, MR_BASE);
-		return -1;
+		shost_printk(KERN_ERR, instance,
+		             "select: arbitration timeout\n");
+		goto out;
 	}
-	/*
-	 * The arbitration delay is 2.2us, but this is a minimum and there is
-	 * no maximum so we can safely sleep for ceil(2.2) usecs to accommodate
-	 * the integral nature of udelay().
-	 *
-	 */
+	spin_unlock_irq(&hostdata->lock);
 
+	/* The SCSI-2 arbitration delay is 2.4 us */
 	udelay(3);
 
 	/* Check for lost arbitration */
 	if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) ||
 	    (NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_higher_mask) ||
-	    (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) ||
-	    hostdata->connected) {
+	    (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST)) {
 		NCR5380_write(MODE_REG, MR_BASE);
-		dprintk(NDEBUG_ARBITRATION, "scsi%d: lost arbitration, deasserting MR_ARBITRATE\n",
-			   HOSTNO);
-		return -1;
+		dsprintk(NDEBUG_ARBITRATION, instance, "lost arbitration, deasserting MR_ARBITRATE\n");
+		spin_lock_irq(&hostdata->lock);
+		goto out;
 	}
 
-	/* after/during arbitration, BSY should be asserted.
-	   IBM DPES-31080 Version S31Q works now */
-	/* Tnx to Thomas_Roesch@m2.maus.de for finding this! (Roman) */
+	/* After/during arbitration, BSY should be asserted.
+	 * IBM DPES-31080 Version S31Q works now
+	 * Tnx to Thomas_Roesch@m2.maus.de for finding this! (Roman)
+	 */
 	NCR5380_write(INITIATOR_COMMAND_REG,
 		      ICR_BASE | ICR_ASSERT_SEL | ICR_ASSERT_BSY);
 
-	if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) ||
-	    hostdata->connected) {
-		NCR5380_write(MODE_REG, MR_BASE);
-		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-		dprintk(NDEBUG_ARBITRATION, "scsi%d: lost arbitration, deasserting ICR_ASSERT_SEL\n",
-			   HOSTNO);
-		return -1;
-	}
-
 	/*
 	 * Again, bus clear + bus settle time is 1.2us, however, this is
 	 * a minimum so we'll udelay ceil(1.2)
 	 */
 
-#ifdef CONFIG_ATARI_SCSI_TOSHIBA_DELAY
-	/* ++roman: But some targets (see above :-) seem to need a bit more... */
-	udelay(15);
-#else
-	udelay(2);
-#endif
+	if (hostdata->flags & FLAG_TOSHIBA_DELAY)
+		udelay(15);
+	else
+		udelay(2);
 
-	if (hostdata->connected) {
+	spin_lock_irq(&hostdata->lock);
+
+	/* NCR5380_reselect() clears MODE_REG after a reselection interrupt */
+	if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE))
+		goto out;
+
+	if (!hostdata->selecting) {
 		NCR5380_write(MODE_REG, MR_BASE);
 		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-		return -1;
+		goto out;
 	}
 
-	dprintk(NDEBUG_ARBITRATION, "scsi%d: won arbitration\n", HOSTNO);
+	dsprintk(NDEBUG_ARBITRATION, instance, "won arbitration\n");
 
 	/*
 	 * Now that we have won arbitration, start Selection process, asserting
 	 * the host and target ID's on the SCSI bus.
 	 */
 
-	NCR5380_write(OUTPUT_DATA_REG, (hostdata->id_mask | (1 << cmd->device->id)));
+	NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask | (1 << scmd_id(cmd)));
 
 	/*
 	 * Raise ATN while SEL is true before BSY goes false from arbitration,
@@ -1434,22 +1322,18 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 	 * phase immediately after selection.
 	 */
 
-	NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_BSY |
-		      ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL ));
+	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY |
+	              ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL);
 	NCR5380_write(MODE_REG, MR_BASE);
 
 	/*
 	 * Reselect interrupts must be turned off prior to the dropping of BSY,
 	 * otherwise we will trigger an interrupt.
 	 */
-
-	if (hostdata->connected) {
-		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-		return -1;
-	}
-
 	NCR5380_write(SELECT_ENABLE_REG, 0);
 
+	spin_unlock_irq(&hostdata->lock);
+
 	/*
 	 * The initiator shall then wait at least two deskew delays and release
 	 * the BSY signal.
@@ -1457,8 +1341,8 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 	udelay(1);        /* wingel -- wait two bus deskew delay >2*45ns */
 
 	/* Reset BSY */
-	NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_DATA |
-		      ICR_ASSERT_ATN | ICR_ASSERT_SEL));
+	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA |
+	              ICR_ASSERT_ATN | ICR_ASSERT_SEL);
 
 	/*
 	 * Something weird happens when we cease to drive BSY - looks
@@ -1479,45 +1363,39 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 
 	udelay(1);
 
-	dprintk(NDEBUG_SELECTION, "scsi%d: selecting target %d\n", HOSTNO, cmd->device->id);
+	dsprintk(NDEBUG_SELECTION, instance, "selecting target %d\n", scmd_id(cmd));
 
 	/*
 	 * The SCSI specification calls for a 250 ms timeout for the actual
 	 * selection.
 	 */
 
-	timeout = jiffies + msecs_to_jiffies(250);
-
-	/*
-	 * XXX very interesting - we're seeing a bounce where the BSY we
-	 * asserted is being reflected / still asserted (propagation delay?)
-	 * and it's detecting as true.  Sigh.
-	 */
-
-#if 0
-	/* ++roman: If a target conformed to the SCSI standard, it wouldn't assert
-	 * IO while SEL is true. But again, there are some disks out the in the
-	 * world that do that nevertheless. (Somebody claimed that this announces
-	 * reselection capability of the target.) So we better skip that test and
-	 * only wait for BSY... (Famous german words: Der Klügere gibt nach :-)
-	 */
-
-	while (time_before(jiffies, timeout) &&
-	       !(NCR5380_read(STATUS_REG) & (SR_BSY | SR_IO)))
-		;
+	err = NCR5380_poll_politely(instance, STATUS_REG, SR_BSY, SR_BSY,
+	                            msecs_to_jiffies(250));
 
 	if ((NCR5380_read(STATUS_REG) & (SR_SEL | SR_IO)) == (SR_SEL | SR_IO)) {
+		spin_lock_irq(&hostdata->lock);
 		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 		NCR5380_reselect(instance);
-		printk(KERN_ERR "scsi%d: reselection after won arbitration?\n",
-		       HOSTNO);
+		if (!hostdata->connected)
+			NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+		shost_printk(KERN_ERR, instance, "reselection after won arbitration?\n");
+		goto out;
+	}
+
+	if (err < 0) {
+		spin_lock_irq(&hostdata->lock);
+		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 		NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-		return -1;
+		/* Can't touch cmd if it has been reclaimed by the scsi ML */
+		if (hostdata->selecting) {
+			cmd->result = DID_BAD_TARGET << 16;
+			complete_cmd(instance, cmd);
+			dsprintk(NDEBUG_SELECTION, instance, "target did not respond within 250ms\n");
+			cmd = NULL;
+		}
+		goto out;
 	}
-#else
-	while (time_before(jiffies, timeout) && !(NCR5380_read(STATUS_REG) & SR_BSY))
-		;
-#endif
 
 	/*
 	 * No less than two deskew delays after the initiator detects the
@@ -1529,29 +1407,6 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
 
-	if (!(NCR5380_read(STATUS_REG) & SR_BSY)) {
-		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-		if (hostdata->targets_present & (1 << cmd->device->id)) {
-			printk(KERN_ERR "scsi%d: weirdness\n", HOSTNO);
-			if (hostdata->restart_select)
-				printk(KERN_NOTICE "\trestart select\n");
-			NCR5380_dprint(NDEBUG_ANY, instance);
-			NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-			return -1;
-		}
-		cmd->result = DID_BAD_TARGET << 16;
-#ifdef SUPPORT_TAGS
-		cmd_free_tag(cmd);
-#endif
-		cmd->scsi_done(cmd);
-		NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-		dprintk(NDEBUG_SELECTION, "scsi%d: target did not respond within 250ms\n", HOSTNO);
-		NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-		return 0;
-	}
-
-	hostdata->targets_present |= (1 << cmd->device->id);
-
 	/*
 	 * Since we followed the SCSI spec, and raised ATN while SEL
 	 * was true but before BSY was false during selection, the information
@@ -1563,16 +1418,27 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 	 * until it wraps back to 0.
 	 *
 	 * XXX - it turns out that there are some broken SCSI-II devices,
-	 *	     which claim to support tagged queuing but fail when more than
-	 *	     some number of commands are issued at once.
+	 * which claim to support tagged queuing but fail when more than
+	 * some number of commands are issued at once.
 	 */
 
 	/* Wait for start of REQ/ACK handshake */
-	while (!(NCR5380_read(STATUS_REG) & SR_REQ))
-		;
 
-	dprintk(NDEBUG_SELECTION, "scsi%d: target %d selected, going into MESSAGE OUT phase.\n",
-		   HOSTNO, cmd->device->id);
+	err = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ);
+	spin_lock_irq(&hostdata->lock);
+	if (err < 0) {
+		shost_printk(KERN_ERR, instance, "select: REQ timeout\n");
+		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+		NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+		goto out;
+	}
+	if (!hostdata->selecting) {
+		do_abort(instance);
+		goto out;
+	}
+
+	dsprintk(NDEBUG_SELECTION, instance, "target %d selected, going into MESSAGE OUT phase.\n",
+	         scmd_id(cmd));
 	tmp[0] = IDENTIFY(1, cmd->device->lun);
 
 #ifdef SUPPORT_TAGS
@@ -1591,11 +1457,12 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 	data = tmp;
 	phase = PHASE_MSGOUT;
 	NCR5380_transfer_pio(instance, &phase, &len, &data);
-	dprintk(NDEBUG_SELECTION, "scsi%d: nexus established.\n", HOSTNO);
+	dsprintk(NDEBUG_SELECTION, instance, "nexus established.\n");
 	/* XXX need to handle errors here */
+
 	hostdata->connected = cmd;
 #ifndef SUPPORT_TAGS
-	hostdata->busy[cmd->device->id] |= (1 << cmd->device->lun);
+	hostdata->busy[cmd->device->id] |= 1 << cmd->device->lun;
 #endif
 #ifdef SUN3_SCSI_VME
 	dregs->csr |= CSR_INTR;
@@ -1603,24 +1470,30 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 
 	initialize_SCp(cmd);
 
-	return 0;
+	cmd = NULL;
+
+out:
+	if (!hostdata->selecting)
+		return NULL;
+	hostdata->selecting = NULL;
+	return cmd;
 }
 
 /*
  * Function : int NCR5380_transfer_pio (struct Scsi_Host *instance,
- *      unsigned char *phase, int *count, unsigned char **data)
+ * unsigned char *phase, int *count, unsigned char **data)
  *
  * Purpose : transfers data in given phase using polled I/O
  *
  * Inputs : instance - instance of driver, *phase - pointer to
- *	what phase is expected, *count - pointer to number of
- *	bytes to transfer, **data - pointer to data pointer.
+ * what phase is expected, *count - pointer to number of
+ * bytes to transfer, **data - pointer to data pointer.
  *
  * Returns : -1 when different phase is entered without transferring
- *	maximum number of bytes, 0 if all bytes are transferred or exit
- *	is in same phase.
+ * maximum number of bytes, 0 if all bytes are transferred or exit
+ * is in same phase.
  *
- *	Also, *phase, *count, *data are modified in place.
+ * Also, *phase, *count, *data are modified in place.
  *
  * XXX Note : handling for bus free may be useful.
  */
@@ -1635,9 +1508,9 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance,
 				unsigned char *phase, int *count,
 				unsigned char **data)
 {
-	register unsigned char p = *phase, tmp;
-	register int c = *count;
-	register unsigned char *d = *data;
+	unsigned char p = *phase, tmp;
+	int c = *count;
+	unsigned char *d = *data;
 
 	/*
 	 * The NCR5380 chip will only drive the SCSI bus when the
@@ -1652,14 +1525,15 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance,
 		 * Wait for assertion of REQ, after which the phase bits will be
 		 * valid
 		 */
-		while (!((tmp = NCR5380_read(STATUS_REG)) & SR_REQ))
-			;
 
-		dprintk(NDEBUG_HANDSHAKE, "scsi%d: REQ detected\n", HOSTNO);
+		if (NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ) < 0)
+			break;
+
+		dsprintk(NDEBUG_HANDSHAKE, instance, "REQ asserted\n");
 
 		/* Check for phase mismatch */
-		if ((tmp & PHASE_MASK) != p) {
-			dprintk(NDEBUG_PIO, "scsi%d: phase mismatch\n", HOSTNO);
+		if ((NCR5380_read(STATUS_REG) & PHASE_MASK) != p) {
+			dsprintk(NDEBUG_PIO, instance, "phase mismatch\n");
 			NCR5380_dprint_phase(NDEBUG_PIO, instance);
 			break;
 		}
@@ -1684,35 +1558,36 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance,
 				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA);
 				NCR5380_dprint(NDEBUG_PIO, instance);
 				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
-					      ICR_ASSERT_DATA | ICR_ASSERT_ACK);
+				              ICR_ASSERT_DATA | ICR_ASSERT_ACK);
 			} else {
 				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
-					      ICR_ASSERT_DATA | ICR_ASSERT_ATN);
+				              ICR_ASSERT_DATA | ICR_ASSERT_ATN);
 				NCR5380_dprint(NDEBUG_PIO, instance);
 				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
-					      ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
+				              ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
 			}
 		} else {
 			NCR5380_dprint(NDEBUG_PIO, instance);
 			NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ACK);
 		}
 
-		while (NCR5380_read(STATUS_REG) & SR_REQ)
-			;
+		if (NCR5380_poll_politely(instance,
+		                          STATUS_REG, SR_REQ, 0, 5 * HZ) < 0)
+			break;
 
-		dprintk(NDEBUG_HANDSHAKE, "scsi%d: req false, handshake complete\n", HOSTNO);
+		dsprintk(NDEBUG_HANDSHAKE, instance, "REQ negated, handshake complete\n");
 
-		/*
-		 * We have several special cases to consider during REQ/ACK handshaking :
-		 * 1.  We were in MSGOUT phase, and we are on the last byte of the
-		 *	message.  ATN must be dropped as ACK is dropped.
-		 *
-		 * 2.  We are in a MSGIN phase, and we are on the last byte of the
-		 *	message.  We must exit with ACK asserted, so that the calling
-		 *	code may raise ATN before dropping ACK to reject the message.
-		 *
-		 * 3.  ACK and ATN are clear and the target may proceed as normal.
-		 */
+/*
+ * We have several special cases to consider during REQ/ACK handshaking :
+ * 1.  We were in MSGOUT phase, and we are on the last byte of the
+ * message.  ATN must be dropped as ACK is dropped.
+ *
+ * 2.  We are in a MSGIN phase, and we are on the last byte of the
+ * message.  We must exit with ACK asserted, so that the calling
+ * code may raise ATN before dropping ACK to reject the message.
+ *
+ * 3.  ACK and ATN are clear and the target may proceed as normal.
+ */
 		if (!(p == PHASE_MSGIN && c == 1)) {
 			if (p == PHASE_MSGOUT && c > 1)
 				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
@@ -1721,16 +1596,16 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance,
 		}
 	} while (--c);
 
-	dprintk(NDEBUG_PIO, "scsi%d: residual %d\n", HOSTNO, c);
+	dsprintk(NDEBUG_PIO, instance, "residual %d\n", c);
 
 	*count = c;
 	*data = d;
 	tmp = NCR5380_read(STATUS_REG);
 	/* The phase read from the bus is valid if either REQ is (already)
-	 * asserted or if ACK hasn't been released yet. The latter is the case if
-	 * we're in MSGIN and all wanted bytes have been received.
+	 * asserted or if ACK hasn't been released yet. The latter applies if
+	 * we're in MSG IN, DATA IN or STATUS and all bytes have been received.
 	 */
-	if ((tmp & SR_REQ) || (p == PHASE_MSGIN && c == 0))
+	if ((tmp & SR_REQ) || ((tmp & SR_IO) && c == 0))
 		*phase = tmp & PHASE_MASK;
 	else
 		*phase = PHASE_UNKNOWN;
@@ -1741,19 +1616,45 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance,
 		return -1;
 }
 
-/*
- * Function : do_abort (Scsi_Host *host)
+/**
+ * do_reset - issue a reset command
+ * @instance: adapter to reset
  *
- * Purpose : abort the currently established nexus.  Should only be
- *	called from a routine which can drop into a
+ * Issue a reset sequence to the NCR5380 and try and get the bus
+ * back into sane shape.
  *
- * Returns : 0 on success, -1 on failure.
+ * This clears the reset interrupt flag because there may be no handler for
+ * it. When the driver is initialized, the NCR5380_intr() handler has not yet
+ * been installed. And when in EH we may have released the ST DMA interrupt.
+ */
+
+static void do_reset(struct Scsi_Host *instance)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	NCR5380_write(TARGET_COMMAND_REG,
+	              PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG) & PHASE_MASK));
+	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST);
+	udelay(50);
+	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+	(void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+	local_irq_restore(flags);
+}
+
+/**
+ * do_abort - abort the currently established nexus by going to
+ * MESSAGE OUT phase and sending an ABORT message.
+ * @instance: relevant scsi host instance
+ *
+ * Returns 0 on success, -1 on failure.
  */
 
 static int do_abort(struct Scsi_Host *instance)
 {
-	unsigned char tmp, *msgptr, phase;
+	unsigned char *msgptr, phase, tmp;
 	int len;
+	int rc;
 
 	/* Request message out phase */
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
@@ -1768,16 +1669,20 @@ static int do_abort(struct Scsi_Host *instance)
 	 * the target sees, so we just handshake.
 	 */
 
-	while (!((tmp = NCR5380_read(STATUS_REG)) & SR_REQ))
-		;
+	rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, 10 * HZ);
+	if (rc < 0)
+		goto timeout;
+
+	tmp = NCR5380_read(STATUS_REG) & PHASE_MASK;
 
 	NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp));
 
-	if ((tmp & PHASE_MASK) != PHASE_MSGOUT) {
-		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN |
-			      ICR_ASSERT_ACK);
-		while (NCR5380_read(STATUS_REG) & SR_REQ)
-			;
+	if (tmp != PHASE_MSGOUT) {
+		NCR5380_write(INITIATOR_COMMAND_REG,
+		              ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
+		rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, 0, 3 * HZ);
+		if (rc < 0)
+			goto timeout;
 		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
 	}
 
@@ -1793,26 +1698,29 @@ static int do_abort(struct Scsi_Host *instance)
 	 */
 
 	return len ? -1 : 0;
+
+timeout:
+	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+	return -1;
 }
 
 #if defined(REAL_DMA)
 /*
  * Function : int NCR5380_transfer_dma (struct Scsi_Host *instance,
- *      unsigned char *phase, int *count, unsigned char **data)
+ * unsigned char *phase, int *count, unsigned char **data)
  *
  * Purpose : transfers data in given phase using either real
- *	or pseudo DMA.
+ * or pseudo DMA.
  *
  * Inputs : instance - instance of driver, *phase - pointer to
- *	what phase is expected, *count - pointer to number of
- *	bytes to transfer, **data - pointer to data pointer.
+ * what phase is expected, *count - pointer to number of
+ * bytes to transfer, **data - pointer to data pointer.
  *
  * Returns : -1 when different phase is entered without transferring
- *	maximum number of bytes, 0 if all bytes or transferred or exit
- *	is in same phase.
- *
- *	Also, *phase, *count, *data are modified in place.
+ * maximum number of bytes, 0 if all bytes or transferred or exit
+ * is in same phase.
  *
+ * Also, *phase, *count, *data are modified in place.
  */
 
 
@@ -1820,10 +1728,9 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
 				unsigned char *phase, int *count,
 				unsigned char **data)
 {
-	SETUP_HOSTDATA(instance);
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	register int c = *count;
 	register unsigned char p = *phase;
-	unsigned long flags;
 
 #if defined(CONFIG_SUN3)
 	/* sanity check */
@@ -1834,29 +1741,22 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
 	}
 	hostdata->dma_len = c;
 
-	dprintk(NDEBUG_DMA, "scsi%d: initializing DMA for %s, %d bytes %s %p\n",
-		instance->host_no, (p & SR_IO) ? "reading" : "writing",
-		c, (p & SR_IO) ? "to" : "from", *data);
+	dsprintk(NDEBUG_DMA, instance, "initializing DMA %s: length %d, address %p\n",
+	         (p & SR_IO) ? "receive" : "send", c, *data);
 
 	/* netbsd turns off ints here, why not be safe and do it too */
-	local_irq_save(flags);
 
 	/* send start chain */
 	sun3scsi_dma_start(c, *data);
 
+	NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p));
+	NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY |
+	                        MR_ENABLE_EOP_INTR);
 	if (p & SR_IO) {
-		NCR5380_write(TARGET_COMMAND_REG, 1);
-		NCR5380_read(RESET_PARITY_INTERRUPT_REG);
 		NCR5380_write(INITIATOR_COMMAND_REG, 0);
-		NCR5380_write(MODE_REG,
-			      (NCR5380_read(MODE_REG) | MR_DMA_MODE | MR_ENABLE_EOP_INTR));
 		NCR5380_write(START_DMA_INITIATOR_RECEIVE_REG, 0);
 	} else {
-		NCR5380_write(TARGET_COMMAND_REG, 0);
-		NCR5380_read(RESET_PARITY_INTERRUPT_REG);
 		NCR5380_write(INITIATOR_COMMAND_REG, ICR_ASSERT_DATA);
-		NCR5380_write(MODE_REG,
-			      (NCR5380_read(MODE_REG) | MR_DMA_MODE | MR_ENABLE_EOP_INTR));
 		NCR5380_write(START_DMA_SEND_REG, 0);
 	}
 
@@ -1864,8 +1764,6 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
 	dregs->csr |= CSR_DMA_ENABLE;
 #endif
 
-	local_irq_restore(flags);
-
 	sun3_dma_active = 1;
 
 #else /* !defined(CONFIG_SUN3) */
@@ -1880,25 +1778,20 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
 	if (hostdata->read_overruns && (p & SR_IO))
 		c -= hostdata->read_overruns;
 
-	dprintk(NDEBUG_DMA, "scsi%d: initializing DMA for %s, %d bytes %s %p\n",
-		   HOSTNO, (p & SR_IO) ? "reading" : "writing",
-		   c, (p & SR_IO) ? "to" : "from", d);
+	dsprintk(NDEBUG_DMA, instance, "initializing DMA %s: length %d, address %p\n",
+	         (p & SR_IO) ? "receive" : "send", c, d);
 
 	NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p));
-
-#ifdef REAL_DMA
-	NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_ENABLE_EOP_INTR | MR_MONITOR_BSY);
-#endif /* def REAL_DMA  */
+	NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY |
+	                        MR_ENABLE_EOP_INTR);
 
 	if (!(hostdata->flags & FLAG_LATE_DMA_SETUP)) {
 		/* On the Medusa, it is a must to initialize the DMA before
 		 * starting the NCR. This is also the cleaner way for the TT.
 		 */
-		local_irq_save(flags);
 		hostdata->dma_len = (p & SR_IO) ?
 			NCR5380_dma_read_setup(instance, d, c) :
 			NCR5380_dma_write_setup(instance, d, c);
-		local_irq_restore(flags);
 	}
 
 	if (p & SR_IO)
@@ -1912,11 +1805,9 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
 		/* On the Falcon, the DMA setup must be done after the last */
 		/* NCR access, else the DMA setup gets trashed!
 		 */
-		local_irq_save(flags);
 		hostdata->dma_len = (p & SR_IO) ?
 			NCR5380_dma_read_setup(instance, d, c) :
 			NCR5380_dma_write_setup(instance, d, c);
-		local_irq_restore(flags);
 	}
 #endif /* !defined(CONFIG_SUN3) */
 
@@ -1928,23 +1819,22 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
  * Function : NCR5380_information_transfer (struct Scsi_Host *instance)
  *
  * Purpose : run through the various SCSI phases and do as the target
- *	directs us to.  Operates on the currently connected command,
- *	instance->connected.
+ * directs us to.  Operates on the currently connected command,
+ * instance->connected.
  *
  * Inputs : instance, instance for which we are doing commands
  *
  * Side effects : SCSI things happen, the disconnected queue will be
- *	modified if a command disconnects, *instance->connected will
- *	change.
+ * modified if a command disconnects, *instance->connected will
+ * change.
  *
  * XXX Note : we need to watch for bus free or a reset condition here
- *	to recover from an unexpected bus free condition.
+ * to recover from an unexpected bus free condition.
  */
 
 static void NCR5380_information_transfer(struct Scsi_Host *instance)
 {
-	SETUP_HOSTDATA(instance);
-	unsigned long flags;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	unsigned char msgout = NOP;
 	int sink = 0;
 	int len;
@@ -1953,13 +1843,15 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 #endif
 	unsigned char *data;
 	unsigned char phase, tmp, extended_msg[10], old_phase = 0xff;
-	struct scsi_cmnd *cmd = (struct scsi_cmnd *) hostdata->connected;
+	struct scsi_cmnd *cmd;
 
 #ifdef SUN3_SCSI_VME
 	dregs->csr |= CSR_INTR;
 #endif
 
-	while (1) {
+	while ((cmd = hostdata->connected)) {
+		struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
+
 		tmp = NCR5380_read(STATUS_REG);
 		/* We only have a valid SCSI phase when REQ is asserted */
 		if (tmp & SR_REQ) {
@@ -1984,7 +1876,7 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 				/* this command setup for dma yet? */
 				if ((count >= DMA_MIN_SIZE) && (sun3_dma_setup_done != cmd)) {
 					if (cmd->request->cmd_type == REQ_TYPE_FS) {
-						sun3scsi_dma_setup(d, count,
+						sun3scsi_dma_setup(instance, d, count,
 						                   rq_data_dir(cmd->request));
 						sun3_dma_setup_done = cmd;
 					}
@@ -2000,11 +1892,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 				NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp));
 
 				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN |
-					      ICR_ASSERT_ACK);
+				              ICR_ASSERT_ACK);
 				while (NCR5380_read(STATUS_REG) & SR_REQ)
 					;
 				NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
-					      ICR_ASSERT_ATN);
+				              ICR_ASSERT_ATN);
 				sink = 0;
 				continue;
 			}
@@ -2012,12 +1904,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 			switch (phase) {
 			case PHASE_DATAOUT:
 #if (NDEBUG & NDEBUG_NO_DATAOUT)
-				printk("scsi%d: NDEBUG_NO_DATAOUT set, attempted DATAOUT "
-				       "aborted\n", HOSTNO);
+				shost_printk(KERN_DEBUG, instance, "NDEBUG_NO_DATAOUT set, attempted DATAOUT aborted\n");
 				sink = 1;
 				do_abort(instance);
 				cmd->result = DID_ERROR << 16;
-				cmd->scsi_done(cmd);
+				complete_cmd(instance, cmd);
 				return;
 #endif
 			case PHASE_DATAIN:
@@ -2031,13 +1922,10 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 					--cmd->SCp.buffers_residual;
 					cmd->SCp.this_residual = cmd->SCp.buffer->length;
 					cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
-					/* ++roman: Try to merge some scatter-buffers if
-					 * they are at contiguous physical addresses.
-					 */
 					merge_contiguous_buffers(cmd);
-					dprintk(NDEBUG_INFORMATION, "scsi%d: %d bytes and %d buffers left\n",
-						   HOSTNO, cmd->SCp.this_residual,
-						   cmd->SCp.buffers_residual);
+					dsprintk(NDEBUG_INFORMATION, instance, "%d bytes and %d buffers left\n",
+					         cmd->SCp.this_residual,
+					         cmd->SCp.buffers_residual);
 				}
 
 				/*
@@ -2051,16 +1939,18 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 				 */
 
 				/* ++roman: I suggest, this should be
-				 *   #if def(REAL_DMA)
+				 * #if def(REAL_DMA)
 				 * instead of leaving REAL_DMA out.
 				 */
 
 #if defined(REAL_DMA)
-				if (
 #if !defined(CONFIG_SUN3)
-				    !cmd->device->borken &&
+				transfersize = 0;
+				if (!cmd->device->borken)
 #endif
-				    (transfersize = NCR5380_dma_xfer_len(instance, cmd, phase)) >= DMA_MIN_SIZE) {
+					transfersize = NCR5380_dma_xfer_len(instance, cmd, phase);
+
+				if (transfersize >= DMA_MIN_SIZE) {
 					len = transfersize;
 					cmd->SCp.phase = phase;
 					if (NCR5380_transfer_dma(instance, &phase,
@@ -2068,16 +1958,15 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 						/*
 						 * If the watchdog timer fires, all future
 						 * accesses to this device will use the
-						 * polled-IO. */
+						 * polled-IO.
+						 */
 						scmd_printk(KERN_INFO, cmd,
 							"switching to slow handshake\n");
 						cmd->device->borken = 1;
-						NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
-							ICR_ASSERT_ATN);
 						sink = 1;
 						do_abort(instance);
 						cmd->result = DID_ERROR << 16;
-						cmd->scsi_done(cmd);
+						complete_cmd(instance, cmd);
 						/* XXX - need to source or sink data here, as appropriate */
 					} else {
 #ifdef REAL_DMA
@@ -2093,9 +1982,13 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 					}
 				} else
 #endif /* defined(REAL_DMA) */
+				{
+					spin_unlock_irq(&hostdata->lock);
 					NCR5380_transfer_pio(instance, &phase,
-							     (int *)&cmd->SCp.this_residual,
-							     (unsigned char **)&cmd->SCp.ptr);
+					                     (int *)&cmd->SCp.this_residual,
+					                     (unsigned char **)&cmd->SCp.ptr);
+					spin_lock_irq(&hostdata->lock);
+				}
 #if defined(CONFIG_SUN3) && defined(REAL_DMA)
 				/* if we had intended to dma that command clear it */
 				if (sun3_dma_setup_done == cmd)
@@ -2105,162 +1998,64 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 			case PHASE_MSGIN:
 				len = 1;
 				data = &tmp;
-				NCR5380_write(SELECT_ENABLE_REG, 0);	/* disable reselects */
 				NCR5380_transfer_pio(instance, &phase, &len, &data);
 				cmd->SCp.Message = tmp;
 
 				switch (tmp) {
-				/*
-				 * Linking lets us reduce the time required to get the
-				 * next command out to the device, hopefully this will
-				 * mean we don't waste another revolution due to the delays
-				 * required by ARBITRATION and another SELECTION.
-				 *
-				 * In the current implementation proposal, low level drivers
-				 * merely have to start the next command, pointed to by
-				 * next_link, done() is called as with unlinked commands.
-				 */
-#ifdef LINKED
-				case LINKED_CMD_COMPLETE:
-				case LINKED_FLG_CMD_COMPLETE:
-					/* Accept message by clearing ACK */
-					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-
-					dprintk(NDEBUG_LINKED, "scsi%d: target %d lun %llu linked command "
-						   "complete.\n", HOSTNO, cmd->device->id, cmd->device->lun);
-
-					/* Enable reselect interrupts */
-					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-					/*
-					 * Sanity check : A linked command should only terminate
-					 * with one of these messages if there are more linked
-					 * commands available.
-					 */
-
-					if (!cmd->next_link) {
-						 printk(KERN_NOTICE "scsi%d: target %d lun %llu "
-							"linked command complete, no next_link\n",
-							HOSTNO, cmd->device->id, cmd->device->lun);
-						sink = 1;
-						do_abort(instance);
-						return;
-					}
-
-					initialize_SCp(cmd->next_link);
-					/* The next command is still part of this process; copy it
-					 * and don't free it! */
-					cmd->next_link->tag = cmd->tag;
-					cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8);
-					dprintk(NDEBUG_LINKED, "scsi%d: target %d lun %llu linked request "
-						   "done, calling scsi_done().\n",
-						   HOSTNO, cmd->device->id, cmd->device->lun);
-					cmd->scsi_done(cmd);
-					cmd = hostdata->connected;
-					break;
-#endif /* def LINKED */
 				case ABORT:
 				case COMMAND_COMPLETE:
 					/* Accept message by clearing ACK */
 					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-					dprintk(NDEBUG_QUEUES, "scsi%d: command for target %d, lun %llu "
-						  "completed\n", HOSTNO, cmd->device->id, cmd->device->lun);
+					dsprintk(NDEBUG_QUEUES, instance,
+					         "COMMAND COMPLETE %p target %d lun %llu\n",
+					         cmd, scmd_id(cmd), cmd->device->lun);
 
-					local_irq_save(flags);
-					hostdata->retain_dma_intr++;
 					hostdata->connected = NULL;
 #ifdef SUPPORT_TAGS
 					cmd_free_tag(cmd);
 					if (status_byte(cmd->SCp.Status) == QUEUE_FULL) {
-						/* Turn a QUEUE FULL status into BUSY, I think the
-						 * mid level cannot handle QUEUE FULL :-( (The
-						 * command is retried after BUSY). Also update our
-						 * queue size to the number of currently issued
-						 * commands now.
-						 */
-						/* ++Andreas: the mid level code knows about
-						   QUEUE_FULL now. */
-						struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][cmd->device->lun];
-						dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %llu returned "
-							   "QUEUE_FULL after %d commands\n",
-							   HOSTNO, cmd->device->id, cmd->device->lun,
-							   ta->nr_allocated);
+						u8 lun = cmd->device->lun;
+						struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][lun];
+
+						dsprintk(NDEBUG_TAGS, instance,
+						         "QUEUE_FULL %p target %d lun %d nr_allocated %d\n",
+						         cmd, scmd_id(cmd), lun, ta->nr_allocated);
 						if (ta->queue_size > ta->nr_allocated)
-							ta->nr_allocated = ta->queue_size;
+							ta->queue_size = ta->nr_allocated;
 					}
-#else
-					hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun);
 #endif
-					/* Enable reselect interrupts */
-					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-
-					/*
-					 * I'm not sure what the correct thing to do here is :
-					 *
-					 * If the command that just executed is NOT a request
-					 * sense, the obvious thing to do is to set the result
-					 * code to the values of the stored parameters.
-					 *
-					 * If it was a REQUEST SENSE command, we need some way to
-					 * differentiate between the failure code of the original
-					 * and the failure code of the REQUEST sense - the obvious
-					 * case is success, where we fall through and leave the
-					 * result code unchanged.
-					 *
-					 * The non-obvious place is where the REQUEST SENSE failed
-					 */
-
-					if (cmd->cmnd[0] != REQUEST_SENSE)
-						cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8);
-					else if (status_byte(cmd->SCp.Status) != GOOD)
-						cmd->result = (cmd->result & 0x00ffff) | (DID_ERROR << 16);
-
-					if ((cmd->cmnd[0] == REQUEST_SENSE) &&
-						hostdata->ses.cmd_len) {
-						scsi_eh_restore_cmnd(cmd, &hostdata->ses);
-						hostdata->ses.cmd_len = 0 ;
-					}
-
-					if ((cmd->cmnd[0] != REQUEST_SENSE) &&
-					    (status_byte(cmd->SCp.Status) == CHECK_CONDITION)) {
-						scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0);
-
-						dprintk(NDEBUG_AUTOSENSE, "scsi%d: performing request sense\n", HOSTNO);
 
-						LIST(cmd,hostdata->issue_queue);
-						SET_NEXT(cmd, hostdata->issue_queue);
-						hostdata->issue_queue = (struct scsi_cmnd *) cmd;
-						dprintk(NDEBUG_QUEUES, "scsi%d: REQUEST SENSE added to head of "
-							  "issue queue\n", H_NO(cmd));
-					} else {
-						cmd->scsi_done(cmd);
+					cmd->result &= ~0xffff;
+					cmd->result |= cmd->SCp.Status;
+					cmd->result |= cmd->SCp.Message << 8;
+
+					if (cmd->cmnd[0] == REQUEST_SENSE)
+						complete_cmd(instance, cmd);
+					else {
+						if (cmd->SCp.Status == SAM_STAT_CHECK_CONDITION ||
+						    cmd->SCp.Status == SAM_STAT_COMMAND_TERMINATED) {
+							dsprintk(NDEBUG_QUEUES, instance, "autosense: adding cmd %p to tail of autosense queue\n",
+							         cmd);
+							list_add_tail(&ncmd->list,
+							              &hostdata->autosense);
+						} else
+							complete_cmd(instance, cmd);
 					}
 
-					local_irq_restore(flags);
-
-					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
 					/*
 					 * Restore phase bits to 0 so an interrupted selection,
 					 * arbitration can resume.
 					 */
 					NCR5380_write(TARGET_COMMAND_REG, 0);
 
-					while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected)
-						barrier();
+					/* Enable reselect interrupts */
+					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
 
-					local_irq_save(flags);
-					hostdata->retain_dma_intr--;
-					/* ++roman: For Falcon SCSI, release the lock on the
-					 * ST-DMA here if no other commands are waiting on the
-					 * disconnected queue.
-					 */
 					maybe_release_dma_irq(instance);
-					local_irq_restore(flags);
 					return;
 				case MESSAGE_REJECT:
 					/* Accept message by clearing ACK */
 					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-					/* Enable reselect interrupts */
-					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
 					switch (hostdata->last_message) {
 					case HEAD_OF_QUEUE_TAG:
 					case ORDERED_QUEUE_TAG:
@@ -2274,27 +2069,20 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 						cmd->device->tagged_supported = 0;
 						hostdata->busy[cmd->device->id] |= (1 << cmd->device->lun);
 						cmd->tag = TAG_NONE;
-						dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %llu rejected "
-							   "QUEUE_TAG message; tagged queuing "
-							   "disabled\n",
-							   HOSTNO, cmd->device->id, cmd->device->lun);
+						dsprintk(NDEBUG_TAGS, instance, "target %d lun %llu rejected QUEUE_TAG message; tagged queuing disabled\n",
+						         scmd_id(cmd), cmd->device->lun);
 						break;
 					}
 					break;
 				case DISCONNECT:
 					/* Accept message by clearing ACK */
 					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-					local_irq_save(flags);
-					cmd->device->disconnect = 1;
-					LIST(cmd,hostdata->disconnected_queue);
-					SET_NEXT(cmd, hostdata->disconnected_queue);
 					hostdata->connected = NULL;
-					hostdata->disconnected_queue = cmd;
-					local_irq_restore(flags);
-					dprintk(NDEBUG_QUEUES, "scsi%d: command for target %d lun %llu was "
-						  "moved from connected to the "
-						  "disconnected_queue\n", HOSTNO,
-						  cmd->device->id, cmd->device->lun);
+					list_add(&ncmd->list, &hostdata->disconnected);
+					dsprintk(NDEBUG_INFORMATION | NDEBUG_QUEUES,
+					         instance, "connected command %p for target %d lun %llu moved to disconnected queue\n",
+					         cmd, scmd_id(cmd), cmd->device->lun);
+
 					/*
 					 * Restore phase bits to 0 so an interrupted selection,
 					 * arbitration can resume.
@@ -2303,9 +2091,6 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 
 					/* Enable reselect interrupts */
 					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-					/* Wait for bus free to avoid nasty timeouts */
-					while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected)
-						barrier();
 #ifdef SUN3_SCSI_VME
 					dregs->csr |= CSR_DMA_ENABLE;
 #endif
@@ -2324,37 +2109,30 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 				case RESTORE_POINTERS:
 					/* Accept message by clearing ACK */
 					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-					/* Enable reselect interrupts */
-					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
 					break;
 				case EXTENDED_MESSAGE:
 					/*
-					 * Extended messages are sent in the following format :
-					 * Byte
-					 * 0		EXTENDED_MESSAGE == 1
-					 * 1		length (includes one byte for code, doesn't
-					 *		include first two bytes)
-					 * 2		code
-					 * 3..length+1	arguments
-					 *
-					 * Start the extended message buffer with the EXTENDED_MESSAGE
+					 * Start the message buffer with the EXTENDED_MESSAGE
 					 * byte, since spi_print_msg() wants the whole thing.
 					 */
 					extended_msg[0] = EXTENDED_MESSAGE;
 					/* Accept first byte by clearing ACK */
 					NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 
-					dprintk(NDEBUG_EXTENDED, "scsi%d: receiving extended message\n", HOSTNO);
+					spin_unlock_irq(&hostdata->lock);
+
+					dsprintk(NDEBUG_EXTENDED, instance, "receiving extended message\n");
 
 					len = 2;
 					data = extended_msg + 1;
 					phase = PHASE_MSGIN;
 					NCR5380_transfer_pio(instance, &phase, &len, &data);
-					dprintk(NDEBUG_EXTENDED, "scsi%d: length=%d, code=0x%02x\n", HOSTNO,
-						   (int)extended_msg[1], (int)extended_msg[2]);
+					dsprintk(NDEBUG_EXTENDED, instance, "length %d, code 0x%02x\n",
+					         (int)extended_msg[1],
+					         (int)extended_msg[2]);
 
-					if (!len && extended_msg[1] <=
-					    (sizeof(extended_msg) - 1)) {
+					if (!len && extended_msg[1] > 0 &&
+					    extended_msg[1] <= sizeof(extended_msg) - 2) {
 						/* Accept third byte by clearing ACK */
 						NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 						len = extended_msg[1] - 1;
@@ -2362,8 +2140,8 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 						phase = PHASE_MSGIN;
 
 						NCR5380_transfer_pio(instance, &phase, &len, &data);
-						dprintk(NDEBUG_EXTENDED, "scsi%d: message received, residual %d\n",
-							   HOSTNO, len);
+						dsprintk(NDEBUG_EXTENDED, instance, "message received, residual %d\n",
+						         len);
 
 						switch (extended_msg[2]) {
 						case EXTENDED_SDTR:
@@ -2373,15 +2151,18 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 							tmp = 0;
 						}
 					} else if (len) {
-						printk(KERN_NOTICE "scsi%d: error receiving "
-						       "extended message\n", HOSTNO);
+						shost_printk(KERN_ERR, instance, "error receiving extended message\n");
 						tmp = 0;
 					} else {
-						printk(KERN_NOTICE "scsi%d: extended message "
-							   "code %02x length %d is too long\n",
-							   HOSTNO, extended_msg[2], extended_msg[1]);
+						shost_printk(KERN_NOTICE, instance, "extended message code %02x length %d is too long\n",
+						             extended_msg[2], extended_msg[1]);
 						tmp = 0;
 					}
+
+					spin_lock_irq(&hostdata->lock);
+					if (!hostdata->connected)
+						return;
+
 					/* Fall through to reject message */
 
 					/*
@@ -2390,8 +2171,7 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 					 */
 				default:
 					if (!tmp) {
-						printk(KERN_INFO "scsi%d: rejecting message ",
-						       instance->host_no);
+						shost_printk(KERN_ERR, instance, "rejecting message ");
 						spi_print_msg(extended_msg);
 						printk("\n");
 					} else if (tmp != EXTENDED_MESSAGE)
@@ -2414,18 +2194,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 				hostdata->last_message = msgout;
 				NCR5380_transfer_pio(instance, &phase, &len, &data);
 				if (msgout == ABORT) {
-					local_irq_save(flags);
-#ifdef SUPPORT_TAGS
-					cmd_free_tag(cmd);
-#else
-					hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun);
-#endif
 					hostdata->connected = NULL;
 					cmd->result = DID_ERROR << 16;
-					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+					complete_cmd(instance, cmd);
 					maybe_release_dma_irq(instance);
-					local_irq_restore(flags);
-					cmd->scsi_done(cmd);
+					NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
 					return;
 				}
 				msgout = NOP;
@@ -2447,22 +2220,25 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 				cmd->SCp.Status = tmp;
 				break;
 			default:
-				printk("scsi%d: unknown phase\n", HOSTNO);
+				shost_printk(KERN_ERR, instance, "unknown phase\n");
 				NCR5380_dprint(NDEBUG_ANY, instance);
 			} /* switch(phase) */
-		} /* if (tmp * SR_REQ) */
-	} /* while (1) */
+		} else {
+			spin_unlock_irq(&hostdata->lock);
+			NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ);
+			spin_lock_irq(&hostdata->lock);
+		}
+	}
 }
 
 /*
  * Function : void NCR5380_reselect (struct Scsi_Host *instance)
  *
  * Purpose : does reselection, initializing the instance->connected
- *	field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q
- *	nexus has been reestablished,
+ * field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q
+ * nexus has been reestablished,
  *
  * Inputs : instance - this instance of the NCR5380.
- *
  */
 
 
@@ -2471,7 +2247,7 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 
 static void NCR5380_reselect(struct Scsi_Host *instance)
 {
-	SETUP_HOSTDATA(instance);
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	unsigned char target_mask;
 	unsigned char lun;
 #ifdef SUPPORT_TAGS
@@ -2480,7 +2256,8 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
 	unsigned char msg[3];
 	int __maybe_unused len;
 	unsigned char __maybe_unused *data, __maybe_unused phase;
-	struct scsi_cmnd *tmp = NULL, *prev;
+	struct NCR5380_cmd *ncmd;
+	struct scsi_cmnd *tmp;
 
 	/*
 	 * Disable arbitration, etc. since the host adapter obviously
@@ -2488,11 +2265,10 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
 	 */
 
 	NCR5380_write(MODE_REG, MR_BASE);
-	hostdata->restart_select = 1;
 
 	target_mask = NCR5380_read(CURRENT_SCSI_DATA_REG) & ~(hostdata->id_mask);
 
-	dprintk(NDEBUG_RESELECTION, "scsi%d: reselect\n", HOSTNO);
+	dsprintk(NDEBUG_RESELECTION, instance, "reselect\n");
 
 	/*
 	 * At this point, we have detected that our SCSI ID is on the bus,
@@ -2504,17 +2280,22 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
 	 */
 
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY);
-
-	while (NCR5380_read(STATUS_REG) & SR_SEL)
-		;
+	if (NCR5380_poll_politely(instance,
+	                          STATUS_REG, SR_SEL, 0, 2 * HZ) < 0) {
+		NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+		return;
+	}
 	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 
 	/*
 	 * Wait for target to go into MSGIN.
 	 */
 
-	while (!(NCR5380_read(STATUS_REG) & SR_REQ))
-		;
+	if (NCR5380_poll_politely(instance,
+	                          STATUS_REG, SR_REQ, SR_REQ, 2 * HZ) < 0) {
+		do_abort(instance);
+		return;
+	}
 
 #if defined(CONFIG_SUN3) && defined(REAL_DMA)
 	/* acknowledge toggle to MSGIN */
@@ -2527,15 +2308,21 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
 	data = msg;
 	phase = PHASE_MSGIN;
 	NCR5380_transfer_pio(instance, &phase, &len, &data);
+
+	if (len) {
+		do_abort(instance);
+		return;
+	}
 #endif
 
 	if (!(msg[0] & 0x80)) {
-		printk(KERN_DEBUG "scsi%d: expecting IDENTIFY message, got ", HOSTNO);
+		shost_printk(KERN_ERR, instance, "expecting IDENTIFY message, got ");
 		spi_print_msg(msg);
+		printk("\n");
 		do_abort(instance);
 		return;
 	}
-	lun = (msg[0] & 0x07);
+	lun = msg[0] & 0x07;
 
 #if defined(SUPPORT_TAGS) && !defined(CONFIG_SUN3)
 	/* If the phase is still MSGIN, the target wants to send some more
@@ -2551,8 +2338,8 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
 		if (!NCR5380_transfer_pio(instance, &phase, &len, &data) &&
 		    msg[1] == SIMPLE_QUEUE_TAG)
 			tag = msg[2];
-		dprintk(NDEBUG_TAGS, "scsi%d: target mask %02x, lun %d sent tag %d at "
-			   "reselection\n", HOSTNO, target_mask, lun, tag);
+		dsprintk(NDEBUG_TAGS, instance, "reselect: target mask %02x, lun %d sent tag %d\n",
+		         target_mask, lun, tag);
 	}
 #endif
 
@@ -2561,36 +2348,34 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
 	 * just reestablished, and remove it from the disconnected queue.
 	 */
 
-	for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue, prev = NULL;
-	     tmp; prev = tmp, tmp = NEXT(tmp)) {
-		if ((target_mask == (1 << tmp->device->id)) && (lun == tmp->device->lun)
+	tmp = NULL;
+	list_for_each_entry(ncmd, &hostdata->disconnected, list) {
+		struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
+
+		if (target_mask == (1 << scmd_id(cmd)) &&
+		    lun == (u8)cmd->device->lun
 #ifdef SUPPORT_TAGS
-		    && (tag == tmp->tag)
+		    && (tag == cmd->tag)
 #endif
 		    ) {
-			if (prev) {
-				REMOVE(prev, NEXT(prev), tmp, NEXT(tmp));
-				SET_NEXT(prev, NEXT(tmp));
-			} else {
-				REMOVE(-1, hostdata->disconnected_queue, tmp, NEXT(tmp));
-				hostdata->disconnected_queue = NEXT(tmp);
-			}
-			SET_NEXT(tmp, NULL);
+			list_del(&ncmd->list);
+			tmp = cmd;
 			break;
 		}
 	}
 
-	if (!tmp) {
-		printk(KERN_WARNING "scsi%d: warning: target bitmask %02x lun %d "
-#ifdef SUPPORT_TAGS
-		       "tag %d "
-#endif
-		       "not in disconnected_queue.\n",
-		       HOSTNO, target_mask, lun
+	if (tmp) {
+		dsprintk(NDEBUG_RESELECTION | NDEBUG_QUEUES, instance,
+		         "reselect: removed %p from disconnected queue\n", tmp);
+	} else {
+
 #ifdef SUPPORT_TAGS
-		       , tag
+		shost_printk(KERN_ERR, instance, "target bitmask 0x%02x lun %d tag %d not in disconnected queue.\n",
+		             target_mask, lun, tag);
+#else
+		shost_printk(KERN_ERR, instance, "target bitmask 0x%02x lun %d not in disconnected queue.\n",
+		             target_mask, lun);
 #endif
-			);
 		/*
 		 * Since we have an established nexus that we can't do anything
 		 * with, we must abort it.
@@ -2614,7 +2399,8 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
 		}
 		/* setup this command for dma if not already */
 		if ((count >= DMA_MIN_SIZE) && (sun3_dma_setup_done != tmp)) {
-			sun3scsi_dma_setup(d, count, rq_data_dir(tmp->request));
+			sun3scsi_dma_setup(instance, d, count,
+			                   rq_data_dir(tmp->request));
 			sun3_dma_setup_done = tmp;
 		}
 	}
@@ -2639,235 +2425,196 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
 		if (!NCR5380_transfer_pio(instance, &phase, &len, &data) &&
 		    msg[1] == SIMPLE_QUEUE_TAG)
 			tag = msg[2];
-		dprintk(NDEBUG_TAGS, "scsi%d: target mask %02x, lun %d sent tag %d at reselection\n"
-			HOSTNO, target_mask, lun, tag);
+		dsprintk(NDEBUG_TAGS, instance, "reselect: target mask %02x, lun %d sent tag %d\n"
+		         target_mask, lun, tag);
 	}
 #endif
 
 	hostdata->connected = tmp;
-	dprintk(NDEBUG_RESELECTION, "scsi%d: nexus established, target = %d, lun = %llu, tag = %d\n",
-		   HOSTNO, tmp->device->id, tmp->device->lun, tmp->tag);
+	dsprintk(NDEBUG_RESELECTION, instance, "nexus established, target %d, lun %llu, tag %d\n",
+	         scmd_id(tmp), tmp->device->lun, tmp->tag);
 }
 
 
-/*
- * Function : int NCR5380_abort (struct scsi_cmnd *cmd)
- *
- * Purpose : abort a command
- *
- * Inputs : cmd - the scsi_cmnd to abort, code - code to set the
- *	host byte of the result field to, if zero DID_ABORTED is
- *	used.
- *
- * Returns : SUCCESS - success, FAILED on failure.
- *
- * XXX - there is no way to abort the command that is currently
- *	 connected, you have to wait for it to complete.  If this is
- *	 a problem, we could implement longjmp() / setjmp(), setjmp()
- *	 called where the loop started in NCR5380_main().
+/**
+ * list_find_cmd - test for presence of a command in a linked list
+ * @haystack: list of commands
+ * @needle: command to search for
  */
 
-static
-int NCR5380_abort(struct scsi_cmnd *cmd)
+static bool list_find_cmd(struct list_head *haystack,
+                          struct scsi_cmnd *needle)
 {
-	struct Scsi_Host *instance = cmd->device->host;
-	SETUP_HOSTDATA(instance);
-	struct scsi_cmnd *tmp, **prev;
-	unsigned long flags;
+	struct NCR5380_cmd *ncmd;
 
-	scmd_printk(KERN_NOTICE, cmd, "aborting command\n");
+	list_for_each_entry(ncmd, haystack, list)
+		if (NCR5380_to_scmd(ncmd) == needle)
+			return true;
+	return false;
+}
 
-	NCR5380_print_status(instance);
+/**
+ * list_remove_cmd - remove a command from linked list
+ * @haystack: list of commands
+ * @needle: command to remove
+ */
 
-	local_irq_save(flags);
+static bool list_del_cmd(struct list_head *haystack,
+                         struct scsi_cmnd *needle)
+{
+	if (list_find_cmd(haystack, needle)) {
+		struct NCR5380_cmd *ncmd = scsi_cmd_priv(needle);
 
-	dprintk(NDEBUG_ABORT, "scsi%d: abort called basr 0x%02x, sr 0x%02x\n", HOSTNO,
-		    NCR5380_read(BUS_AND_STATUS_REG),
-		    NCR5380_read(STATUS_REG));
+		list_del(&ncmd->list);
+		return true;
+	}
+	return false;
+}
 
-#if 1
-	/*
-	 * Case 1 : If the command is the currently executing command,
-	 * we'll set the aborted flag and return control so that
-	 * information transfer routine can exit cleanly.
-	 */
+/**
+ * NCR5380_abort - scsi host eh_abort_handler() method
+ * @cmd: the command to be aborted
+ *
+ * Try to abort a given command by removing it from queues and/or sending
+ * the target an abort message. This may not succeed in causing a target
+ * to abort the command. Nonetheless, the low-level driver must forget about
+ * the command because the mid-layer reclaims it and it may be re-issued.
+ *
+ * The normal path taken by a command is as follows. For EH we trace this
+ * same path to locate and abort the command.
+ *
+ * unissued -> selecting -> [unissued -> selecting ->]... connected ->
+ * [disconnected -> connected ->]...
+ * [autosense -> connected ->] done
+ *
+ * If cmd is unissued then just remove it.
+ * If cmd is disconnected, try to select the target.
+ * If cmd is connected, try to send an abort message.
+ * If cmd is waiting for autosense, give it a chance to complete but check
+ * that it isn't left connected.
+ * If cmd was not found at all then presumably it has already been completed,
+ * in which case return SUCCESS to try to avoid further EH measures.
+ * If the command has not completed yet, we must not fail to find it.
+ */
 
-	if (hostdata->connected == cmd) {
+static int NCR5380_abort(struct scsi_cmnd *cmd)
+{
+	struct Scsi_Host *instance = cmd->device->host;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	unsigned long flags;
+	int result = SUCCESS;
 
-		dprintk(NDEBUG_ABORT, "scsi%d: aborting connected command\n", HOSTNO);
-		/*
-		 * We should perform BSY checking, and make sure we haven't slipped
-		 * into BUS FREE.
-		 */
+	spin_lock_irqsave(&hostdata->lock, flags);
 
-		/*	NCR5380_write(INITIATOR_COMMAND_REG, ICR_ASSERT_ATN); */
-		/*
-		 * Since we can't change phases until we've completed the current
-		 * handshake, we have to source or sink a byte of data if the current
-		 * phase is not MSGOUT.
-		 */
+#if (NDEBUG & NDEBUG_ANY)
+	scmd_printk(KERN_INFO, cmd, __func__);
+#endif
+	NCR5380_dprint(NDEBUG_ANY, instance);
+	NCR5380_dprint_phase(NDEBUG_ANY, instance);
 
-		/*
-		 * Return control to the executing NCR drive so we can clear the
-		 * aborted flag and get back into our main loop.
-		 */
+	if (list_del_cmd(&hostdata->unissued, cmd)) {
+		dsprintk(NDEBUG_ABORT, instance,
+		         "abort: removed %p from issue queue\n", cmd);
+		cmd->result = DID_ABORT << 16;
+		cmd->scsi_done(cmd); /* No tag or busy flag to worry about */
+	}
 
-		if (do_abort(instance) == 0) {
-			hostdata->aborted = 1;
-			hostdata->connected = NULL;
-			cmd->result = DID_ABORT << 16;
-#ifdef SUPPORT_TAGS
-			cmd_free_tag(cmd);
-#else
-			hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun);
-#endif
-			maybe_release_dma_irq(instance);
-			local_irq_restore(flags);
-			cmd->scsi_done(cmd);
-			return SUCCESS;
-		} else {
-			local_irq_restore(flags);
-			printk("scsi%d: abort of connected command failed!\n", HOSTNO);
-			return FAILED;
-		}
+	if (hostdata->selecting == cmd) {
+		dsprintk(NDEBUG_ABORT, instance,
+		         "abort: cmd %p == selecting\n", cmd);
+		hostdata->selecting = NULL;
+		cmd->result = DID_ABORT << 16;
+		complete_cmd(instance, cmd);
+		goto out;
 	}
-#endif
 
-	/*
-	 * Case 2 : If the command hasn't been issued yet, we simply remove it
-	 *	    from the issue queue.
-	 */
-	for (prev = (struct scsi_cmnd **)&(hostdata->issue_queue),
-	     tmp = (struct scsi_cmnd *)hostdata->issue_queue;
-	     tmp; prev = NEXTADDR(tmp), tmp = NEXT(tmp)) {
-		if (cmd == tmp) {
-			REMOVE(5, *prev, tmp, NEXT(tmp));
-			(*prev) = NEXT(tmp);
-			SET_NEXT(tmp, NULL);
-			tmp->result = DID_ABORT << 16;
-			maybe_release_dma_irq(instance);
-			local_irq_restore(flags);
-			dprintk(NDEBUG_ABORT, "scsi%d: abort removed command from issue queue.\n",
-				    HOSTNO);
-			/* Tagged queuing note: no tag to free here, hasn't been assigned
-			 * yet... */
-			tmp->scsi_done(tmp);
-			return SUCCESS;
+	if (list_del_cmd(&hostdata->disconnected, cmd)) {
+		dsprintk(NDEBUG_ABORT, instance,
+		         "abort: removed %p from disconnected list\n", cmd);
+		cmd->result = DID_ERROR << 16;
+		if (!hostdata->connected)
+			NCR5380_select(instance, cmd);
+		if (hostdata->connected != cmd) {
+			complete_cmd(instance, cmd);
+			result = FAILED;
+			goto out;
 		}
 	}
 
-	/*
-	 * Case 3 : If any commands are connected, we're going to fail the abort
-	 *	    and let the high level SCSI driver retry at a later time or
-	 *	    issue a reset.
-	 *
-	 *	    Timeouts, and therefore aborted commands, will be highly unlikely
-	 *          and handling them cleanly in this situation would make the common
-	 *	    case of noresets less efficient, and would pollute our code.  So,
-	 *	    we fail.
-	 */
+	if (hostdata->connected == cmd) {
+		dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd);
+		hostdata->connected = NULL;
+		if (do_abort(instance)) {
+			set_host_byte(cmd, DID_ERROR);
+			complete_cmd(instance, cmd);
+			result = FAILED;
+			goto out;
+		}
+		set_host_byte(cmd, DID_ABORT);
+#ifdef REAL_DMA
+		hostdata->dma_len = 0;
+#endif
+		if (cmd->cmnd[0] == REQUEST_SENSE)
+			complete_cmd(instance, cmd);
+		else {
+			struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
 
-	if (hostdata->connected) {
-		local_irq_restore(flags);
-		dprintk(NDEBUG_ABORT, "scsi%d: abort failed, command connected.\n", HOSTNO);
-		return FAILED;
+			/* Perform autosense for this command */
+			list_add(&ncmd->list, &hostdata->autosense);
+		}
 	}
 
-	/*
-	 * Case 4: If the command is currently disconnected from the bus, and
-	 *	there are no connected commands, we reconnect the I_T_L or
-	 *	I_T_L_Q nexus associated with it, go into message out, and send
-	 *      an abort message.
-	 *
-	 * This case is especially ugly. In order to reestablish the nexus, we
-	 * need to call NCR5380_select().  The easiest way to implement this
-	 * function was to abort if the bus was busy, and let the interrupt
-	 * handler triggered on the SEL for reselect take care of lost arbitrations
-	 * where necessary, meaning interrupts need to be enabled.
-	 *
-	 * When interrupts are enabled, the queues may change - so we
-	 * can't remove it from the disconnected queue before selecting it
-	 * because that could cause a failure in hashing the nexus if that
-	 * device reselected.
-	 *
-	 * Since the queues may change, we can't use the pointers from when we
-	 * first locate it.
-	 *
-	 * So, we must first locate the command, and if NCR5380_select()
-	 * succeeds, then issue the abort, relocate the command and remove
-	 * it from the disconnected queue.
-	 */
-
-	for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue; tmp;
-	     tmp = NEXT(tmp)) {
-		if (cmd == tmp) {
-			local_irq_restore(flags);
-			dprintk(NDEBUG_ABORT, "scsi%d: aborting disconnected command.\n", HOSTNO);
-
-			if (NCR5380_select(instance, cmd))
-				return FAILED;
-
-			dprintk(NDEBUG_ABORT, "scsi%d: nexus reestablished.\n", HOSTNO);
-
-			do_abort(instance);
-
-			local_irq_save(flags);
-			for (prev = (struct scsi_cmnd **)&(hostdata->disconnected_queue),
-			     tmp = (struct scsi_cmnd *)hostdata->disconnected_queue;
-			     tmp; prev = NEXTADDR(tmp), tmp = NEXT(tmp)) {
-				if (cmd == tmp) {
-					REMOVE(5, *prev, tmp, NEXT(tmp));
-					*prev = NEXT(tmp);
-					SET_NEXT(tmp, NULL);
-					tmp->result = DID_ABORT << 16;
-					/* We must unlock the tag/LUN immediately here, since the
-					 * target goes to BUS FREE and doesn't send us another
-					 * message (COMMAND_COMPLETE or the like)
-					 */
-#ifdef SUPPORT_TAGS
-					cmd_free_tag(tmp);
-#else
-					hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun);
-#endif
-					maybe_release_dma_irq(instance);
-					local_irq_restore(flags);
-					tmp->scsi_done(tmp);
-					return SUCCESS;
-				}
-			}
+	if (list_find_cmd(&hostdata->autosense, cmd)) {
+		dsprintk(NDEBUG_ABORT, instance,
+		         "abort: found %p on sense queue\n", cmd);
+		spin_unlock_irqrestore(&hostdata->lock, flags);
+		queue_work(hostdata->work_q, &hostdata->main_task);
+		msleep(1000);
+		spin_lock_irqsave(&hostdata->lock, flags);
+		if (list_del_cmd(&hostdata->autosense, cmd)) {
+			dsprintk(NDEBUG_ABORT, instance,
+			         "abort: removed %p from sense queue\n", cmd);
+			set_host_byte(cmd, DID_ABORT);
+			complete_cmd(instance, cmd);
+			goto out;
 		}
 	}
 
-	/* Maybe it is sufficient just to release the ST-DMA lock... (if
-	 * possible at all) At least, we should check if the lock could be
-	 * released after the abort, in case it is kept due to some bug.
-	 */
-	maybe_release_dma_irq(instance);
-	local_irq_restore(flags);
+	if (hostdata->connected == cmd) {
+		dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd);
+		hostdata->connected = NULL;
+		if (do_abort(instance)) {
+			set_host_byte(cmd, DID_ERROR);
+			complete_cmd(instance, cmd);
+			result = FAILED;
+			goto out;
+		}
+		set_host_byte(cmd, DID_ABORT);
+#ifdef REAL_DMA
+		hostdata->dma_len = 0;
+#endif
+		complete_cmd(instance, cmd);
+	}
 
-	/*
-	 * Case 5 : If we reached this point, the command was not found in any of
-	 *	    the queues.
-	 *
-	 * We probably reached this point because of an unlikely race condition
-	 * between the command completing successfully and the abortion code,
-	 * so we won't panic, but we will notify the user in case something really
-	 * broke.
-	 */
+out:
+	if (result == FAILED)
+		dsprintk(NDEBUG_ABORT, instance, "abort: failed to abort %p\n", cmd);
+	else
+		dsprintk(NDEBUG_ABORT, instance, "abort: successfully aborted %p\n", cmd);
 
-	printk(KERN_INFO "scsi%d: warning : SCSI command probably completed successfully before abortion\n", HOSTNO);
+	queue_work(hostdata->work_q, &hostdata->main_task);
+	maybe_release_dma_irq(instance);
+	spin_unlock_irqrestore(&hostdata->lock, flags);
 
-	return FAILED;
+	return result;
 }
 
 
-/*
- * Function : int NCR5380_reset (struct scsi_cmnd *cmd)
- *
- * Purpose : reset the SCSI bus.
- *
- * Returns : SUCCESS or FAILURE
+/**
+ * NCR5380_bus_reset - reset the SCSI bus
+ * @cmd: SCSI command undergoing EH
  *
+ * Returns SUCCESS
  */
 
 static int NCR5380_bus_reset(struct scsi_cmnd *cmd)
@@ -2876,23 +2623,22 @@ static int NCR5380_bus_reset(struct scsi_cmnd *cmd)
 	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	int i;
 	unsigned long flags;
+	struct NCR5380_cmd *ncmd;
 
-	NCR5380_print_status(instance);
+	spin_lock_irqsave(&hostdata->lock, flags);
+
+#if (NDEBUG & NDEBUG_ANY)
+	scmd_printk(KERN_INFO, cmd, __func__);
+#endif
+	NCR5380_dprint(NDEBUG_ANY, instance);
+	NCR5380_dprint_phase(NDEBUG_ANY, instance);
+
+	do_reset(instance);
 
-	/* get in phase */
-	NCR5380_write(TARGET_COMMAND_REG,
-		      PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG)));
-	/* assert RST */
-	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST);
-	udelay(40);
 	/* reset NCR registers */
-	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 	NCR5380_write(MODE_REG, MR_BASE);
 	NCR5380_write(TARGET_COMMAND_REG, 0);
 	NCR5380_write(SELECT_ENABLE_REG, 0);
-	/* ++roman: reset interrupt condition! otherwise no interrupts don't get
-	 * through anymore ... */
-	(void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
 
 	/* After the reset, there are no more connected or disconnected commands
 	 * and no busy units; so clear the low-level status here to avoid
@@ -2900,17 +2646,34 @@ static int NCR5380_bus_reset(struct scsi_cmnd *cmd)
 	 * commands!
 	 */
 
-	if (hostdata->issue_queue)
-		dprintk(NDEBUG_ABORT, "scsi%d: reset aborted issued command(s)\n", H_NO(cmd));
-	if (hostdata->connected)
-		dprintk(NDEBUG_ABORT, "scsi%d: reset aborted a connected command\n", H_NO(cmd));
-	if (hostdata->disconnected_queue)
-		dprintk(NDEBUG_ABORT, "scsi%d: reset aborted disconnected command(s)\n", H_NO(cmd));
+	hostdata->selecting = NULL;
+
+	list_for_each_entry(ncmd, &hostdata->disconnected, list) {
+		struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
+
+		set_host_byte(cmd, DID_RESET);
+		cmd->scsi_done(cmd);
+	}
+
+	list_for_each_entry(ncmd, &hostdata->autosense, list) {
+		struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
+
+		set_host_byte(cmd, DID_RESET);
+		cmd->scsi_done(cmd);
+	}
+
+	if (hostdata->connected) {
+		set_host_byte(hostdata->connected, DID_RESET);
+		complete_cmd(instance, hostdata->connected);
+		hostdata->connected = NULL;
+	}
+
+	if (hostdata->sensing) {
+		set_host_byte(hostdata->connected, DID_RESET);
+		complete_cmd(instance, hostdata->sensing);
+		hostdata->sensing = NULL;
+	}
 
-	local_irq_save(flags);
-	hostdata->issue_queue = NULL;
-	hostdata->connected = NULL;
-	hostdata->disconnected_queue = NULL;
 #ifdef SUPPORT_TAGS
 	free_all_tags(hostdata);
 #endif
@@ -2920,8 +2683,9 @@ static int NCR5380_bus_reset(struct scsi_cmnd *cmd)
 	hostdata->dma_len = 0;
 #endif
 
+	queue_work(hostdata->work_q, &hostdata->main_task);
 	maybe_release_dma_irq(instance);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&hostdata->lock, flags);
 
 	return SUCCESS;
 }
diff --git a/drivers/scsi/atari_scsi.c b/drivers/scsi/atari_scsi.c
index 5ede3daa93dc..78d1b2963f2c 100644
--- a/drivers/scsi/atari_scsi.c
+++ b/drivers/scsi/atari_scsi.c
@@ -66,7 +66,6 @@
 
 #include <linux/module.h>
 #include <linux/types.h>
-#include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
@@ -98,7 +97,6 @@
 
 #define NCR5380_queue_command           atari_scsi_queue_command
 #define NCR5380_abort                   atari_scsi_abort
-#define NCR5380_show_info               atari_scsi_show_info
 #define NCR5380_info                    atari_scsi_info
 
 #define NCR5380_dma_read_setup(instance, data, count) \
@@ -161,23 +159,10 @@ static inline unsigned long SCSI_DMA_GETADR(void)
 	return adr;
 }
 
-#define HOSTDATA_DMALEN		(((struct NCR5380_hostdata *) \
-				(atari_scsi_host->hostdata))->dma_len)
-
-/* Time (in jiffies) to wait after a reset; the SCSI standard calls for 250ms,
- * we usually do 0.5s to be on the safe side. But Toshiba CD-ROMs once more
- * need ten times the standard value... */
-#ifndef CONFIG_ATARI_SCSI_TOSHIBA_DELAY
-#define	AFTER_RESET_DELAY	(HZ/2)
-#else
-#define	AFTER_RESET_DELAY	(5*HZ/2)
-#endif
-
 #ifdef REAL_DMA
 static void atari_scsi_fetch_restbytes(void);
 #endif
 
-static struct Scsi_Host *atari_scsi_host;
 static unsigned char (*atari_scsi_reg_read)(unsigned char reg);
 static void (*atari_scsi_reg_write)(unsigned char reg, unsigned char value);
 
@@ -208,12 +193,12 @@ static int setup_cmd_per_lun = -1;
 module_param(setup_cmd_per_lun, int, 0);
 static int setup_sg_tablesize = -1;
 module_param(setup_sg_tablesize, int, 0);
-#ifdef SUPPORT_TAGS
 static int setup_use_tagged_queuing = -1;
 module_param(setup_use_tagged_queuing, int, 0);
-#endif
 static int setup_hostid = -1;
 module_param(setup_hostid, int, 0);
+static int setup_toshiba_delay = -1;
+module_param(setup_toshiba_delay, int, 0);
 
 
 #if defined(REAL_DMA)
@@ -273,15 +258,17 @@ static void scsi_dma_buserr(int irq, void *dummy)
 #endif
 
 
-static irqreturn_t scsi_tt_intr(int irq, void *dummy)
+static irqreturn_t scsi_tt_intr(int irq, void *dev)
 {
 #ifdef REAL_DMA
+	struct Scsi_Host *instance = dev;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	int dma_stat;
 
 	dma_stat = tt_scsi_dma.dma_ctrl;
 
-	dprintk(NDEBUG_INTR, "scsi%d: NCR5380 interrupt, DMA status = %02x\n",
-		   atari_scsi_host->host_no, dma_stat & 0xff);
+	dsprintk(NDEBUG_INTR, instance, "NCR5380 interrupt, DMA status = %02x\n",
+	         dma_stat & 0xff);
 
 	/* Look if it was the DMA that has interrupted: First possibility
 	 * is that a bus error occurred...
@@ -304,7 +291,8 @@ static irqreturn_t scsi_tt_intr(int irq, void *dummy)
 	 * data reg!
 	 */
 	if ((dma_stat & 0x02) && !(dma_stat & 0x40)) {
-		atari_dma_residual = HOSTDATA_DMALEN - (SCSI_DMA_READ_P(dma_addr) - atari_dma_startaddr);
+		atari_dma_residual = hostdata->dma_len -
+			(SCSI_DMA_READ_P(dma_addr) - atari_dma_startaddr);
 
 		dprintk(NDEBUG_DMA, "SCSI DMA: There are %ld residual bytes.\n",
 			   atari_dma_residual);
@@ -356,15 +344,17 @@ static irqreturn_t scsi_tt_intr(int irq, void *dummy)
 
 #endif /* REAL_DMA */
 
-	NCR5380_intr(irq, dummy);
+	NCR5380_intr(irq, dev);
 
 	return IRQ_HANDLED;
 }
 
 
-static irqreturn_t scsi_falcon_intr(int irq, void *dummy)
+static irqreturn_t scsi_falcon_intr(int irq, void *dev)
 {
 #ifdef REAL_DMA
+	struct Scsi_Host *instance = dev;
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	int dma_stat;
 
 	/* Turn off DMA and select sector counter register before
@@ -399,7 +389,7 @@ static irqreturn_t scsi_falcon_intr(int irq, void *dummy)
 			printk(KERN_ERR "SCSI DMA error: %ld bytes lost in "
 			       "ST-DMA fifo\n", transferred & 15);
 
-		atari_dma_residual = HOSTDATA_DMALEN - transferred;
+		atari_dma_residual = hostdata->dma_len - transferred;
 		dprintk(NDEBUG_DMA, "SCSI DMA: There are %ld residual bytes.\n",
 			   atari_dma_residual);
 	} else
@@ -411,13 +401,14 @@ static irqreturn_t scsi_falcon_intr(int irq, void *dummy)
 		 * data to the original destination address.
 		 */
 		memcpy(atari_dma_orig_addr, phys_to_virt(atari_dma_startaddr),
-		       HOSTDATA_DMALEN - atari_dma_residual);
+		       hostdata->dma_len - atari_dma_residual);
 		atari_dma_orig_addr = NULL;
 	}
 
 #endif /* REAL_DMA */
 
-	NCR5380_intr(irq, dummy);
+	NCR5380_intr(irq, dev);
+
 	return IRQ_HANDLED;
 }
 
@@ -488,7 +479,7 @@ static int __init atari_scsi_setup(char *str)
 	 * Defaults depend on TT or Falcon, determined at run time.
 	 * Negative values mean don't change.
 	 */
-	int ints[6];
+	int ints[8];
 
 	get_options(str, ARRAY_SIZE(ints), ints);
 
@@ -504,10 +495,11 @@ static int __init atari_scsi_setup(char *str)
 		setup_sg_tablesize = ints[3];
 	if (ints[0] >= 4)
 		setup_hostid = ints[4];
-#ifdef SUPPORT_TAGS
 	if (ints[0] >= 5)
 		setup_use_tagged_queuing = ints[5];
-#endif
+	/* ints[6] (use_pdma) is ignored */
+	if (ints[0] >= 7)
+		setup_toshiba_delay = ints[7];
 
 	return 1;
 }
@@ -516,38 +508,6 @@ __setup("atascsi=", atari_scsi_setup);
 #endif /* !MODULE */
 
 
-#ifdef CONFIG_ATARI_SCSI_RESET_BOOT
-static void __init atari_scsi_reset_boot(void)
-{
-	unsigned long end;
-
-	/*
-	 * Do a SCSI reset to clean up the bus during initialization. No messing
-	 * with the queues, interrupts, or locks necessary here.
-	 */
-
-	printk("Atari SCSI: resetting the SCSI bus...");
-
-	/* get in phase */
-	NCR5380_write(TARGET_COMMAND_REG,
-		      PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG)));
-
-	/* assert RST */
-	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST);
-	/* The min. reset hold time is 25us, so 40us should be enough */
-	udelay(50);
-	/* reset RST and interrupt */
-	NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-	NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-
-	end = jiffies + AFTER_RESET_DELAY;
-	while (time_before(jiffies, end))
-		barrier();
-
-	printk(" done\n");
-}
-#endif
-
 #if defined(REAL_DMA)
 
 static unsigned long atari_scsi_dma_setup(struct Scsi_Host *instance,
@@ -815,14 +775,14 @@ static int atari_scsi_bus_reset(struct scsi_cmnd *cmd)
 static struct scsi_host_template atari_scsi_template = {
 	.module			= THIS_MODULE,
 	.proc_name		= DRV_MODULE_NAME,
-	.show_info		= atari_scsi_show_info,
 	.name			= "Atari native SCSI",
 	.info			= atari_scsi_info,
 	.queuecommand		= atari_scsi_queue_command,
 	.eh_abort_handler	= atari_scsi_abort,
 	.eh_bus_reset_handler	= atari_scsi_bus_reset,
 	.this_id		= 7,
-	.use_clustering		= DISABLE_CLUSTERING
+	.use_clustering		= DISABLE_CLUSTERING,
+	.cmd_size		= NCR5380_CMD_SIZE,
 };
 
 static int __init atari_scsi_probe(struct platform_device *pdev)
@@ -880,7 +840,7 @@ static int __init atari_scsi_probe(struct platform_device *pdev)
 	} else {
 		/* Test if a host id is set in the NVRam */
 		if (ATARIHW_PRESENT(TT_CLK) && nvram_check_checksum()) {
-			unsigned char b = nvram_read_byte(14);
+			unsigned char b = nvram_read_byte(16);
 
 			/* Arbitration enabled? (for TOS)
 			 * If yes, use configured host ID
@@ -915,21 +875,18 @@ static int __init atari_scsi_probe(struct platform_device *pdev)
 		error = -ENOMEM;
 		goto fail_alloc;
 	}
-	atari_scsi_host = instance;
-
-#ifdef CONFIG_ATARI_SCSI_RESET_BOOT
-	atari_scsi_reset_boot();
-#endif
 
 	instance->irq = irq->start;
 
 	host_flags |= IS_A_TT() ? 0 : FLAG_LATE_DMA_SETUP;
-
 #ifdef SUPPORT_TAGS
 	host_flags |= setup_use_tagged_queuing > 0 ? FLAG_TAGGED_QUEUING : 0;
 #endif
+	host_flags |= setup_toshiba_delay > 0 ? FLAG_TOSHIBA_DELAY : 0;
 
-	NCR5380_init(instance, host_flags);
+	error = NCR5380_init(instance, host_flags);
+	if (error)
+		goto fail_init;
 
 	if (IS_A_TT()) {
 		error = request_irq(instance->irq, scsi_tt_intr, 0,
@@ -975,6 +932,8 @@ static int __init atari_scsi_probe(struct platform_device *pdev)
 #endif
 	}
 
+	NCR5380_maybe_reset_bus(instance);
+
 	error = scsi_add_host(instance, NULL);
 	if (error)
 		goto fail_host;
@@ -989,6 +948,7 @@ fail_host:
 		free_irq(instance->irq, instance);
 fail_irq:
 	NCR5380_exit(instance);
+fail_init:
 	scsi_host_put(instance);
 fail_alloc:
 	if (atari_dma_buffer)
diff --git a/drivers/scsi/be2iscsi/Kconfig b/drivers/scsi/be2iscsi/Kconfig
index 4e7cad272469..bad5f32e1f67 100644
--- a/drivers/scsi/be2iscsi/Kconfig
+++ b/drivers/scsi/be2iscsi/Kconfig
@@ -3,6 +3,7 @@ config BE2ISCSI
 	depends on PCI && SCSI && NET
 	select SCSI_ISCSI_ATTRS
 	select ISCSI_BOOT_SYSFS
+	select IRQ_POLL
 
 	help
 	This driver implements the iSCSI functionality for Emulex
diff --git a/drivers/scsi/be2iscsi/be.h b/drivers/scsi/be2iscsi/be.h
index 77f992e74726..a41c6432f444 100644
--- a/drivers/scsi/be2iscsi/be.h
+++ b/drivers/scsi/be2iscsi/be.h
@@ -20,7 +20,7 @@
 
 #include <linux/pci.h>
 #include <linux/if_vlan.h>
-#include <linux/blk-iopoll.h>
+#include <linux/irq_poll.h>
 #define FW_VER_LEN	32
 #define MCC_Q_LEN	128
 #define MCC_CQ_LEN	256
@@ -101,7 +101,7 @@ struct be_eq_obj {
 	struct beiscsi_hba *phba;
 	struct be_queue_info *cq;
 	struct work_struct work_cqs; /* Work Item */
-	struct blk_iopoll	iopoll;
+	struct irq_poll	iopoll;
 };
 
 struct be_mcc_obj {
diff --git a/drivers/scsi/be2iscsi/be_iscsi.c b/drivers/scsi/be2iscsi/be_iscsi.c
index b7087ba69d8d..022e87b62e40 100644
--- a/drivers/scsi/be2iscsi/be_iscsi.c
+++ b/drivers/scsi/be2iscsi/be_iscsi.c
@@ -1292,9 +1292,9 @@ static void beiscsi_flush_cq(struct beiscsi_hba *phba)
 
 	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		blk_iopoll_disable(&pbe_eq->iopoll);
+		irq_poll_disable(&pbe_eq->iopoll);
 		beiscsi_process_cq(pbe_eq);
-		blk_iopoll_enable(&pbe_eq->iopoll);
+		irq_poll_enable(&pbe_eq->iopoll);
 	}
 }
 
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index fe0c5143f8e6..cb9072a841be 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -910,8 +910,7 @@ static irqreturn_t be_isr_msix(int irq, void *dev_id)
 	num_eq_processed = 0;
 	while (eqe->dw[offsetof(struct amap_eq_entry, valid) / 32]
 				& EQE_VALID_MASK) {
-		if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
-			blk_iopoll_sched(&pbe_eq->iopoll);
+		irq_poll_sched(&pbe_eq->iopoll);
 
 		AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
 		queue_tail_inc(eq);
@@ -972,8 +971,7 @@ static irqreturn_t be_isr(int irq, void *dev_id)
 			spin_unlock_irqrestore(&phba->isr_lock, flags);
 			num_mcceq_processed++;
 		} else {
-			if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
-				blk_iopoll_sched(&pbe_eq->iopoll);
+			irq_poll_sched(&pbe_eq->iopoll);
 			num_ioeq_processed++;
 		}
 		AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
@@ -2295,7 +2293,7 @@ void beiscsi_process_all_cqs(struct work_struct *work)
 	hwi_ring_eq_db(phba, pbe_eq->q.id, 0, 0, 1, 1);
 }
 
-static int be_iopoll(struct blk_iopoll *iop, int budget)
+static int be_iopoll(struct irq_poll *iop, int budget)
 {
 	unsigned int ret;
 	struct beiscsi_hba *phba;
@@ -2306,7 +2304,7 @@ static int be_iopoll(struct blk_iopoll *iop, int budget)
 	pbe_eq->cq_count += ret;
 	if (ret < budget) {
 		phba = pbe_eq->phba;
-		blk_iopoll_complete(iop);
+		irq_poll_complete(iop);
 		beiscsi_log(phba, KERN_INFO,
 			    BEISCSI_LOG_CONFIG | BEISCSI_LOG_IO,
 			    "BM_%d : rearm pbe_eq->q.id =%d\n",
@@ -5293,7 +5291,7 @@ static void beiscsi_quiesce(struct beiscsi_hba *phba,
 
 	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		blk_iopoll_disable(&pbe_eq->iopoll);
+		irq_poll_disable(&pbe_eq->iopoll);
 	}
 
 	if (unload_state == BEISCSI_CLEAN_UNLOAD) {
@@ -5579,9 +5577,8 @@ static void beiscsi_eeh_resume(struct pci_dev *pdev)
 
 	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
+		irq_poll_init(&pbe_eq->iopoll, be_iopoll_budget,
 				be_iopoll);
-		blk_iopoll_enable(&pbe_eq->iopoll);
 	}
 
 	i = (phba->msix_enabled) ? i : 0;
@@ -5752,9 +5749,8 @@ static int beiscsi_dev_probe(struct pci_dev *pcidev,
 
 	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
+		irq_poll_init(&pbe_eq->iopoll, be_iopoll_budget,
 				be_iopoll);
-		blk_iopoll_enable(&pbe_eq->iopoll);
 	}
 
 	i = (phba->msix_enabled) ? i : 0;
@@ -5795,7 +5791,7 @@ free_blkenbld:
 	destroy_workqueue(phba->wq);
 	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		blk_iopoll_disable(&pbe_eq->iopoll);
+		irq_poll_disable(&pbe_eq->iopoll);
 	}
 free_twq:
 	beiscsi_clean_port(phba);
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
index 0e2bee937fe8..e22a268fd311 100644
--- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
@@ -57,7 +57,7 @@ MODULE_PARM_DESC(cxgb3i_snd_win, "TCP send window in bytes (default=128KB)");
 
 static int cxgb3i_rx_credit_thres = 10 * 1024;
 module_param(cxgb3i_rx_credit_thres, int, 0644);
-MODULE_PARM_DESC(rx_credit_thres,
+MODULE_PARM_DESC(cxgb3i_rx_credit_thres,
 		 "RX credits return threshold in bytes (default=10KB)");
 
 static unsigned int cxgb3i_max_connect = 8 * 1024;
diff --git a/drivers/scsi/dmx3191d.c b/drivers/scsi/dmx3191d.c
index 3e088125a8be..6c14e68b9e1a 100644
--- a/drivers/scsi/dmx3191d.c
+++ b/drivers/scsi/dmx3191d.c
@@ -36,17 +36,10 @@
 
 #define DONT_USE_INTR
 
-#define NCR5380_read(reg)		inb(port + reg)
-#define NCR5380_write(reg, value)	outb(value, port + reg)
+#define NCR5380_read(reg)		inb(instance->io_port + reg)
+#define NCR5380_write(reg, value)	outb(value, instance->io_port + reg)
 
 #define NCR5380_implementation_fields	/* none */
-#define NCR5380_local_declare()		unsigned int port
-#define NCR5380_setup(instance)		port = instance->io_port
-
-/*
- * Includes needed for NCR5380.[ch] (XXX: Move them to NCR5380.h)
- */
-#include <linux/delay.h>
 
 #include "NCR5380.h"
 #include "NCR5380.c"
@@ -56,6 +49,7 @@
 
 
 static struct scsi_host_template dmx3191d_driver_template = {
+	.module			= THIS_MODULE,
 	.proc_name		= DMX3191D_DRIVER_NAME,
 	.name			= "Domex DMX3191D",
 	.info			= NCR5380_info,
@@ -67,6 +61,8 @@ static struct scsi_host_template dmx3191d_driver_template = {
 	.sg_tablesize		= SG_ALL,
 	.cmd_per_lun		= 2,
 	.use_clustering		= DISABLE_CLUSTERING,
+	.cmd_size		= NCR5380_CMD_SIZE,
+	.max_sectors		= 128,
 };
 
 static int dmx3191d_probe_one(struct pci_dev *pdev,
@@ -97,17 +93,25 @@ static int dmx3191d_probe_one(struct pci_dev *pdev,
 	 */
 	shost->irq = NO_IRQ;
 
-	NCR5380_init(shost, FLAG_NO_PSEUDO_DMA | FLAG_DTC3181E);
+	error = NCR5380_init(shost, FLAG_NO_PSEUDO_DMA);
+	if (error)
+		goto out_host_put;
+
+	NCR5380_maybe_reset_bus(shost);
 
 	pci_set_drvdata(pdev, shost);
 
 	error = scsi_add_host(shost, &pdev->dev);
 	if (error)
-		goto out_release_region;
+		goto out_exit;
 
 	scsi_scan_host(shost);
 	return 0;
 
+out_exit:
+	NCR5380_exit(shost);
+out_host_put:
+	scsi_host_put(shost);
  out_release_region:
 	release_region(io, DMX3191D_REGION_LEN);
  out_disable_device:
@@ -119,15 +123,14 @@ static int dmx3191d_probe_one(struct pci_dev *pdev,
 static void dmx3191d_remove_one(struct pci_dev *pdev)
 {
 	struct Scsi_Host *shost = pci_get_drvdata(pdev);
+	unsigned long io = shost->io_port;
 
 	scsi_remove_host(shost);
 
 	NCR5380_exit(shost);
-
-	release_region(shost->io_port, DMX3191D_REGION_LEN);
-	pci_disable_device(pdev);
-
 	scsi_host_put(shost);
+	release_region(io, DMX3191D_REGION_LEN);
+	pci_disable_device(pdev);
 }
 
 static struct pci_device_id dmx3191d_pci_tbl[] = {
diff --git a/drivers/scsi/dtc.c b/drivers/scsi/dtc.c
index 4c74c7ba2dff..6c736b071cf4 100644
--- a/drivers/scsi/dtc.c
+++ b/drivers/scsi/dtc.c
@@ -1,9 +1,5 @@
-
 #define PSEUDO_DMA
 #define DONT_USE_INTR
-#define UNSAFE			/* Leave interrupts enabled during pseudo-dma I/O */
-#define DMA_WORKS_RIGHT
-
 
 /*
  * DTC 3180/3280 driver, by
@@ -50,15 +46,13 @@
 
 
 #include <linux/module.h>
-#include <linux/signal.h>
 #include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <scsi/scsi_host.h>
+
 #include "dtc.h"
 #define AUTOPROBE_IRQ
 #include "NCR5380.h"
@@ -150,7 +144,7 @@ static const struct signature {
 
 static int __init dtc_setup(char *str)
 {
-	static int commandline_current = 0;
+	static int commandline_current;
 	int i;
 	int ints[10];
 
@@ -188,7 +182,7 @@ __setup("dtc=", dtc_setup);
 
 static int __init dtc_detect(struct scsi_host_template * tpnt)
 {
-	static int current_override = 0, current_base = 0;
+	static int current_override, current_base;
 	struct Scsi_Host *instance;
 	unsigned int addr;
 	void __iomem *base;
@@ -205,9 +199,8 @@ static int __init dtc_detect(struct scsi_host_template * tpnt)
 				addr = 0;
 		} else
 			for (; !addr && (current_base < NO_BASES); ++current_base) {
-#if (DTCDEBUG & DTCDEBUG_INIT)
-				printk(KERN_DEBUG "scsi-dtc : probing address %08x\n", bases[current_base].address);
-#endif
+				dprintk(NDEBUG_INIT, "dtc: probing address 0x%08x\n",
+				        (unsigned int)bases[current_base].address);
 				if (bases[current_base].noauto)
 					continue;
 				base = ioremap(bases[current_base].address, 0x2000);
@@ -216,18 +209,14 @@ static int __init dtc_detect(struct scsi_host_template * tpnt)
 				for (sig = 0; sig < NO_SIGNATURES; ++sig) {
 					if (check_signature(base + signatures[sig].offset, signatures[sig].string, strlen(signatures[sig].string))) {
 						addr = bases[current_base].address;
-#if (DTCDEBUG & DTCDEBUG_INIT)
-						printk(KERN_DEBUG "scsi-dtc : detected board.\n");
-#endif
+						dprintk(NDEBUG_INIT, "dtc: detected board\n");
 						goto found;
 					}
 				}
 				iounmap(base);
 			}
 
-#if defined(DTCDEBUG) && (DTCDEBUG & DTCDEBUG_INIT)
-		printk(KERN_DEBUG "scsi-dtc : base = %08x\n", addr);
-#endif
+		dprintk(NDEBUG_INIT, "dtc: addr = 0x%08x\n", addr);
 
 		if (!addr)
 			break;
@@ -235,12 +224,15 @@ static int __init dtc_detect(struct scsi_host_template * tpnt)
 found:
 		instance = scsi_register(tpnt, sizeof(struct NCR5380_hostdata));
 		if (instance == NULL)
-			break;
+			goto out_unmap;
 
 		instance->base = addr;
 		((struct NCR5380_hostdata *)(instance)->hostdata)->base = base;
 
-		NCR5380_init(instance, 0);
+		if (NCR5380_init(instance, FLAG_NO_DMA_FIXUP))
+			goto out_unregister;
+
+		NCR5380_maybe_reset_bus(instance);
 
 		NCR5380_write(DTC_CONTROL_REG, CSR_5380_INTR);	/* Enable int's */
 		if (overrides[current_override].irq != IRQ_AUTO)
@@ -271,14 +263,19 @@ found:
 			printk(KERN_WARNING "scsi%d : interrupts not used. Might as well not jumper it.\n", instance->host_no);
 		instance->irq = NO_IRQ;
 #endif
-#if defined(DTCDEBUG) && (DTCDEBUG & DTCDEBUG_INIT)
-		printk("scsi%d : irq = %d\n", instance->host_no, instance->irq);
-#endif
+		dprintk(NDEBUG_INIT, "scsi%d : irq = %d\n",
+		        instance->host_no, instance->irq);
 
 		++current_override;
 		++count;
 	}
 	return count;
+
+out_unregister:
+	scsi_unregister(instance);
+out_unmap:
+	iounmap(base);
+	return count;
 }
 
 /*
@@ -331,12 +328,8 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst,
 	unsigned char *d = dst;
 	int i;			/* For counting time spent in the poll-loop */
 	struct NCR5380_hostdata *hostdata = shost_priv(instance);
-	NCR5380_local_declare();
-	NCR5380_setup(instance);
 
 	i = 0;
-	NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-	NCR5380_write(MODE_REG, MR_ENABLE_EOP_INTR | MR_DMA_MODE);
 	if (instance->irq == NO_IRQ)
 		NCR5380_write(DTC_CONTROL_REG, CSR_DIR_READ);
 	else
@@ -348,7 +341,7 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst,
 		while (NCR5380_read(DTC_CONTROL_REG) & CSR_HOST_BUF_NOT_RDY)
 			++i;
 		rtrc(3);
-		memcpy_fromio(d, base + DTC_DATA_BUF, 128);
+		memcpy_fromio(d, hostdata->base + DTC_DATA_BUF, 128);
 		d += 128;
 		len -= 128;
 		rtrc(7);
@@ -358,9 +351,7 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst,
 	rtrc(4);
 	while (!(NCR5380_read(DTC_CONTROL_REG) & D_CR_ACCESS))
 		++i;
-	NCR5380_write(MODE_REG, 0);	/* Clear the operating mode */
 	rtrc(0);
-	NCR5380_read(RESET_PARITY_INTERRUPT_REG);
 	if (i > hostdata->spin_max_r)
 		hostdata->spin_max_r = i;
 	return (0);
@@ -383,12 +374,7 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src,
 {
 	int i;
 	struct NCR5380_hostdata *hostdata = shost_priv(instance);
-	NCR5380_local_declare();
-	NCR5380_setup(instance);
 
-	NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-	NCR5380_write(MODE_REG, MR_ENABLE_EOP_INTR | MR_DMA_MODE);
-	/* set direction (write) */
 	if (instance->irq == NO_IRQ)
 		NCR5380_write(DTC_CONTROL_REG, 0);
 	else
@@ -400,7 +386,7 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src,
 		while (NCR5380_read(DTC_CONTROL_REG) & CSR_HOST_BUF_NOT_RDY)
 			++i;
 		rtrc(3);
-		memcpy_toio(base + DTC_DATA_BUF, src, 128);
+		memcpy_toio(hostdata->base + DTC_DATA_BUF, src, 128);
 		src += 128;
 		len -= 128;
 	}
@@ -413,47 +399,60 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src,
 		++i;
 	rtrc(7);
 	/* Check for parity error here. fixme. */
-	NCR5380_write(MODE_REG, 0);	/* Clear the operating mode */
 	rtrc(0);
 	if (i > hostdata->spin_max_w)
 		hostdata->spin_max_w = i;
 	return (0);
 }
 
+static int dtc_dma_xfer_len(struct scsi_cmnd *cmd)
+{
+	int transfersize = cmd->transfersize;
+
+	/* Limit transfers to 32K, for xx400 & xx406
+	 * pseudoDMA that transfers in 128 bytes blocks.
+	 */
+	if (transfersize > 32 * 1024 && cmd->SCp.this_residual &&
+	    !(cmd->SCp.this_residual % transfersize))
+		transfersize = 32 * 1024;
+
+	return transfersize;
+}
+
 MODULE_LICENSE("GPL");
 
 #include "NCR5380.c"
 
 static int dtc_release(struct Scsi_Host *shost)
 {
-	NCR5380_local_declare();
-	NCR5380_setup(shost);
+	struct NCR5380_hostdata *hostdata = shost_priv(shost);
+
 	if (shost->irq != NO_IRQ)
 		free_irq(shost->irq, shost);
 	NCR5380_exit(shost);
-	if (shost->io_port && shost->n_io_port)
-		release_region(shost->io_port, shost->n_io_port);
 	scsi_unregister(shost);
-	iounmap(base);
+	iounmap(hostdata->base);
 	return 0;
 }
 
 static struct scsi_host_template driver_template = {
-	.name				= "DTC 3180/3280 ",
-	.detect				= dtc_detect,
-	.release			= dtc_release,
-	.proc_name			= "dtc3x80",
-	.show_info			= dtc_show_info,
-	.write_info			= dtc_write_info,
-	.info				= dtc_info,
-	.queuecommand			= dtc_queue_command,
-	.eh_abort_handler		= dtc_abort,
-	.eh_bus_reset_handler		= dtc_bus_reset,
-	.bios_param     		= dtc_biosparam,
-	.can_queue      		= CAN_QUEUE,
-	.this_id        		= 7,
-	.sg_tablesize   		= SG_ALL,
-	.cmd_per_lun    		= CMD_PER_LUN,
-	.use_clustering 		= DISABLE_CLUSTERING,
+	.name			= "DTC 3180/3280",
+	.detect			= dtc_detect,
+	.release		= dtc_release,
+	.proc_name		= "dtc3x80",
+	.show_info		= dtc_show_info,
+	.write_info		= dtc_write_info,
+	.info			= dtc_info,
+	.queuecommand		= dtc_queue_command,
+	.eh_abort_handler	= dtc_abort,
+	.eh_bus_reset_handler	= dtc_bus_reset,
+	.bios_param		= dtc_biosparam,
+	.can_queue		= 32,
+	.this_id		= 7,
+	.sg_tablesize		= SG_ALL,
+	.cmd_per_lun		= 2,
+	.use_clustering		= DISABLE_CLUSTERING,
+	.cmd_size		= NCR5380_CMD_SIZE,
+	.max_sectors		= 128,
 };
 #include "scsi_module.c"
diff --git a/drivers/scsi/dtc.h b/drivers/scsi/dtc.h
index 78a2332e9064..56732cba8aba 100644
--- a/drivers/scsi/dtc.h
+++ b/drivers/scsi/dtc.h
@@ -10,54 +10,17 @@
 #ifndef DTC3280_H
 #define DTC3280_H
 
-#define DTCDEBUG 0
-#define DTCDEBUG_INIT	0x1
-#define DTCDEBUG_TRANSFER 0x2
-
-#ifndef CMD_PER_LUN
-#define CMD_PER_LUN 2
-#endif
-
-#ifndef CAN_QUEUE
-#define CAN_QUEUE 32 
-#endif
-
 #define NCR5380_implementation_fields \
     void __iomem *base
 
-#define NCR5380_local_declare() \
-    void __iomem *base
-
-#define NCR5380_setup(instance) \
-    base = ((struct NCR5380_hostdata *)(instance)->hostdata)->base
+#define DTC_address(reg) \
+	(((struct NCR5380_hostdata *)shost_priv(instance))->base + DTC_5380_OFFSET + reg)
 
-#define DTC_address(reg) (base + DTC_5380_OFFSET + reg)
-
-#define dbNCR5380_read(reg)                                              \
-    (rval=readb(DTC_address(reg)), \
-     (((unsigned char) printk("DTC : read register %d at addr %p is: %02x\n"\
-    , (reg), DTC_address(reg), rval)), rval ) )
-
-#define dbNCR5380_write(reg, value) do {                                  \
-    printk("DTC : write %02x to register %d at address %p\n",         \
-            (value), (reg), DTC_address(reg));     \
-    writeb(value, DTC_address(reg));} while(0)
-
-
-#if !(DTCDEBUG & DTCDEBUG_TRANSFER) 
 #define NCR5380_read(reg) (readb(DTC_address(reg)))
 #define NCR5380_write(reg, value) (writeb(value, DTC_address(reg)))
-#else
-#define NCR5380_read(reg) (readb(DTC_address(reg)))
-#define xNCR5380_read(reg)						\
-    (((unsigned char) printk("DTC : read register %d at address %p\n"\
-    , (reg), DTC_address(reg))), readb(DTC_address(reg)))
 
-#define NCR5380_write(reg, value) do {					\
-    printk("DTC : write %02x to register %d at address %p\n", 	\
-	    (value), (reg), DTC_address(reg));	\
-    writeb(value, DTC_address(reg));} while(0)
-#endif
+#define NCR5380_dma_xfer_len(instance, cmd, phase) \
+        dtc_dma_xfer_len(cmd)
 
 #define NCR5380_intr			dtc_intr
 #define NCR5380_queue_command		dtc_queue_command
diff --git a/drivers/scsi/g_NCR5380.c b/drivers/scsi/g_NCR5380.c
index f8d2478b11cc..90091e693020 100644
--- a/drivers/scsi/g_NCR5380.c
+++ b/drivers/scsi/g_NCR5380.c
@@ -56,40 +56,31 @@
  *     
  */
 
-/* settings for DTC3181E card with only Mustek scanner attached */
-#define USLEEP_POLL	msecs_to_jiffies(10)
-#define USLEEP_SLEEP	msecs_to_jiffies(200)
-#define USLEEP_WAITLONG	msecs_to_jiffies(5000)
-
 #define AUTOPROBE_IRQ
 
 #ifdef CONFIG_SCSI_GENERIC_NCR53C400
-#define NCR53C400_PSEUDO_DMA 1
 #define PSEUDO_DMA
-#define NCR53C400
 #endif
 
 #include <asm/io.h>
-#include <linux/signal.h>
 #include <linux/blkdev.h>
+#include <linux/module.h>
 #include <scsi/scsi_host.h>
 #include "g_NCR5380.h"
 #include "NCR5380.h"
-#include <linux/stat.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/isapnp.h>
-#include <linux/delay.h>
 #include <linux/interrupt.h>
 
-#define NCR_NOT_SET 0
-static int ncr_irq = NCR_NOT_SET;
-static int ncr_dma = NCR_NOT_SET;
-static int ncr_addr = NCR_NOT_SET;
-static int ncr_5380 = NCR_NOT_SET;
-static int ncr_53c400 = NCR_NOT_SET;
-static int ncr_53c400a = NCR_NOT_SET;
-static int dtc_3181e = NCR_NOT_SET;
+static int ncr_irq;
+static int ncr_dma;
+static int ncr_addr;
+static int ncr_5380;
+static int ncr_53c400;
+static int ncr_53c400a;
+static int dtc_3181e;
+static int hp_c2502;
 
 static struct override {
 	NCR5380_map_type NCR5380_map_name;
@@ -121,7 +112,7 @@ static struct override {
 
 static void __init internal_setup(int board, char *str, int *ints)
 {
-	static int commandline_current = 0;
+	static int commandline_current;
 	switch (board) {
 	case BOARD_NCR5380:
 		if (ints[0] != 2 && ints[0] != 3) {
@@ -235,6 +226,30 @@ static int __init do_DTC3181E_setup(char *str)
 
 #endif
 
+#ifndef SCSI_G_NCR5380_MEM
+/*
+ * Configure I/O address of 53C400A or DTC436 by writing magic numbers
+ * to ports 0x779 and 0x379.
+ */
+static void magic_configure(int idx, u8 irq, u8 magic[])
+{
+	u8 cfg = 0;
+
+	outb(magic[0], 0x779);
+	outb(magic[1], 0x379);
+	outb(magic[2], 0x379);
+	outb(magic[3], 0x379);
+	outb(magic[4], 0x379);
+
+	/* allowed IRQs for HP C2502 */
+	if (irq != 2 && irq != 3 && irq != 4 && irq != 5 && irq != 7)
+		irq = 0;
+	if (idx >= 0 && idx <= 7)
+		cfg = 0x80 | idx | (irq << 4);
+	outb(cfg, 0x379);
+}
+#endif
+
 /**
  * 	generic_NCR5380_detect	-	look for NCR5380 controllers
  *	@tpnt: the scsi template
@@ -243,19 +258,18 @@ static int __init do_DTC3181E_setup(char *str)
  *	and DTC436(ISAPnP) controllers. If overrides have been set we use
  *	them.
  *
- *	The caller supplied NCR5380_init function is invoked from here, before
- *	the interrupt line is taken.
- *
  *	Locks: none
  */
 
 static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
 {
-	static int current_override = 0;
+	static int current_override;
 	int count;
 	unsigned int *ports;
+	u8 *magic = NULL;
 #ifndef SCSI_G_NCR5380_MEM
 	int i;
+	int port_idx = -1;
 	unsigned long region_size = 16;
 #endif
 	static unsigned int __initdata ncr_53c400a_ports[] = {
@@ -264,27 +278,36 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
 	static unsigned int __initdata dtc_3181e_ports[] = {
 		0x220, 0x240, 0x280, 0x2a0, 0x2c0, 0x300, 0x320, 0x340, 0
 	};
-	int flags = 0;
+	static u8 ncr_53c400a_magic[] __initdata = {	/* 53C400A & DTC436 */
+		0x59, 0xb9, 0xc5, 0xae, 0xa6
+	};
+	static u8 hp_c2502_magic[] __initdata = {	/* HP C2502 */
+		0x0f, 0x22, 0xf0, 0x20, 0x80
+	};
+	int flags;
 	struct Scsi_Host *instance;
+	struct NCR5380_hostdata *hostdata;
 #ifdef SCSI_G_NCR5380_MEM
 	unsigned long base;
 	void __iomem *iomem;
 #endif
 
-	if (ncr_irq != NCR_NOT_SET)
+	if (ncr_irq)
 		overrides[0].irq = ncr_irq;
-	if (ncr_dma != NCR_NOT_SET)
+	if (ncr_dma)
 		overrides[0].dma = ncr_dma;
-	if (ncr_addr != NCR_NOT_SET)
+	if (ncr_addr)
 		overrides[0].NCR5380_map_name = (NCR5380_map_type) ncr_addr;
-	if (ncr_5380 != NCR_NOT_SET)
+	if (ncr_5380)
 		overrides[0].board = BOARD_NCR5380;
-	else if (ncr_53c400 != NCR_NOT_SET)
+	else if (ncr_53c400)
 		overrides[0].board = BOARD_NCR53C400;
-	else if (ncr_53c400a != NCR_NOT_SET)
+	else if (ncr_53c400a)
 		overrides[0].board = BOARD_NCR53C400A;
-	else if (dtc_3181e != NCR_NOT_SET)
+	else if (dtc_3181e)
 		overrides[0].board = BOARD_DTC3181E;
+	else if (hp_c2502)
+		overrides[0].board = BOARD_HP_C2502;
 #ifndef SCSI_G_NCR5380_MEM
 	if (!current_override && isapnp_present()) {
 		struct pnp_dev *dev = NULL;
@@ -318,41 +341,45 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
 		}
 	}
 #endif
-	tpnt->proc_name = "g_NCR5380";
 
 	for (count = 0; current_override < NO_OVERRIDES; ++current_override) {
 		if (!(overrides[current_override].NCR5380_map_name))
 			continue;
 
 		ports = NULL;
+		flags = 0;
 		switch (overrides[current_override].board) {
 		case BOARD_NCR5380:
 			flags = FLAG_NO_PSEUDO_DMA;
 			break;
 		case BOARD_NCR53C400:
-			flags = FLAG_NCR53C400;
+#ifdef PSEUDO_DMA
+			flags = FLAG_NO_DMA_FIXUP;
+#endif
 			break;
 		case BOARD_NCR53C400A:
-			flags = FLAG_NO_PSEUDO_DMA;
+			flags = FLAG_NO_DMA_FIXUP;
+			ports = ncr_53c400a_ports;
+			magic = ncr_53c400a_magic;
+			break;
+		case BOARD_HP_C2502:
+			flags = FLAG_NO_DMA_FIXUP;
 			ports = ncr_53c400a_ports;
+			magic = hp_c2502_magic;
 			break;
 		case BOARD_DTC3181E:
-			flags = FLAG_NO_PSEUDO_DMA | FLAG_DTC3181E;
+			flags = FLAG_NO_DMA_FIXUP;
 			ports = dtc_3181e_ports;
+			magic = ncr_53c400a_magic;
 			break;
 		}
 
 #ifndef SCSI_G_NCR5380_MEM
-		if (ports) {
+		if (ports && magic) {
 			/* wakeup sequence for the NCR53C400A and DTC3181E */
 
 			/* Disable the adapter and look for a free io port */
-			outb(0x59, 0x779);
-			outb(0xb9, 0x379);
-			outb(0xc5, 0x379);
-			outb(0xae, 0x379);
-			outb(0xa6, 0x379);
-			outb(0x00, 0x379);
+			magic_configure(-1, 0, magic);
 
 			if (overrides[current_override].NCR5380_map_name != PORT_AUTO)
 				for (i = 0; ports[i]; i++) {
@@ -371,17 +398,12 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
 				}
 			if (ports[i]) {
 				/* At this point we have our region reserved */
-				outb(0x59, 0x779);
-				outb(0xb9, 0x379);
-				outb(0xc5, 0x379);
-				outb(0xae, 0x379);
-				outb(0xa6, 0x379);
-				outb(0x80 | i, 0x379);	/* set io port to be used */
+				magic_configure(i, 0, magic); /* no IRQ yet */
 				outb(0xc0, ports[i] + 9);
 				if (inb(ports[i] + 9) != 0x80)
 					continue;
-				else
-					overrides[current_override].NCR5380_map_name = ports[i];
+				overrides[current_override].NCR5380_map_name = ports[i];
+				port_idx = i;
 			} else
 				continue;
 		}
@@ -403,24 +425,65 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
 		}
 #endif
 		instance = scsi_register(tpnt, sizeof(struct NCR5380_hostdata));
-		if (instance == NULL) {
-#ifndef SCSI_G_NCR5380_MEM
-			release_region(overrides[current_override].NCR5380_map_name, region_size);
-#else
-			iounmap(iomem);
-			release_mem_region(base, NCR5380_region_size);
-#endif
-			continue;
-		}
+		if (instance == NULL)
+			goto out_release;
+		hostdata = shost_priv(instance);
 
-		instance->NCR5380_instance_name = overrides[current_override].NCR5380_map_name;
 #ifndef SCSI_G_NCR5380_MEM
+		instance->io_port = overrides[current_override].NCR5380_map_name;
 		instance->n_io_port = region_size;
+		hostdata->io_width = 1; /* 8-bit PDMA by default */
+
+		/*
+		 * On NCR53C400 boards, NCR5380 registers are mapped 8 past
+		 * the base address.
+		 */
+		switch (overrides[current_override].board) {
+		case BOARD_NCR53C400:
+			instance->io_port += 8;
+			hostdata->c400_ctl_status = 0;
+			hostdata->c400_blk_cnt = 1;
+			hostdata->c400_host_buf = 4;
+			break;
+		case BOARD_DTC3181E:
+			hostdata->io_width = 2;	/* 16-bit PDMA */
+			/* fall through */
+		case BOARD_NCR53C400A:
+		case BOARD_HP_C2502:
+			hostdata->c400_ctl_status = 9;
+			hostdata->c400_blk_cnt = 10;
+			hostdata->c400_host_buf = 8;
+			break;
+		}
 #else
-		((struct NCR5380_hostdata *)instance->hostdata)->iomem = iomem;
+		instance->base = overrides[current_override].NCR5380_map_name;
+		hostdata->iomem = iomem;
+		switch (overrides[current_override].board) {
+		case BOARD_NCR53C400:
+			hostdata->c400_ctl_status = 0x100;
+			hostdata->c400_blk_cnt = 0x101;
+			hostdata->c400_host_buf = 0x104;
+			break;
+		case BOARD_DTC3181E:
+		case BOARD_NCR53C400A:
+		case BOARD_HP_C2502:
+			pr_err(DRV_MODULE_NAME ": unknown register offsets\n");
+			goto out_unregister;
+		}
 #endif
 
-		NCR5380_init(instance, flags);
+		if (NCR5380_init(instance, flags))
+			goto out_unregister;
+
+		switch (overrides[current_override].board) {
+		case BOARD_NCR53C400:
+		case BOARD_DTC3181E:
+		case BOARD_NCR53C400A:
+		case BOARD_HP_C2502:
+			NCR5380_write(hostdata->c400_ctl_status, CSR_BASE);
+		}
+
+		NCR5380_maybe_reset_bus(instance);
 
 		if (overrides[current_override].irq != IRQ_AUTO)
 			instance->irq = overrides[current_override].irq;
@@ -431,12 +494,18 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
 		if (instance->irq == 255)
 			instance->irq = NO_IRQ;
 
-		if (instance->irq != NO_IRQ)
+		if (instance->irq != NO_IRQ) {
+#ifndef SCSI_G_NCR5380_MEM
+			/* set IRQ for HP C2502 */
+			if (overrides[current_override].board == BOARD_HP_C2502)
+				magic_configure(port_idx, instance->irq, magic);
+#endif
 			if (request_irq(instance->irq, generic_NCR5380_intr,
 					0, "NCR5380", instance)) {
 				printk(KERN_WARNING "scsi%d : IRQ%d not free, interrupts disabled\n", instance->host_no, instance->irq);
 				instance->irq = NO_IRQ;
 			}
+		}
 
 		if (instance->irq == NO_IRQ) {
 			printk(KERN_INFO "scsi%d : interrupts not enabled. for better interactive performance,\n", instance->host_no);
@@ -447,6 +516,17 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
 		++count;
 	}
 	return count;
+
+out_unregister:
+	scsi_unregister(instance);
+out_release:
+#ifndef SCSI_G_NCR5380_MEM
+	release_region(overrides[current_override].NCR5380_map_name, region_size);
+#else
+	iounmap(iomem);
+	release_mem_region(base, NCR5380_region_size);
+#endif
+	return count;
 }
 
 /**
@@ -460,21 +540,15 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
  
 static int generic_NCR5380_release_resources(struct Scsi_Host *instance)
 {
-	NCR5380_local_declare();
-	NCR5380_setup(instance);
-	
 	if (instance->irq != NO_IRQ)
 		free_irq(instance->irq, instance);
 	NCR5380_exit(instance);
-
 #ifndef SCSI_G_NCR5380_MEM
-	release_region(instance->NCR5380_instance_name, instance->n_io_port);
+	release_region(instance->io_port, instance->n_io_port);
 #else
 	iounmap(((struct NCR5380_hostdata *)instance->hostdata)->iomem);
-	release_mem_region(instance->NCR5380_instance_name, NCR5380_region_size);
+	release_mem_region(instance->base, NCR5380_region_size);
 #endif
-
-
 	return 0;
 }
 
@@ -507,7 +581,7 @@ generic_NCR5380_biosparam(struct scsi_device *sdev, struct block_device *bdev,
 }
 #endif
 
-#ifdef NCR53C400_PSEUDO_DMA
+#ifdef PSEUDO_DMA
 
 /**
  *	NCR5380_pread		-	pseudo DMA read
@@ -521,75 +595,68 @@ generic_NCR5380_biosparam(struct scsi_device *sdev, struct block_device *bdev,
  
 static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst, int len)
 {
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	int blocks = len / 128;
 	int start = 0;
-	int bl;
-
-	NCR5380_local_declare();
-	NCR5380_setup(instance);
 
-	NCR5380_write(C400_CONTROL_STATUS_REG, CSR_BASE | CSR_TRANS_DIR);
-	NCR5380_write(C400_BLOCK_COUNTER_REG, blocks);
+	NCR5380_write(hostdata->c400_ctl_status, CSR_BASE | CSR_TRANS_DIR);
+	NCR5380_write(hostdata->c400_blk_cnt, blocks);
 	while (1) {
-		if ((bl = NCR5380_read(C400_BLOCK_COUNTER_REG)) == 0) {
+		if (NCR5380_read(hostdata->c400_blk_cnt) == 0)
 			break;
-		}
-		if (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ) {
+		if (NCR5380_read(hostdata->c400_ctl_status) & CSR_GATED_53C80_IRQ) {
 			printk(KERN_ERR "53C400r: Got 53C80_IRQ start=%d, blocks=%d\n", start, blocks);
 			return -1;
 		}
-		while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY);
+		while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY)
+			; /* FIXME - no timeout */
 
 #ifndef SCSI_G_NCR5380_MEM
-		{
-			int i;
-			for (i = 0; i < 128; i++)
-				dst[start + i] = NCR5380_read(C400_HOST_BUFFER);
-		}
+		if (hostdata->io_width == 2)
+			insw(instance->io_port + hostdata->c400_host_buf,
+							dst + start, 64);
+		else
+			insb(instance->io_port + hostdata->c400_host_buf,
+							dst + start, 128);
 #else
 		/* implies SCSI_G_NCR5380_MEM */
-		memcpy_fromio(dst + start, iomem + NCR53C400_host_buffer, 128);
+		memcpy_fromio(dst + start,
+		              hostdata->iomem + NCR53C400_host_buffer, 128);
 #endif
 		start += 128;
 		blocks--;
 	}
 
 	if (blocks) {
-		while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY)
-		{
-			// FIXME - no timeout
-		}
+		while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY)
+			; /* FIXME - no timeout */
 
 #ifndef SCSI_G_NCR5380_MEM
-		{
-			int i;	
-			for (i = 0; i < 128; i++)
-				dst[start + i] = NCR5380_read(C400_HOST_BUFFER);
-		}
+		if (hostdata->io_width == 2)
+			insw(instance->io_port + hostdata->c400_host_buf,
+							dst + start, 64);
+		else
+			insb(instance->io_port + hostdata->c400_host_buf,
+							dst + start, 128);
 #else
 		/* implies SCSI_G_NCR5380_MEM */
-		memcpy_fromio(dst + start, iomem + NCR53C400_host_buffer, 128);
+		memcpy_fromio(dst + start,
+		              hostdata->iomem + NCR53C400_host_buffer, 128);
 #endif
 		start += 128;
 		blocks--;
 	}
 
-	if (!(NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ))
+	if (!(NCR5380_read(hostdata->c400_ctl_status) & CSR_GATED_53C80_IRQ))
 		printk("53C400r: no 53C80 gated irq after transfer");
 
-#if 0
-	/*
-	 *	DON'T DO THIS - THEY NEVER ARRIVE!
-	 */
-	printk("53C400r: Waiting for 53C80 registers\n");
-	while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_53C80_REG)
+	/* wait for 53C80 registers to be available */
+	while (!(NCR5380_read(hostdata->c400_ctl_status) & CSR_53C80_REG))
 		;
-#endif
+
 	if (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_END_DMA_TRANSFER))
 		printk(KERN_ERR "53C400r: no end dma signal\n");
 		
-	NCR5380_write(MODE_REG, MR_BASE);
-	NCR5380_read(RESET_PARITY_INTERRUPT_REG);
 	return 0;
 }
 
@@ -605,89 +672,91 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst,
 
 static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src, int len)
 {
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
 	int blocks = len / 128;
 	int start = 0;
-	int bl;
-	int i;
 
-	NCR5380_local_declare();
-	NCR5380_setup(instance);
-
-	NCR5380_write(C400_CONTROL_STATUS_REG, CSR_BASE);
-	NCR5380_write(C400_BLOCK_COUNTER_REG, blocks);
+	NCR5380_write(hostdata->c400_ctl_status, CSR_BASE);
+	NCR5380_write(hostdata->c400_blk_cnt, blocks);
 	while (1) {
-		if (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ) {
+		if (NCR5380_read(hostdata->c400_ctl_status) & CSR_GATED_53C80_IRQ) {
 			printk(KERN_ERR "53C400w: Got 53C80_IRQ start=%d, blocks=%d\n", start, blocks);
 			return -1;
 		}
 
-		if ((bl = NCR5380_read(C400_BLOCK_COUNTER_REG)) == 0) {
+		if (NCR5380_read(hostdata->c400_blk_cnt) == 0)
 			break;
-		}
-		while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY)
+		while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY)
 			; // FIXME - timeout
 #ifndef SCSI_G_NCR5380_MEM
-		{
-			for (i = 0; i < 128; i++)
-				NCR5380_write(C400_HOST_BUFFER, src[start + i]);
-		}
+		if (hostdata->io_width == 2)
+			outsw(instance->io_port + hostdata->c400_host_buf,
+							src + start, 64);
+		else
+			outsb(instance->io_port + hostdata->c400_host_buf,
+							src + start, 128);
 #else
 		/* implies SCSI_G_NCR5380_MEM */
-		memcpy_toio(iomem + NCR53C400_host_buffer, src + start, 128);
+		memcpy_toio(hostdata->iomem + NCR53C400_host_buffer,
+		            src + start, 128);
 #endif
 		start += 128;
 		blocks--;
 	}
 	if (blocks) {
-		while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY)
+		while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY)
 			; // FIXME - no timeout
 
 #ifndef SCSI_G_NCR5380_MEM
-		{
-			for (i = 0; i < 128; i++)
-				NCR5380_write(C400_HOST_BUFFER, src[start + i]);
-		}
+		if (hostdata->io_width == 2)
+			outsw(instance->io_port + hostdata->c400_host_buf,
+							src + start, 64);
+		else
+			outsb(instance->io_port + hostdata->c400_host_buf,
+							src + start, 128);
 #else
 		/* implies SCSI_G_NCR5380_MEM */
-		memcpy_toio(iomem + NCR53C400_host_buffer, src + start, 128);
+		memcpy_toio(hostdata->iomem + NCR53C400_host_buffer,
+		            src + start, 128);
 #endif
 		start += 128;
 		blocks--;
 	}
 
-#if 0
-	printk("53C400w: waiting for registers to be available\n");
-	THEY NEVER DO ! while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_53C80_REG);
-	printk("53C400w: Got em\n");
-#endif
-
-	/* Let's wait for this instead - could be ugly */
-	/* All documentation says to check for this. Maybe my hardware is too
-	 * fast. Waiting for it seems to work fine! KLL
-	 */
-	while (!(i = NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ))
-		;	// FIXME - no timeout
-
-	/*
-	 * I know. i is certainly != 0 here but the loop is new. See previous
-	 * comment.
-	 */
-	if (i) {
-		if (!((i = NCR5380_read(BUS_AND_STATUS_REG)) & BASR_END_DMA_TRANSFER))
-			printk(KERN_ERR "53C400w: No END OF DMA bit - WHOOPS! BASR=%0x\n", i);
-	} else
-		printk(KERN_ERR "53C400w: no 53C80 gated irq after transfer (last block)\n");
+	/* wait for 53C80 registers to be available */
+	while (!(NCR5380_read(hostdata->c400_ctl_status) & CSR_53C80_REG)) {
+		udelay(4); /* DTC436 chip hangs without this */
+		/* FIXME - no timeout */
+	}
 
-#if 0
 	if (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_END_DMA_TRANSFER)) {
 		printk(KERN_ERR "53C400w: no end dma signal\n");
 	}
-#endif
+
 	while (!(NCR5380_read(TARGET_COMMAND_REG) & TCR_LAST_BYTE_SENT))
 		; 	// TIMEOUT
 	return 0;
 }
-#endif				/* PSEUDO_DMA */
+
+static int generic_NCR5380_dma_xfer_len(struct scsi_cmnd *cmd)
+{
+	int transfersize = cmd->transfersize;
+
+	/* Limit transfers to 32K, for xx400 & xx406
+	 * pseudoDMA that transfers in 128 bytes blocks.
+	 */
+	if (transfersize > 32 * 1024 && cmd->SCp.this_residual &&
+	    !(cmd->SCp.this_residual % transfersize))
+		transfersize = 32 * 1024;
+
+	/* 53C400 datasheet: non-modulo-128-byte transfers should use PIO */
+	if (transfersize % 128)
+		transfersize = 0;
+
+	return transfersize;
+}
+
+#endif /* PSEUDO_DMA */
 
 /*
  *	Include the NCR5380 core code that we build our driver around	
@@ -696,22 +765,24 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src,
 #include "NCR5380.c"
 
 static struct scsi_host_template driver_template = {
-	.show_info      	= generic_NCR5380_show_info,
-	.name           	= "Generic NCR5380/NCR53C400 SCSI",
-	.detect         	= generic_NCR5380_detect,
-	.release        	= generic_NCR5380_release_resources,
-	.info           	= generic_NCR5380_info,
-	.queuecommand   	= generic_NCR5380_queue_command,
+	.proc_name		= DRV_MODULE_NAME,
+	.name			= "Generic NCR5380/NCR53C400 SCSI",
+	.detect			= generic_NCR5380_detect,
+	.release		= generic_NCR5380_release_resources,
+	.info			= generic_NCR5380_info,
+	.queuecommand		= generic_NCR5380_queue_command,
 	.eh_abort_handler	= generic_NCR5380_abort,
 	.eh_bus_reset_handler	= generic_NCR5380_bus_reset,
-	.bios_param     	= NCR5380_BIOSPARAM,
-	.can_queue      	= CAN_QUEUE,
-        .this_id        	= 7,
-        .sg_tablesize   	= SG_ALL,
-	.cmd_per_lun    	= CMD_PER_LUN,
-        .use_clustering		= DISABLE_CLUSTERING,
+	.bios_param		= NCR5380_BIOSPARAM,
+	.can_queue		= 16,
+	.this_id		= 7,
+	.sg_tablesize		= SG_ALL,
+	.cmd_per_lun		= 2,
+	.use_clustering		= DISABLE_CLUSTERING,
+	.cmd_size		= NCR5380_CMD_SIZE,
+	.max_sectors		= 128,
 };
-#include <linux/module.h>
+
 #include "scsi_module.c"
 
 module_param(ncr_irq, int, 0);
@@ -721,6 +792,7 @@ module_param(ncr_5380, int, 0);
 module_param(ncr_53c400, int, 0);
 module_param(ncr_53c400a, int, 0);
 module_param(dtc_3181e, int, 0);
+module_param(hp_c2502, int, 0);
 MODULE_LICENSE("GPL");
 
 #if !defined(SCSI_G_NCR5380_MEM) && defined(MODULE)
diff --git a/drivers/scsi/g_NCR5380.h b/drivers/scsi/g_NCR5380.h
index bea1a3b9b862..6f3d2ac4f185 100644
--- a/drivers/scsi/g_NCR5380.h
+++ b/drivers/scsi/g_NCR5380.h
@@ -14,81 +14,67 @@
 #ifndef GENERIC_NCR5380_H
 #define GENERIC_NCR5380_H
 
-#ifdef NCR53C400
+#ifdef CONFIG_SCSI_GENERIC_NCR53C400
 #define BIOSPARAM
 #define NCR5380_BIOSPARAM generic_NCR5380_biosparam
 #else
 #define NCR5380_BIOSPARAM NULL
 #endif
 
-#ifndef ASM
-
-#ifndef CMD_PER_LUN
-#define CMD_PER_LUN 2
-#endif
-
-#ifndef CAN_QUEUE
-#define CAN_QUEUE 16
-#endif
-
 #define __STRVAL(x) #x
 #define STRVAL(x) __STRVAL(x)
 
 #ifndef SCSI_G_NCR5380_MEM
+#define DRV_MODULE_NAME "g_NCR5380"
 
-#define NCR5380_map_config port
 #define NCR5380_map_type int
 #define NCR5380_map_name port
-#define NCR5380_instance_name io_port
-#define NCR53C400_register_offset 0
-#define NCR53C400_address_adjust 8
 
-#ifdef NCR53C400
+#ifdef CONFIG_SCSI_GENERIC_NCR53C400
 #define NCR5380_region_size 16
 #else
 #define NCR5380_region_size 8
 #endif
 
-#define NCR5380_read(reg) (inb(NCR5380_map_name + (reg)))
-#define NCR5380_write(reg, value) (outb((value), (NCR5380_map_name + (reg))))
+#define NCR5380_read(reg) \
+	inb(instance->io_port + (reg))
+#define NCR5380_write(reg, value) \
+	outb(value, instance->io_port + (reg))
 
 #define NCR5380_implementation_fields \
-    NCR5380_map_type NCR5380_map_name
-
-#define NCR5380_local_declare() \
-    register NCR5380_implementation_fields
-
-#define NCR5380_setup(instance) \
-    NCR5380_map_name = (NCR5380_map_type)((instance)->NCR5380_instance_name)
+	int c400_ctl_status; \
+	int c400_blk_cnt; \
+	int c400_host_buf; \
+	int io_width;
 
 #else 
 /* therefore SCSI_G_NCR5380_MEM */
+#define DRV_MODULE_NAME "g_NCR5380_mmio"
 
-#define NCR5380_map_config memory
 #define NCR5380_map_type unsigned long
 #define NCR5380_map_name base
-#define NCR5380_instance_name base
-#define NCR53C400_register_offset 0x108
-#define NCR53C400_address_adjust 0
 #define NCR53C400_mem_base 0x3880
 #define NCR53C400_host_buffer 0x3900
 #define NCR5380_region_size 0x3a00
 
-#define NCR5380_read(reg) readb(iomem + NCR53C400_mem_base + (reg))
-#define NCR5380_write(reg, value) writeb(value, iomem + NCR53C400_mem_base + (reg))
+#define NCR5380_read(reg) \
+	readb(((struct NCR5380_hostdata *)shost_priv(instance))->iomem + \
+	      NCR53C400_mem_base + (reg))
+#define NCR5380_write(reg, value) \
+	writeb(value, ((struct NCR5380_hostdata *)shost_priv(instance))->iomem + \
+	       NCR53C400_mem_base + (reg))
 
 #define NCR5380_implementation_fields \
-    NCR5380_map_type NCR5380_map_name; \
-    void __iomem *iomem;
-
-#define NCR5380_local_declare() \
-    register void __iomem *iomem
-
-#define NCR5380_setup(instance) \
-    iomem = (((struct NCR5380_hostdata *)(instance)->hostdata)->iomem)
+	void __iomem *iomem; \
+	int c400_ctl_status; \
+	int c400_blk_cnt; \
+	int c400_host_buf;
 
 #endif
 
+#define NCR5380_dma_xfer_len(instance, cmd, phase) \
+        generic_NCR5380_dma_xfer_len(cmd)
+
 #define NCR5380_intr generic_NCR5380_intr
 #define NCR5380_queue_command generic_NCR5380_queue_command
 #define NCR5380_abort generic_NCR5380_abort
@@ -102,7 +88,7 @@
 #define BOARD_NCR53C400	1
 #define BOARD_NCR53C400A 2
 #define BOARD_DTC3181E	3
+#define BOARD_HP_C2502	4
 
-#endif /* ndef ASM */
 #endif /* GENERIC_NCR5380_H */
 
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c
index d54381149c0d..057fdeb720ac 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c
@@ -247,41 +247,36 @@
 /* ITCT header */
 /* qw0 */
 #define ITCT_HDR_DEV_TYPE_OFF		0
-#define ITCT_HDR_DEV_TYPE_MSK		(0x3 << ITCT_HDR_DEV_TYPE_OFF)
+#define ITCT_HDR_DEV_TYPE_MSK		(0x3ULL << ITCT_HDR_DEV_TYPE_OFF)
 #define ITCT_HDR_VALID_OFF		2
-#define ITCT_HDR_VALID_MSK		(0x1 << ITCT_HDR_VALID_OFF)
-#define ITCT_HDR_BREAK_REPLY_ENA_OFF	3
-#define ITCT_HDR_BREAK_REPLY_ENA_MSK	(0x1 << ITCT_HDR_BREAK_REPLY_ENA_OFF)
+#define ITCT_HDR_VALID_MSK		(0x1ULL << ITCT_HDR_VALID_OFF)
 #define ITCT_HDR_AWT_CONTROL_OFF	4
-#define ITCT_HDR_AWT_CONTROL_MSK	(0x1 << ITCT_HDR_AWT_CONTROL_OFF)
+#define ITCT_HDR_AWT_CONTROL_MSK	(0x1ULL << ITCT_HDR_AWT_CONTROL_OFF)
 #define ITCT_HDR_MAX_CONN_RATE_OFF	5
-#define ITCT_HDR_MAX_CONN_RATE_MSK	(0xf << ITCT_HDR_MAX_CONN_RATE_OFF)
+#define ITCT_HDR_MAX_CONN_RATE_MSK	(0xfULL << ITCT_HDR_MAX_CONN_RATE_OFF)
 #define ITCT_HDR_VALID_LINK_NUM_OFF	9
-#define ITCT_HDR_VALID_LINK_NUM_MSK	(0xf << ITCT_HDR_VALID_LINK_NUM_OFF)
+#define ITCT_HDR_VALID_LINK_NUM_MSK	(0xfULL << ITCT_HDR_VALID_LINK_NUM_OFF)
 #define ITCT_HDR_PORT_ID_OFF		13
-#define ITCT_HDR_PORT_ID_MSK		(0x7 << ITCT_HDR_PORT_ID_OFF)
+#define ITCT_HDR_PORT_ID_MSK		(0x7ULL << ITCT_HDR_PORT_ID_OFF)
 #define ITCT_HDR_SMP_TIMEOUT_OFF	16
-#define ITCT_HDR_SMP_TIMEOUT_MSK	(0xffff << ITCT_HDR_SMP_TIMEOUT_OFF)
-#define ITCT_HDR_MAX_BURST_BYTES_OFF	16
-#define ITCT_HDR_MAX_BURST_BYTES_MSK	(0xffffffff << \
-					ITCT_MAX_BURST_BYTES_OFF)
+#define ITCT_HDR_SMP_TIMEOUT_MSK	(0xffffULL << ITCT_HDR_SMP_TIMEOUT_OFF)
 /* qw1 */
 #define ITCT_HDR_MAX_SAS_ADDR_OFF	0
 #define ITCT_HDR_MAX_SAS_ADDR_MSK	(0xffffffffffffffff << \
 					ITCT_HDR_MAX_SAS_ADDR_OFF)
 /* qw2 */
 #define ITCT_HDR_IT_NEXUS_LOSS_TL_OFF	0
-#define ITCT_HDR_IT_NEXUS_LOSS_TL_MSK	(0xffff << \
+#define ITCT_HDR_IT_NEXUS_LOSS_TL_MSK	(0xffffULL << \
 					ITCT_HDR_IT_NEXUS_LOSS_TL_OFF)
 #define ITCT_HDR_BUS_INACTIVE_TL_OFF	16
-#define ITCT_HDR_BUS_INACTIVE_TL_MSK	(0xffff << \
+#define ITCT_HDR_BUS_INACTIVE_TL_MSK	(0xffffULL << \
 					ITCT_HDR_BUS_INACTIVE_TL_OFF)
 #define ITCT_HDR_MAX_CONN_TL_OFF	32
-#define ITCT_HDR_MAX_CONN_TL_MSK	(0xffff << \
+#define ITCT_HDR_MAX_CONN_TL_MSK	(0xffffULL << \
 					ITCT_HDR_MAX_CONN_TL_OFF)
 #define ITCT_HDR_REJ_OPEN_TL_OFF	48
-#define ITCT_HDR_REJ_OPEN_TL_MSK	(0xffff << \
-					ITCT_REJ_OPEN_TL_OFF)
+#define ITCT_HDR_REJ_OPEN_TL_MSK	(0xffffULL << \
+					ITCT_HDR_REJ_OPEN_TL_OFF)
 
 /* Err record header */
 #define ERR_HDR_DMA_TX_ERR_TYPE_OFF	0
@@ -533,10 +528,10 @@ static void setup_itct_v1_hw(struct hisi_hba *hisi_hba,
 	itct->sas_addr = __swab64(itct->sas_addr);
 
 	/* qw2 */
-	itct->qw2 = cpu_to_le64((500 < ITCT_HDR_IT_NEXUS_LOSS_TL_OFF) |
-				(0xff00 < ITCT_HDR_BUS_INACTIVE_TL_OFF) |
-				(0xff00 < ITCT_HDR_MAX_CONN_TL_OFF) |
-				(0xff00 < ITCT_HDR_REJ_OPEN_TL_OFF));
+	itct->qw2 = cpu_to_le64((500ULL << ITCT_HDR_IT_NEXUS_LOSS_TL_OFF) |
+				(0xff00ULL << ITCT_HDR_BUS_INACTIVE_TL_OFF) |
+				(0xff00ULL << ITCT_HDR_MAX_CONN_TL_OFF) |
+				(0xff00ULL << ITCT_HDR_REJ_OPEN_TL_OFF));
 }
 
 static void free_device_v1_hw(struct hisi_hba *hisi_hba,
@@ -544,7 +539,8 @@ static void free_device_v1_hw(struct hisi_hba *hisi_hba,
 {
 	u64 dev_id = sas_dev->device_id;
 	struct hisi_sas_itct *itct = &hisi_hba->itct[dev_id];
-	u32 qw0, reg_val = hisi_sas_read32(hisi_hba, CFG_AGING_TIME);
+	u64 qw0;
+	u32 reg_val = hisi_sas_read32(hisi_hba, CFG_AGING_TIME);
 
 	reg_val |= CFG_AGING_TIME_ITCT_REL_MSK;
 	hisi_sas_write32(hisi_hba, CFG_AGING_TIME, reg_val);
diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c
index 4e1a632ccf16..f8b88fa78e62 100644
--- a/drivers/scsi/imm.c
+++ b/drivers/scsi/imm.c
@@ -43,6 +43,7 @@ typedef struct {
 	unsigned dp:1;		/* Data phase present           */
 	unsigned rd:1;		/* Read data in data phase      */
 	unsigned wanted:1;	/* Parport sharing busy flag    */
+	unsigned int dev_no;	/* Device number		*/
 	wait_queue_head_t *waiting;
 	struct Scsi_Host *host;
 	struct list_head list;
@@ -1120,15 +1121,40 @@ static struct scsi_host_template imm_template = {
 
 static LIST_HEAD(imm_hosts);
 
+/*
+ * Finds the first available device number that can be alloted to the
+ * new imm device and returns the address of the previous node so that
+ * we can add to the tail and have a list in the ascending order.
+ */
+
+static inline imm_struct *find_parent(void)
+{
+	imm_struct *dev, *par = NULL;
+	unsigned int cnt = 0;
+
+	if (list_empty(&imm_hosts))
+		return NULL;
+
+	list_for_each_entry(dev, &imm_hosts, list) {
+		if (dev->dev_no != cnt)
+			return par;
+		cnt++;
+		par = dev;
+	}
+
+	return par;
+}
+
 static int __imm_attach(struct parport *pb)
 {
 	struct Scsi_Host *host;
-	imm_struct *dev;
+	imm_struct *dev, *temp;
 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(waiting);
 	DEFINE_WAIT(wait);
 	int ports;
 	int modes, ppb;
 	int err = -ENOMEM;
+	struct pardev_cb imm_cb;
 
 	init_waitqueue_head(&waiting);
 
@@ -1141,9 +1167,15 @@ static int __imm_attach(struct parport *pb)
 	dev->mode = IMM_AUTODETECT;
 	INIT_LIST_HEAD(&dev->list);
 
-	dev->dev = parport_register_device(pb, "imm", NULL, imm_wakeup,
-						NULL, 0, dev);
+	temp = find_parent();
+	if (temp)
+		dev->dev_no = temp->dev_no + 1;
+
+	memset(&imm_cb, 0, sizeof(imm_cb));
+	imm_cb.private = dev;
+	imm_cb.wakeup = imm_wakeup;
 
+	dev->dev = parport_register_dev_model(pb, "imm", &imm_cb, dev->dev_no);
 	if (!dev->dev)
 		goto out;
 
@@ -1207,7 +1239,10 @@ static int __imm_attach(struct parport *pb)
 	host->unique_id = pb->number;
 	*(imm_struct **)&host->hostdata = dev;
 	dev->host = host;
-	list_add_tail(&dev->list, &imm_hosts);
+	if (!temp)
+		list_add_tail(&dev->list, &imm_hosts);
+	else
+		list_add_tail(&dev->list, &temp->list);
 	err = scsi_add_host(host, NULL);
 	if (err)
 		goto out2;
@@ -1245,9 +1280,10 @@ static void imm_detach(struct parport *pb)
 }
 
 static struct parport_driver imm_driver = {
-	.name	= "imm",
-	.attach	= imm_attach,
-	.detach	= imm_detach,
+	.name		= "imm",
+	.match_port	= imm_attach,
+	.detach		= imm_detach,
+	.devmodel	= true,
 };
 
 static int __init imm_driver_init(void)
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 536cd5a80422..3b3e0998fa6e 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -3638,7 +3638,7 @@ static struct device_attribute ipr_ioa_reset_attr = {
 	.store = ipr_store_reset_adapter
 };
 
-static int ipr_iopoll(struct blk_iopoll *iop, int budget);
+static int ipr_iopoll(struct irq_poll *iop, int budget);
  /**
  * ipr_show_iopoll_weight - Show ipr polling mode
  * @dev:	class device struct
@@ -3681,34 +3681,33 @@ static ssize_t ipr_store_iopoll_weight(struct device *dev,
 	int i;
 
 	if (!ioa_cfg->sis64) {
-		dev_info(&ioa_cfg->pdev->dev, "blk-iopoll not supported on this adapter\n");
+		dev_info(&ioa_cfg->pdev->dev, "irq_poll not supported on this adapter\n");
 		return -EINVAL;
 	}
 	if (kstrtoul(buf, 10, &user_iopoll_weight))
 		return -EINVAL;
 
 	if (user_iopoll_weight > 256) {
-		dev_info(&ioa_cfg->pdev->dev, "Invalid blk-iopoll weight. It must be less than 256\n");
+		dev_info(&ioa_cfg->pdev->dev, "Invalid irq_poll weight. It must be less than 256\n");
 		return -EINVAL;
 	}
 
 	if (user_iopoll_weight == ioa_cfg->iopoll_weight) {
-		dev_info(&ioa_cfg->pdev->dev, "Current blk-iopoll weight has the same weight\n");
+		dev_info(&ioa_cfg->pdev->dev, "Current irq_poll weight has the same weight\n");
 		return strlen(buf);
 	}
 
 	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		for (i = 1; i < ioa_cfg->hrrq_num; i++)
-			blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll);
+			irq_poll_disable(&ioa_cfg->hrrq[i].iopoll);
 	}
 
 	spin_lock_irqsave(shost->host_lock, lock_flags);
 	ioa_cfg->iopoll_weight = user_iopoll_weight;
 	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		for (i = 1; i < ioa_cfg->hrrq_num; i++) {
-			blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll,
+			irq_poll_init(&ioa_cfg->hrrq[i].iopoll,
 					ioa_cfg->iopoll_weight, ipr_iopoll);
-			blk_iopoll_enable(&ioa_cfg->hrrq[i].iopoll);
 		}
 	}
 	spin_unlock_irqrestore(shost->host_lock, lock_flags);
@@ -4003,13 +4002,12 @@ static ssize_t ipr_store_update_fw(struct device *dev,
 	struct ipr_sglist *sglist;
 	char fname[100];
 	char *src;
-	int len, result, dnld_size;
+	int result, dnld_size;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
-	len = snprintf(fname, 99, "%s", buf);
-	fname[len-1] = '\0';
+	snprintf(fname, sizeof(fname), "%s", buf);
 
 	if (request_firmware(&fw_entry, fname, &ioa_cfg->pdev->dev)) {
 		dev_err(&ioa_cfg->pdev->dev, "Firmware file %s not found\n", fname);
@@ -5569,7 +5567,7 @@ static int ipr_process_hrrq(struct ipr_hrr_queue *hrr_queue, int budget,
 	return num_hrrq;
 }
 
-static int ipr_iopoll(struct blk_iopoll *iop, int budget)
+static int ipr_iopoll(struct irq_poll *iop, int budget)
 {
 	struct ipr_ioa_cfg *ioa_cfg;
 	struct ipr_hrr_queue *hrrq;
@@ -5585,7 +5583,7 @@ static int ipr_iopoll(struct blk_iopoll *iop, int budget)
 	completed_ops = ipr_process_hrrq(hrrq, budget, &doneq);
 
 	if (completed_ops < budget)
-		blk_iopoll_complete(iop);
+		irq_poll_complete(iop);
 	spin_unlock_irqrestore(hrrq->lock, hrrq_flags);
 
 	list_for_each_entry_safe(ipr_cmd, temp, &doneq, queue) {
@@ -5693,8 +5691,7 @@ static irqreturn_t ipr_isr_mhrrq(int irq, void *devp)
 	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		if ((be32_to_cpu(*hrrq->hrrq_curr) & IPR_HRRQ_TOGGLE_BIT) ==
 		       hrrq->toggle_bit) {
-			if (!blk_iopoll_sched_prep(&hrrq->iopoll))
-				blk_iopoll_sched(&hrrq->iopoll);
+			irq_poll_sched(&hrrq->iopoll);
 			spin_unlock_irqrestore(hrrq->lock, hrrq_flags);
 			return IRQ_HANDLED;
 		}
@@ -10405,9 +10402,8 @@ static int ipr_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id)
 
 	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		for (i = 1; i < ioa_cfg->hrrq_num; i++) {
-			blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll,
+			irq_poll_init(&ioa_cfg->hrrq[i].iopoll,
 					ioa_cfg->iopoll_weight, ipr_iopoll);
-			blk_iopoll_enable(&ioa_cfg->hrrq[i].iopoll);
 		}
 	}
 
@@ -10436,7 +10432,7 @@ static void ipr_shutdown(struct pci_dev *pdev)
 	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		ioa_cfg->iopoll_weight = 0;
 		for (i = 1; i < ioa_cfg->hrrq_num; i++)
-			blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll);
+			irq_poll_disable(&ioa_cfg->hrrq[i].iopoll);
 	}
 
 	while (ioa_cfg->in_reset_reload) {
diff --git a/drivers/scsi/ipr.h b/drivers/scsi/ipr.h
index a34c7a5a995e..56c57068300a 100644
--- a/drivers/scsi/ipr.h
+++ b/drivers/scsi/ipr.h
@@ -32,7 +32,7 @@
 #include <linux/libata.h>
 #include <linux/list.h>
 #include <linux/kref.h>
-#include <linux/blk-iopoll.h>
+#include <linux/irq_poll.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
 
@@ -517,7 +517,7 @@ struct ipr_hrr_queue {
 	u8 allow_cmds:1;
 	u8 removing_ioa:1;
 
-	struct blk_iopoll iopoll;
+	struct irq_poll iopoll;
 };
 
 /* Command packet structure */
diff --git a/drivers/scsi/mac_scsi.c b/drivers/scsi/mac_scsi.c
index d64a769b8155..bb2381314a2b 100644
--- a/drivers/scsi/mac_scsi.c
+++ b/drivers/scsi/mac_scsi.c
@@ -12,7 +12,6 @@
  */
 
 #include <linux/types.h>
-#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
@@ -32,14 +31,13 @@
 #define PSEUDO_DMA
 
 #define NCR5380_implementation_fields   unsigned char *pdma_base
-#define NCR5380_local_declare()         struct Scsi_Host *_instance
-#define NCR5380_setup(instance)         _instance = instance
 
-#define NCR5380_read(reg)               macscsi_read(_instance, reg)
-#define NCR5380_write(reg, value)       macscsi_write(_instance, reg, value)
+#define NCR5380_read(reg)               macscsi_read(instance, reg)
+#define NCR5380_write(reg, value)       macscsi_write(instance, reg, value)
 
 #define NCR5380_pread                   macscsi_pread
 #define NCR5380_pwrite                  macscsi_pwrite
+#define NCR5380_dma_xfer_len(instance, cmd, phase)	(cmd->transfersize)
 
 #define NCR5380_intr                    macscsi_intr
 #define NCR5380_queue_command           macscsi_queue_command
@@ -51,8 +49,6 @@
 
 #include "NCR5380.h"
 
-#define RESET_BOOT
-
 static int setup_can_queue = -1;
 module_param(setup_can_queue, int, 0);
 static int setup_cmd_per_lun = -1;
@@ -65,17 +61,8 @@ static int setup_use_tagged_queuing = -1;
 module_param(setup_use_tagged_queuing, int, 0);
 static int setup_hostid = -1;
 module_param(setup_hostid, int, 0);
-
-/* Time (in jiffies) to wait after a reset; the SCSI standard calls for 250ms,
- * we usually do 0.5s to be on the safe side. But Toshiba CD-ROMs once more
- * need ten times the standard value... */
-#define TOSHIBA_DELAY
-
-#ifdef TOSHIBA_DELAY
-#define	AFTER_RESET_DELAY	(5*HZ/2)
-#else
-#define	AFTER_RESET_DELAY	(HZ/2)
-#endif
+static int setup_toshiba_delay = -1;
+module_param(setup_toshiba_delay, int, 0);
 
 /*
  * NCR 5380 register access functions
@@ -94,12 +81,12 @@ static inline void macscsi_write(struct Scsi_Host *instance, int reg, int value)
 #ifndef MODULE
 static int __init mac_scsi_setup(char *str)
 {
-	int ints[7];
+	int ints[8];
 
 	(void)get_options(str, ARRAY_SIZE(ints), ints);
 
-	if (ints[0] < 1 || ints[0] > 6) {
-		pr_err("Usage: mac5380=<can_queue>[,<cmd_per_lun>[,<sg_tablesize>[,<hostid>[,<use_tags>[,<use_pdma>]]]]]\n");
+	if (ints[0] < 1) {
+		pr_err("Usage: mac5380=<can_queue>[,<cmd_per_lun>[,<sg_tablesize>[,<hostid>[,<use_tags>[,<use_pdma>[,<toshiba_delay>]]]]]]\n");
 		return 0;
 	}
 	if (ints[0] >= 1)
@@ -114,50 +101,14 @@ static int __init mac_scsi_setup(char *str)
 		setup_use_tagged_queuing = ints[5];
 	if (ints[0] >= 6)
 		setup_use_pdma = ints[6];
+	if (ints[0] >= 7)
+		setup_toshiba_delay = ints[7];
 	return 1;
 }
 
 __setup("mac5380=", mac_scsi_setup);
 #endif /* !MODULE */
 
-#ifdef RESET_BOOT
-/*
- * Our 'bus reset on boot' function
- */
-
-static void mac_scsi_reset_boot(struct Scsi_Host *instance)
-{
-	unsigned long end;
-
-	NCR5380_local_declare();
-	NCR5380_setup(instance);
-	
-	/*
-	 * Do a SCSI reset to clean up the bus during initialization. No messing
-	 * with the queues, interrupts, or locks necessary here.
-	 */
-
-	printk(KERN_INFO "Macintosh SCSI: resetting the SCSI bus..." );
-
-	/* get in phase */
-	NCR5380_write( TARGET_COMMAND_REG,
-		      PHASE_SR_TO_TCR( NCR5380_read(STATUS_REG) ));
-
-	/* assert RST */
-	NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST );
-	/* The min. reset hold time is 25us, so 40us should be enough */
-	udelay( 50 );
-	/* reset RST and interrupt */
-	NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE );
-	NCR5380_read( RESET_PARITY_INTERRUPT_REG );
-
-	for( end = jiffies + AFTER_RESET_DELAY; time_before(jiffies, end); )
-		barrier();
-
-	printk(KERN_INFO " done\n" );
-}
-#endif
-
 #ifdef PSEUDO_DMA
 /* 
    Pseudo-DMA: (Ove Edlund)
@@ -235,9 +186,6 @@ static int macscsi_pread(struct Scsi_Host *instance,
 	unsigned char *d;
 	unsigned char *s;
 
-	NCR5380_local_declare();
-	NCR5380_setup(instance);
-
 	s = hostdata->pdma_base + (INPUT_DATA_REG << 4);
 	d = dst;
 
@@ -329,9 +277,6 @@ static int macscsi_pwrite(struct Scsi_Host *instance,
 	unsigned char *s;
 	unsigned char *d;
 
-	NCR5380_local_declare();
-	NCR5380_setup(instance);
-
 	s = src;
 	d = hostdata->pdma_base + (OUTPUT_DATA_REG << 4);
 
@@ -364,20 +309,22 @@ static int macscsi_pwrite(struct Scsi_Host *instance,
 #define PFX                     DRV_MODULE_NAME ": "
 
 static struct scsi_host_template mac_scsi_template = {
-	.module				= THIS_MODULE,
-	.proc_name			= DRV_MODULE_NAME,
-	.show_info			= macscsi_show_info,
-	.write_info			= macscsi_write_info,
-	.name				= "Macintosh NCR5380 SCSI",
-	.info				= macscsi_info,
-	.queuecommand			= macscsi_queue_command,
-	.eh_abort_handler		= macscsi_abort,
-	.eh_bus_reset_handler		= macscsi_bus_reset,
-	.can_queue			= 16,
-	.this_id			= 7,
-	.sg_tablesize			= SG_ALL,
-	.cmd_per_lun			= 2,
-	.use_clustering			= DISABLE_CLUSTERING
+	.module			= THIS_MODULE,
+	.proc_name		= DRV_MODULE_NAME,
+	.show_info		= macscsi_show_info,
+	.write_info		= macscsi_write_info,
+	.name			= "Macintosh NCR5380 SCSI",
+	.info			= macscsi_info,
+	.queuecommand		= macscsi_queue_command,
+	.eh_abort_handler	= macscsi_abort,
+	.eh_bus_reset_handler	= macscsi_bus_reset,
+	.can_queue		= 16,
+	.this_id		= 7,
+	.sg_tablesize		= SG_ALL,
+	.cmd_per_lun		= 2,
+	.use_clustering		= DISABLE_CLUSTERING,
+	.cmd_size		= NCR5380_CMD_SIZE,
+	.max_sectors		= 128,
 };
 
 static int __init mac_scsi_probe(struct platform_device *pdev)
@@ -432,15 +379,14 @@ static int __init mac_scsi_probe(struct platform_device *pdev)
 	} else
 		host_flags |= FLAG_NO_PSEUDO_DMA;
 
-#ifdef RESET_BOOT
-	mac_scsi_reset_boot(instance);
-#endif
-
 #ifdef SUPPORT_TAGS
 	host_flags |= setup_use_tagged_queuing > 0 ? FLAG_TAGGED_QUEUING : 0;
 #endif
+	host_flags |= setup_toshiba_delay > 0 ? FLAG_TOSHIBA_DELAY : 0;
 
-	NCR5380_init(instance, host_flags);
+	error = NCR5380_init(instance, host_flags);
+	if (error)
+		goto fail_init;
 
 	if (instance->irq != NO_IRQ) {
 		error = request_irq(instance->irq, macscsi_intr, IRQF_SHARED,
@@ -449,6 +395,8 @@ static int __init mac_scsi_probe(struct platform_device *pdev)
 			goto fail_irq;
 	}
 
+	NCR5380_maybe_reset_bus(instance);
+
 	error = scsi_add_host(instance, NULL);
 	if (error)
 		goto fail_host;
@@ -463,6 +411,7 @@ fail_host:
 		free_irq(instance->irq, instance);
 fail_irq:
 	NCR5380_exit(instance);
+fail_init:
 	scsi_host_put(instance);
 	return error;
 }
diff --git a/drivers/scsi/megaraid/megaraid_mm.c b/drivers/scsi/megaraid/megaraid_mm.c
index a70692779a16..4cf9ed96414f 100644
--- a/drivers/scsi/megaraid/megaraid_mm.c
+++ b/drivers/scsi/megaraid/megaraid_mm.c
@@ -179,8 +179,12 @@ mraid_mm_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 
 	/*
 	 * The following call will block till a kioc is available
+	 * or return NULL if the list head is empty for the pointer
+	 * of type mraid_mmapt passed to mraid_mm_alloc_kioc
 	 */
 	kioc = mraid_mm_alloc_kioc(adp);
+	if (!kioc)
+		return -ENXIO;
 
 	/*
 	 * User sent the old mimd_t ioctl packet. Convert it to uioc_t.
diff --git a/drivers/scsi/pas16.c b/drivers/scsi/pas16.c
index e81eadd08afc..512037e27783 100644
--- a/drivers/scsi/pas16.c
+++ b/drivers/scsi/pas16.c
@@ -1,6 +1,4 @@
 #define PSEUDO_DMA
-#define UNSAFE  /* Not unsafe for PAS16 -- use it */
-#define PDEBUG 0
 
 /*
  * This driver adapted from Drew Eckhardt's Trantor T128 driver
@@ -71,14 +69,10 @@
  
 #include <linux/module.h>
 
-#include <linux/signal.h>
-#include <linux/proc_fs.h>
 #include <asm/io.h>
 #include <asm/dma.h>
 #include <linux/blkdev.h>
-#include <linux/delay.h>
 #include <linux/interrupt.h>
-#include <linux/stat.h>
 #include <linux/init.h>
 
 #include <scsi/scsi_host.h>
@@ -87,8 +81,8 @@
 #include "NCR5380.h"
 
 
-static unsigned short pas16_addr = 0;
-static int pas16_irq = 0;
+static unsigned short pas16_addr;
+static int pas16_irq;
  
 
 static const int scsi_irq_translate[] =
@@ -146,22 +140,6 @@ static const unsigned short  pas16_offset[ 8 ] =
 		    * START_DMA_INITIATOR_RECEIVE_REG wo
 		    */
     };
-/*----------------------------------------------------------------*/
-/* the following will set the monitor border color (useful to find
- where something crashed or gets stuck at */
-/* 1 = blue
- 2 = green
- 3 = cyan
- 4 = red
- 5 = magenta
- 6 = yellow
- 7 = white
-*/
-#if 1
-#define rtrc(i) {inb(0x3da); outb(0x31, 0x3c0); outb((i), 0x3c0);}
-#else
-#define rtrc(i) {}
-#endif
 
 
 /*
@@ -205,7 +183,7 @@ static void __init
 	outb( 0x01, io_port + P_TIMEOUT_STATUS_REG_OFFSET );   /* Reset TC */
 	outb( 0x01, io_port + WAIT_STATE );   /* 1 Wait state */
 
-	NCR5380_read( RESET_PARITY_INTERRUPT_REG );
+	inb(io_port + pas16_offset[RESET_PARITY_INTERRUPT_REG]);
 
 	/* Set the SCSI interrupt pointer without mucking up the sound
 	 * interrupt pointer in the same byte.
@@ -280,13 +258,13 @@ static int __init
      * put in an additional test to try to weed them out.
      */
 
-    outb( 0x01, io_port + WAIT_STATE ); 	/* 1 Wait state */
-    NCR5380_write( MODE_REG, 0x20 );		/* Is it really SCSI? */
-    if( NCR5380_read( MODE_REG ) != 0x20 )	/* Write to a reg.    */
-	return 0;				/* and try to read    */
-    NCR5380_write( MODE_REG, 0x00 );		/* it back.	      */
-    if( NCR5380_read( MODE_REG ) != 0x00 )
-	return 0;
+	outb(0x01, io_port + WAIT_STATE);             /* 1 Wait state */
+	outb(0x20, io_port + pas16_offset[MODE_REG]); /* Is it really SCSI? */
+	if (inb(io_port + pas16_offset[MODE_REG]) != 0x20) /* Write to a reg. */
+		return 0;                                  /* and try to read */
+	outb(0x00, io_port + pas16_offset[MODE_REG]);      /* it back. */
+	if (inb(io_port + pas16_offset[MODE_REG]) != 0x00)
+		return 0;
 
     return 1;
 }
@@ -305,7 +283,7 @@ static int __init
 
 static int __init pas16_setup(char *str)
 {
-    static int commandline_current = 0;
+	static int commandline_current;
     int i;
     int ints[10];
 
@@ -344,8 +322,8 @@ __setup("pas16=", pas16_setup);
 
 static int __init pas16_detect(struct scsi_host_template *tpnt)
 {
-    static int current_override = 0;
-    static unsigned short current_base = 0;
+	static int current_override;
+	static unsigned short current_base;
     struct Scsi_Host *instance;
     unsigned short io_port;
     int  count;
@@ -377,34 +355,32 @@ static int __init pas16_detect(struct scsi_host_template *tpnt)
 	}
 	else
 	    for (; !io_port && (current_base < NO_BASES); ++current_base) {
-#if (PDEBUG & PDEBUG_INIT)
-    printk("scsi-pas16 : probing io_port %04x\n", (unsigned int) bases[current_base].io_port);
-#endif
+		dprintk(NDEBUG_INIT, "pas16: probing io_port 0x%04x\n",
+		        (unsigned int)bases[current_base].io_port);
 		if ( !bases[current_base].noauto &&
 		     pas16_hw_detect( current_base ) ){
 			io_port = bases[current_base].io_port;
 			init_board( io_port, default_irqs[ current_base ], 0 ); 
-#if (PDEBUG & PDEBUG_INIT)
-			printk("scsi-pas16 : detected board.\n");
-#endif
+			dprintk(NDEBUG_INIT, "pas16: detected board\n");
 		}
     }
 
-
-#if defined(PDEBUG) && (PDEBUG & PDEBUG_INIT)
-	printk("scsi-pas16 : io_port = %04x\n", (unsigned int) io_port);
-#endif
+	dprintk(NDEBUG_INIT, "pas16: io_port = 0x%04x\n",
+	        (unsigned int)io_port);
 
 	if (!io_port)
 	    break;
 
 	instance = scsi_register (tpnt, sizeof(struct NCR5380_hostdata));
 	if(instance == NULL)
-		break;
+		goto out;
 		
 	instance->io_port = io_port;
 
-	NCR5380_init(instance, 0);
+	if (NCR5380_init(instance, 0))
+		goto out_unregister;
+
+	NCR5380_maybe_reset_bus(instance);
 
 	if (overrides[current_override].irq != IRQ_AUTO)
 	    instance->irq = overrides[current_override].irq;
@@ -431,14 +407,18 @@ static int __init pas16_detect(struct scsi_host_template *tpnt)
 	    outb( (inb(io_port + IO_CONFIG_3) & 0x0f), io_port + IO_CONFIG_3 );
 	}
 
-#if defined(PDEBUG) && (PDEBUG & PDEBUG_INIT)
-	printk("scsi%d : irq = %d\n", instance->host_no, instance->irq);
-#endif
+	dprintk(NDEBUG_INIT, "scsi%d : irq = %d\n",
+	        instance->host_no, instance->irq);
 
 	++current_override;
 	++count;
     }
     return count;
+
+out_unregister:
+	scsi_unregister(instance);
+out:
+	return count;
 }
 
 /*
@@ -561,29 +541,29 @@ static int pas16_release(struct Scsi_Host *shost)
 	if (shost->irq != NO_IRQ)
 		free_irq(shost->irq, shost);
 	NCR5380_exit(shost);
-	if (shost->io_port && shost->n_io_port)
-		release_region(shost->io_port, shost->n_io_port);
 	scsi_unregister(shost);
 	return 0;
 }
 
 static struct scsi_host_template driver_template = {
-	.name           = "Pro Audio Spectrum-16 SCSI",
-	.detect         = pas16_detect,
-	.release        = pas16_release,
-	.proc_name      = "pas16",
-	.show_info      = pas16_show_info,
-	.write_info     = pas16_write_info,
-	.info           = pas16_info,
-	.queuecommand   = pas16_queue_command,
-	.eh_abort_handler = pas16_abort,
-	.eh_bus_reset_handler = pas16_bus_reset,
-	.bios_param     = pas16_biosparam, 
-	.can_queue      = CAN_QUEUE,
-	.this_id        = 7,
-	.sg_tablesize   = SG_ALL,
-	.cmd_per_lun    = CMD_PER_LUN,
-	.use_clustering = DISABLE_CLUSTERING,
+	.name			= "Pro Audio Spectrum-16 SCSI",
+	.detect			= pas16_detect,
+	.release		= pas16_release,
+	.proc_name		= "pas16",
+	.show_info		= pas16_show_info,
+	.write_info		= pas16_write_info,
+	.info			= pas16_info,
+	.queuecommand		= pas16_queue_command,
+	.eh_abort_handler	= pas16_abort,
+	.eh_bus_reset_handler	= pas16_bus_reset,
+	.bios_param		= pas16_biosparam,
+	.can_queue		= 32,
+	.this_id		= 7,
+	.sg_tablesize		= SG_ALL,
+	.cmd_per_lun		= 2,
+	.use_clustering		= DISABLE_CLUSTERING,
+	.cmd_size		= NCR5380_CMD_SIZE,
+	.max_sectors		= 128,
 };
 #include "scsi_module.c"
 
diff --git a/drivers/scsi/pas16.h b/drivers/scsi/pas16.h
index c6109c80050b..d37527717225 100644
--- a/drivers/scsi/pas16.h
+++ b/drivers/scsi/pas16.h
@@ -24,9 +24,6 @@
 #ifndef PAS16_H
 #define PAS16_H
 
-#define PDEBUG_INIT	0x1
-#define PDEBUG_TRANSFER 0x2
-
 #define PAS16_DEFAULT_BASE_1  0x388
 #define PAS16_DEFAULT_BASE_2  0x384
 #define PAS16_DEFAULT_BASE_3  0x38c
@@ -98,46 +95,16 @@
 #define OPERATION_MODE_1 0xec03
 #define IO_CONFIG_3 0xf002
 
+#define NCR5380_implementation_fields /* none */
 
-#ifndef ASM
-
-#ifndef CMD_PER_LUN
-#define CMD_PER_LUN 2
-#endif
-
-#ifndef CAN_QUEUE
-#define CAN_QUEUE 32 
-#endif
-
-#define NCR5380_implementation_fields \
-    volatile unsigned short io_port
-
-#define NCR5380_local_declare() \
-    volatile unsigned short io_port
+#define PAS16_io_port(reg) (instance->io_port + pas16_offset[(reg)])
 
-#define NCR5380_setup(instance) \
-    io_port = (instance)->io_port
-
-#define PAS16_io_port(reg) ( io_port + pas16_offset[(reg)] )
-
-#if !(PDEBUG & PDEBUG_TRANSFER) 
 #define NCR5380_read(reg) ( inb(PAS16_io_port(reg)) )
 #define NCR5380_write(reg, value) ( outb((value),PAS16_io_port(reg)) )
-#else
-#define NCR5380_read(reg)						\
-    (((unsigned char) printk("scsi%d : read register %d at io_port %04x\n"\
-    , instance->hostno, (reg), PAS16_io_port(reg))), inb( PAS16_io_port(reg)) )
-
-#define NCR5380_write(reg, value) 					\
-    (printk("scsi%d : write %02x to register %d at io_port %04x\n", 	\
-	    instance->hostno, (value), (reg), PAS16_io_port(reg)),	\
-    outb( (value),PAS16_io_port(reg) ) )
-
-#endif
 
+#define NCR5380_dma_xfer_len(instance, cmd, phase)	(cmd->transfersize)
 
 #define NCR5380_intr pas16_intr
-#define do_NCR5380_intr do_pas16_intr
 #define NCR5380_queue_command pas16_queue_command
 #define NCR5380_abort pas16_abort
 #define NCR5380_bus_reset pas16_bus_reset
@@ -150,5 +117,4 @@
    
 #define PAS16_IRQS 0xd4a8 
 
-#endif /* ndef ASM */
 #endif /* PAS16_H */
diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c
index 2c1160c7ec92..47b9d13f97b8 100644
--- a/drivers/scsi/scsi_devinfo.c
+++ b/drivers/scsi/scsi_devinfo.c
@@ -227,6 +227,7 @@ static struct {
 	{"Promise", "VTrak E610f", NULL, BLIST_SPARSELUN | BLIST_NO_RSOC},
 	{"Promise", "", NULL, BLIST_SPARSELUN},
 	{"QNAP", "iSCSI Storage", NULL, BLIST_MAX_1024},
+	{"SYNOLOGY", "iSCSI Storage", NULL, BLIST_MAX_1024},
 	{"QUANTUM", "XP34301", "1071", BLIST_NOTQ},
 	{"REGAL", "CDC-4X", NULL, BLIST_MAX5LUN | BLIST_SINGLELUN},
 	{"SanDisk", "ImageMate CF-SD1", NULL, BLIST_FORCELUN},
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 41c115c230d9..55627d097873 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -390,7 +390,7 @@ module_param(storvsc_ringbuffer_size, int, S_IRUGO);
 MODULE_PARM_DESC(storvsc_ringbuffer_size, "Ring buffer size (bytes)");
 
 module_param(storvsc_vcpus_per_sub_channel, int, S_IRUGO);
-MODULE_PARM_DESC(vcpus_per_sub_channel, "Ratio of VCPUs to subchannels");
+MODULE_PARM_DESC(storvsc_vcpus_per_sub_channel, "Ratio of VCPUs to subchannels");
 /*
  * Timeout in seconds for all devices managed by this driver.
  */
diff --git a/drivers/scsi/sun3_scsi.c b/drivers/scsi/sun3_scsi.c
index 22a42836d193..b9de487bbd31 100644
--- a/drivers/scsi/sun3_scsi.c
+++ b/drivers/scsi/sun3_scsi.c
@@ -53,13 +53,12 @@
 #define NCR5380_queue_command           sun3scsi_queue_command
 #define NCR5380_bus_reset               sun3scsi_bus_reset
 #define NCR5380_abort                   sun3scsi_abort
-#define NCR5380_show_info               sun3scsi_show_info
 #define NCR5380_info                    sun3scsi_info
 
 #define NCR5380_dma_read_setup(instance, data, count) \
-        sun3scsi_dma_setup(data, count, 0)
+        sun3scsi_dma_setup(instance, data, count, 0)
 #define NCR5380_dma_write_setup(instance, data, count) \
-        sun3scsi_dma_setup(data, count, 1)
+        sun3scsi_dma_setup(instance, data, count, 1)
 #define NCR5380_dma_residual(instance) \
         sun3scsi_dma_residual(instance)
 #define NCR5380_dma_xfer_len(instance, cmd, phase) \
@@ -86,10 +85,6 @@ module_param(setup_use_tagged_queuing, int, 0);
 static int setup_hostid = -1;
 module_param(setup_hostid, int, 0);
 
-/* #define RESET_BOOT */
-
-#define	AFTER_RESET_DELAY	(HZ/2)
-
 /* ms to wait after hitting dma regs */
 #define SUN3_DMA_DELAY 10
 
@@ -100,11 +95,10 @@ static struct scsi_cmnd *sun3_dma_setup_done;
 static unsigned char *sun3_scsi_regp;
 static volatile struct sun3_dma_regs *dregs;
 static struct sun3_udc_regs *udc_regs;
-static unsigned char *sun3_dma_orig_addr = NULL;
-static unsigned long sun3_dma_orig_count = 0;
-static int sun3_dma_active = 0;
-static unsigned long last_residual = 0;
-static struct Scsi_Host *default_instance;
+static unsigned char *sun3_dma_orig_addr;
+static unsigned long sun3_dma_orig_count;
+static int sun3_dma_active;
+static unsigned long last_residual;
 
 /*
  * NCR 5380 register access functions
@@ -144,50 +138,12 @@ static inline void sun3_udc_write(unsigned short val, unsigned char reg)
 }
 #endif
 
-#ifdef RESET_BOOT
-static void sun3_scsi_reset_boot(struct Scsi_Host *instance)
-{
-	unsigned long end;
-	
-	/*
-	 * Do a SCSI reset to clean up the bus during initialization. No
-	 * messing with the queues, interrupts, or locks necessary here.
-	 */
-
-	printk( "Sun3 SCSI: resetting the SCSI bus..." );
-
-	/* switch off SCSI IRQ - catch an interrupt without IRQ bit set else */
-//       	sun3_disable_irq( IRQ_SUN3_SCSI );
-
-	/* get in phase */
-	NCR5380_write( TARGET_COMMAND_REG,
-		      PHASE_SR_TO_TCR( NCR5380_read(STATUS_REG) ));
-
-	/* assert RST */
-	NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST );
-
-	/* The min. reset hold time is 25us, so 40us should be enough */
-	udelay( 50 );
-
-	/* reset RST and interrupt */
-	NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE );
-	NCR5380_read( RESET_PARITY_INTERRUPT_REG );
-
-	for( end = jiffies + AFTER_RESET_DELAY; time_before(jiffies, end); )
-		barrier();
-
-	/* switch on SCSI IRQ again */
-//       	sun3_enable_irq( IRQ_SUN3_SCSI );
-
-	printk( " done\n" );
-}
-#endif
-
 // safe bits for the CSR
 #define CSR_GOOD 0x060f
 
-static irqreturn_t scsi_sun3_intr(int irq, void *dummy)
+static irqreturn_t scsi_sun3_intr(int irq, void *dev)
 {
+	struct Scsi_Host *instance = dev;
 	unsigned short csr = dregs->csr;
 	int handled = 0;
 
@@ -196,46 +152,24 @@ static irqreturn_t scsi_sun3_intr(int irq, void *dummy)
 #endif
 
 	if(csr & ~CSR_GOOD) {
-		if(csr & CSR_DMA_BUSERR) {
-			printk("scsi%d: bus error in dma\n", default_instance->host_no);
-		}
-
-		if(csr & CSR_DMA_CONFLICT) {
-			printk("scsi%d: dma conflict\n", default_instance->host_no);
-		}
+		if (csr & CSR_DMA_BUSERR)
+			shost_printk(KERN_ERR, instance, "bus error in DMA\n");
+		if (csr & CSR_DMA_CONFLICT)
+			shost_printk(KERN_ERR, instance, "DMA conflict\n");
 		handled = 1;
 	}
 
 	if(csr & (CSR_SDB_INT | CSR_DMA_INT)) {
-		NCR5380_intr(irq, dummy);
+		NCR5380_intr(irq, dev);
 		handled = 1;
 	}
 
 	return IRQ_RETVAL(handled);
 }
 
-/*
- * Debug stuff - to be called on NMI, or sysrq key. Use at your own risk; 
- * reentering NCR5380_print_status seems to have ugly side effects
- */
-
-/* this doesn't seem to get used at all -- sam */
-#if 0
-void sun3_sun3_debug (void)
-{
-	unsigned long flags;
-
-	if (default_instance) {
-			local_irq_save(flags);
-			NCR5380_print_status(default_instance);
-			local_irq_restore(flags);
-	}
-}
-#endif
-
-
 /* sun3scsi_dma_setup() -- initialize the dma controller for a read/write */
-static unsigned long sun3scsi_dma_setup(void *data, unsigned long count, int write_flag)
+static unsigned long sun3scsi_dma_setup(struct Scsi_Host *instance,
+                                void *data, unsigned long count, int write_flag)
 {
 	void *addr;
 
@@ -287,10 +221,9 @@ static unsigned long sun3scsi_dma_setup(void *data, unsigned long count, int wri
 	dregs->csr |= CSR_FIFO;
 	
 	if(dregs->fifo_count != count) { 
-		printk("scsi%d: fifo_mismatch %04x not %04x\n",
-		       default_instance->host_no, dregs->fifo_count,
-		       (unsigned int) count);
-		NCR5380_dprint(NDEBUG_DMA, default_instance);
+		shost_printk(KERN_ERR, instance, "FIFO mismatch %04x not %04x\n",
+		             dregs->fifo_count, (unsigned int) count);
+		NCR5380_dprint(NDEBUG_DMA, instance);
 	}
 
 	/* setup udc */
@@ -325,21 +258,6 @@ static unsigned long sun3scsi_dma_setup(void *data, unsigned long count, int wri
 
 }
 
-#ifndef SUN3_SCSI_VME
-static inline unsigned long sun3scsi_dma_count(struct Scsi_Host *instance)
-{
-	unsigned short resid;
-
-	dregs->udc_addr = 0x32; 
-	udelay(SUN3_DMA_DELAY);
-	resid = dregs->udc_data;
-	udelay(SUN3_DMA_DELAY);
-	resid *= 2;
-
-	return (unsigned long) resid;
-}
-#endif
-
 static inline unsigned long sun3scsi_dma_residual(struct Scsi_Host *instance)
 {
 	return last_residual;
@@ -437,7 +355,10 @@ static int sun3scsi_dma_finish(int write_flag)
 		}
 	}
 
-	count = sun3scsi_dma_count(default_instance);
+	dregs->udc_addr = 0x32;
+	udelay(SUN3_DMA_DELAY);
+	count = 2 * dregs->udc_data;
+	udelay(SUN3_DMA_DELAY);
 
 	fifo = dregs->fifo_count;
 	last_residual = fifo;
@@ -502,17 +423,17 @@ static int sun3scsi_dma_finish(int write_flag)
 static struct scsi_host_template sun3_scsi_template = {
 	.module			= THIS_MODULE,
 	.proc_name		= DRV_MODULE_NAME,
-	.show_info		= sun3scsi_show_info,
 	.name			= SUN3_SCSI_NAME,
 	.info			= sun3scsi_info,
 	.queuecommand		= sun3scsi_queue_command,
-	.eh_abort_handler      	= sun3scsi_abort,
-	.eh_bus_reset_handler  	= sun3scsi_bus_reset,
+	.eh_abort_handler	= sun3scsi_abort,
+	.eh_bus_reset_handler	= sun3scsi_bus_reset,
 	.can_queue		= 16,
 	.this_id		= 7,
 	.sg_tablesize		= SG_NONE,
 	.cmd_per_lun		= 2,
-	.use_clustering		= DISABLE_CLUSTERING
+	.use_clustering		= DISABLE_CLUSTERING,
+	.cmd_size		= NCR5380_CMD_SIZE,
 };
 
 static int __init sun3_scsi_probe(struct platform_device *pdev)
@@ -591,7 +512,6 @@ static int __init sun3_scsi_probe(struct platform_device *pdev)
 		error = -ENOMEM;
 		goto fail_alloc;
 	}
-	default_instance = instance;
 
 	instance->io_port = (unsigned long)ioaddr;
 	instance->irq = irq->start;
@@ -600,7 +520,9 @@ static int __init sun3_scsi_probe(struct platform_device *pdev)
 	host_flags |= setup_use_tagged_queuing > 0 ? FLAG_TAGGED_QUEUING : 0;
 #endif
 
-	NCR5380_init(instance, host_flags);
+	error = NCR5380_init(instance, host_flags);
+	if (error)
+		goto fail_init;
 
 	error = request_irq(instance->irq, scsi_sun3_intr, 0,
 	                    "NCR5380", instance);
@@ -631,9 +553,7 @@ static int __init sun3_scsi_probe(struct platform_device *pdev)
 	dregs->ivect = VME_DATA24 | (instance->irq & 0xff);
 #endif
 
-#ifdef RESET_BOOT
-	sun3_scsi_reset_boot(instance);
-#endif
+	NCR5380_maybe_reset_bus(instance);
 
 	error = scsi_add_host(instance, NULL);
 	if (error)
@@ -649,6 +569,7 @@ fail_host:
 		free_irq(instance->irq, instance);
 fail_irq:
 	NCR5380_exit(instance);
+fail_init:
 	scsi_host_put(instance);
 fail_alloc:
 	if (udc_regs)
diff --git a/drivers/scsi/t128.c b/drivers/scsi/t128.c
index 87828acbf7c6..4615fda60dbd 100644
--- a/drivers/scsi/t128.c
+++ b/drivers/scsi/t128.c
@@ -68,14 +68,11 @@
  * 15 9-11
  */
  
-#include <linux/signal.h>
 #include <linux/io.h>
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
-#include <linux/stat.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/delay.h>
 
 #include <scsi/scsi_host.h>
 #include "t128.h"
@@ -126,7 +123,7 @@ static struct signature {
 
 static int __init t128_setup(char *str)
 {
-    static int commandline_current = 0;
+	static int commandline_current;
     int i;
     int ints[10];
 
@@ -165,7 +162,7 @@ __setup("t128=", t128_setup);
 
 static int __init t128_detect(struct scsi_host_template *tpnt)
 {
-    static int current_override = 0, current_base = 0;
+	static int current_override, current_base;
     struct Scsi_Host *instance;
     unsigned long base;
     void __iomem *p;
@@ -182,9 +179,8 @@ static int __init t128_detect(struct scsi_host_template *tpnt)
 		base = 0;
 	} else 
 	    for (; !base && (current_base < NO_BASES); ++current_base) {
-#if (TDEBUG & TDEBUG_INIT)
-    printk("scsi-t128 : probing address %08x\n", bases[current_base].address);
-#endif
+		dprintk(NDEBUG_INIT, "t128: probing address 0x%08x\n",
+		        bases[current_base].address);
 		if (bases[current_base].noauto)
 			continue;
 		p = ioremap(bases[current_base].address, 0x2000);
@@ -195,17 +191,13 @@ static int __init t128_detect(struct scsi_host_template *tpnt)
 					signatures[sig].string,
 					strlen(signatures[sig].string))) {
 			base = bases[current_base].address;
-#if (TDEBUG & TDEBUG_INIT)
-			printk("scsi-t128 : detected board.\n");
-#endif
+			dprintk(NDEBUG_INIT, "t128: detected board\n");
 			goto found;
 		    }
 		iounmap(p);
 	    }
 
-#if defined(TDEBUG) && (TDEBUG & TDEBUG_INIT)
-	printk("scsi-t128 : base = %08x\n", (unsigned int) base);
-#endif
+	dprintk(NDEBUG_INIT, "t128: base = 0x%08x\n", (unsigned int)base);
 
 	if (!base)
 	    break;
@@ -213,12 +205,15 @@ static int __init t128_detect(struct scsi_host_template *tpnt)
 found:
 	instance = scsi_register (tpnt, sizeof(struct NCR5380_hostdata));
 	if(instance == NULL)
-		break;
-		
+		goto out_unmap;
+
 	instance->base = base;
 	((struct NCR5380_hostdata *)instance->hostdata)->base = p;
 
-	NCR5380_init(instance, 0);
+	if (NCR5380_init(instance, 0))
+		goto out_unregister;
+
+	NCR5380_maybe_reset_bus(instance);
 
 	if (overrides[current_override].irq != IRQ_AUTO)
 	    instance->irq = overrides[current_override].irq;
@@ -242,27 +237,30 @@ found:
 	    printk("scsi%d : please jumper the board for a free IRQ.\n", instance->host_no);
 	}
 
-#if defined(TDEBUG) && (TDEBUG & TDEBUG_INIT)
-	printk("scsi%d : irq = %d\n", instance->host_no, instance->irq);
-#endif
+	dprintk(NDEBUG_INIT, "scsi%d: irq = %d\n",
+	        instance->host_no, instance->irq);
 
 	++current_override;
 	++count;
     }
     return count;
+
+out_unregister:
+	scsi_unregister(instance);
+out_unmap:
+	iounmap(p);
+	return count;
 }
 
 static int t128_release(struct Scsi_Host *shost)
 {
-	NCR5380_local_declare();
-	NCR5380_setup(shost);
+	struct NCR5380_hostdata *hostdata = shost_priv(shost);
+
 	if (shost->irq != NO_IRQ)
 		free_irq(shost->irq, shost);
 	NCR5380_exit(shost);
-	if (shost->io_port && shost->n_io_port)
-		release_region(shost->io_port, shost->n_io_port);
 	scsi_unregister(shost);
-	iounmap(base);
+	iounmap(hostdata->base);
 	return 0;
 }
 
@@ -308,14 +306,14 @@ static int t128_biosparam(struct scsi_device *sdev, struct block_device *bdev,
  * 	timeout.
  */
 
-static inline int NCR5380_pread (struct Scsi_Host *instance, unsigned char *dst,
-    int len) {
-    NCR5380_local_declare();
-    void __iomem *reg;
+static inline int
+NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst, int len)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	void __iomem *reg, *base = hostdata->base;
     unsigned char *d = dst;
     register int i = len;
 
-    NCR5380_setup(instance);
     reg = base + T_DATA_REG_OFFSET;
 
 #if 0
@@ -354,14 +352,14 @@ static inline int NCR5380_pread (struct Scsi_Host *instance, unsigned char *dst,
  * 	timeout.
  */
 
-static inline int NCR5380_pwrite (struct Scsi_Host *instance, unsigned char *src,
-    int len) {
-    NCR5380_local_declare();
-    void __iomem *reg;
+static inline int
+NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src, int len)
+{
+	struct NCR5380_hostdata *hostdata = shost_priv(instance);
+	void __iomem *reg, *base = hostdata->base;
     unsigned char *s = src;
     register int i = len;
 
-    NCR5380_setup(instance);
     reg = base + T_DATA_REG_OFFSET;
 
 #if 0
@@ -392,21 +390,23 @@ MODULE_LICENSE("GPL");
 #include "NCR5380.c"
 
 static struct scsi_host_template driver_template = {
-	.name           = "Trantor T128/T128F/T228",
-	.detect         = t128_detect,
-	.release        = t128_release,
-	.proc_name      = "t128",
-	.show_info      = t128_show_info,
-	.write_info     = t128_write_info,
-	.info           = t128_info,
-	.queuecommand   = t128_queue_command,
-	.eh_abort_handler = t128_abort,
-	.eh_bus_reset_handler    = t128_bus_reset,
-	.bios_param     = t128_biosparam,
-	.can_queue      = CAN_QUEUE,
-        .this_id        = 7,
-	.sg_tablesize   = SG_ALL,
-	.cmd_per_lun    = CMD_PER_LUN,
-	.use_clustering = DISABLE_CLUSTERING,
+	.name			= "Trantor T128/T128F/T228",
+	.detect			= t128_detect,
+	.release		= t128_release,
+	.proc_name		= "t128",
+	.show_info		= t128_show_info,
+	.write_info		= t128_write_info,
+	.info			= t128_info,
+	.queuecommand		= t128_queue_command,
+	.eh_abort_handler	= t128_abort,
+	.eh_bus_reset_handler	= t128_bus_reset,
+	.bios_param		= t128_biosparam,
+	.can_queue		= 32,
+	.this_id		= 7,
+	.sg_tablesize		= SG_ALL,
+	.cmd_per_lun		= 2,
+	.use_clustering		= DISABLE_CLUSTERING,
+	.cmd_size		= NCR5380_CMD_SIZE,
+	.max_sectors		= 128,
 };
 #include "scsi_module.c"
diff --git a/drivers/scsi/t128.h b/drivers/scsi/t128.h
index 2c7371454dfd..dd16d85497e1 100644
--- a/drivers/scsi/t128.h
+++ b/drivers/scsi/t128.h
@@ -23,10 +23,6 @@
 #ifndef T128_H
 #define T128_H
 
-#define TDEBUG		0
-#define TDEBUG_INIT	0x1
-#define TDEBUG_TRANSFER 0x2
-
 /*
  * The trantor boards are memory mapped. They use an NCR5380 or
  * equivalent (my sample board had part second sourced from ZILOG).
@@ -71,44 +67,18 @@
 
 #define T_DATA_REG_OFFSET	0x1e00	/* rw 512 bytes long */
 
-#ifndef ASM
-
-#ifndef CMD_PER_LUN
-#define CMD_PER_LUN 2
-#endif
-
-#ifndef CAN_QUEUE
-#define CAN_QUEUE 32
-#endif
-
 #define NCR5380_implementation_fields \
     void __iomem *base
 
-#define NCR5380_local_declare() \
-    void __iomem *base
-
-#define NCR5380_setup(instance) \
-    base = ((struct NCR5380_hostdata *)(instance->hostdata))->base
+#define T128_address(reg) \
+	(((struct NCR5380_hostdata *)shost_priv(instance))->base + T_5380_OFFSET + ((reg) * 0x20))
 
-#define T128_address(reg) (base + T_5380_OFFSET + ((reg) * 0x20))
-
-#if !(TDEBUG & TDEBUG_TRANSFER)
 #define NCR5380_read(reg) readb(T128_address(reg))
 #define NCR5380_write(reg, value) writeb((value),(T128_address(reg)))
-#else
-#define NCR5380_read(reg)						\
-    (((unsigned char) printk("scsi%d : read register %d at address %08x\n"\
-    , instance->hostno, (reg), T128_address(reg))), readb(T128_address(reg)))
-
-#define NCR5380_write(reg, value) {					\
-    printk("scsi%d : write %02x to register %d at address %08x\n",	\
-	    instance->hostno, (value), (reg), T128_address(reg));	\
-    writeb((value), (T128_address(reg)));				\
-}
-#endif
+
+#define NCR5380_dma_xfer_len(instance, cmd, phase)	(cmd->transfersize)
 
 #define NCR5380_intr t128_intr
-#define do_NCR5380_intr do_t128_intr
 #define NCR5380_queue_command t128_queue_command
 #define NCR5380_abort t128_abort
 #define NCR5380_bus_reset t128_bus_reset
@@ -121,5 +91,4 @@
 
 #define T128_IRQS 0xc4a8
 
-#endif /* ndef ASM */
 #endif /* T128_H */
diff --git a/drivers/soc/Kconfig b/drivers/soc/Kconfig
index fb2b3935b658..88260205a261 100644
--- a/drivers/soc/Kconfig
+++ b/drivers/soc/Kconfig
@@ -7,6 +7,7 @@ source "drivers/soc/mediatek/Kconfig"
 source "drivers/soc/qcom/Kconfig"
 source "drivers/soc/rockchip/Kconfig"
 source "drivers/soc/sunxi/Kconfig"
+source "drivers/soc/tegra/Kconfig"
 source "drivers/soc/ti/Kconfig"
 source "drivers/soc/versatile/Kconfig"
 
diff --git a/drivers/soc/qcom/spm.c b/drivers/soc/qcom/spm.c
index 0ad66fa9bb1a..5548a31e1a39 100644
--- a/drivers/soc/qcom/spm.c
+++ b/drivers/soc/qcom/spm.c
@@ -288,7 +288,7 @@ static struct spm_driver_data *spm_get_drv(struct platform_device *pdev,
 	struct spm_driver_data *drv = NULL;
 	struct device_node *cpu_node, *saw_node;
 	int cpu;
-	bool found;
+	bool found = 0;
 
 	for_each_possible_cpu(cpu) {
 		cpu_node = of_cpu_device_node_get(cpu);
diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig
new file mode 100644
index 000000000000..d0c3c3e085e3
--- /dev/null
+++ b/drivers/soc/tegra/Kconfig
@@ -0,0 +1,83 @@
+if ARCH_TEGRA
+
+# 32-bit ARM SoCs
+if ARM
+
+config ARCH_TEGRA_2x_SOC
+	bool "Enable support for Tegra20 family"
+	select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
+	select ARM_ERRATA_720789
+	select ARM_ERRATA_754327 if SMP
+	select ARM_ERRATA_764369 if SMP
+	select PINCTRL_TEGRA20
+	select PL310_ERRATA_727915 if CACHE_L2X0
+	select PL310_ERRATA_769419 if CACHE_L2X0
+	select TEGRA_TIMER
+	help
+	  Support for NVIDIA Tegra AP20 and T20 processors, based on the
+	  ARM CortexA9MP CPU and the ARM PL310 L2 cache controller
+
+config ARCH_TEGRA_3x_SOC
+	bool "Enable support for Tegra30 family"
+	select ARM_ERRATA_754322
+	select ARM_ERRATA_764369 if SMP
+	select PINCTRL_TEGRA30
+	select PL310_ERRATA_769419 if CACHE_L2X0
+	select TEGRA_TIMER
+	help
+	  Support for NVIDIA Tegra T30 processor family, based on the
+	  ARM CortexA9MP CPU and the ARM PL310 L2 cache controller
+
+config ARCH_TEGRA_114_SOC
+	bool "Enable support for Tegra114 family"
+	select ARM_ERRATA_798181 if SMP
+	select ARM_L1_CACHE_SHIFT_6
+	select HAVE_ARM_ARCH_TIMER
+	select PINCTRL_TEGRA114
+	select TEGRA_TIMER
+	help
+	  Support for NVIDIA Tegra T114 processor family, based on the
+	  ARM CortexA15MP CPU
+
+config ARCH_TEGRA_124_SOC
+	bool "Enable support for Tegra124 family"
+	select ARM_L1_CACHE_SHIFT_6
+	select HAVE_ARM_ARCH_TIMER
+	select PINCTRL_TEGRA124
+	select TEGRA_TIMER
+	help
+	  Support for NVIDIA Tegra T124 processor family, based on the
+	  ARM CortexA15MP CPU
+
+endif
+
+# 64-bit ARM SoCs
+if ARM64
+
+config ARCH_TEGRA_132_SOC
+	bool "NVIDIA Tegra132 SoC"
+	select PINCTRL_TEGRA124
+	help
+	  Enable support for NVIDIA Tegra132 SoC, based on the Denver
+	  ARMv8 CPU.  The Tegra132 SoC is similar to the Tegra124 SoC,
+	  but contains an NVIDIA Denver CPU complex in place of
+	  Tegra124's "4+1" Cortex-A15 CPU complex.
+
+config ARCH_TEGRA_210_SOC
+	bool "NVIDIA Tegra210 SoC"
+	select PINCTRL_TEGRA210
+	help
+	  Enable support for the NVIDIA Tegra210 SoC. Also known as Tegra X1,
+	  the Tegra210 has four Cortex-A57 cores paired with four Cortex-A53
+	  cores in a switched configuration. It features a GPU of the Maxwell
+	  architecture with support for DX11, SM4, OpenGL 4.5, OpenGL ES 3.1
+	  and providing 256 CUDA cores. It supports hardware-accelerated en-
+	  and decoding of various video standards including H.265, H.264 and
+	  VP8 at 4K resolution and up to 60 fps.
+
+	  Besides the multimedia features it also comes with a variety of I/O
+	  controllers, such as GPIO, I2C, SPI, SDHCI, PCIe, SATA and XHCI, to
+	  name only a few.
+
+endif
+endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
index d6273e143324..a80d993b882e 100644
--- a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
@@ -151,16 +151,12 @@ do {									    \
 
 #define LIBCFS_FREE(ptr, size)					  \
 do {								    \
-	int s = (size);						 \
 	if (unlikely((ptr) == NULL)) {				  \
 		CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "    \
-		       "%s:%d\n", s, __FILE__, __LINE__);	       \
+		       "%s:%d\n", (int)(size), __FILE__, __LINE__);	\
 		break;						  \
 	}							       \
-	if (unlikely(s > LIBCFS_VMALLOC_SIZE))			  \
-		vfree(ptr);				    \
-	else							    \
-		kfree(ptr);					  \
+	kvfree(ptr);					  \
 } while (0)
 
 /******************************************************************************/
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
index 72af486b65df..cb74ae731b95 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
@@ -2070,32 +2070,13 @@ static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
 
 static int kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
 {
-	struct ib_device_attr *attr;
-	int rc;
-
 	/* It's safe to assume a HCA can handle a page size
 	 * matching that of the native system */
 	hdev->ibh_page_shift = PAGE_SHIFT;
 	hdev->ibh_page_size  = 1 << PAGE_SHIFT;
 	hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
 
-	LIBCFS_ALLOC(attr, sizeof(*attr));
-	if (attr == NULL) {
-		CERROR("Out of memory\n");
-		return -ENOMEM;
-	}
-
-	rc = ib_query_device(hdev->ibh_ibdev, attr);
-	if (rc == 0)
-		hdev->ibh_mr_size = attr->max_mr_size;
-
-	LIBCFS_FREE(attr, sizeof(*attr));
-
-	if (rc != 0) {
-		CERROR("Failed to query IB device: %d\n", rc);
-		return rc;
-	}
-
+	hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
 	if (hdev->ibh_mr_size == ~0ULL) {
 		hdev->ibh_mr_shift = 64;
 		return 0;
diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c
index 7b355319079c..8982f7d1b374 100644
--- a/drivers/staging/lustre/lustre/llite/dir.c
+++ b/drivers/staging/lustre/lustre/llite/dir.c
@@ -1858,7 +1858,7 @@ static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
 	int api32 = ll_need_32bit_api(sbi);
 	loff_t ret = -EINVAL;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	switch (origin) {
 	case SEEK_SET:
 		break;
@@ -1896,7 +1896,7 @@ static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
 	goto out;
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
index c92d58b770ec..39e2ffd5f97f 100644
--- a/drivers/staging/lustre/lustre/llite/file.c
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -2082,17 +2082,17 @@ putgl:
 	/* update time if requested */
 	rc = 0;
 	if (llss->ia2.ia_valid != 0) {
-		mutex_lock(&llss->inode1->i_mutex);
+		inode_lock(llss->inode1);
 		rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
-		mutex_unlock(&llss->inode1->i_mutex);
+		inode_unlock(llss->inode1);
 	}
 
 	if (llss->ia1.ia_valid != 0) {
 		int rc1;
 
-		mutex_lock(&llss->inode2->i_mutex);
+		inode_lock(llss->inode2);
 		rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
-		mutex_unlock(&llss->inode2->i_mutex);
+		inode_unlock(llss->inode2);
 		if (rc == 0)
 			rc = rc1;
 	}
@@ -2179,13 +2179,13 @@ static int ll_hsm_import(struct inode *inode, struct file *file,
 			 ATTR_MTIME | ATTR_MTIME_SET |
 			 ATTR_ATIME | ATTR_ATIME_SET;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	rc = ll_setattr_raw(file->f_path.dentry, attr, true);
 	if (rc == -ENODATA)
 		rc = 0;
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	kfree(attr);
 free_hss:
@@ -2609,7 +2609,7 @@ int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
 
 	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* catch async errors that were recorded back when async writeback
 	 * failed for pages in this mapping. */
@@ -2641,7 +2641,7 @@ int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 			fd->fd_write_failed = false;
 	}
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return rc;
 }
 
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index ee8a1d67d191..845e992ca5fc 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -631,8 +631,6 @@ struct ll_file_data {
 
 struct lov_stripe_md;
 
-extern spinlock_t inode_lock;
-
 extern struct dentry *llite_root;
 extern struct kset *llite_kset;
 
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index 1db93af62bad..b2fc5b3786ee 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -1277,7 +1277,7 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 		return -ENOMEM;
 
 	if (!S_ISDIR(inode->i_mode))
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 	memcpy(&op_data->op_attr, attr, sizeof(*attr));
 
@@ -1358,7 +1358,7 @@ out:
 	ll_finish_md_op_data(op_data);
 
 	if (!S_ISDIR(inode->i_mode)) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if ((attr->ia_valid & ATTR_SIZE) && !hsm_import)
 			inode_dio_wait(inode);
 	}
diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c
index e578a1130ad1..18aab25f9cd9 100644
--- a/drivers/staging/lustre/lustre/llite/llite_nfs.c
+++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c
@@ -245,9 +245,9 @@ static int ll_get_name(struct dentry *dentry, char *name,
 		goto out;
 	}
 
-	mutex_lock(&dir->i_mutex);
+	inode_lock(dir);
 	rc = ll_dir_read(dir, &lgd.ctx);
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	if (!rc && !lgd.lgd_found)
 		rc = -ENOENT;
 out:
diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
index 420d39123877..871924b3f2e7 100644
--- a/drivers/staging/lustre/lustre/llite/lloop.c
+++ b/drivers/staging/lustre/lustre/llite/lloop.c
@@ -257,9 +257,9 @@ static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
 	 *    be asked to write less pages once, this purely depends on
 	 *    implementation. Anyway, we should be careful to avoid deadlocking.
 	 */
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	bytes = ll_direct_rw_pages(env, io, rw, inode, pvec);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	cl_io_fini(env, io);
 	return (bytes == pvec->ldp_size) ? 0 : (int)bytes;
 }
diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c
index 95cdb0c58b04..f355474967d6 100644
--- a/drivers/staging/lustre/lustre/llite/rw.c
+++ b/drivers/staging/lustre/lustre/llite/rw.c
@@ -115,8 +115,8 @@ static struct ll_cl_context *ll_cl_init(struct file *file,
 		struct inode *inode = vmpage->mapping->host;
 		loff_t pos;
 
-		if (mutex_trylock(&inode->i_mutex)) {
-			mutex_unlock(&(inode)->i_mutex);
+		if (inode_trylock(inode)) {
+			inode_unlock((inode));
 
 			/* this is too bad. Someone is trying to write the
 			 * page w/o holding inode mutex. This means we can
diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
index 39fa13b74cbd..711fda93a58d 100644
--- a/drivers/staging/lustre/lustre/llite/rw26.c
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -403,7 +403,7 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
 	 * 1. Need inode mutex to operate transient pages.
 	 */
 	if (iov_iter_rw(iter) == READ)
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 
 	LASSERT(obj->cob_transient_pages == 0);
 	while (iov_iter_count(iter)) {
@@ -454,7 +454,7 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
 out:
 	LASSERT(obj->cob_transient_pages == 0);
 	if (iov_iter_rw(iter) == READ)
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 	if (tot_bytes > 0) {
 		if (iov_iter_rw(iter) == WRITE) {
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index f68e972886ca..0920ac6b3003 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -439,7 +439,7 @@ static int vvp_io_setattr_start(const struct lu_env *env,
 	struct inode	*inode = ccc_object_inode(io->ci_obj);
 	int result = 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	if (cl_io_is_trunc(io))
 		result = vvp_io_setattr_trunc(env, ios, inode,
 					io->u.ci_setattr.sa_attr.lvb_size);
@@ -459,7 +459,7 @@ static void vvp_io_setattr_end(const struct lu_env *env,
 		 * because osc has already notified to destroy osc_extents. */
 		vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 }
 
 static void vvp_io_setattr_fini(const struct lu_env *env,
diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c
index 99c0d7aee921..a133475a7c74 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_page.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_page.c
@@ -428,7 +428,7 @@ static void vvp_transient_page_verify(const struct cl_page *page)
 {
 	struct inode *inode = ccc_object_inode(page->cp_obj);
 
-	LASSERT(!mutex_trylock(&inode->i_mutex));
+	LASSERT(!inode_trylock(inode));
 }
 
 static int vvp_transient_page_own(const struct lu_env *env,
@@ -480,9 +480,9 @@ static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
 	struct inode    *inode = ccc_object_inode(slice->cpl_obj);
 	int	locked;
 
-	locked = !mutex_trylock(&inode->i_mutex);
+	locked = !inode_trylock(inode);
 	if (!locked)
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	return locked ? -EBUSY : -ENODATA;
 }
 
@@ -502,7 +502,7 @@ static void vvp_transient_page_fini(const struct lu_env *env,
 	struct ccc_object *clobj = cl2ccc(clp->cp_obj);
 
 	vvp_page_fini_common(cp);
-	LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+	LASSERT(!inode_trylock(clobj->cob_inode));
 	clobj->cob_transient_pages--;
 }
 
@@ -548,7 +548,7 @@ int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
 	} else {
 		struct ccc_object *clobj = cl2ccc(obj);
 
-		LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+		LASSERT(!inode_trylock(clobj->cob_inode));
 		cl_page_slice_add(page, &cpg->cpg_cl, obj,
 				&vvp_transient_page_ops);
 		clobj->cob_transient_pages++;
diff --git a/drivers/staging/rdma/amso1100/c2_cq.c b/drivers/staging/rdma/amso1100/c2_cq.c
index 3ef881f2da0f..7ad0c082485a 100644
--- a/drivers/staging/rdma/amso1100/c2_cq.c
+++ b/drivers/staging/rdma/amso1100/c2_cq.c
@@ -173,9 +173,6 @@ static inline int c2_poll_one(struct c2_dev *c2dev,
 	case C2_WR_TYPE_RDMA_READ:
 		entry->opcode = IB_WC_RDMA_READ;
 		break;
-	case C2_WR_TYPE_BIND_MW:
-		entry->opcode = IB_WC_BIND_MW;
-		break;
 	case C2_WR_TYPE_RECV:
 		entry->byte_len = be32_to_cpu(ce->bytes_rcvd);
 		entry->opcode = IB_WC_RECV;
diff --git a/drivers/staging/rdma/amso1100/c2_provider.c b/drivers/staging/rdma/amso1100/c2_provider.c
index a092ac743c72..de8d10e1bde3 100644
--- a/drivers/staging/rdma/amso1100/c2_provider.c
+++ b/drivers/staging/rdma/amso1100/c2_provider.c
@@ -337,43 +337,21 @@ static inline u32 c2_convert_access(int acc)
 	    C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND;
 }
 
-static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
-				    struct ib_phys_buf *buffer_list,
-				    int num_phys_buf, int acc, u64 * iova_start)
+static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
 {
 	struct c2_mr *mr;
 	u64 *page_list;
-	u32 total_len;
-	int err, i, j, k, page_shift, pbl_depth;
+	const u32 total_len = 0xffffffff;	/* AMSO1100 limit */
+	int err, page_shift, pbl_depth, i;
+	u64 kva = 0;
 
-	pbl_depth = 0;
-	total_len = 0;
+	pr_debug("%s:%u\n", __func__, __LINE__);
 
-	page_shift = PAGE_SHIFT;
 	/*
-	 * If there is only 1 buffer we assume this could
-	 * be a map of all phy mem...use a 32k page_shift.
+	 * This is a map of all phy mem...use a 32k page_shift.
 	 */
-	if (num_phys_buf == 1)
-		page_shift += 3;
-
-	for (i = 0; i < num_phys_buf; i++) {
-
-		if (offset_in_page(buffer_list[i].addr)) {
-			pr_debug("Unaligned Memory Buffer: 0x%x\n",
-				(unsigned int) buffer_list[i].addr);
-			return ERR_PTR(-EINVAL);
-		}
-
-		if (!buffer_list[i].size) {
-			pr_debug("Invalid Buffer Size\n");
-			return ERR_PTR(-EINVAL);
-		}
-
-		total_len += buffer_list[i].size;
-		pbl_depth += ALIGN(buffer_list[i].size,
-				   BIT(page_shift)) >> page_shift;
-	}
+	page_shift = PAGE_SHIFT + 3;
+	pbl_depth = ALIGN(total_len, BIT(page_shift)) >> page_shift;
 
 	page_list = vmalloc(sizeof(u64) * pbl_depth);
 	if (!page_list) {
@@ -382,16 +360,8 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	for (i = 0, j = 0; i < num_phys_buf; i++) {
-
-		int naddrs;
-
- 		naddrs = ALIGN(buffer_list[i].size,
-			       BIT(page_shift)) >> page_shift;
-		for (k = 0; k < naddrs; k++)
-			page_list[j++] = (buffer_list[i].addr +
-						     (k << page_shift));
-	}
+	for (i = 0; i < pbl_depth; i++)
+		page_list[i] = (i << page_shift);
 
 	mr = kmalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr) {
@@ -399,17 +369,17 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	mr->pd = to_c2pd(ib_pd);
+	mr->pd = to_c2pd(pd);
 	mr->umem = NULL;
 	pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
 		"*iova_start %llx, first pa %llx, last pa %llx\n",
 		__func__, page_shift, pbl_depth, total_len,
-		(unsigned long long) *iova_start,
+		(unsigned long long) kva,
 	       	(unsigned long long) page_list[0],
 	       	(unsigned long long) page_list[pbl_depth-1]);
-  	err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list,
+	err = c2_nsmr_register_phys_kern(to_c2dev(pd->device), page_list,
 					 BIT(page_shift), pbl_depth,
-					 total_len, 0, iova_start,
+					 total_len, 0, &kva,
 					 c2_convert_access(acc), mr);
 	vfree(page_list);
 	if (err) {
@@ -420,19 +390,6 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
 	return &mr->ibmr;
 }
 
-static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
-{
-	struct ib_phys_buf bl;
-	u64 kva = 0;
-
-	pr_debug("%s:%u\n", __func__, __LINE__);
-
-	/* AMSO1100 limit */
-	bl.size = 0xffffffff;
-	bl.addr = 0;
-	return c2_reg_phys_mr(pd, &bl, 1, acc, &kva);
-}
-
 static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				    u64 virt, int acc, struct ib_udata *udata)
 {
@@ -840,7 +797,6 @@ int c2_register_device(struct c2_dev *dev)
 	dev->ibdev.destroy_cq = c2_destroy_cq;
 	dev->ibdev.poll_cq = c2_poll_cq;
 	dev->ibdev.get_dma_mr = c2_get_dma_mr;
-	dev->ibdev.reg_phys_mr = c2_reg_phys_mr;
 	dev->ibdev.reg_user_mr = c2_reg_user_mr;
 	dev->ibdev.dereg_mr = c2_dereg_mr;
 	dev->ibdev.get_port_immutable = c2_port_immutable;
diff --git a/drivers/staging/rdma/ehca/ehca_classes.h b/drivers/staging/rdma/ehca/ehca_classes.h
index bd45e0f3923f..e8c3387d7aaa 100644
--- a/drivers/staging/rdma/ehca/ehca_classes.h
+++ b/drivers/staging/rdma/ehca/ehca_classes.h
@@ -316,9 +316,8 @@ struct ehca_mr_pginfo {
 
 	union {
 		struct { /* type EHCA_MR_PGI_PHYS section */
-			int num_phys_buf;
-			struct ib_phys_buf *phys_buf_array;
-			u64 next_buf;
+			u64 addr;
+			u16 size;
 		} phy;
 		struct { /* type EHCA_MR_PGI_USER section */
 			struct ib_umem *region;
diff --git a/drivers/staging/rdma/ehca/ehca_iverbs.h b/drivers/staging/rdma/ehca/ehca_iverbs.h
index 80e6a3d5df3e..cca5933fcda6 100644
--- a/drivers/staging/rdma/ehca/ehca_iverbs.h
+++ b/drivers/staging/rdma/ehca/ehca_iverbs.h
@@ -80,30 +80,14 @@ int ehca_destroy_ah(struct ib_ah *ah);
 
 struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
 
-struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
-			       struct ib_phys_buf *phys_buf_array,
-			       int num_phys_buf,
-			       int mr_access_flags, u64 *iova_start);
-
 struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			       u64 virt, int mr_access_flags,
 			       struct ib_udata *udata);
 
-int ehca_rereg_phys_mr(struct ib_mr *mr,
-		       int mr_rereg_mask,
-		       struct ib_pd *pd,
-		       struct ib_phys_buf *phys_buf_array,
-		       int num_phys_buf, int mr_access_flags, u64 *iova_start);
-
-int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
-
 int ehca_dereg_mr(struct ib_mr *mr);
 
 struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
 
-int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
-		 struct ib_mw_bind *mw_bind);
-
 int ehca_dealloc_mw(struct ib_mw *mw);
 
 struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
diff --git a/drivers/staging/rdma/ehca/ehca_main.c b/drivers/staging/rdma/ehca/ehca_main.c
index 860b974e9faa..832f22f40862 100644
--- a/drivers/staging/rdma/ehca/ehca_main.c
+++ b/drivers/staging/rdma/ehca/ehca_main.c
@@ -511,13 +511,9 @@ static int ehca_init_device(struct ehca_shca *shca)
 	shca->ib_device.req_notify_cq	    = ehca_req_notify_cq;
 	/* shca->ib_device.req_ncomp_notif  = ehca_req_ncomp_notif; */
 	shca->ib_device.get_dma_mr	    = ehca_get_dma_mr;
-	shca->ib_device.reg_phys_mr	    = ehca_reg_phys_mr;
 	shca->ib_device.reg_user_mr	    = ehca_reg_user_mr;
-	shca->ib_device.query_mr	    = ehca_query_mr;
 	shca->ib_device.dereg_mr	    = ehca_dereg_mr;
-	shca->ib_device.rereg_phys_mr	    = ehca_rereg_phys_mr;
 	shca->ib_device.alloc_mw	    = ehca_alloc_mw;
-	shca->ib_device.bind_mw		    = ehca_bind_mw;
 	shca->ib_device.dealloc_mw	    = ehca_dealloc_mw;
 	shca->ib_device.alloc_fmr	    = ehca_alloc_fmr;
 	shca->ib_device.map_phys_fmr	    = ehca_map_phys_fmr;
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.c b/drivers/staging/rdma/ehca/ehca_mrmw.c
index 553e883a5718..3367205e3160 100644
--- a/drivers/staging/rdma/ehca/ehca_mrmw.c
+++ b/drivers/staging/rdma/ehca/ehca_mrmw.c
@@ -196,120 +196,6 @@ get_dma_mr_exit0:
 
 /*----------------------------------------------------------------------*/
 
-struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
-			       struct ib_phys_buf *phys_buf_array,
-			       int num_phys_buf,
-			       int mr_access_flags,
-			       u64 *iova_start)
-{
-	struct ib_mr *ib_mr;
-	int ret;
-	struct ehca_mr *e_mr;
-	struct ehca_shca *shca =
-		container_of(pd->device, struct ehca_shca, ib_device);
-	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-
-	u64 size;
-
-	if ((num_phys_buf <= 0) || !phys_buf_array) {
-		ehca_err(pd->device, "bad input values: num_phys_buf=%x "
-			 "phys_buf_array=%p", num_phys_buf, phys_buf_array);
-		ib_mr = ERR_PTR(-EINVAL);
-		goto reg_phys_mr_exit0;
-	}
-	if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-	    ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
-		/*
-		 * Remote Write Access requires Local Write Access
-		 * Remote Atomic Access requires Local Write Access
-		 */
-		ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-			 mr_access_flags);
-		ib_mr = ERR_PTR(-EINVAL);
-		goto reg_phys_mr_exit0;
-	}
-
-	/* check physical buffer list and calculate size */
-	ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf,
-					    iova_start, &size);
-	if (ret) {
-		ib_mr = ERR_PTR(ret);
-		goto reg_phys_mr_exit0;
-	}
-	if ((size == 0) ||
-	    (((u64)iova_start + size) < (u64)iova_start)) {
-		ehca_err(pd->device, "bad input values: size=%llx iova_start=%p",
-			 size, iova_start);
-		ib_mr = ERR_PTR(-EINVAL);
-		goto reg_phys_mr_exit0;
-	}
-
-	e_mr = ehca_mr_new();
-	if (!e_mr) {
-		ehca_err(pd->device, "out of memory");
-		ib_mr = ERR_PTR(-ENOMEM);
-		goto reg_phys_mr_exit0;
-	}
-
-	/* register MR on HCA */
-	if (ehca_mr_is_maxmr(size, iova_start)) {
-		e_mr->flags |= EHCA_MR_FLAG_MAXMR;
-		ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags,
-				     e_pd, &e_mr->ib.ib_mr.lkey,
-				     &e_mr->ib.ib_mr.rkey);
-		if (ret) {
-			ib_mr = ERR_PTR(ret);
-			goto reg_phys_mr_exit1;
-		}
-	} else {
-		struct ehca_mr_pginfo pginfo;
-		u32 num_kpages;
-		u32 num_hwpages;
-		u64 hw_pgsize;
-
-		num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size,
-					PAGE_SIZE);
-		/* for kernel space we try most possible pgsize */
-		hw_pgsize = ehca_get_max_hwpage_size(shca);
-		num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size,
-					 hw_pgsize);
-		memset(&pginfo, 0, sizeof(pginfo));
-		pginfo.type = EHCA_MR_PGI_PHYS;
-		pginfo.num_kpages = num_kpages;
-		pginfo.hwpage_size = hw_pgsize;
-		pginfo.num_hwpages = num_hwpages;
-		pginfo.u.phy.num_phys_buf = num_phys_buf;
-		pginfo.u.phy.phys_buf_array = phys_buf_array;
-		pginfo.next_hwpage =
-			((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
-
-		ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
-				  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-				  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
-		if (ret) {
-			ib_mr = ERR_PTR(ret);
-			goto reg_phys_mr_exit1;
-		}
-	}
-
-	/* successful registration of all pages */
-	return &e_mr->ib.ib_mr;
-
-reg_phys_mr_exit1:
-	ehca_mr_delete(e_mr);
-reg_phys_mr_exit0:
-	if (IS_ERR(ib_mr))
-		ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p "
-			 "num_phys_buf=%x mr_access_flags=%x iova_start=%p",
-			 PTR_ERR(ib_mr), pd, phys_buf_array,
-			 num_phys_buf, mr_access_flags, iova_start);
-	return ib_mr;
-} /* end ehca_reg_phys_mr() */
-
-/*----------------------------------------------------------------------*/
-
 struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			       u64 virt, int mr_access_flags,
 			       struct ib_udata *udata)
@@ -437,207 +323,6 @@ reg_user_mr_exit0:
 
 /*----------------------------------------------------------------------*/
 
-int ehca_rereg_phys_mr(struct ib_mr *mr,
-		       int mr_rereg_mask,
-		       struct ib_pd *pd,
-		       struct ib_phys_buf *phys_buf_array,
-		       int num_phys_buf,
-		       int mr_access_flags,
-		       u64 *iova_start)
-{
-	int ret;
-
-	struct ehca_shca *shca =
-		container_of(mr->device, struct ehca_shca, ib_device);
-	struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
-	u64 new_size;
-	u64 *new_start;
-	u32 new_acl;
-	struct ehca_pd *new_pd;
-	u32 tmp_lkey, tmp_rkey;
-	unsigned long sl_flags;
-	u32 num_kpages = 0;
-	u32 num_hwpages = 0;
-	struct ehca_mr_pginfo pginfo;
-
-	if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) {
-		/* TODO not supported, because PHYP rereg hCall needs pages */
-		ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not "
-			 "supported yet, mr_rereg_mask=%x", mr_rereg_mask);
-		ret = -EINVAL;
-		goto rereg_phys_mr_exit0;
-	}
-
-	if (mr_rereg_mask & IB_MR_REREG_PD) {
-		if (!pd) {
-			ehca_err(mr->device, "rereg with bad pd, pd=%p "
-				 "mr_rereg_mask=%x", pd, mr_rereg_mask);
-			ret = -EINVAL;
-			goto rereg_phys_mr_exit0;
-		}
-	}
-
-	if ((mr_rereg_mask &
-	     ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) ||
-	    (mr_rereg_mask == 0)) {
-		ret = -EINVAL;
-		goto rereg_phys_mr_exit0;
-	}
-
-	/* check other parameters */
-	if (e_mr == shca->maxmr) {
-		/* should be impossible, however reject to be sure */
-		ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p "
-			 "shca->maxmr=%p mr->lkey=%x",
-			 mr, shca->maxmr, mr->lkey);
-		ret = -EINVAL;
-		goto rereg_phys_mr_exit0;
-	}
-	if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */
-		if (e_mr->flags & EHCA_MR_FLAG_FMR) {
-			ehca_err(mr->device, "not supported for FMR, mr=%p "
-				 "flags=%x", mr, e_mr->flags);
-			ret = -EINVAL;
-			goto rereg_phys_mr_exit0;
-		}
-		if (!phys_buf_array || num_phys_buf <= 0) {
-			ehca_err(mr->device, "bad input values mr_rereg_mask=%x"
-				 " phys_buf_array=%p num_phys_buf=%x",
-				 mr_rereg_mask, phys_buf_array, num_phys_buf);
-			ret = -EINVAL;
-			goto rereg_phys_mr_exit0;
-		}
-	}
-	if ((mr_rereg_mask & IB_MR_REREG_ACCESS) &&	/* change ACL */
-	    (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-	      !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-	     ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-	      !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) {
-		/*
-		 * Remote Write Access requires Local Write Access
-		 * Remote Atomic Access requires Local Write Access
-		 */
-		ehca_err(mr->device, "bad input values: mr_rereg_mask=%x "
-			 "mr_access_flags=%x", mr_rereg_mask, mr_access_flags);
-		ret = -EINVAL;
-		goto rereg_phys_mr_exit0;
-	}
-
-	/* set requested values dependent on rereg request */
-	spin_lock_irqsave(&e_mr->mrlock, sl_flags);
-	new_start = e_mr->start;
-	new_size = e_mr->size;
-	new_acl = e_mr->acl;
-	new_pd = container_of(mr->pd, struct ehca_pd, ib_pd);
-
-	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-		u64 hw_pgsize = ehca_get_max_hwpage_size(shca);
-
-		new_start = iova_start;	/* change address */
-		/* check physical buffer list and calculate size */
-		ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array,
-						    num_phys_buf, iova_start,
-						    &new_size);
-		if (ret)
-			goto rereg_phys_mr_exit1;
-		if ((new_size == 0) ||
-		    (((u64)iova_start + new_size) < (u64)iova_start)) {
-			ehca_err(mr->device, "bad input values: new_size=%llx "
-				 "iova_start=%p", new_size, iova_start);
-			ret = -EINVAL;
-			goto rereg_phys_mr_exit1;
-		}
-		num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) +
-					new_size, PAGE_SIZE);
-		num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) +
-					 new_size, hw_pgsize);
-		memset(&pginfo, 0, sizeof(pginfo));
-		pginfo.type = EHCA_MR_PGI_PHYS;
-		pginfo.num_kpages = num_kpages;
-		pginfo.hwpage_size = hw_pgsize;
-		pginfo.num_hwpages = num_hwpages;
-		pginfo.u.phy.num_phys_buf = num_phys_buf;
-		pginfo.u.phy.phys_buf_array = phys_buf_array;
-		pginfo.next_hwpage =
-			((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
-	}
-	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
-		new_acl = mr_access_flags;
-	if (mr_rereg_mask & IB_MR_REREG_PD)
-		new_pd = container_of(pd, struct ehca_pd, ib_pd);
-
-	ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl,
-			    new_pd, &pginfo, &tmp_lkey, &tmp_rkey);
-	if (ret)
-		goto rereg_phys_mr_exit1;
-
-	/* successful reregistration */
-	if (mr_rereg_mask & IB_MR_REREG_PD)
-		mr->pd = pd;
-	mr->lkey = tmp_lkey;
-	mr->rkey = tmp_rkey;
-
-rereg_phys_mr_exit1:
-	spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
-rereg_phys_mr_exit0:
-	if (ret)
-		ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p "
-			 "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x "
-			 "iova_start=%p",
-			 ret, mr, mr_rereg_mask, pd, phys_buf_array,
-			 num_phys_buf, mr_access_flags, iova_start);
-	return ret;
-} /* end ehca_rereg_phys_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
-{
-	int ret = 0;
-	u64 h_ret;
-	struct ehca_shca *shca =
-		container_of(mr->device, struct ehca_shca, ib_device);
-	struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
-	unsigned long sl_flags;
-	struct ehca_mr_hipzout_parms hipzout;
-
-	if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
-		ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
-			 "e_mr->flags=%x", mr, e_mr, e_mr->flags);
-		ret = -EINVAL;
-		goto query_mr_exit0;
-	}
-
-	memset(mr_attr, 0, sizeof(struct ib_mr_attr));
-	spin_lock_irqsave(&e_mr->mrlock, sl_flags);
-
-	h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout);
-	if (h_ret != H_SUCCESS) {
-		ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p "
-			 "hca_hndl=%llx mr_hndl=%llx lkey=%x",
-			 h_ret, mr, shca->ipz_hca_handle.handle,
-			 e_mr->ipz_mr_handle.handle, mr->lkey);
-		ret = ehca2ib_return_code(h_ret);
-		goto query_mr_exit1;
-	}
-	mr_attr->pd = mr->pd;
-	mr_attr->device_virt_addr = hipzout.vaddr;
-	mr_attr->size = hipzout.len;
-	mr_attr->lkey = hipzout.lkey;
-	mr_attr->rkey = hipzout.rkey;
-	ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags);
-
-query_mr_exit1:
-	spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
-query_mr_exit0:
-	if (ret)
-		ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p",
-			 ret, mr, mr_attr);
-	return ret;
-} /* end ehca_query_mr() */
-
-/*----------------------------------------------------------------------*/
-
 int ehca_dereg_mr(struct ib_mr *mr)
 {
 	int ret = 0;
@@ -728,18 +413,6 @@ alloc_mw_exit0:
 
 /*----------------------------------------------------------------------*/
 
-int ehca_bind_mw(struct ib_qp *qp,
-		 struct ib_mw *mw,
-		 struct ib_mw_bind *mw_bind)
-{
-	/* TODO: not supported up to now */
-	ehca_gen_err("bind MW currently not supported by HCAD");
-
-	return -EPERM;
-} /* end ehca_bind_mw() */
-
-/*----------------------------------------------------------------------*/
-
 int ehca_dealloc_mw(struct ib_mw *mw)
 {
 	u64 h_ret;
@@ -1616,7 +1289,6 @@ int ehca_reg_internal_maxmr(
 	u64 *iova_start;
 	u64 size_maxmr;
 	struct ehca_mr_pginfo pginfo;
-	struct ib_phys_buf ib_pbuf;
 	u32 num_kpages;
 	u32 num_hwpages;
 	u64 hw_pgsize;
@@ -1637,8 +1309,6 @@ int ehca_reg_internal_maxmr(
 	/* register internal max-MR on HCA */
 	size_maxmr = ehca_mr_len;
 	iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START));
-	ib_pbuf.addr = 0;
-	ib_pbuf.size = size_maxmr;
 	num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
 				PAGE_SIZE);
 	hw_pgsize = ehca_get_max_hwpage_size(shca);
@@ -1650,8 +1320,8 @@ int ehca_reg_internal_maxmr(
 	pginfo.num_kpages = num_kpages;
 	pginfo.num_hwpages = num_hwpages;
 	pginfo.hwpage_size = hw_pgsize;
-	pginfo.u.phy.num_phys_buf = 1;
-	pginfo.u.phy.phys_buf_array = &ib_pbuf;
+	pginfo.u.phy.addr = 0;
+	pginfo.u.phy.size = size_maxmr;
 
 	ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
 			  &pginfo, &e_mr->ib.ib_mr.lkey,
@@ -1669,7 +1339,6 @@ int ehca_reg_internal_maxmr(
 	e_mr->ib.ib_mr.pd = &e_pd->ib_pd;
 	e_mr->ib.ib_mr.uobject = NULL;
 	atomic_inc(&(e_pd->ib_pd.usecnt));
-	atomic_set(&(e_mr->ib.ib_mr.usecnt), 0);
 	*e_maxmr = e_mr;
 	return 0;
 
@@ -1762,61 +1431,6 @@ ehca_dereg_internal_maxmr_exit0:
 
 /*----------------------------------------------------------------------*/
 
-/*
- * check physical buffer array of MR verbs for validness and
- * calculates MR size
- */
-int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
-				  int num_phys_buf,
-				  u64 *iova_start,
-				  u64 *size)
-{
-	struct ib_phys_buf *pbuf = phys_buf_array;
-	u64 size_count = 0;
-	u32 i;
-
-	if (num_phys_buf == 0) {
-		ehca_gen_err("bad phys buf array len, num_phys_buf=0");
-		return -EINVAL;
-	}
-	/* check first buffer */
-	if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) {
-		ehca_gen_err("iova_start/addr mismatch, iova_start=%p "
-			     "pbuf->addr=%llx pbuf->size=%llx",
-			     iova_start, pbuf->addr, pbuf->size);
-		return -EINVAL;
-	}
-	if (((pbuf->addr + pbuf->size) % PAGE_SIZE) &&
-	    (num_phys_buf > 1)) {
-		ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx "
-			     "pbuf->size=%llx", pbuf->addr, pbuf->size);
-		return -EINVAL;
-	}
-
-	for (i = 0; i < num_phys_buf; i++) {
-		if ((i > 0) && (pbuf->addr % PAGE_SIZE)) {
-			ehca_gen_err("bad address, i=%x pbuf->addr=%llx "
-				     "pbuf->size=%llx",
-				     i, pbuf->addr, pbuf->size);
-			return -EINVAL;
-		}
-		if (((i > 0) &&	/* not 1st */
-		     (i < (num_phys_buf - 1)) &&	/* not last */
-		     (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) {
-			ehca_gen_err("bad size, i=%x pbuf->size=%llx",
-				     i, pbuf->size);
-			return -EINVAL;
-		}
-		size_count += pbuf->size;
-		pbuf++;
-	}
-
-	*size = size_count;
-	return 0;
-} /* end ehca_mr_chk_buf_and_calc_size() */
-
-/*----------------------------------------------------------------------*/
-
 /* check page list of map FMR verb for validness */
 int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
 			     u64 *page_list,
@@ -2002,57 +1616,54 @@ static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo,
 				 u32 number, u64 *kpage)
 {
 	int ret = 0;
-	struct ib_phys_buf *pbuf;
+	u64 addr = pginfo->u.phy.addr;
+	u64 size = pginfo->u.phy.size;
 	u64 num_hw, offs_hw;
 	u32 i = 0;
 
-	/* loop over desired phys_buf_array entries */
-	while (i < number) {
-		pbuf   = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf;
-		num_hw  = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) +
-				     pbuf->size, pginfo->hwpage_size);
-		offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) /
-			pginfo->hwpage_size;
-		while (pginfo->next_hwpage < offs_hw + num_hw) {
-			/* sanity check */
-			if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
-			    (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
-				ehca_gen_err("kpage_cnt >= num_kpages, "
-					     "kpage_cnt=%llx num_kpages=%llx "
-					     "hwpage_cnt=%llx "
-					     "num_hwpages=%llx i=%x",
-					     pginfo->kpage_cnt,
-					     pginfo->num_kpages,
-					     pginfo->hwpage_cnt,
-					     pginfo->num_hwpages, i);
-				return -EFAULT;
-			}
-			*kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) +
-				 (pginfo->next_hwpage * pginfo->hwpage_size);
-			if ( !(*kpage) && pbuf->addr ) {
-				ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx "
-					     "next_hwpage=%llx", pbuf->addr,
-					     pbuf->size, pginfo->next_hwpage);
-				return -EFAULT;
-			}
-			(pginfo->hwpage_cnt)++;
-			(pginfo->next_hwpage)++;
-			if (PAGE_SIZE >= pginfo->hwpage_size) {
-				if (pginfo->next_hwpage %
-				    (PAGE_SIZE / pginfo->hwpage_size) == 0)
-					(pginfo->kpage_cnt)++;
-			} else
-				pginfo->kpage_cnt += pginfo->hwpage_size /
-					PAGE_SIZE;
-			kpage++;
-			i++;
-			if (i >= number) break;
+	num_hw  = NUM_CHUNKS((addr % pginfo->hwpage_size) + size,
+				pginfo->hwpage_size);
+	offs_hw = (addr & ~(pginfo->hwpage_size - 1)) / pginfo->hwpage_size;
+
+	while (pginfo->next_hwpage < offs_hw + num_hw) {
+		/* sanity check */
+		if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
+		    (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
+			ehca_gen_err("kpage_cnt >= num_kpages, "
+				     "kpage_cnt=%llx num_kpages=%llx "
+				     "hwpage_cnt=%llx "
+				     "num_hwpages=%llx i=%x",
+				     pginfo->kpage_cnt,
+				     pginfo->num_kpages,
+				     pginfo->hwpage_cnt,
+				     pginfo->num_hwpages, i);
+			return -EFAULT;
 		}
-		if (pginfo->next_hwpage >= offs_hw + num_hw) {
-			(pginfo->u.phy.next_buf)++;
-			pginfo->next_hwpage = 0;
+		*kpage = (addr & ~(pginfo->hwpage_size - 1)) +
+			 (pginfo->next_hwpage * pginfo->hwpage_size);
+		if ( !(*kpage) && addr ) {
+			ehca_gen_err("addr=%llx size=%llx "
+				     "next_hwpage=%llx", addr,
+				     size, pginfo->next_hwpage);
+			return -EFAULT;
 		}
+		(pginfo->hwpage_cnt)++;
+		(pginfo->next_hwpage)++;
+		if (PAGE_SIZE >= pginfo->hwpage_size) {
+			if (pginfo->next_hwpage %
+			    (PAGE_SIZE / pginfo->hwpage_size) == 0)
+				(pginfo->kpage_cnt)++;
+		} else
+			pginfo->kpage_cnt += pginfo->hwpage_size /
+				PAGE_SIZE;
+		kpage++;
+		i++;
+		if (i >= number) break;
 	}
+	if (pginfo->next_hwpage >= offs_hw + num_hw) {
+		pginfo->next_hwpage = 0;
+	}
+
 	return ret;
 }
 
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.h b/drivers/staging/rdma/ehca/ehca_mrmw.h
index 50d8b51306dd..52bfa95697f7 100644
--- a/drivers/staging/rdma/ehca/ehca_mrmw.h
+++ b/drivers/staging/rdma/ehca/ehca_mrmw.h
@@ -98,11 +98,6 @@ int ehca_reg_maxmr(struct ehca_shca *shca,
 
 int ehca_dereg_internal_maxmr(struct ehca_shca *shca);
 
-int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
-				  int num_phys_buf,
-				  u64 *iova_start,
-				  u64 *size);
-
 int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
 			     u64 *page_list,
 			     int list_len);
diff --git a/drivers/staging/rdma/ehca/ehca_reqs.c b/drivers/staging/rdma/ehca/ehca_reqs.c
index 10e2074384f5..11813b880e16 100644
--- a/drivers/staging/rdma/ehca/ehca_reqs.c
+++ b/drivers/staging/rdma/ehca/ehca_reqs.c
@@ -614,7 +614,6 @@ int ehca_post_srq_recv(struct ib_srq *srq,
 static const u8 ib_wc_opcode[255] = {
 	[0x01] = IB_WC_RECV+1,
 	[0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
-	[0x04] = IB_WC_BIND_MW+1,
 	[0x08] = IB_WC_FETCH_ADD+1,
 	[0x10] = IB_WC_COMP_SWAP+1,
 	[0x20] = IB_WC_RDMA_WRITE+1,
diff --git a/drivers/staging/rdma/hfi1/mr.c b/drivers/staging/rdma/hfi1/mr.c
index 568f185a022d..a3f8b884fdd6 100644
--- a/drivers/staging/rdma/hfi1/mr.c
+++ b/drivers/staging/rdma/hfi1/mr.c
@@ -167,10 +167,7 @@ static struct hfi1_mr *alloc_mr(int count, struct ib_pd *pd)
 	rval = init_mregion(&mr->mr, pd, count);
 	if (rval)
 		goto bail;
-	/*
-	 * ib_reg_phys_mr() will initialize mr->ibmr except for
-	 * lkey and rkey.
-	 */
+
 	rval = hfi1_alloc_lkey(&mr->mr, 0);
 	if (rval)
 		goto bail_mregion;
@@ -188,52 +185,6 @@ bail:
 }
 
 /**
- * hfi1_reg_phys_mr - register a physical memory region
- * @pd: protection domain for this memory region
- * @buffer_list: pointer to the list of physical buffers to register
- * @num_phys_buf: the number of physical buffers to register
- * @iova_start: the starting address passed over IB which maps to this MR
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
-			       struct ib_phys_buf *buffer_list,
-			       int num_phys_buf, int acc, u64 *iova_start)
-{
-	struct hfi1_mr *mr;
-	int n, m, i;
-	struct ib_mr *ret;
-
-	mr = alloc_mr(num_phys_buf, pd);
-	if (IS_ERR(mr)) {
-		ret = (struct ib_mr *)mr;
-		goto bail;
-	}
-
-	mr->mr.user_base = *iova_start;
-	mr->mr.iova = *iova_start;
-	mr->mr.access_flags = acc;
-
-	m = 0;
-	n = 0;
-	for (i = 0; i < num_phys_buf; i++) {
-		mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
-		mr->mr.map[m]->segs[n].length = buffer_list[i].size;
-		mr->mr.length += buffer_list[i].size;
-		n++;
-		if (n == HFI1_SEGSZ) {
-			m++;
-			n = 0;
-		}
-	}
-
-	ret = &mr->ibmr;
-
-bail:
-	return ret;
-}
-
-/**
  * hfi1_reg_user_mr - register a userspace memory region
  * @pd: protection domain for this memory region
  * @start: starting userspace address
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
index ef0feaa684a4..09b8d412ee90 100644
--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -2052,7 +2052,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	ibdev->poll_cq = hfi1_poll_cq;
 	ibdev->req_notify_cq = hfi1_req_notify_cq;
 	ibdev->get_dma_mr = hfi1_get_dma_mr;
-	ibdev->reg_phys_mr = hfi1_reg_phys_mr;
 	ibdev->reg_user_mr = hfi1_reg_user_mr;
 	ibdev->dereg_mr = hfi1_dereg_mr;
 	ibdev->alloc_mr = hfi1_alloc_mr;
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
index 72106e5362b9..286e468b0479 100644
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -1024,10 +1024,6 @@ int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 
 struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc);
 
-struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
-			       struct ib_phys_buf *buffer_list,
-			       int num_phys_buf, int acc, u64 *iova_start);
-
 struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			       u64 virt_addr, int mr_access_flags,
 			       struct ib_udata *udata);
diff --git a/drivers/staging/rdma/ipath/ipath_fs.c b/drivers/staging/rdma/ipath/ipath_fs.c
index 796af6867007..476fcdf05acb 100644
--- a/drivers/staging/rdma/ipath/ipath_fs.c
+++ b/drivers/staging/rdma/ipath/ipath_fs.c
@@ -82,14 +82,14 @@ static int create_file(const char *name, umode_t mode,
 {
 	int error;
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 	*dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(*dentry))
 		error = ipathfs_mknod(d_inode(parent), *dentry,
 				      mode, fops, data);
 	else
 		error = PTR_ERR(*dentry);
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 
 	return error;
 }
@@ -295,7 +295,7 @@ static int remove_device_files(struct super_block *sb,
 	int ret;
 
 	root = dget(sb->s_root);
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 	snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
 	dir = lookup_one_len(unit, root, strlen(unit));
 
@@ -311,7 +311,7 @@ static int remove_device_files(struct super_block *sb,
 	ret = simple_rmdir(d_inode(root), dir);
 
 bail:
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 	dput(root);
 	return ret;
 }
diff --git a/drivers/staging/rdma/ipath/ipath_mr.c b/drivers/staging/rdma/ipath/ipath_mr.c
index c7278f6a8217..b76b0ce66709 100644
--- a/drivers/staging/rdma/ipath/ipath_mr.c
+++ b/drivers/staging/rdma/ipath/ipath_mr.c
@@ -98,10 +98,6 @@ static struct ipath_mr *alloc_mr(int count,
 	}
 	mr->mr.mapsz = m;
 
-	/*
-	 * ib_reg_phys_mr() will initialize mr->ibmr except for
-	 * lkey and rkey.
-	 */
 	if (!ipath_alloc_lkey(lk_table, &mr->mr))
 		goto bail;
 	mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
@@ -121,57 +117,6 @@ done:
 }
 
 /**
- * ipath_reg_phys_mr - register a physical memory region
- * @pd: protection domain for this memory region
- * @buffer_list: pointer to the list of physical buffers to register
- * @num_phys_buf: the number of physical buffers to register
- * @iova_start: the starting address passed over IB which maps to this MR
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
-				struct ib_phys_buf *buffer_list,
-				int num_phys_buf, int acc, u64 *iova_start)
-{
-	struct ipath_mr *mr;
-	int n, m, i;
-	struct ib_mr *ret;
-
-	mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table);
-	if (mr == NULL) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail;
-	}
-
-	mr->mr.pd = pd;
-	mr->mr.user_base = *iova_start;
-	mr->mr.iova = *iova_start;
-	mr->mr.length = 0;
-	mr->mr.offset = 0;
-	mr->mr.access_flags = acc;
-	mr->mr.max_segs = num_phys_buf;
-	mr->umem = NULL;
-
-	m = 0;
-	n = 0;
-	for (i = 0; i < num_phys_buf; i++) {
-		mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
-		mr->mr.map[m]->segs[n].length = buffer_list[i].size;
-		mr->mr.length += buffer_list[i].size;
-		n++;
-		if (n == IPATH_SEGSZ) {
-			m++;
-			n = 0;
-		}
-	}
-
-	ret = &mr->ibmr;
-
-bail:
-	return ret;
-}
-
-/**
  * ipath_reg_user_mr - register a userspace memory region
  * @pd: protection domain for this memory region
  * @start: starting userspace address
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.c b/drivers/staging/rdma/ipath/ipath_verbs.c
index 1778dee13f99..53f9dcab180d 100644
--- a/drivers/staging/rdma/ipath/ipath_verbs.c
+++ b/drivers/staging/rdma/ipath/ipath_verbs.c
@@ -2201,7 +2201,6 @@ int ipath_register_ib_device(struct ipath_devdata *dd)
 	dev->poll_cq = ipath_poll_cq;
 	dev->req_notify_cq = ipath_req_notify_cq;
 	dev->get_dma_mr = ipath_get_dma_mr;
-	dev->reg_phys_mr = ipath_reg_phys_mr;
 	dev->reg_user_mr = ipath_reg_user_mr;
 	dev->dereg_mr = ipath_dereg_mr;
 	dev->alloc_fmr = ipath_alloc_fmr;
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.h b/drivers/staging/rdma/ipath/ipath_verbs.h
index 0a90a56870ab..6c70a89667a9 100644
--- a/drivers/staging/rdma/ipath/ipath_verbs.h
+++ b/drivers/staging/rdma/ipath/ipath_verbs.h
@@ -828,10 +828,6 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 
 struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
 
-struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
-				struct ib_phys_buf *buffer_list,
-				int num_phys_buf, int acc, u64 *iova_start);
-
 struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				u64 virt_addr, int mr_access_flags,
 				struct ib_udata *udata);
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index e77d15000caa..5a2899f9f50b 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -615,9 +615,9 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio)
 	}
 
 	bip = bio_integrity_alloc(bio, GFP_NOIO, cmd->t_prot_nents);
-	if (!bip) {
+	if (IS_ERR(bip)) {
 		pr_err("Unable to allocate bio_integrity_payload\n");
-		return -ENOMEM;
+		return PTR_ERR(bip);
 	}
 
 	bip->bip_iter.bi_size = (cmd->data_length / dev->dev_attrib.block_size) *
diff --git a/drivers/thermal/int340x_thermal/processor_thermal_device.c b/drivers/thermal/int340x_thermal/processor_thermal_device.c
index ccc0ad02d066..36fa724a36c8 100644
--- a/drivers/thermal/int340x_thermal/processor_thermal_device.c
+++ b/drivers/thermal/int340x_thermal/processor_thermal_device.c
@@ -33,6 +33,12 @@
 /* Braswell thermal reporting device */
 #define PCI_DEVICE_ID_PROC_BSW_THERMAL	0x22DC
 
+/* Broxton thermal reporting device */
+#define PCI_DEVICE_ID_PROC_BXT0_THERMAL  0x0A8C
+#define PCI_DEVICE_ID_PROC_BXT1_THERMAL  0x1A8C
+#define PCI_DEVICE_ID_PROC_BXTX_THERMAL  0x4A8C
+#define PCI_DEVICE_ID_PROC_BXTP_THERMAL  0x5A8C
+
 struct power_config {
 	u32	index;
 	u32	min_uw;
@@ -404,6 +410,10 @@ static const struct pci_device_id proc_thermal_pci_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_HSB_THERMAL)},
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_SKL_THERMAL)},
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BSW_THERMAL)},
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXT0_THERMAL)},
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXT1_THERMAL)},
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXTX_THERMAL)},
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXTP_THERMAL)},
 	{ 0, },
 };
 
diff --git a/drivers/thermal/intel_pch_thermal.c b/drivers/thermal/intel_pch_thermal.c
index 50c7da79be83..00d81af648b8 100644
--- a/drivers/thermal/intel_pch_thermal.c
+++ b/drivers/thermal/intel_pch_thermal.c
@@ -136,7 +136,7 @@ struct pch_dev_ops {
 
 
 /* dev ops for Wildcat Point */
-static struct pch_dev_ops pch_dev_ops_wpt = {
+static const struct pch_dev_ops pch_dev_ops_wpt = {
 	.hw_init = pch_wpt_init,
 	.get_temp = pch_wpt_get_temp,
 };
diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c
index 13d01edc7a04..44b9c485157d 100644
--- a/drivers/thermal/rcar_thermal.c
+++ b/drivers/thermal/rcar_thermal.c
@@ -75,11 +75,11 @@ struct rcar_thermal_priv {
 #define rcar_has_irq_support(priv)	((priv)->common->base)
 #define rcar_id_to_shift(priv)		((priv)->id * 8)
 
-#ifdef DEBUG
-# define rcar_force_update_temp(priv)	1
-#else
-# define rcar_force_update_temp(priv)	0
-#endif
+static const struct of_device_id rcar_thermal_dt_ids[] = {
+	{ .compatible = "renesas,rcar-thermal", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, rcar_thermal_dt_ids);
 
 /*
  *		basic functions
@@ -203,14 +203,26 @@ err_out_unlock:
 static int rcar_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
 {
 	struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone);
+	int tmp;
+	int ret;
 
-	if (!rcar_has_irq_support(priv) || rcar_force_update_temp(priv))
-		rcar_thermal_update_temp(priv);
+	ret = rcar_thermal_update_temp(priv);
+	if (ret < 0)
+		return ret;
 
 	mutex_lock(&priv->lock);
-	*temp =  MCELSIUS((priv->ctemp * 5) - 65);
+	tmp =  MCELSIUS((priv->ctemp * 5) - 65);
 	mutex_unlock(&priv->lock);
 
+	if ((tmp < MCELSIUS(-45)) || (tmp > MCELSIUS(125))) {
+		struct device *dev = rcar_priv_to_dev(priv);
+
+		dev_err(dev, "it couldn't measure temperature correctly\n");
+		return -EIO;
+	}
+
+	*temp = tmp;
+
 	return 0;
 }
 
@@ -288,6 +300,9 @@ static void _rcar_thermal_irq_ctrl(struct rcar_thermal_priv *priv, int enable)
 	unsigned long flags;
 	u32 mask = 0x3 << rcar_id_to_shift(priv); /* enable Rising/Falling */
 
+	if (!rcar_has_irq_support(priv))
+		return;
+
 	spin_lock_irqsave(&common->lock, flags);
 
 	rcar_thermal_common_bset(common, INTMSK, mask, enable ? 0 : mask);
@@ -299,11 +314,15 @@ static void rcar_thermal_work(struct work_struct *work)
 {
 	struct rcar_thermal_priv *priv;
 	int cctemp, nctemp;
+	int ret;
 
 	priv = container_of(work, struct rcar_thermal_priv, work.work);
 
 	rcar_thermal_get_temp(priv->zone, &cctemp);
-	rcar_thermal_update_temp(priv);
+	ret = rcar_thermal_update_temp(priv);
+	if (ret < 0)
+		return;
+
 	rcar_thermal_irq_enable(priv);
 
 	rcar_thermal_get_temp(priv->zone, &nctemp);
@@ -368,8 +387,7 @@ static int rcar_thermal_remove(struct platform_device *pdev)
 	struct rcar_thermal_priv *priv;
 
 	rcar_thermal_for_each_priv(priv, common) {
-		if (rcar_has_irq_support(priv))
-			rcar_thermal_irq_disable(priv);
+		rcar_thermal_irq_disable(priv);
 		thermal_zone_device_unregister(priv->zone);
 	}
 
@@ -441,7 +459,9 @@ static int rcar_thermal_probe(struct platform_device *pdev)
 		mutex_init(&priv->lock);
 		INIT_LIST_HEAD(&priv->list);
 		INIT_DELAYED_WORK(&priv->work, rcar_thermal_work);
-		rcar_thermal_update_temp(priv);
+		ret = rcar_thermal_update_temp(priv);
+		if (ret < 0)
+			goto error_unregister;
 
 		priv->zone = thermal_zone_device_register("rcar_thermal",
 						1, 0, priv,
@@ -453,8 +473,7 @@ static int rcar_thermal_probe(struct platform_device *pdev)
 			goto error_unregister;
 		}
 
-		if (rcar_has_irq_support(priv))
-			rcar_thermal_irq_enable(priv);
+		rcar_thermal_irq_enable(priv);
 
 		list_move_tail(&priv->list, &common->head);
 
@@ -484,12 +503,6 @@ error_unregister:
 	return ret;
 }
 
-static const struct of_device_id rcar_thermal_dt_ids[] = {
-	{ .compatible = "renesas,rcar-thermal", },
-	{},
-};
-MODULE_DEVICE_TABLE(of, rcar_thermal_dt_ids);
-
 static struct platform_driver rcar_thermal_driver = {
 	.driver	= {
 		.name	= "rcar_thermal",
diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c
index e845841ab036..b58e3fb9b311 100644
--- a/drivers/thermal/rockchip_thermal.c
+++ b/drivers/thermal/rockchip_thermal.c
@@ -38,7 +38,7 @@ enum tshut_mode {
 };
 
 /**
- * the system Temperature Sensors tshut(tshut) polarity
+ * The system Temperature Sensors tshut(tshut) polarity
  * the bit 8 is tshut polarity.
  * 0: low active, 1: high active
  */
@@ -57,10 +57,10 @@ enum sensor_id {
 };
 
 /**
-* The conversion table has the adc value and temperature.
-* ADC_DECREMENT is the adc value decremnet.(e.g. v2_code_table)
-* ADC_INCREMNET is the adc value incremnet.(e.g. v3_code_table)
-*/
+ * The conversion table has the adc value and temperature.
+ * ADC_DECREMENT: the adc value is of diminishing.(e.g. v2_code_table)
+ * ADC_INCREMENT: the adc value is incremental.(e.g. v3_code_table)
+ */
 enum adc_sort_mode {
 	ADC_DECREMENT = 0,
 	ADC_INCREMENT,
@@ -72,16 +72,17 @@ enum adc_sort_mode {
  */
 #define SOC_MAX_SENSORS	2
 
+/**
+ * struct chip_tsadc_table: hold information about chip-specific differences
+ * @id: conversion table
+ * @length: size of conversion table
+ * @data_mask: mask to apply on data inputs
+ * @mode: sort mode of this adc variant (incrementing or decrementing)
+ */
 struct chip_tsadc_table {
 	const struct tsadc_table *id;
-
-	/* the array table size*/
 	unsigned int length;
-
-	/* that analogic mask data */
 	u32 data_mask;
-
-	/* the sort mode is adc value that increment or decrement in table */
 	enum adc_sort_mode mode;
 };
 
@@ -153,6 +154,7 @@ struct rockchip_thermal_data {
 #define TSADCV2_SHUT_2GPIO_SRC_EN(chn)		BIT(4 + (chn))
 #define TSADCV2_SHUT_2CRU_SRC_EN(chn)		BIT(8 + (chn))
 
+#define TSADCV1_INT_PD_CLEAR_MASK		~BIT(16)
 #define TSADCV2_INT_PD_CLEAR_MASK		~BIT(8)
 
 #define TSADCV2_DATA_MASK			0xfff
@@ -168,6 +170,51 @@ struct tsadc_table {
 	int temp;
 };
 
+/**
+ * Note:
+ * Code to Temperature mapping of the Temperature sensor is a piece wise linear
+ * curve.Any temperature, code faling between to 2 give temperatures can be
+ * linearly interpolated.
+ * Code to Temperature mapping should be updated based on sillcon results.
+ */
+static const struct tsadc_table v1_code_table[] = {
+	{TSADCV3_DATA_MASK, -40000},
+	{436, -40000},
+	{431, -35000},
+	{426, -30000},
+	{421, -25000},
+	{416, -20000},
+	{411, -15000},
+	{406, -10000},
+	{401, -5000},
+	{395, 0},
+	{390, 5000},
+	{385, 10000},
+	{380, 15000},
+	{375, 20000},
+	{370, 25000},
+	{364, 30000},
+	{359, 35000},
+	{354, 40000},
+	{349, 45000},
+	{343, 50000},
+	{338, 55000},
+	{333, 60000},
+	{328, 65000},
+	{322, 70000},
+	{317, 75000},
+	{312, 80000},
+	{307, 85000},
+	{301, 90000},
+	{296, 95000},
+	{291, 100000},
+	{286, 105000},
+	{280, 110000},
+	{275, 115000},
+	{270, 120000},
+	{264, 125000},
+};
+
 static const struct tsadc_table v2_code_table[] = {
 	{TSADCV2_DATA_MASK, -40000},
 	{3800, -40000},
@@ -245,6 +292,44 @@ static const struct tsadc_table v3_code_table[] = {
 	{TSADCV3_DATA_MASK, 125000},
 };
 
+static const struct tsadc_table v4_code_table[] = {
+	{TSADCV3_DATA_MASK, -40000},
+	{431, -40000},
+	{426, -35000},
+	{421, -30000},
+	{415, -25000},
+	{410, -20000},
+	{405, -15000},
+	{399, -10000},
+	{394, -5000},
+	{389, 0},
+	{383, 5000},
+	{378, 10000},
+	{373, 15000},
+	{367, 20000},
+	{362, 25000},
+	{357, 30000},
+	{351, 35000},
+	{346, 40000},
+	{340, 45000},
+	{335, 50000},
+	{330, 55000},
+	{324, 60000},
+	{319, 65000},
+	{313, 70000},
+	{308, 75000},
+	{302, 80000},
+	{297, 85000},
+	{291, 90000},
+	{286, 95000},
+	{281, 100000},
+	{275, 105000},
+	{270, 110000},
+	{264, 115000},
+	{259, 120000},
+	{253, 125000},
+};
+
 static u32 rk_tsadcv2_temp_to_code(struct chip_tsadc_table table,
 				   int temp)
 {
@@ -368,6 +453,14 @@ static void rk_tsadcv2_initialize(void __iomem *regs,
 		       regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
 }
 
+static void rk_tsadcv1_irq_ack(void __iomem *regs)
+{
+	u32 val;
+
+	val = readl_relaxed(regs + TSADCV2_INT_PD);
+	writel_relaxed(val & TSADCV1_INT_PD_CLEAR_MASK, regs + TSADCV2_INT_PD);
+}
+
 static void rk_tsadcv2_irq_ack(void __iomem *regs)
 {
 	u32 val;
@@ -429,6 +522,29 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem *regs,
 	writel_relaxed(val, regs + TSADCV2_INT_EN);
 }
 
+static const struct rockchip_tsadc_chip rk3228_tsadc_data = {
+	.chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
+	.chn_num = 1, /* one channel for tsadc */
+
+	.tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
+	.tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
+	.tshut_temp = 95000,
+
+	.initialize = rk_tsadcv2_initialize,
+	.irq_ack = rk_tsadcv1_irq_ack,
+	.control = rk_tsadcv2_control,
+	.get_temp = rk_tsadcv2_get_temp,
+	.set_tshut_temp = rk_tsadcv2_tshut_temp,
+	.set_tshut_mode = rk_tsadcv2_tshut_mode,
+
+	.table = {
+		.id = v1_code_table,
+		.length = ARRAY_SIZE(v1_code_table),
+		.data_mask = TSADCV3_DATA_MASK,
+		.mode = ADC_DECREMENT,
+	},
+};
+
 static const struct rockchip_tsadc_chip rk3288_tsadc_data = {
 	.chn_id[SENSOR_CPU] = 1, /* cpu sensor is channel 1 */
 	.chn_id[SENSOR_GPU] = 2, /* gpu sensor is channel 2 */
@@ -477,8 +593,36 @@ static const struct rockchip_tsadc_chip rk3368_tsadc_data = {
 	},
 };
 
+static const struct rockchip_tsadc_chip rk3399_tsadc_data = {
+	.chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
+	.chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */
+	.chn_num = 2, /* two channels for tsadc */
+
+	.tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
+	.tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
+	.tshut_temp = 95000,
+
+	.initialize = rk_tsadcv2_initialize,
+	.irq_ack = rk_tsadcv1_irq_ack,
+	.control = rk_tsadcv2_control,
+	.get_temp = rk_tsadcv2_get_temp,
+	.set_tshut_temp = rk_tsadcv2_tshut_temp,
+	.set_tshut_mode = rk_tsadcv2_tshut_mode,
+
+	.table = {
+		.id = v4_code_table,
+		.length = ARRAY_SIZE(v4_code_table),
+		.data_mask = TSADCV3_DATA_MASK,
+		.mode = ADC_DECREMENT,
+	},
+};
+
 static const struct of_device_id of_rockchip_thermal_match[] = {
 	{
+		.compatible = "rockchip,rk3228-tsadc",
+		.data = (void *)&rk3228_tsadc_data,
+	},
+	{
 		.compatible = "rockchip,rk3288-tsadc",
 		.data = (void *)&rk3288_tsadc_data,
 	},
@@ -486,6 +630,10 @@ static const struct of_device_id of_rockchip_thermal_match[] = {
 		.compatible = "rockchip,rk3368-tsadc",
 		.data = (void *)&rk3368_tsadc_data,
 	},
+	{
+		.compatible = "rockchip,rk3399-tsadc",
+		.data = (void *)&rk3399_tsadc_data,
+	},
 	{ /* end */ },
 };
 MODULE_DEVICE_TABLE(of, of_rockchip_thermal_match);
@@ -617,7 +765,7 @@ rockchip_thermal_register_sensor(struct platform_device *pdev,
 	return 0;
 }
 
-/*
+/**
  * Reset TSADC Controller, reset all tsadc registers.
  */
 static void rockchip_thermal_reset_controller(struct reset_control *reset)
diff --git a/drivers/thermal/step_wise.c b/drivers/thermal/step_wise.c
index 2f9f7086ac3d..ea9366ad3e6b 100644
--- a/drivers/thermal/step_wise.c
+++ b/drivers/thermal/step_wise.c
@@ -63,6 +63,19 @@ static unsigned long get_target_state(struct thermal_instance *instance,
 	next_target = instance->target;
 	dev_dbg(&cdev->device, "cur_state=%ld\n", cur_state);
 
+	if (!instance->initialized) {
+		if (throttle) {
+			next_target = (cur_state + 1) >= instance->upper ?
+					instance->upper :
+					((cur_state + 1) < instance->lower ?
+					instance->lower : (cur_state + 1));
+		} else {
+			next_target = THERMAL_NO_TARGET;
+		}
+
+		return next_target;
+	}
+
 	switch (trend) {
 	case THERMAL_TREND_RAISING:
 		if (throttle) {
@@ -149,7 +162,7 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 		dev_dbg(&instance->cdev->device, "old_target=%d, target=%d\n",
 					old_target, (int)instance->target);
 
-		if (old_target == instance->target)
+		if (instance->initialized && old_target == instance->target)
 			continue;
 
 		/* Activate a passive thermal instance */
@@ -161,7 +174,7 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 			instance->target == THERMAL_NO_TARGET)
 			update_passive_instance(tz, trip_type, -1);
 
-
+		instance->initialized = true;
 		instance->cdev->updated = false; /* cdev needs update */
 	}
 
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index d9e525cc9c1c..a0a8fd1235e2 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -37,6 +37,7 @@
 #include <linux/of.h>
 #include <net/netlink.h>
 #include <net/genetlink.h>
+#include <linux/suspend.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/thermal.h>
@@ -59,6 +60,8 @@ static LIST_HEAD(thermal_governor_list);
 static DEFINE_MUTEX(thermal_list_lock);
 static DEFINE_MUTEX(thermal_governor_lock);
 
+static atomic_t in_suspend;
+
 static struct thermal_governor *def_governor;
 
 static struct thermal_governor *__find_governor(const char *name)
@@ -532,14 +535,31 @@ static void update_temperature(struct thermal_zone_device *tz)
 	mutex_unlock(&tz->lock);
 
 	trace_thermal_temperature(tz);
-	dev_dbg(&tz->device, "last_temperature=%d, current_temperature=%d\n",
-				tz->last_temperature, tz->temperature);
+	if (tz->last_temperature == THERMAL_TEMP_INVALID)
+		dev_dbg(&tz->device, "last_temperature N/A, current_temperature=%d\n",
+			tz->temperature);
+	else
+		dev_dbg(&tz->device, "last_temperature=%d, current_temperature=%d\n",
+			tz->last_temperature, tz->temperature);
+}
+
+static void thermal_zone_device_reset(struct thermal_zone_device *tz)
+{
+	struct thermal_instance *pos;
+
+	tz->temperature = THERMAL_TEMP_INVALID;
+	tz->passive = 0;
+	list_for_each_entry(pos, &tz->thermal_instances, tz_node)
+		pos->initialized = false;
 }
 
 void thermal_zone_device_update(struct thermal_zone_device *tz)
 {
 	int count;
 
+	if (atomic_read(&in_suspend))
+		return;
+
 	if (!tz->ops->get_temp)
 		return;
 
@@ -676,8 +696,12 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,
 		return -EINVAL;
 
 	ret = tz->ops->set_trip_temp(tz, trip, temperature);
+	if (ret)
+		return ret;
 
-	return ret ? ret : count;
+	thermal_zone_device_update(tz);
+
+	return count;
 }
 
 static ssize_t
@@ -1321,6 +1345,7 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 	if (!result) {
 		list_add_tail(&dev->tz_node, &tz->thermal_instances);
 		list_add_tail(&dev->cdev_node, &cdev->thermal_instances);
+		atomic_set(&tz->need_update, 1);
 	}
 	mutex_unlock(&cdev->lock);
 	mutex_unlock(&tz->lock);
@@ -1430,6 +1455,7 @@ __thermal_cooling_device_register(struct device_node *np,
 				  const struct thermal_cooling_device_ops *ops)
 {
 	struct thermal_cooling_device *cdev;
+	struct thermal_zone_device *pos = NULL;
 	int result;
 
 	if (type && strlen(type) >= THERMAL_NAME_LENGTH)
@@ -1474,6 +1500,12 @@ __thermal_cooling_device_register(struct device_node *np,
 	/* Update binding information for 'this' new cdev */
 	bind_cdev(cdev);
 
+	mutex_lock(&thermal_list_lock);
+	list_for_each_entry(pos, &thermal_tz_list, node)
+		if (atomic_cmpxchg(&pos->need_update, 1, 0))
+			thermal_zone_device_update(pos);
+	mutex_unlock(&thermal_list_lock);
+
 	return cdev;
 }
 
@@ -1806,6 +1838,8 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
 	tz->trips = trips;
 	tz->passive_delay = passive_delay;
 	tz->polling_delay = polling_delay;
+	/* A new thermal zone needs to be updated anyway. */
+	atomic_set(&tz->need_update, 1);
 
 	dev_set_name(&tz->device, "thermal_zone%d", tz->id);
 	result = device_register(&tz->device);
@@ -1900,7 +1934,10 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
 
 	INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check);
 
-	thermal_zone_device_update(tz);
+	thermal_zone_device_reset(tz);
+	/* Update the new thermal zone and mark it as already updated. */
+	if (atomic_cmpxchg(&tz->need_update, 1, 0))
+		thermal_zone_device_update(tz);
 
 	return tz;
 
@@ -2140,6 +2177,36 @@ static void thermal_unregister_governors(void)
 	thermal_gov_power_allocator_unregister();
 }
 
+static int thermal_pm_notify(struct notifier_block *nb,
+				unsigned long mode, void *_unused)
+{
+	struct thermal_zone_device *tz;
+
+	switch (mode) {
+	case PM_HIBERNATION_PREPARE:
+	case PM_RESTORE_PREPARE:
+	case PM_SUSPEND_PREPARE:
+		atomic_set(&in_suspend, 1);
+		break;
+	case PM_POST_HIBERNATION:
+	case PM_POST_RESTORE:
+	case PM_POST_SUSPEND:
+		atomic_set(&in_suspend, 0);
+		list_for_each_entry(tz, &thermal_tz_list, node) {
+			thermal_zone_device_reset(tz);
+			thermal_zone_device_update(tz);
+		}
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static struct notifier_block thermal_pm_nb = {
+	.notifier_call = thermal_pm_notify,
+};
+
 static int __init thermal_init(void)
 {
 	int result;
@@ -2160,6 +2227,11 @@ static int __init thermal_init(void)
 	if (result)
 		goto exit_netlink;
 
+	result = register_pm_notifier(&thermal_pm_nb);
+	if (result)
+		pr_warn("Thermal: Can not register suspend notifier, return %d\n",
+			result);
+
 	return 0;
 
 exit_netlink:
@@ -2179,6 +2251,7 @@ error:
 
 static void __exit thermal_exit(void)
 {
+	unregister_pm_notifier(&thermal_pm_nb);
 	of_thermal_destroy_zones();
 	genetlink_exit();
 	class_unregister(&thermal_class);
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index d7ac1fccd659..749d41abfbab 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -41,6 +41,7 @@ struct thermal_instance {
 	struct thermal_zone_device *tz;
 	struct thermal_cooling_device *cdev;
 	int trip;
+	bool initialized;
 	unsigned long upper;	/* Highest cooling state for this trip point */
 	unsigned long lower;	/* Lowest cooling state for this trip point */
 	unsigned long target;	/* expected cooling state */
diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c
index 0fbfb2b2aa08..26ccad5d8680 100644
--- a/drivers/usb/gadget/function/f_printer.c
+++ b/drivers/usb/gadget/function/f_printer.c
@@ -673,7 +673,7 @@ printer_fsync(struct file *fd, loff_t start, loff_t end, int datasync)
 	unsigned long		flags;
 	int			tx_list_empty;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	spin_lock_irqsave(&dev->lock, flags);
 	tx_list_empty = (likely(list_empty(&dev->tx_reqs)));
 	spin_unlock_irqrestore(&dev->lock, flags);
@@ -683,7 +683,7 @@ printer_fsync(struct file *fd, loff_t start, loff_t end, int datasync)
 		wait_event_interruptible(dev->tx_flush_wait,
 				(likely(list_empty(&dev->tx_reqs_active))));
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return 0;
 }
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 365afd7e14f8..7e179f81d05c 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -1521,10 +1521,10 @@ static void destroy_ep_files (struct dev_data *dev)
 		spin_unlock_irq (&dev->lock);
 
 		/* break link to dcache */
-		mutex_lock (&parent->i_mutex);
+		inode_lock(parent);
 		d_delete (dentry);
 		dput (dentry);
-		mutex_unlock (&parent->i_mutex);
+		inode_unlock(parent);
 
 		spin_lock_irq (&dev->lock);
 	}
diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c
index f92f5aff0dd5..8755b2c2aada 100644
--- a/drivers/usb/gadget/udc/atmel_usba_udc.c
+++ b/drivers/usb/gadget/udc/atmel_usba_udc.c
@@ -91,7 +91,7 @@ static ssize_t queue_dbg_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, nbytes))
 		return -EFAULT;
 
-	mutex_lock(&file_inode(file)->i_mutex);
+	inode_lock(file_inode(file));
 	list_for_each_entry_safe(req, tmp_req, queue, queue) {
 		len = snprintf(tmpbuf, sizeof(tmpbuf),
 				"%8p %08x %c%c%c %5d %c%c%c\n",
@@ -118,7 +118,7 @@ static ssize_t queue_dbg_read(struct file *file, char __user *buf,
 		nbytes -= len;
 		buf += len;
 	}
-	mutex_unlock(&file_inode(file)->i_mutex);
+	inode_unlock(file_inode(file));
 
 	return actual;
 }
@@ -143,7 +143,7 @@ static int regs_dbg_open(struct inode *inode, struct file *file)
 	u32 *data;
 	int ret = -ENOMEM;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	udc = inode->i_private;
 	data = kmalloc(inode->i_size, GFP_KERNEL);
 	if (!data)
@@ -158,7 +158,7 @@ static int regs_dbg_open(struct inode *inode, struct file *file)
 	ret = 0;
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return ret;
 }
@@ -169,11 +169,11 @@ static ssize_t regs_dbg_read(struct file *file, char __user *buf,
 	struct inode *inode = file_inode(file);
 	int ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = simple_read_from_buffer(buf, nbytes, ppos,
 			file->private_data,
 			file_inode(file)->i_size);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return ret;
 }
diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
index daa563ff1fa0..1f117c360ebb 100644
--- a/drivers/usb/host/Kconfig
+++ b/drivers/usb/host/Kconfig
@@ -229,6 +229,8 @@ config USB_EHCI_TEGRA
        depends on ARCH_TEGRA
        select USB_EHCI_ROOT_HUB_TT
        select USB_PHY
+	select USB_ULPI
+	select USB_ULPI_VIEWPORT
        help
          This driver enables support for the internal USB Host Controllers
          found in NVIDIA Tegra SoCs. The controllers are EHCI compliant.
diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c
index 3fc63c208d08..57721c73177f 100644
--- a/drivers/video/fbdev/core/fb_defio.c
+++ b/drivers/video/fbdev/core/fb_defio.c
@@ -78,13 +78,13 @@ int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasy
 	if (!info->fbdefio)
 		return 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	/* Kill off the delayed work */
 	cancel_delayed_work_sync(&info->deferred_work);
 
 	/* Run it immediately */
 	schedule_delayed_work(&info->deferred_work, 0);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return 0;
 }
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 7bf835f85bc8..eadc894faea2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -449,14 +449,14 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	if (retval)
 		return retval;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
 	fid = filp->private_data;
 	v9fs_blank_wstat(&wstat);
 
 	retval = p9_client_wstat(fid, &wstat);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return retval;
 }
@@ -472,13 +472,13 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
 	if (retval)
 		return retval;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
 	fid = filp->private_data;
 
 	retval = p9_client_fsync(fid, datasync);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return retval;
 }
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 659c579c4588..0548c53f41d5 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -33,11 +33,11 @@ affs_file_release(struct inode *inode, struct file *filp)
 		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
 
 	if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if (inode->i_size != AFFS_I(inode)->mmu_private)
 			affs_truncate(inode);
 		affs_free_prealloc(inode);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 	return 0;
@@ -958,12 +958,12 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	if (err)
 		return err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = write_inode_now(inode, 0);
 	err = sync_blockdev(inode->i_sb->s_bdev);
 	if (!ret)
 		ret = err;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 const struct file_operations affs_file_operations = {
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 4baf1d2b39e4..d91a9c9cfbd0 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -483,7 +483,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
 
 	fl->fl_type = F_UNLCK;
 
-	mutex_lock(&vnode->vfs_inode.i_mutex);
+	inode_lock(&vnode->vfs_inode);
 
 	/* check local lock records first */
 	ret = 0;
@@ -505,7 +505,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
 	}
 
 error:
-	mutex_unlock(&vnode->vfs_inode.i_mutex);
+	inode_unlock(&vnode->vfs_inode);
 	_leave(" = %d [%hd]", ret, fl->fl_type);
 	return ret;
 }
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 0714abcd7f32..dfef94f70667 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -693,7 +693,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (ret)
 		return ret;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* use a writeback record as a marker in the queue - when this reaches
 	 * the front of the queue, all the outstanding writes are either
@@ -735,7 +735,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	afs_put_writeback(wb);
 	_leave(" = %d", ret);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
diff --git a/fs/attr.c b/fs/attr.c
index 6530ced19697..25b24d0f6c88 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -195,7 +195,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 	struct timespec now;
 	unsigned int ia_valid = attr->ia_valid;
 
-	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+	WARN_ON_ONCE(!inode_is_locked(inode));
 
 	if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
 		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 78f005f37847..3a3ced779fc7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 	case 3:
 		/* Delete this handler. */
 		root = dget(file->f_path.dentry->d_sb->s_root);
-		mutex_lock(&d_inode(root)->i_mutex);
+		inode_lock(d_inode(root));
 
 		kill_node(e);
 
-		mutex_unlock(&d_inode(root)->i_mutex);
+		inode_unlock(d_inode(root));
 		dput(root);
 		break;
 	default:
@@ -675,7 +675,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		return PTR_ERR(e);
 
 	root = dget(sb->s_root);
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 	dentry = lookup_one_len(e->name, root, strlen(e->name));
 	err = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
@@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 out2:
 	dput(dentry);
 out:
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 	dput(root);
 
 	if (err) {
@@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
 	case 3:
 		/* Delete all handlers. */
 		root = dget(file->f_path.dentry->d_sb->s_root);
-		mutex_lock(&d_inode(root)->i_mutex);
+		inode_lock(d_inode(root));
 
 		while (!list_empty(&entries))
 			kill_node(list_entry(entries.next, Node, list));
 
-		mutex_unlock(&d_inode(root)->i_mutex);
+		inode_unlock(d_inode(root));
 		dput(root);
 		break;
 	default:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ba762ea07f67..7b9cd49622b1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-	if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
 		return;
 
 	invalidate_bh_lrus();
@@ -346,9 +346,9 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
 	struct inode *bd_inode = bdev_file_inode(file);
 	loff_t retval;
 
-	mutex_lock(&bd_inode->i_mutex);
+	inode_lock(bd_inode);
 	retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
-	mutex_unlock(&bd_inode->i_mutex);
+	inode_unlock(bd_inode);
 	return retval;
 }
 	
@@ -1142,9 +1142,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 {
 	unsigned bsize = bdev_logical_block_size(bdev);
 
-	mutex_lock(&bdev->bd_inode->i_mutex);
+	inode_lock(bdev->bd_inode);
 	i_size_write(bdev->bd_inode, size);
-	mutex_unlock(&bdev->bd_inode->i_mutex);
+	inode_unlock(bdev->bd_inode);
 	while (bsize < PAGE_CACHE_SIZE) {
 		if (size & bsize)
 			break;
@@ -1741,9 +1741,9 @@ static void blkdev_vm_open(struct vm_area_struct *vma)
 	struct inode *bd_inode = bdev_file_inode(vma->vm_file);
 	struct block_device *bdev = I_BDEV(bd_inode);
 
-	mutex_lock(&bd_inode->i_mutex);
+	inode_lock(bd_inode);
 	bdev->bd_map_count++;
-	mutex_unlock(&bd_inode->i_mutex);
+	inode_unlock(bd_inode);
 }
 
 static void blkdev_vm_close(struct vm_area_struct *vma)
@@ -1751,9 +1751,9 @@ static void blkdev_vm_close(struct vm_area_struct *vma)
 	struct inode *bd_inode = bdev_file_inode(vma->vm_file);
 	struct block_device *bdev = I_BDEV(bd_inode);
 
-	mutex_lock(&bd_inode->i_mutex);
+	inode_lock(bd_inode);
 	bdev->bd_map_count--;
-	mutex_unlock(&bd_inode->i_mutex);
+	inode_unlock(bd_inode);
 }
 
 static const struct vm_operations_struct blkdev_dax_vm_ops = {
@@ -1777,7 +1777,7 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
 	struct block_device *bdev = I_BDEV(bd_inode);
 
 	file_accessed(file);
-	mutex_lock(&bd_inode->i_mutex);
+	inode_lock(bd_inode);
 	bdev->bd_map_count++;
 	if (IS_DAX(bd_inode)) {
 		vma->vm_ops = &blkdev_dax_vm_ops;
@@ -1785,7 +1785,7 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
 	} else {
 		vma->vm_ops = &blkdev_default_vm_ops;
 	}
-	mutex_unlock(&bd_inode->i_mutex);
+	inode_unlock(bd_inode);
 
 	return 0;
 }
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 08405a3da6b1..b90cd3776f8e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -560,13 +560,13 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
  */
 static void __merge_refs(struct list_head *head, int mode)
 {
-	struct __prelim_ref *ref1;
+	struct __prelim_ref *pos1;
 
-	list_for_each_entry(ref1, head, list) {
-		struct __prelim_ref *ref2 = ref1, *tmp;
+	list_for_each_entry(pos1, head, list) {
+		struct __prelim_ref *pos2 = pos1, *tmp;
 
-		list_for_each_entry_safe_continue(ref2, tmp, head, list) {
-			struct __prelim_ref *xchg;
+		list_for_each_entry_safe_continue(pos2, tmp, head, list) {
+			struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2;
 			struct extent_inode_elem *eie;
 
 			if (!ref_for_same_block(ref1, ref2))
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 97ad9bbeb35d..bfe4a337fb4d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1614,7 +1614,7 @@ struct btrfs_fs_info {
 
 	spinlock_t delayed_iput_lock;
 	struct list_head delayed_iputs;
-	struct rw_semaphore delayed_iput_sem;
+	struct mutex cleaner_delayed_iput_mutex;
 
 	/* this protects tree_mod_seq_list */
 	spinlock_t tree_mod_seq_lock;
@@ -3641,6 +3641,7 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 int __get_raid_index(u64 flags);
 int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
 void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
 void check_system_chunk(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			const u64 type);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1e668fb7dd4c..cbb7dbfb3fff 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -614,7 +614,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
 		em = lookup_extent_mapping(em_tree, start, (u64)-1);
 		if (!em)
 			break;
-		map = (struct map_lookup *)em->bdev;
+		map = em->map_lookup;
 		for (i = 0; i < map->num_stripes; i++)
 			if (srcdev == map->stripes[i].dev)
 				map->stripes[i].dev = tgtdev;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e99ccd6ffb2c..dd08e29f5117 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -55,6 +55,12 @@
 #include <asm/cpufeature.h>
 #endif
 
+#define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
+				 BTRFS_HEADER_FLAG_RELOC |\
+				 BTRFS_SUPER_FLAG_ERROR |\
+				 BTRFS_SUPER_FLAG_SEEDING |\
+				 BTRFS_SUPER_FLAG_METADUMP)
+
 static const struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
@@ -1583,8 +1589,23 @@ int btrfs_init_fs_root(struct btrfs_root *root)
 	ret = get_anon_bdev(&root->anon_dev);
 	if (ret)
 		goto free_writers;
+
+	mutex_lock(&root->objectid_mutex);
+	ret = btrfs_find_highest_objectid(root,
+					&root->highest_objectid);
+	if (ret) {
+		mutex_unlock(&root->objectid_mutex);
+		goto free_root_dev;
+	}
+
+	ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+	mutex_unlock(&root->objectid_mutex);
+
 	return 0;
 
+free_root_dev:
+	free_anon_bdev(root->anon_dev);
 free_writers:
 	btrfs_free_subvolume_writers(root->subv_writers);
 fail:
@@ -1786,7 +1807,10 @@ static int cleaner_kthread(void *arg)
 			goto sleep;
 		}
 
+		mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
 		btrfs_run_delayed_iputs(root);
+		mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
+
 		again = btrfs_clean_one_deleted_snapshot(root);
 		mutex_unlock(&root->fs_info->cleaner_mutex);
 
@@ -2556,8 +2580,8 @@ int open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->delete_unused_bgs_mutex);
 	mutex_init(&fs_info->reloc_mutex);
 	mutex_init(&fs_info->delalloc_root_mutex);
+	mutex_init(&fs_info->cleaner_delayed_iput_mutex);
 	seqlock_init(&fs_info->profiles_lock);
-	init_rwsem(&fs_info->delayed_iput_sem);
 
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
@@ -2742,26 +2766,6 @@ int open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
-	/*
-	 * Leafsize and nodesize were always equal, this is only a sanity check.
-	 */
-	if (le32_to_cpu(disk_super->__unused_leafsize) !=
-	    btrfs_super_nodesize(disk_super)) {
-		printk(KERN_ERR "BTRFS: couldn't mount because metadata "
-		       "blocksizes don't match.  node %d leaf %d\n",
-		       btrfs_super_nodesize(disk_super),
-		       le32_to_cpu(disk_super->__unused_leafsize));
-		err = -EINVAL;
-		goto fail_alloc;
-	}
-	if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
-		printk(KERN_ERR "BTRFS: couldn't mount because metadata "
-		       "blocksize (%d) was too large\n",
-		       btrfs_super_nodesize(disk_super));
-		err = -EINVAL;
-		goto fail_alloc;
-	}
-
 	features = btrfs_super_incompat_flags(disk_super);
 	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
 	if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
@@ -2833,17 +2837,6 @@ int open_ctree(struct super_block *sb,
 	sb->s_blocksize = sectorsize;
 	sb->s_blocksize_bits = blksize_bits(sectorsize);
 
-	if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
-		printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
-		goto fail_sb_buffer;
-	}
-
-	if (sectorsize != PAGE_SIZE) {
-		printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
-		       "found on %s\n", (unsigned long)sectorsize, sb->s_id);
-		goto fail_sb_buffer;
-	}
-
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
@@ -2915,6 +2908,18 @@ retry_root_backup:
 	tree_root->commit_root = btrfs_root_node(tree_root);
 	btrfs_set_root_refs(&tree_root->root_item, 1);
 
+	mutex_lock(&tree_root->objectid_mutex);
+	ret = btrfs_find_highest_objectid(tree_root,
+					&tree_root->highest_objectid);
+	if (ret) {
+		mutex_unlock(&tree_root->objectid_mutex);
+		goto recovery_tree_root;
+	}
+
+	ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+	mutex_unlock(&tree_root->objectid_mutex);
+
 	ret = btrfs_read_roots(fs_info, tree_root);
 	if (ret)
 		goto recovery_tree_root;
@@ -4018,8 +4023,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 			      int read_only)
 {
 	struct btrfs_super_block *sb = fs_info->super_copy;
+	u64 nodesize = btrfs_super_nodesize(sb);
+	u64 sectorsize = btrfs_super_sectorsize(sb);
 	int ret = 0;
 
+	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
+		printk(KERN_ERR "BTRFS: no valid FS found\n");
+		ret = -EINVAL;
+	}
+	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)
+		printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n",
+				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
 	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
 		printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
 				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
@@ -4037,31 +4051,46 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	}
 
 	/*
-	 * The common minimum, we don't know if we can trust the nodesize/sectorsize
-	 * items yet, they'll be verified later. Issue just a warning.
+	 * Check sectorsize and nodesize first, other check will need it.
+	 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
 	 */
-	if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
+	if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+	    sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+		printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize);
+		ret = -EINVAL;
+	}
+	/* Only PAGE SIZE is supported yet */
+	if (sectorsize != PAGE_CACHE_SIZE) {
+		printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n",
+				sectorsize, PAGE_CACHE_SIZE);
+		ret = -EINVAL;
+	}
+	if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
+	    nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+		printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize);
+		ret = -EINVAL;
+	}
+	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
+		printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n",
+				le32_to_cpu(sb->__unused_leafsize),
+				nodesize);
+		ret = -EINVAL;
+	}
+
+	/* Root alignment check */
+	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
 		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
 				btrfs_super_root(sb));
-	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
+		ret = -EINVAL;
+	}
+	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
 		printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
 				btrfs_super_chunk_root(sb));
-	if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
-		printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
-				btrfs_super_log_root(sb));
-
-	/*
-	 * Check the lower bound, the alignment and other constraints are
-	 * checked later.
-	 */
-	if (btrfs_super_nodesize(sb) < 4096) {
-		printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
-				btrfs_super_nodesize(sb));
 		ret = -EINVAL;
 	}
-	if (btrfs_super_sectorsize(sb) < 4096) {
-		printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
-				btrfs_super_sectorsize(sb));
+	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
+		printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
+				btrfs_super_log_root(sb));
 		ret = -EINVAL;
 	}
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 60cc1399c64f..e2287c7c10be 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4139,8 +4139,10 @@ commit_trans:
 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
 			need_commit--;
 
-			if (need_commit > 0)
+			if (need_commit > 0) {
+				btrfs_start_delalloc_roots(fs_info, 0, -1);
 				btrfs_wait_ordered_roots(fs_info, -1);
+			}
 
 			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
@@ -4153,11 +4155,12 @@ commit_trans:
 				if (ret)
 					return ret;
 				/*
-				 * make sure that all running delayed iput are
-				 * done
+				 * The cleaner kthread might still be doing iput
+				 * operations. Wait for it to finish so that
+				 * more space is released.
 				 */
-				down_write(&root->fs_info->delayed_iput_sem);
-				up_write(&root->fs_info->delayed_iput_sem);
+				mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
+				mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
 				goto again;
 			} else {
 				btrfs_end_transaction(trans, root);
@@ -10399,7 +10402,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
 	 * more device items and remove one chunk item), but this is done at
 	 * btrfs_remove_chunk() through a call to check_system_chunk().
 	 */
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	num_items = 3 + map->num_stripes;
 	free_extent_map(em);
 
@@ -10586,7 +10589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 
 	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
-		return 1;
+		return -EINVAL;
 
 	features = btrfs_super_incompat_flags(disk_super);
 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
@@ -10816,3 +10819,23 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
 	}
 	return 1;
 }
+
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+	schedule();
+	return 0;
+}
+
+void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
+{
+	while (true) {
+		int ret;
+
+		ret = btrfs_start_write_no_snapshoting(root);
+		if (ret)
+			break;
+		wait_on_atomic_t(&root->will_be_snapshoted,
+				 wait_snapshoting_atomic_t,
+				 TASK_UNINTERRUPTIBLE);
+	}
+}
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6a98bddd8f33..84fb56d5c018 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em)
 		WARN_ON(extent_map_in_tree(em));
 		WARN_ON(!list_empty(&em->list));
 		if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
-			kfree(em->bdev);
+			kfree(em->map_lookup);
 		kmem_cache_free(extent_map_cache, em);
 	}
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index b2991fd8583e..eb8b8fae036b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -32,7 +32,15 @@ struct extent_map {
 	u64 block_len;
 	u64 generation;
 	unsigned long flags;
-	struct block_device *bdev;
+	union {
+		struct block_device *bdev;
+
+		/*
+		 * used for chunk mappings
+		 * flags & EXTENT_FLAG_FS_MAPPING must be set
+		 */
+		struct map_lookup *map_lookup;
+	};
 	atomic_t refs;
 	unsigned int compress_type;
 	struct list_head list;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 83d7859d7619..098bb8f690c9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -406,8 +406,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
  */
-static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
-					 size_t write_bytes,
+static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 					 struct page **prepared_pages,
 					 struct iov_iter *i)
 {
@@ -1588,8 +1587,7 @@ again:
 			ret = 0;
 		}
 
-		copied = btrfs_copy_from_user(pos, num_pages,
-					   write_bytes, pages, i);
+		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
 
 		/*
 		 * if we have trouble faulting in the pages, fall
@@ -1764,17 +1762,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	loff_t pos;
 	size_t count;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	err = generic_write_checks(iocb, from);
 	if (err <= 0) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		return err;
 	}
 
 	current->backing_dev_info = inode_to_bdi(inode);
 	err = file_remove_privs(file);
 	if (err) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		goto out;
 	}
 
@@ -1785,7 +1783,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	 * to stop this write operation to ensure FS consistency.
 	 */
 	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		err = -EROFS;
 		goto out;
 	}
@@ -1806,7 +1804,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 		end_pos = round_up(pos + count, root->sectorsize);
 		err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
 		if (err) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			goto out;
 		}
 	}
@@ -1822,7 +1820,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 			iocb->ki_pos = pos + num_written;
 	}
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	/*
 	 * We also have to set last_sub_trans to the current log transid,
@@ -1911,7 +1909,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	atomic_inc(&root->log_batch);
 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			     &BTRFS_I(inode)->runtime_flags);
@@ -1963,7 +1961,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = start_ordered_ops(inode, start, end);
 	}
 	if (ret) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		goto out;
 	}
 	atomic_inc(&root->log_batch);
@@ -2009,7 +2007,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		 */
 		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			  &BTRFS_I(inode)->runtime_flags);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		goto out;
 	}
 
@@ -2033,7 +2031,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		goto out;
 	}
 	trans->sync = true;
@@ -2056,7 +2054,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	/*
 	 * If any of the ordered extents had an error, just return it to user
@@ -2305,7 +2303,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
 	ret = find_first_non_hole(inode, &offset, &len);
 	if (ret < 0)
@@ -2345,7 +2343,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		truncated_page = true;
 		ret = btrfs_truncate_page(inode, offset, 0, 0);
 		if (ret) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			return ret;
 		}
 	}
@@ -2421,7 +2419,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		ret = btrfs_wait_ordered_range(inode, lockstart,
 					       lockend - lockstart + 1);
 		if (ret) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			return ret;
 		}
 	}
@@ -2576,7 +2574,7 @@ out_only_mutex:
 			ret = btrfs_end_transaction(trans, root);
 		}
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (ret && !err)
 		err = ret;
 	return err;
@@ -2660,7 +2658,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret < 0)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = inode_newsize_ok(inode, alloc_end);
 	if (ret)
 		goto out;
@@ -2818,7 +2816,7 @@ out:
 	 * So this is completely used as cleanup.
 	 */
 	btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	/* Let go of our reservation. */
 	btrfs_free_reserved_data_space(inode, alloc_start,
 				       alloc_end - alloc_start);
@@ -2894,7 +2892,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 	struct inode *inode = file->f_mapping->host;
 	int ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	switch (whence) {
 	case SEEK_END:
 	case SEEK_CUR:
@@ -2903,20 +2901,20 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 	case SEEK_DATA:
 	case SEEK_HOLE:
 		if (offset >= i_size_read(inode)) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			return -ENXIO;
 		}
 
 		ret = find_desired_extent(inode, &offset, whence);
 		if (ret) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			return ret;
 		}
 	}
 
 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return offset;
 }
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 8b57c17b3fb3..e50316c4af15 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -515,7 +515,7 @@ out:
 	return ret;
 }
 
-static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -555,13 +555,6 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
 	int ret;
 	mutex_lock(&root->objectid_mutex);
 
-	if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
-		ret = btrfs_find_highest_objectid(root,
-						  &root->highest_objectid);
-		if (ret)
-			goto out;
-	}
-
 	if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
 		ret = -ENOSPC;
 		goto out;
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
index ddb347bfee23..c8e864b2d530 100644
--- a/fs/btrfs/inode-map.h
+++ b/fs/btrfs/inode-map.h
@@ -9,5 +9,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 			 struct btrfs_trans_handle *trans);
 
 int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
 
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 247830107686..e28f3d4691af 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3134,7 +3134,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	down_read(&fs_info->delayed_iput_sem);
 	spin_lock(&fs_info->delayed_iput_lock);
 	while (!list_empty(&fs_info->delayed_iputs)) {
 		struct btrfs_inode *inode;
@@ -3153,7 +3152,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 		spin_lock(&fs_info->delayed_iput_lock);
 	}
 	spin_unlock(&fs_info->delayed_iput_lock);
-	up_read(&root->fs_info->delayed_iput_sem);
 }
 
 /*
@@ -4874,26 +4872,6 @@ next:
 	return err;
 }
 
-static int wait_snapshoting_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
-static void wait_for_snapshot_creation(struct btrfs_root *root)
-{
-	while (true) {
-		int ret;
-
-		ret = btrfs_start_write_no_snapshoting(root);
-		if (ret)
-			break;
-		wait_on_atomic_t(&root->will_be_snapshoted,
-				 wait_snapshoting_atomic_t,
-				 TASK_UNINTERRUPTIBLE);
-	}
-}
-
 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4925,7 +4903,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		 * truncation, it must capture all writes that happened before
 		 * this truncation.
 		 */
-		wait_for_snapshot_creation(root);
+		btrfs_wait_for_snapshot_creation(root);
 		ret = btrfs_cont_expand(inode, oldsize, newsize);
 		if (ret) {
 			btrfs_end_write_no_snapshoting(root);
@@ -8469,7 +8447,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		 * not unlock the i_mutex at this case.
 		 */
 		if (offset + count <= inode->i_size) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			relock = true;
 		}
 		ret = btrfs_delalloc_reserve_space(inode, offset, count);
@@ -8526,7 +8504,7 @@ out:
 	if (wakeup)
 		inode_dio_end(inode);
 	if (relock)
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 
 	return ret;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2a47a3148ec8..952172ca7e45 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -240,7 +240,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	ip_oldflags = ip->flags;
 	i_oldflags = inode->i_flags;
@@ -358,7 +358,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	}
 
  out_unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	mnt_drop_write_file(file);
 	return ret;
 }
@@ -568,6 +568,10 @@ static noinline int create_subvol(struct inode *dir,
 		goto fail;
 	}
 
+	mutex_lock(&new_root->objectid_mutex);
+	new_root->highest_objectid = new_dirid;
+	mutex_unlock(&new_root->objectid_mutex);
+
 	/*
 	 * insert the directory item
 	 */
@@ -877,7 +881,7 @@ out_up_read:
 out_dput:
 	dput(dentry);
 out_unlock:
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	return error;
 }
 
@@ -1389,18 +1393,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			ra_index += cluster;
 		}
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
 			BTRFS_I(inode)->force_compress = compress_type;
 		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
 		if (ret < 0) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			goto out_ra;
 		}
 
 		defrag_count += ret;
 		balance_dirty_pages_ratelimited(inode->i_mapping);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		if (newer_than) {
 			if (newer_off == (u64)-1)
@@ -1461,9 +1465,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 out_ra:
 	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	if (!file)
 		kfree(ra);
@@ -2426,7 +2430,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		goto out_dput;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * Don't allow to delete a subvolume with send in progress. This is
@@ -2539,7 +2543,7 @@ out_up_write:
 		spin_unlock(&dest->root_item_lock);
 	}
 out_unlock_inode:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (!err) {
 		d_invalidate(dentry);
 		btrfs_invalidate_inodes(dest);
@@ -2555,7 +2559,7 @@ out_unlock_inode:
 out_dput:
 	dput(dentry);
 out_unlock_dir:
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 out_drop_write:
 	mnt_drop_write_file(file);
 out:
@@ -2853,8 +2857,8 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
 
 static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
 {
-	mutex_unlock(&inode1->i_mutex);
-	mutex_unlock(&inode2->i_mutex);
+	inode_unlock(inode1);
+	inode_unlock(inode2);
 }
 
 static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
@@ -2862,8 +2866,8 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
 	if (inode1 < inode2)
 		swap(inode1, inode2);
 
-	mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-	mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(inode1, I_MUTEX_PARENT);
+	inode_lock_nested(inode2, I_MUTEX_CHILD);
 }
 
 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
@@ -3022,7 +3026,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 		return 0;
 
 	if (same_inode) {
-		mutex_lock(&src->i_mutex);
+		inode_lock(src);
 
 		ret = extent_same_check_offsets(src, loff, &len, olen);
 		if (ret)
@@ -3097,7 +3101,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 	btrfs_cmp_data_free(&cmp);
 out_unlock:
 	if (same_inode)
-		mutex_unlock(&src->i_mutex);
+		inode_unlock(src);
 	else
 		btrfs_double_inode_unlock(src, dst);
 
@@ -3745,7 +3749,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 	if (!same_inode) {
 		btrfs_double_inode_lock(src, inode);
 	} else {
-		mutex_lock(&src->i_mutex);
+		inode_lock(src);
 	}
 
 	/* determine range to clone */
@@ -3816,7 +3820,7 @@ out_unlock:
 	if (!same_inode)
 		btrfs_double_inode_unlock(src, inode);
 	else
-		mutex_unlock(&src->i_mutex);
+		inode_unlock(src);
 	return ret;
 }
 
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 6d707545f775..55161369fab1 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -609,13 +609,28 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
 	return 1;
 }
 
+static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
+				  int index)
+{
+	return stripe * rbio->stripe_npages + index;
+}
+
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
+				     int index)
+{
+	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
+}
+
 /*
  * helper to index into the pstripe
  */
 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 {
-	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
-	return rbio->stripe_pages[index];
+	return rbio_stripe_page(rbio, rbio->nr_data, index);
 }
 
 /*
@@ -626,10 +641,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 {
 	if (rbio->nr_data + 1 == rbio->real_stripes)
 		return NULL;
-
-	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
-		PAGE_CACHE_SHIFT;
-	return rbio->stripe_pages[index];
+	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
 }
 
 /*
@@ -889,6 +901,7 @@ static void raid_write_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 	int err = bio->bi_error;
+	int max_errors;
 
 	if (err)
 		fail_bio_stripe(rbio, bio);
@@ -901,7 +914,9 @@ static void raid_write_end_io(struct bio *bio)
 	err = 0;
 
 	/* OK, we have read all the stripes we need to. */
-	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
+		     0 : rbio->bbio->max_errors;
+	if (atomic_read(&rbio->error) > max_errors)
 		err = -EIO;
 
 	rbio_orig_end_io(rbio, err);
@@ -947,8 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
  */
 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 {
-	unsigned long nr = stripe_len * nr_stripes;
-	return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
+	return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes;
 }
 
 /*
@@ -966,8 +980,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 	void *p;
 
 	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
-		       DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
-			GFP_NOFS);
+		       DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) *
+		       sizeof(long), GFP_NOFS);
 	if (!rbio)
 		return ERR_PTR(-ENOMEM);
 
@@ -1021,18 +1035,17 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 		if (!page)
 			return -ENOMEM;
 		rbio->stripe_pages[i] = page;
-		ClearPageUptodate(page);
 	}
 	return 0;
 }
 
-/* allocate pages for just the p/q stripes */
+/* only allocate pages for p/q stripes */
 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 {
 	int i;
 	struct page *page;
 
-	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
 
 	for (; i < rbio->nr_pages; i++) {
 		if (rbio->stripe_pages[i])
@@ -1121,18 +1134,6 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
 }
 
 /*
- * these are just the pages from the rbio array, not from anything
- * the FS sent down to us
- */
-static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
-{
-	int index;
-	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
-	index += page;
-	return rbio->stripe_pages[index];
-}
-
-/*
  * helper function to walk our bio list and populate the bio_pages array with
  * the result.  This seems expensive, but it is faster than constantly
  * searching through the bio list as we setup the IO in finish_rmw or stripe
@@ -1175,7 +1176,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 {
 	struct btrfs_bio *bbio = rbio->bbio;
 	void *pointers[rbio->real_stripes];
-	int stripe_len = rbio->stripe_len;
 	int nr_data = rbio->nr_data;
 	int stripe;
 	int pagenr;
@@ -1183,7 +1183,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 	int q_stripe = -1;
 	struct bio_list bio_list;
 	struct bio *bio;
-	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
 	int ret;
 
 	bio_list_init(&bio_list);
@@ -1226,7 +1225,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 	else
 		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 
-	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 		struct page *p;
 		/* first collect one page from each data stripe */
 		for (stripe = 0; stripe < nr_data; stripe++) {
@@ -1268,7 +1267,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 	 * everything else.
 	 */
 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
-		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 			struct page *page;
 			if (stripe < rbio->nr_data) {
 				page = page_in_rbio(rbio, stripe, pagenr, 1);
@@ -1292,7 +1291,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 		if (!bbio->tgtdev_map[stripe])
 			continue;
 
-		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 			struct page *page;
 			if (stripe < rbio->nr_data) {
 				page = page_in_rbio(rbio, stripe, pagenr, 1);
@@ -1506,7 +1505,6 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 	int bios_to_read = 0;
 	struct bio_list bio_list;
 	int ret;
-	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
 	int pagenr;
 	int stripe;
 	struct bio *bio;
@@ -1525,7 +1523,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 	 * stripe
 	 */
 	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
-		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 			struct page *page;
 			/*
 			 * we want to find all the pages missing from
@@ -1801,7 +1799,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 	int pagenr, stripe;
 	void **pointers;
 	int faila = -1, failb = -1;
-	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
 	struct page *page;
 	int err;
 	int i;
@@ -1824,7 +1821,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 
 	index_rbio_pages(rbio);
 
-	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 		/*
 		 * Now we just use bitmap to mark the horizontal stripes in
 		 * which we have data when doing parity scrub.
@@ -1935,7 +1932,7 @@ pstripe:
 		 * other endio functions will fiddle the uptodate bits
 		 */
 		if (rbio->operation == BTRFS_RBIO_WRITE) {
-			for (i = 0;  i < nr_pages; i++) {
+			for (i = 0;  i < rbio->stripe_npages; i++) {
 				if (faila != -1) {
 					page = rbio_stripe_page(rbio, faila, i);
 					SetPageUptodate(page);
@@ -2031,7 +2028,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 	int bios_to_read = 0;
 	struct bio_list bio_list;
 	int ret;
-	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
 	int pagenr;
 	int stripe;
 	struct bio *bio;
@@ -2055,7 +2051,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 			continue;
 		}
 
-		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 			struct page *p;
 
 			/*
@@ -2279,37 +2275,11 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
 			if (!page)
 				return -ENOMEM;
 			rbio->stripe_pages[index] = page;
-			ClearPageUptodate(page);
 		}
 	}
 	return 0;
 }
 
-/*
- * end io function used by finish_rmw.  When we finally
- * get here, we've written a full stripe
- */
-static void raid_write_parity_end_io(struct bio *bio)
-{
-	struct btrfs_raid_bio *rbio = bio->bi_private;
-	int err = bio->bi_error;
-
-	if (bio->bi_error)
-		fail_bio_stripe(rbio, bio);
-
-	bio_put(bio);
-
-	if (!atomic_dec_and_test(&rbio->stripes_pending))
-		return;
-
-	err = 0;
-
-	if (atomic_read(&rbio->error))
-		err = -EIO;
-
-	rbio_orig_end_io(rbio, err);
-}
-
 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 					 int need_check)
 {
@@ -2462,7 +2432,7 @@ submit_write:
 			break;
 
 		bio->bi_private = rbio;
-		bio->bi_end_io = raid_write_parity_end_io;
+		bio->bi_end_io = raid_write_end_io;
 		submit_bio(WRITE, bio);
 	}
 	return;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ef6d8fc85853..fd1c4d982463 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3030,7 +3030,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
 	int ret = 0;
 
 	BUG_ON(cluster->start != cluster->boundary[0]);
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	ret = btrfs_check_data_free_space(inode, cluster->start,
 					  cluster->end + 1 - cluster->start);
@@ -3057,7 +3057,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
 	btrfs_free_reserved_data_space(inode, cluster->start,
 				       cluster->end + 1 - cluster->start);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0c981ebe2acb..92bf5ee732fb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2813,7 +2813,7 @@ out:
 
 static inline int scrub_calc_parity_bitmap_len(int nsectors)
 {
-	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
+	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
 }
 
 static void scrub_parity_get(struct scrub_parity *sparity)
@@ -3458,7 +3458,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 		return ret;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (em->start != chunk_offset)
 		goto out;
 
@@ -4279,7 +4279,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 		return PTR_ERR(inode);
 
 	/* Avoid truncate/dio/punch hole.. */
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	inode_dio_wait(inode);
 
 	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
@@ -4358,7 +4358,7 @@ next_page:
 	}
 	ret = COPY_COMPLETE;
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	iput(inode);
 	return ret;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9b9eab6d048e..d41e09fe8e38 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -383,6 +383,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	int ret = 0;
 	char *compress_type;
 	bool compress_force = false;
+	enum btrfs_compression_type saved_compress_type;
+	bool saved_compress_force;
+	int no_compress = 0;
 
 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
 	if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
@@ -462,6 +465,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			/* Fallthrough */
 		case Opt_compress:
 		case Opt_compress_type:
+			saved_compress_type = btrfs_test_opt(root, COMPRESS) ?
+				info->compress_type : BTRFS_COMPRESS_NONE;
+			saved_compress_force =
+				btrfs_test_opt(root, FORCE_COMPRESS);
 			if (token == Opt_compress ||
 			    token == Opt_compress_force ||
 			    strcmp(args[0].from, "zlib") == 0) {
@@ -470,6 +477,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				btrfs_set_opt(info->mount_opt, COMPRESS);
 				btrfs_clear_opt(info->mount_opt, NODATACOW);
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
+				no_compress = 0;
 			} else if (strcmp(args[0].from, "lzo") == 0) {
 				compress_type = "lzo";
 				info->compress_type = BTRFS_COMPRESS_LZO;
@@ -477,25 +485,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				btrfs_clear_opt(info->mount_opt, NODATACOW);
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
 				btrfs_set_fs_incompat(info, COMPRESS_LZO);
+				no_compress = 0;
 			} else if (strncmp(args[0].from, "no", 2) == 0) {
 				compress_type = "no";
 				btrfs_clear_opt(info->mount_opt, COMPRESS);
 				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
 				compress_force = false;
+				no_compress++;
 			} else {
 				ret = -EINVAL;
 				goto out;
 			}
 
 			if (compress_force) {
-				btrfs_set_and_info(root, FORCE_COMPRESS,
-						   "force %s compression",
-						   compress_type);
+				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
 			} else {
-				if (!btrfs_test_opt(root, COMPRESS))
-					btrfs_info(root->fs_info,
-						   "btrfs: use %s compression",
-						   compress_type);
 				/*
 				 * If we remount from compress-force=xxx to
 				 * compress=xxx, we need clear FORCE_COMPRESS
@@ -504,6 +508,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				 */
 				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
 			}
+			if ((btrfs_test_opt(root, COMPRESS) &&
+			     (info->compress_type != saved_compress_type ||
+			      compress_force != saved_compress_force)) ||
+			    (!btrfs_test_opt(root, COMPRESS) &&
+			     no_compress == 1)) {
+				btrfs_info(root->fs_info,
+					   "%s %s compression",
+					   (compress_force) ? "force" : "use",
+					   compress_type);
+			}
+			compress_force = false;
 			break;
 		case Opt_ssd:
 			btrfs_set_and_info(root, SSD,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c32abbca9d77..366b335946fa 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 	},
 };
 
-const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
 	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
 	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
 	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
@@ -233,6 +233,7 @@ static struct btrfs_device *__alloc_device(void)
 	spin_lock_init(&dev->reada_lock);
 	atomic_set(&dev->reada_in_flight, 0);
 	atomic_set(&dev->dev_stats_ccnt, 0);
+	btrfs_device_data_ordered_init(dev);
 	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 
@@ -1183,7 +1184,7 @@ again:
 		struct map_lookup *map;
 		int i;
 
-		map = (struct map_lookup *)em->bdev;
+		map = em->map_lookup;
 		for (i = 0; i < map->num_stripes; i++) {
 			u64 end;
 
@@ -2755,7 +2756,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 			free_extent_map(em);
 		return -EINVAL;
 	}
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	lock_chunks(root->fs_info->chunk_root);
 	check_system_chunk(trans, extent_root, map->type);
 	unlock_chunks(root->fs_info->chunk_root);
@@ -3751,7 +3752,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
 		btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
 		btrfs_warn(fs_info,
-	"metatdata profile 0x%llx has lower redundancy than data profile 0x%llx",
+	"metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
 			bctl->meta.target, bctl->data.target);
 	}
 
@@ -4718,7 +4719,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		goto error;
 	}
 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
-	em->bdev = (struct block_device *)map;
+	em->map_lookup = map;
 	em->start = start;
 	em->len = num_bytes;
 	em->block_start = 0;
@@ -4813,7 +4814,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 		return -EINVAL;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	item_size = btrfs_chunk_item_size(map->num_stripes);
 	stripe_size = em->orig_block_len;
 
@@ -4968,7 +4969,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
 	if (!em)
 		return 1;
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	for (i = 0; i < map->num_stripes; i++) {
 		if (map->stripes[i].dev->missing) {
 			miss_ndevs++;
@@ -5048,7 +5049,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		return 1;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
 		ret = map->num_stripes;
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
@@ -5084,7 +5085,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
 		len = map->stripe_len * nr_data_stripes(map);
 	free_extent_map(em);
@@ -5105,7 +5106,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
 		ret = 1;
 	free_extent_map(em);
@@ -5264,7 +5265,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		return -EINVAL;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	offset = logical - em->start;
 
 	stripe_len = map->stripe_len;
@@ -5378,35 +5379,33 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		 * target drive.
 		 */
 		for (i = 0; i < tmp_num_stripes; i++) {
-			if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
-				/*
-				 * In case of DUP, in order to keep it
-				 * simple, only add the mirror with the
-				 * lowest physical address
-				 */
-				if (found &&
-				    physical_of_found <=
-				     tmp_bbio->stripes[i].physical)
-					continue;
-				index_srcdev = i;
-				found = 1;
-				physical_of_found =
-					tmp_bbio->stripes[i].physical;
-			}
+			if (tmp_bbio->stripes[i].dev->devid != srcdev_devid)
+				continue;
+
+			/*
+			 * In case of DUP, in order to keep it simple, only add
+			 * the mirror with the lowest physical address
+			 */
+			if (found &&
+			    physical_of_found <= tmp_bbio->stripes[i].physical)
+				continue;
+
+			index_srcdev = i;
+			found = 1;
+			physical_of_found = tmp_bbio->stripes[i].physical;
 		}
 
-		if (found) {
-			mirror_num = index_srcdev + 1;
-			patch_the_first_stripe_for_dev_replace = 1;
-			physical_to_patch_in_first_stripe = physical_of_found;
-		} else {
+		btrfs_put_bbio(tmp_bbio);
+
+		if (!found) {
 			WARN_ON(1);
 			ret = -EIO;
-			btrfs_put_bbio(tmp_bbio);
 			goto out;
 		}
 
-		btrfs_put_bbio(tmp_bbio);
+		mirror_num = index_srcdev + 1;
+		patch_the_first_stripe_for_dev_replace = 1;
+		physical_to_patch_in_first_stripe = physical_of_found;
 	} else if (mirror_num > map->num_stripes) {
 		mirror_num = 0;
 	}
@@ -5806,7 +5805,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		free_extent_map(em);
 		return -EIO;
 	}
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 
 	length = em->len;
 	rmap_len = map->stripe_len;
@@ -6069,7 +6068,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	bbio->fs_info = root->fs_info;
 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
-	if (bbio->raid_map) {
+	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+	    ((rw & WRITE) || (mirror_num > 1))) {
 		/* In this case, map_length has been set to the length of
 		   a single stripe; not the whole write */
 		if (rw & WRITE) {
@@ -6210,6 +6210,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	struct extent_map *em;
 	u64 logical;
 	u64 length;
+	u64 stripe_len;
 	u64 devid;
 	u8 uuid[BTRFS_UUID_SIZE];
 	int num_stripes;
@@ -6218,6 +6219,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
 	logical = key->offset;
 	length = btrfs_chunk_length(leaf, chunk);
+	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	/* Validation check */
+	if (!num_stripes) {
+		btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
+			  num_stripes);
+		return -EIO;
+	}
+	if (!IS_ALIGNED(logical, root->sectorsize)) {
+		btrfs_err(root->fs_info,
+			  "invalid chunk logical %llu", logical);
+		return -EIO;
+	}
+	if (!length || !IS_ALIGNED(length, root->sectorsize)) {
+		btrfs_err(root->fs_info,
+			"invalid chunk length %llu", length);
+		return -EIO;
+	}
+	if (!is_power_of_2(stripe_len)) {
+		btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
+			  stripe_len);
+		return -EIO;
+	}
+	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+	    btrfs_chunk_type(leaf, chunk)) {
+		btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
+			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+			  btrfs_chunk_type(leaf, chunk));
+		return -EIO;
+	}
 
 	read_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
@@ -6234,7 +6266,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em = alloc_extent_map();
 	if (!em)
 		return -ENOMEM;
-	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
 		free_extent_map(em);
@@ -6242,7 +6273,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	}
 
 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
-	em->bdev = (struct block_device *)map;
+	em->map_lookup = map;
 	em->start = logical;
 	em->len = length;
 	em->orig_start = 0;
@@ -6944,7 +6975,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
 	/* In order to kick the device replace finish process */
 	lock_chunks(root);
 	list_for_each_entry(em, &transaction->pending_chunks, list) {
-		map = (struct map_lookup *)em->bdev;
+		map = em->map_lookup;
 
 		for (i = 0; i < map->num_stripes; i++) {
 			dev = map->stripes[i].dev;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index fd953c361a43..6c68d6356197 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -126,7 +126,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 	 * locks the inode's i_mutex before calling setxattr or removexattr.
 	 */
 	if (flags & XATTR_REPLACE) {
-		ASSERT(mutex_is_locked(&inode->i_mutex));
+		ASSERT(inode_is_locked(inode));
 		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
 					name, name_len, 0);
 		if (!di)
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index afa023dded5b..675a3332d72f 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -446,7 +446,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
 		return 0;
 
 	cachefiles_begin_secure(cache, &saved_cred);
-	mutex_lock(&d_inode(object->backer)->i_mutex);
+	inode_lock(d_inode(object->backer));
 
 	/* if there's an extension to a partial page at the end of the backing
 	 * file, we need to discard the partial page so that we pick up new
@@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
 	ret = notify_change(object->backer, &newattrs, NULL);
 
 truncate_failed:
-	mutex_unlock(&d_inode(object->backer)->i_mutex);
+	inode_unlock(d_inode(object->backer));
 	cachefiles_end_secure(cache, saved_cred);
 
 	if (ret == -EIO) {
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index c4b893453e0e..1c2334c163dd 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -295,7 +295,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 				cachefiles_mark_object_buried(cache, rep, why);
 		}
 
-		mutex_unlock(&d_inode(dir)->i_mutex);
+		inode_unlock(d_inode(dir));
 
 		if (ret == -EIO)
 			cachefiles_io_error(cache, "Unlink failed");
@@ -306,7 +306,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 
 	/* directories have to be moved to the graveyard */
 	_debug("move stale object to graveyard");
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 
 try_again:
 	/* first step is to make up a grave dentry in the graveyard */
@@ -423,13 +423,13 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
 
 	dir = dget_parent(object->dentry);
 
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
 	if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) {
 		/* object allocation for the same key preemptively deleted this
 		 * object's file so that it could create its own file */
 		_debug("object preemptively buried");
-		mutex_unlock(&d_inode(dir)->i_mutex);
+		inode_unlock(d_inode(dir));
 		ret = 0;
 	} else {
 		/* we need to check that our parent is _still_ our parent - it
@@ -442,7 +442,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
 			/* it got moved, presumably by cachefilesd culling it,
 			 * so it's no longer in the key path and we can ignore
 			 * it */
-			mutex_unlock(&d_inode(dir)->i_mutex);
+			inode_unlock(d_inode(dir));
 			ret = 0;
 		}
 	}
@@ -501,7 +501,7 @@ lookup_again:
 	/* search the current directory for the element name */
 	_debug("lookup '%s'", name);
 
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
 	start = jiffies;
 	next = lookup_one_len(name, dir, nlen);
@@ -585,7 +585,7 @@ lookup_again:
 	/* process the next component */
 	if (key) {
 		_debug("advance");
-		mutex_unlock(&d_inode(dir)->i_mutex);
+		inode_unlock(d_inode(dir));
 		dput(dir);
 		dir = next;
 		next = NULL;
@@ -623,7 +623,7 @@ lookup_again:
 	/* note that we're now using this object */
 	ret = cachefiles_mark_object_active(cache, object);
 
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(dir);
 	dir = NULL;
 
@@ -705,7 +705,7 @@ lookup_error:
 		cachefiles_io_error(cache, "Lookup failed");
 	next = NULL;
 error:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(next);
 error_out2:
 	dput(dir);
@@ -729,7 +729,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 	_enter(",,%s", dirname);
 
 	/* search the current directory for the element name */
-	mutex_lock(&d_inode(dir)->i_mutex);
+	inode_lock(d_inode(dir));
 
 	start = jiffies;
 	subdir = lookup_one_len(dirname, dir, strlen(dirname));
@@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 		       d_backing_inode(subdir)->i_ino);
 	}
 
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 
 	/* we need to make sure the subdir is a directory */
 	ASSERT(d_backing_inode(subdir));
@@ -800,19 +800,19 @@ check_error:
 	return ERR_PTR(ret);
 
 mkdir_error:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(subdir);
 	pr_err("mkdir %s failed with error %d\n", dirname, ret);
 	return ERR_PTR(ret);
 
 lookup_error:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	ret = PTR_ERR(subdir);
 	pr_err("Lookup %s failed with error %d\n", dirname, ret);
 	return ERR_PTR(ret);
 
 nomem_d_alloc:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	_leave(" = -ENOMEM");
 	return ERR_PTR(-ENOMEM);
 }
@@ -837,7 +837,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
 	//       dir, filename);
 
 	/* look up the victim */
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
 	start = jiffies;
 	victim = lookup_one_len(filename, dir, strlen(filename));
@@ -852,7 +852,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
 	 * at the netfs's request whilst the cull was in progress
 	 */
 	if (d_is_negative(victim)) {
-		mutex_unlock(&d_inode(dir)->i_mutex);
+		inode_unlock(d_inode(dir));
 		dput(victim);
 		_leave(" = -ENOENT [absent]");
 		return ERR_PTR(-ENOENT);
@@ -881,13 +881,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
 
 object_in_use:
 	read_unlock(&cache->active_lock);
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(victim);
 	//_leave(" = -EBUSY [in use]");
 	return ERR_PTR(-EBUSY);
 
 lookup_error:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	ret = PTR_ERR(victim);
 	if (ret == -ENOENT) {
 		/* file or dir now absent - probably retired by netfs */
@@ -947,7 +947,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
 	return 0;
 
 error_unlock:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 error:
 	dput(victim);
 	if (ret == -ENOENT) {
@@ -982,7 +982,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
 	if (IS_ERR(victim))
 		return PTR_ERR(victim);
 
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(victim);
 	//_leave(" = 0");
 	return 0;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b7d218a168fb..c22213789090 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1108,7 +1108,7 @@ retry_locked:
 		return 0;
 
 	/* past end of file? */
-	i_size = inode->i_size;   /* caller holds i_mutex */
+	i_size = i_size_read(inode);
 
 	if (page_off >= i_size ||
 	    (pos_in_page == 0 && (pos+len) >= i_size &&
@@ -1149,7 +1149,6 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 		page = grab_cache_page_write_begin(mapping, index, 0);
 		if (!page)
 			return -ENOMEM;
-		*pagep = page;
 
 		dout("write_begin file %p inode %p page %p %d~%d\n", file,
 		     inode, page, (int)pos, (int)len);
@@ -1184,8 +1183,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 		zero_user_segment(page, from+copied, len);
 
 	/* did file size increase? */
-	/* (no need for i_size_read(); we caller holds i_mutex */
-	if (pos+copied > inode->i_size)
+	if (pos+copied > i_size_read(inode))
 		check_cap = ceph_inode_set_size(inode, pos+copied);
 
 	if (!PageUptodate(page))
@@ -1378,11 +1376,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	ret = VM_FAULT_NOPAGE;
 	if ((off > size) ||
-	    (page->mapping != inode->i_mapping))
+	    (page->mapping != inode->i_mapping)) {
+		unlock_page(page);
 		goto out;
+	}
 
 	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
-	if (ret == 0) {
+	if (ret >= 0) {
 		/* success.  we'll keep the page locked. */
 		set_page_dirty(page);
 		ret = VM_FAULT_LOCKED;
@@ -1393,8 +1393,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 			ret = VM_FAULT_SIGBUS;
 	}
 out:
-	if (ret != VM_FAULT_LOCKED)
-		unlock_page(page);
 	if (ret == VM_FAULT_LOCKED ||
 	    ci->i_inline_version != CEPH_INLINE_NONE) {
 		int dirty;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a4766ded1ba7..a351480dbabc 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -106,7 +106,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
 
 	memset(&aux, 0, sizeof(aux));
 	aux.mtime = inode->i_mtime;
-	aux.size = inode->i_size;
+	aux.size = i_size_read(inode);
 
 	memcpy(buffer, &aux, sizeof(aux));
 
@@ -117,9 +117,7 @@ static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
 					uint64_t *size)
 {
 	const struct ceph_inode_info* ci = cookie_netfs_data;
-	const struct inode* inode = &ci->vfs_inode;
-
-	*size = inode->i_size;
+	*size = i_size_read(&ci->vfs_inode);
 }
 
 static enum fscache_checkaux ceph_fscache_inode_check_aux(
@@ -134,7 +132,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
 
 	memset(&aux, 0, sizeof(aux));
 	aux.mtime = inode->i_mtime;
-	aux.size = inode->i_size;
+	aux.size = i_size_read(inode);
 
 	if (memcmp(data, &aux, sizeof(aux)) != 0)
 		return FSCACHE_CHECKAUX_OBSOLETE;
@@ -197,7 +195,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
 		return;
 
 	/* Avoid multiple racing open requests */
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (ci->fscache)
 		goto done;
@@ -207,7 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
 					     ci, true);
 	fscache_check_consistency(ci->fscache);
 done:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 }
 
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c69e1253b47b..cdbf8cf3d52c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2030,7 +2030,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	if (datasync)
 		goto out;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	dirty = try_flush_caps(inode, &flush_tid);
 	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
@@ -2046,7 +2046,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = wait_event_interruptible(ci->i_cap_wq,
 					caps_are_flushed(inode, flush_tid));
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 out:
 	dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
 	return ret;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 9314b4ea2375..fd11fb231a2e 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -507,7 +507,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 	loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
 	loff_t retval;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	retval = -EINVAL;
 	switch (whence) {
 	case SEEK_CUR:
@@ -542,7 +542,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 		}
 	}
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return retval;
 }
 
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index fe02ae7f056a..3b3172357326 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -215,7 +215,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 
 	req->r_inode = d_inode(child);
 	ihold(d_inode(child));
@@ -224,7 +224,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
 
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 
 	if (!err) {
 		struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3c68e6aee2f0..86a9c383955e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -397,8 +397,9 @@ int ceph_release(struct inode *inode, struct file *file)
 }
 
 enum {
-	CHECK_EOF = 1,
-	READ_INLINE = 2,
+	HAVE_RETRIED = 1,
+	CHECK_EOF =    2,
+	READ_INLINE =  3,
 };
 
 /*
@@ -411,17 +412,15 @@ enum {
 static int striped_read(struct inode *inode,
 			u64 off, u64 len,
 			struct page **pages, int num_pages,
-			int *checkeof, bool o_direct,
-			unsigned long buf_align)
+			int *checkeof)
 {
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 pos, this_len, left;
-	int io_align, page_align;
-	int pages_left;
-	int read;
+	loff_t i_size;
+	int page_align, pages_left;
+	int read, ret;
 	struct page **page_pos;
-	int ret;
 	bool hit_stripe, was_short;
 
 	/*
@@ -432,13 +431,9 @@ static int striped_read(struct inode *inode,
 	page_pos = pages;
 	pages_left = num_pages;
 	read = 0;
-	io_align = off & ~PAGE_MASK;
 
 more:
-	if (o_direct)
-		page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
-	else
-		page_align = pos & ~PAGE_MASK;
+	page_align = pos & ~PAGE_MASK;
 	this_len = left;
 	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
 				  &ci->i_layout, pos, &this_len,
@@ -452,13 +447,12 @@ more:
 	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
 	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
 
+	i_size = i_size_read(inode);
 	if (ret >= 0) {
 		int didpages;
-		if (was_short && (pos + ret < inode->i_size)) {
-			int zlen = min(this_len - ret,
-				       inode->i_size - pos - ret);
-			int zoff = (o_direct ? buf_align : io_align) +
-				    read + ret;
+		if (was_short && (pos + ret < i_size)) {
+			int zlen = min(this_len - ret, i_size - pos - ret);
+			int zoff = (off & ~PAGE_MASK) + read + ret;
 			dout(" zero gap %llu to %llu\n",
 				pos + ret, pos + ret + zlen);
 			ceph_zero_page_vector_range(zoff, zlen, pages);
@@ -473,14 +467,14 @@ more:
 		pages_left -= didpages;
 
 		/* hit stripe and need continue*/
-		if (left && hit_stripe && pos < inode->i_size)
+		if (left && hit_stripe && pos < i_size)
 			goto more;
 	}
 
 	if (read > 0) {
 		ret = read;
 		/* did we bounce off eof? */
-		if (pos + left > inode->i_size)
+		if (pos + left > i_size)
 			*checkeof = CHECK_EOF;
 	}
 
@@ -521,54 +515,28 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 	if (ret < 0)
 		return ret;
 
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		while (iov_iter_count(i)) {
-			size_t start;
-			ssize_t n;
-
-			n = dio_get_pagev_size(i);
-			pages = dio_get_pages_alloc(i, n, &start, &num_pages);
-			if (IS_ERR(pages))
-				return PTR_ERR(pages);
-
-			ret = striped_read(inode, off, n,
-					   pages, num_pages, checkeof,
-					   1, start);
-
-			ceph_put_page_vector(pages, num_pages, true);
-
-			if (ret <= 0)
-				break;
-			off += ret;
-			iov_iter_advance(i, ret);
-			if (ret < n)
+	num_pages = calc_pages_for(off, len);
+	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+	ret = striped_read(inode, off, len, pages,
+				num_pages, checkeof);
+	if (ret > 0) {
+		int l, k = 0;
+		size_t left = ret;
+
+		while (left) {
+			size_t page_off = off & ~PAGE_MASK;
+			size_t copy = min_t(size_t, left,
+					    PAGE_SIZE - page_off);
+			l = copy_page_to_iter(pages[k++], page_off, copy, i);
+			off += l;
+			left -= l;
+			if (l < copy)
 				break;
 		}
-	} else {
-		num_pages = calc_pages_for(off, len);
-		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
-		if (IS_ERR(pages))
-			return PTR_ERR(pages);
-		ret = striped_read(inode, off, len, pages,
-					num_pages, checkeof, 0, 0);
-		if (ret > 0) {
-			int l, k = 0;
-			size_t left = ret;
-
-			while (left) {
-				size_t page_off = off & ~PAGE_MASK;
-				size_t copy = min_t(size_t,
-						    PAGE_SIZE - page_off, left);
-				l = copy_page_to_iter(pages[k++], page_off,
-						      copy, i);
-				off += l;
-				left -= l;
-				if (l < copy)
-					break;
-			}
-		}
-		ceph_release_page_vector(pages, num_pages);
 	}
+	ceph_release_page_vector(pages, num_pages);
 
 	if (off > iocb->ki_pos) {
 		ret = off - iocb->ki_pos;
@@ -579,6 +547,193 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 	return ret;
 }
 
+struct ceph_aio_request {
+	struct kiocb *iocb;
+	size_t total_len;
+	int write;
+	int error;
+	struct list_head osd_reqs;
+	unsigned num_reqs;
+	atomic_t pending_reqs;
+	struct timespec mtime;
+	struct ceph_cap_flush *prealloc_cf;
+};
+
+struct ceph_aio_work {
+	struct work_struct work;
+	struct ceph_osd_request *req;
+};
+
+static void ceph_aio_retry_work(struct work_struct *work);
+
+static void ceph_aio_complete(struct inode *inode,
+			      struct ceph_aio_request *aio_req)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+
+	if (!atomic_dec_and_test(&aio_req->pending_reqs))
+		return;
+
+	ret = aio_req->error;
+	if (!ret)
+		ret = aio_req->total_len;
+
+	dout("ceph_aio_complete %p rc %d\n", inode, ret);
+
+	if (ret >= 0 && aio_req->write) {
+		int dirty;
+
+		loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
+		if (endoff > i_size_read(inode)) {
+			if (ceph_inode_set_size(inode, endoff))
+				ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+		}
+
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_inline_version = CEPH_INLINE_NONE;
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					       &aio_req->prealloc_cf);
+		spin_unlock(&ci->i_ceph_lock);
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
+
+	}
+
+	ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
+						CEPH_CAP_FILE_RD));
+
+	aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+
+	ceph_free_cap_flush(aio_req->prealloc_cf);
+	kfree(aio_req);
+}
+
+static void ceph_aio_complete_req(struct ceph_osd_request *req,
+				  struct ceph_msg *msg)
+{
+	int rc = req->r_result;
+	struct inode *inode = req->r_inode;
+	struct ceph_aio_request *aio_req = req->r_priv;
+	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+	int num_pages = calc_pages_for((u64)osd_data->alignment,
+				       osd_data->length);
+
+	dout("ceph_aio_complete_req %p rc %d bytes %llu\n",
+	     inode, rc, osd_data->length);
+
+	if (rc == -EOLDSNAPC) {
+		struct ceph_aio_work *aio_work;
+		BUG_ON(!aio_req->write);
+
+		aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
+		if (aio_work) {
+			INIT_WORK(&aio_work->work, ceph_aio_retry_work);
+			aio_work->req = req;
+			queue_work(ceph_inode_to_client(inode)->wb_wq,
+				   &aio_work->work);
+			return;
+		}
+		rc = -ENOMEM;
+	} else if (!aio_req->write) {
+		if (rc == -ENOENT)
+			rc = 0;
+		if (rc >= 0 && osd_data->length > rc) {
+			int zoff = osd_data->alignment + rc;
+			int zlen = osd_data->length - rc;
+			/*
+			 * If read is satisfied by single OSD request,
+			 * it can pass EOF. Otherwise read is within
+			 * i_size.
+			 */
+			if (aio_req->num_reqs == 1) {
+				loff_t i_size = i_size_read(inode);
+				loff_t endoff = aio_req->iocb->ki_pos + rc;
+				if (endoff < i_size)
+					zlen = min_t(size_t, zlen,
+						     i_size - endoff);
+				aio_req->total_len = rc + zlen;
+			}
+
+			if (zlen > 0)
+				ceph_zero_page_vector_range(zoff, zlen,
+							    osd_data->pages);
+		}
+	}
+
+	ceph_put_page_vector(osd_data->pages, num_pages, false);
+	ceph_osdc_put_request(req);
+
+	if (rc < 0)
+		cmpxchg(&aio_req->error, 0, rc);
+
+	ceph_aio_complete(inode, aio_req);
+	return;
+}
+
+static void ceph_aio_retry_work(struct work_struct *work)
+{
+	struct ceph_aio_work *aio_work =
+		container_of(work, struct ceph_aio_work, work);
+	struct ceph_osd_request *orig_req = aio_work->req;
+	struct ceph_aio_request *aio_req = orig_req->r_priv;
+	struct inode *inode = orig_req->r_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_snap_context *snapc;
+	struct ceph_osd_request *req;
+	int ret;
+
+	spin_lock(&ci->i_ceph_lock);
+	if (__ceph_have_pending_cap_snap(ci)) {
+		struct ceph_cap_snap *capsnap =
+			list_last_entry(&ci->i_cap_snaps,
+					struct ceph_cap_snap,
+					ci_item);
+		snapc = ceph_get_snap_context(capsnap->context);
+	} else {
+		BUG_ON(!ci->i_head_snapc);
+		snapc = ceph_get_snap_context(ci->i_head_snapc);
+	}
+	spin_unlock(&ci->i_ceph_lock);
+
+	req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
+			false, GFP_NOFS);
+	if (IS_ERR(req)) {
+		ret = PTR_ERR(req);
+		req = orig_req;
+		goto out;
+	}
+
+	req->r_flags =	CEPH_OSD_FLAG_ORDERSNAP |
+			CEPH_OSD_FLAG_ONDISK |
+			CEPH_OSD_FLAG_WRITE;
+	req->r_base_oloc = orig_req->r_base_oloc;
+	req->r_base_oid = orig_req->r_base_oid;
+
+	req->r_ops[0] = orig_req->r_ops[0];
+	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+
+	ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
+				snapc, CEPH_NOSNAP, &aio_req->mtime);
+
+	ceph_put_snap_context(snapc);
+	ceph_osdc_put_request(orig_req);
+
+	req->r_callback = ceph_aio_complete_req;
+	req->r_inode = inode;
+	req->r_priv = aio_req;
+
+	ret = ceph_osdc_start_request(req->r_osdc, req, false);
+out:
+	if (ret < 0) {
+		BUG_ON(ret == -EOLDSNAPC);
+		req->r_result = ret;
+		ceph_aio_complete_req(req, NULL);
+	}
+
+	kfree(aio_work);
+}
+
 /*
  * Write commit request unsafe callback, called to tell us when a
  * request is unsafe (that is, in flight--has been handed to the
@@ -612,16 +767,10 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 }
 
 
-/*
- * Synchronous write, straight from __user pointer or user pages.
- *
- * If write spans object boundary, just do multiple writes.  (For a
- * correct atomic write, we should e.g. take write locks on all
- * objects, rollback on failure, etc.)
- */
 static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
-		       struct ceph_snap_context *snapc)
+ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
+		       struct ceph_snap_context *snapc,
+		       struct ceph_cap_flush **pcf)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
@@ -630,44 +779,52 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 	struct ceph_vino vino;
 	struct ceph_osd_request *req;
 	struct page **pages;
-	int num_pages;
-	int written = 0;
+	struct ceph_aio_request *aio_req = NULL;
+	int num_pages = 0;
 	int flags;
-	int check_caps = 0;
 	int ret;
 	struct timespec mtime = CURRENT_TIME;
-	size_t count = iov_iter_count(from);
+	size_t count = iov_iter_count(iter);
+	loff_t pos = iocb->ki_pos;
+	bool write = iov_iter_rw(iter) == WRITE;
 
-	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
 
-	dout("sync_direct_write on file %p %lld~%u\n", file, pos,
-	     (unsigned)count);
+	dout("sync_direct_read_write (%s) on file %p %lld~%u\n",
+	     (write ? "write" : "read"), file, pos, (unsigned)count);
 
 	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
 	if (ret < 0)
 		return ret;
 
-	ret = invalidate_inode_pages2_range(inode->i_mapping,
-					    pos >> PAGE_CACHE_SHIFT,
-					    (pos + count) >> PAGE_CACHE_SHIFT);
-	if (ret < 0)
-		dout("invalidate_inode_pages2_range returned %d\n", ret);
+	if (write) {
+		ret = invalidate_inode_pages2_range(inode->i_mapping,
+					pos >> PAGE_CACHE_SHIFT,
+					(pos + count) >> PAGE_CACHE_SHIFT);
+		if (ret < 0)
+			dout("invalidate_inode_pages2_range returned %d\n", ret);
 
-	flags = CEPH_OSD_FLAG_ORDERSNAP |
-		CEPH_OSD_FLAG_ONDISK |
-		CEPH_OSD_FLAG_WRITE;
+		flags = CEPH_OSD_FLAG_ORDERSNAP |
+			CEPH_OSD_FLAG_ONDISK |
+			CEPH_OSD_FLAG_WRITE;
+	} else {
+		flags = CEPH_OSD_FLAG_READ;
+	}
 
-	while (iov_iter_count(from) > 0) {
-		u64 len = dio_get_pagev_size(from);
-		size_t start;
-		ssize_t n;
+	while (iov_iter_count(iter) > 0) {
+		u64 size = dio_get_pagev_size(iter);
+		size_t start = 0;
+		ssize_t len;
 
 		vino = ceph_vino(inode);
 		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-					    vino, pos, &len, 0,
-					    2,/*include a 'startsync' command*/
-					    CEPH_OSD_OP_WRITE, flags, snapc,
+					    vino, pos, &size, 0,
+					    /*include a 'startsync' command*/
+					    write ? 2 : 1,
+					    write ? CEPH_OSD_OP_WRITE :
+						    CEPH_OSD_OP_READ,
+					    flags, snapc,
 					    ci->i_truncate_seq,
 					    ci->i_truncate_size,
 					    false);
@@ -676,10 +833,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 			break;
 		}
 
-		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
-
-		n = len;
-		pages = dio_get_pages_alloc(from, len, &start, &num_pages);
+		len = size;
+		pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
 		if (IS_ERR(pages)) {
 			ceph_osdc_put_request(req);
 			ret = PTR_ERR(pages);
@@ -687,47 +842,128 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		}
 
 		/*
-		 * throw out any page cache pages in this range. this
-		 * may block.
+		 * To simplify error handling, allow AIO when IO within i_size
+		 * or IO can be satisfied by single OSD request.
 		 */
-		truncate_inode_pages_range(inode->i_mapping, pos,
-				   (pos+n) | (PAGE_CACHE_SIZE-1));
-		osd_req_op_extent_osd_data_pages(req, 0, pages, n, start,
-						false, false);
+		if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
+		    (len == count || pos + count <= i_size_read(inode))) {
+			aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
+			if (aio_req) {
+				aio_req->iocb = iocb;
+				aio_req->write = write;
+				INIT_LIST_HEAD(&aio_req->osd_reqs);
+				if (write) {
+					aio_req->mtime = mtime;
+					swap(aio_req->prealloc_cf, *pcf);
+				}
+			}
+			/* ignore error */
+		}
+
+		if (write) {
+			/*
+			 * throw out any page cache pages in this range. this
+			 * may block.
+			 */
+			truncate_inode_pages_range(inode->i_mapping, pos,
+					(pos+len) | (PAGE_CACHE_SIZE - 1));
+
+			osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+		}
+
+
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
+						 false, false);
 
-		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
 		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
 
-		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+		if (aio_req) {
+			aio_req->total_len += len;
+			aio_req->num_reqs++;
+			atomic_inc(&aio_req->pending_reqs);
+
+			req->r_callback = ceph_aio_complete_req;
+			req->r_inode = inode;
+			req->r_priv = aio_req;
+			list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
+
+			pos += len;
+			iov_iter_advance(iter, len);
+			continue;
+		}
+
+		ret = ceph_osdc_start_request(req->r_osdc, req, false);
 		if (!ret)
 			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
+		size = i_size_read(inode);
+		if (!write) {
+			if (ret == -ENOENT)
+				ret = 0;
+			if (ret >= 0 && ret < len && pos + ret < size) {
+				int zlen = min_t(size_t, len - ret,
+						 size - pos - ret);
+				ceph_zero_page_vector_range(start + ret, zlen,
+							    pages);
+				ret += zlen;
+			}
+			if (ret >= 0)
+				len = ret;
+		}
+
 		ceph_put_page_vector(pages, num_pages, false);
 
 		ceph_osdc_put_request(req);
-		if (ret)
+		if (ret < 0)
 			break;
-		pos += n;
-		written += n;
-		iov_iter_advance(from, n);
 
-		if (pos > i_size_read(inode)) {
-			check_caps = ceph_inode_set_size(inode, pos);
-			if (check_caps)
+		pos += len;
+		iov_iter_advance(iter, len);
+
+		if (!write && pos >= size)
+			break;
+
+		if (write && pos > size) {
+			if (ceph_inode_set_size(inode, pos))
 				ceph_check_caps(ceph_inode(inode),
 						CHECK_CAPS_AUTHONLY,
 						NULL);
 		}
 	}
 
-	if (ret != -EOLDSNAPC && written > 0) {
+	if (aio_req) {
+		if (aio_req->num_reqs == 0) {
+			kfree(aio_req);
+			return ret;
+		}
+
+		ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
+					      CEPH_CAP_FILE_RD);
+
+		while (!list_empty(&aio_req->osd_reqs)) {
+			req = list_first_entry(&aio_req->osd_reqs,
+					       struct ceph_osd_request,
+					       r_unsafe_item);
+			list_del_init(&req->r_unsafe_item);
+			if (ret >= 0)
+				ret = ceph_osdc_start_request(req->r_osdc,
+							      req, false);
+			if (ret < 0) {
+				BUG_ON(ret == -EOLDSNAPC);
+				req->r_result = ret;
+				ceph_aio_complete_req(req, NULL);
+			}
+		}
+		return -EIOCBQUEUED;
+	}
+
+	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
+		ret = pos - iocb->ki_pos;
 		iocb->ki_pos = pos;
-		ret = written;
 	}
 	return ret;
 }
 
-
 /*
  * Synchronous write, straight from __user pointer or user pages.
  *
@@ -897,8 +1133,14 @@ again:
 		     ceph_cap_string(got));
 
 		if (ci->i_inline_version == CEPH_INLINE_NONE) {
-			/* hmm, this isn't really async... */
-			ret = ceph_sync_read(iocb, to, &retry_op);
+			if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
+				ret = ceph_direct_read_write(iocb, to,
+							     NULL, NULL);
+				if (ret >= 0 && ret < len)
+					retry_op = CHECK_EOF;
+			} else {
+				ret = ceph_sync_read(iocb, to, &retry_op);
+			}
 		} else {
 			retry_op = READ_INLINE;
 		}
@@ -916,7 +1158,7 @@ again:
 		pinned_page = NULL;
 	}
 	ceph_put_cap_refs(ci, got);
-	if (retry_op && ret >= 0) {
+	if (retry_op > HAVE_RETRIED && ret >= 0) {
 		int statret;
 		struct page *page = NULL;
 		loff_t i_size;
@@ -968,12 +1210,11 @@ again:
 		if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
 		    ret < len) {
 			dout("sync_read hit hole, ppos %lld < size %lld"
-			     ", reading more\n", iocb->ki_pos,
-			     inode->i_size);
+			     ", reading more\n", iocb->ki_pos, i_size);
 
 			read += ret;
 			len -= ret;
-			retry_op = 0;
+			retry_op = HAVE_RETRIED;
 			goto again;
 		}
 	}
@@ -1014,7 +1255,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (!prealloc_cf)
 		return -ENOMEM;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
@@ -1052,7 +1293,7 @@ retry_snap:
 	}
 
 	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
-	     inode, ceph_vinop(inode), pos, count, inode->i_size);
+	     inode, ceph_vinop(inode), pos, count, i_size_read(inode));
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
 	else
@@ -1070,7 +1311,7 @@ retry_snap:
 	    (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
 		struct ceph_snap_context *snapc;
 		struct iov_iter data;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		spin_lock(&ci->i_ceph_lock);
 		if (__ceph_have_pending_cap_snap(ci)) {
@@ -1088,8 +1329,8 @@ retry_snap:
 		/* we might need to revert back to that point */
 		data = *from;
 		if (iocb->ki_flags & IOCB_DIRECT)
-			written = ceph_sync_direct_write(iocb, &data, pos,
-							 snapc);
+			written = ceph_direct_read_write(iocb, &data, snapc,
+							 &prealloc_cf);
 		else
 			written = ceph_sync_write(iocb, &data, pos, snapc);
 		if (written == -EOLDSNAPC) {
@@ -1097,14 +1338,14 @@ retry_snap:
 				"got EOLDSNAPC, retrying\n",
 				inode, ceph_vinop(inode),
 				pos, (unsigned)count);
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 			goto retry_snap;
 		}
 		if (written > 0)
 			iov_iter_advance(from, written);
 		ceph_put_snap_context(snapc);
 	} else {
-		loff_t old_size = inode->i_size;
+		loff_t old_size = i_size_read(inode);
 		/*
 		 * No need to acquire the i_truncate_mutex. Because
 		 * the MDS revokes Fwb caps before sending truncate
@@ -1115,9 +1356,9 @@ retry_snap:
 		written = generic_perform_write(file, from, pos);
 		if (likely(written >= 0))
 			iocb->ki_pos = pos + written;
-		if (inode->i_size > old_size)
+		if (i_size_read(inode) > old_size)
 			ceph_fscache_update_objectsize(inode);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 	if (written >= 0) {
@@ -1147,7 +1388,7 @@ retry_snap:
 	goto out_unlocked;
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 out_unlocked:
 	ceph_free_cap_flush(prealloc_cf);
 	current->backing_dev_info = NULL;
@@ -1160,9 +1401,10 @@ out_unlocked:
 static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
+	loff_t i_size;
 	int ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
 		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
@@ -1172,9 +1414,10 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 		}
 	}
 
+	i_size = i_size_read(inode);
 	switch (whence) {
 	case SEEK_END:
-		offset += inode->i_size;
+		offset += i_size;
 		break;
 	case SEEK_CUR:
 		/*
@@ -1190,24 +1433,24 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 		offset += file->f_pos;
 		break;
 	case SEEK_DATA:
-		if (offset >= inode->i_size) {
+		if (offset >= i_size) {
 			ret = -ENXIO;
 			goto out;
 		}
 		break;
 	case SEEK_HOLE:
-		if (offset >= inode->i_size) {
+		if (offset >= i_size) {
 			ret = -ENXIO;
 			goto out;
 		}
-		offset = inode->i_size;
+		offset = i_size;
 		break;
 	}
 
 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return offset;
 }
 
@@ -1363,7 +1606,7 @@ static long ceph_fallocate(struct file *file, int mode,
 	if (!prealloc_cf)
 		return -ENOMEM;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (ceph_snap(inode) != CEPH_NOSNAP) {
 		ret = -EROFS;
@@ -1418,7 +1661,7 @@ static long ceph_fallocate(struct file *file, int mode,
 
 	ceph_put_cap_refs(ci, got);
 unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	ceph_free_cap_flush(prealloc_cf);
 	return ret;
 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index da55eb8bcffa..fb4ba2e4e2a5 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -548,7 +548,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
 	    (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
 		dout("size %lld -> %llu\n", inode->i_size, size);
-		inode->i_size = size;
+		i_size_write(inode, size);
 		inode->i_blocks = (size + (1<<9) - 1) >> 9;
 		ci->i_reported_size = size;
 		if (truncate_seq != ci->i_truncate_seq) {
@@ -808,7 +808,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 			spin_unlock(&ci->i_ceph_lock);
 
 			err = -EINVAL;
-			if (WARN_ON(symlen != inode->i_size))
+			if (WARN_ON(symlen != i_size_read(inode)))
 				goto out;
 
 			err = -ENOMEM;
@@ -1549,7 +1549,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
 
 	spin_lock(&ci->i_ceph_lock);
 	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
-	inode->i_size = size;
+	i_size_write(inode, size);
 	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
 
 	/* tell the MDS if we are approaching max_size */
@@ -1911,7 +1911,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 		     inode->i_size, attr->ia_size);
 		if ((issued & CEPH_CAP_FILE_EXCL) &&
 		    attr->ia_size > inode->i_size) {
-			inode->i_size = attr->ia_size;
+			i_size_write(inode, attr->ia_size);
 			inode->i_blocks =
 				(attr->ia_size + (1 << 9) - 1) >> 9;
 			inode->i_ctime = attr->ia_ctime;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 7febcf2475c5..50b268483302 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -50,7 +50,7 @@ void cifs_vfs_err(const char *fmt, ...)
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	pr_err("CIFS VFS: %pV", &vaf);
+	pr_err_ratelimited("CIFS VFS: %pV", &vaf);
 
 	va_end(args);
 }
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index f40fbaca1b2a..66cf0f9fff89 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -51,14 +51,13 @@ __printf(1, 2) void cifs_vfs_err(const char *fmt, ...);
 /* information message: e.g., configuration, major event */
 #define cifs_dbg(type, fmt, ...)					\
 do {									\
-	if (type == FYI) {						\
-		if (cifsFYI & CIFS_INFO) {				\
-			pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__);	\
-		}							\
+	if (type == FYI && cifsFYI & CIFS_INFO) {			\
+		pr_debug_ratelimited("%s: "				\
+			    fmt, __FILE__, ##__VA_ARGS__);		\
 	} else if (type == VFS) {					\
 		cifs_vfs_err(fmt, ##__VA_ARGS__);			\
 	} else if (type == NOISY && type != 0) {			\
-		pr_debug(fmt, ##__VA_ARGS__);				\
+		pr_debug_ratelimited(fmt, ##__VA_ARGS__);		\
 	}								\
 } while (0)
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c4c1169814b2..c48ca13673e3 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -507,6 +507,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 
 	seq_printf(s, ",rsize=%u", cifs_sb->rsize);
 	seq_printf(s, ",wsize=%u", cifs_sb->wsize);
+	seq_printf(s, ",echo_interval=%lu",
+			tcon->ses->server->echo_interval / HZ);
 	/* convert actimeo and display it in seconds */
 	seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
 
@@ -640,9 +642,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 		while (*s && *s != sep)
 			s++;
 
-		mutex_lock(&dir->i_mutex);
+		inode_lock(dir);
 		child = lookup_one_len(p, dentry, s - p);
-		mutex_unlock(&dir->i_mutex);
+		inode_unlock(dir);
 		dput(dentry);
 		dentry = child;
 	} while (!IS_ERR(dentry));
@@ -752,6 +754,9 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	ssize_t rc;
 	struct inode *inode = file_inode(iocb->ki_filp);
 
+	if (iocb->ki_filp->f_flags & O_DIRECT)
+		return cifs_user_readv(iocb, iter);
+
 	rc = cifs_revalidate_mapping(inode);
 	if (rc)
 		return rc;
@@ -766,6 +771,18 @@ static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t written;
 	int rc;
 
+	if (iocb->ki_filp->f_flags & O_DIRECT) {
+		written = cifs_user_writev(iocb, from);
+		if (written > 0 && CIFS_CACHE_READ(cinode)) {
+			cifs_zap_mapping(inode);
+			cifs_dbg(FYI,
+				 "Set no oplock for inode=%p after a write operation\n",
+				 inode);
+			cinode->oplock = 0;
+		}
+		return written;
+	}
+
 	written = cifs_get_writer(cinode);
 	if (written)
 		return written;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 2b510c537a0d..a25b2513f146 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -70,8 +70,10 @@
 #define SERVER_NAME_LENGTH 40
 #define SERVER_NAME_LEN_WITH_NULL     (SERVER_NAME_LENGTH + 1)
 
-/* SMB echo "timeout" -- FIXME: tunable? */
-#define SMB_ECHO_INTERVAL (60 * HZ)
+/* echo interval in seconds */
+#define SMB_ECHO_INTERVAL_MIN 1
+#define SMB_ECHO_INTERVAL_MAX 600
+#define SMB_ECHO_INTERVAL_DEFAULT 60
 
 #include "cifspdu.h"
 
@@ -225,7 +227,7 @@ struct smb_version_operations {
 	void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
 	void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
 	/* verify the message */
-	int (*check_message)(char *, unsigned int);
+	int (*check_message)(char *, unsigned int, struct TCP_Server_Info *);
 	bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
 	void (*downgrade_oplock)(struct TCP_Server_Info *,
 					struct cifsInodeInfo *, bool);
@@ -507,6 +509,7 @@ struct smb_vol {
 	struct sockaddr_storage dstaddr; /* destination address */
 	struct sockaddr_storage srcaddr; /* allow binding to a local IP */
 	struct nls_table *local_nls;
+	unsigned int echo_interval; /* echo interval in secs */
 };
 
 #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
@@ -627,7 +630,9 @@ struct TCP_Server_Info {
 #ifdef CONFIG_CIFS_SMB2
 	unsigned int	max_read;
 	unsigned int	max_write;
+	__u8		preauth_hash[512];
 #endif /* CONFIG_CIFS_SMB2 */
+	unsigned long echo_interval;
 };
 
 static inline unsigned int
@@ -809,7 +814,10 @@ struct cifs_ses {
 	bool need_reconnect:1; /* connection reset, uid now invalid */
 #ifdef CONFIG_CIFS_SMB2
 	__u16 session_flags;
-	char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
+	__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
+	__u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
+	__u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
+	__u8 preauth_hash[512];
 #endif /* CONFIG_CIFS_SMB2 */
 };
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index c63fd1dde25b..eed7ff50faf0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -102,7 +102,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
 			struct smb_hdr *out_buf,
 			int *bytes_returned);
 extern int cifs_reconnect(struct TCP_Server_Info *server);
-extern int checkSMB(char *buf, unsigned int length);
+extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr);
 extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
 extern bool backup_cred(struct cifs_sb_info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
@@ -439,7 +439,8 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
 extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
 extern int calc_seckey(struct cifs_ses *);
-extern int generate_smb3signingkey(struct cifs_ses *);
+extern int generate_smb30signingkey(struct cifs_ses *);
+extern int generate_smb311signingkey(struct cifs_ses *);
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern int calc_lanman_hash(const char *password, const char *cryptkey,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ecb0803bdb0e..4fbd92d2e113 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -95,6 +95,7 @@ enum {
 	Opt_cruid, Opt_gid, Opt_file_mode,
 	Opt_dirmode, Opt_port,
 	Opt_rsize, Opt_wsize, Opt_actimeo,
+	Opt_echo_interval,
 
 	/* Mount options which take string value */
 	Opt_user, Opt_pass, Opt_ip,
@@ -188,6 +189,7 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_rsize, "rsize=%s" },
 	{ Opt_wsize, "wsize=%s" },
 	{ Opt_actimeo, "actimeo=%s" },
+	{ Opt_echo_interval, "echo_interval=%s" },
 
 	{ Opt_blank_user, "user=" },
 	{ Opt_blank_user, "username=" },
@@ -368,7 +370,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	server->session_key.response = NULL;
 	server->session_key.len = 0;
 	server->lstrp = jiffies;
-	mutex_unlock(&server->srv_mutex);
 
 	/* mark submitted MIDs for retry and issue callback */
 	INIT_LIST_HEAD(&retry_list);
@@ -381,6 +382,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		list_move(&mid_entry->qhead, &retry_list);
 	}
 	spin_unlock(&GlobalMid_Lock);
+	mutex_unlock(&server->srv_mutex);
 
 	cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
 	list_for_each_safe(tmp, tmp2, &retry_list) {
@@ -418,6 +420,7 @@ cifs_echo_request(struct work_struct *work)
 	int rc;
 	struct TCP_Server_Info *server = container_of(work,
 					struct TCP_Server_Info, echo.work);
+	unsigned long echo_interval = server->echo_interval;
 
 	/*
 	 * We cannot send an echo if it is disabled or until the
@@ -427,7 +430,7 @@ cifs_echo_request(struct work_struct *work)
 	 */
 	if (!server->ops->need_neg || server->ops->need_neg(server) ||
 	    (server->ops->can_echo && !server->ops->can_echo(server)) ||
-	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+	    time_before(jiffies, server->lstrp + echo_interval - HZ))
 		goto requeue_echo;
 
 	rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS;
@@ -436,7 +439,7 @@ cifs_echo_request(struct work_struct *work)
 			 server->hostname);
 
 requeue_echo:
-	queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL);
+	queue_delayed_work(cifsiod_wq, &server->echo, echo_interval);
 }
 
 static bool
@@ -487,9 +490,9 @@ server_unresponsive(struct TCP_Server_Info *server)
 	 *     a response in >60s.
 	 */
 	if (server->tcpStatus == CifsGood &&
-	    time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {
-		cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n",
-			 server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ);
+	    time_after(jiffies, server->lstrp + 2 * server->echo_interval)) {
+		cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n",
+			 server->hostname, (2 * server->echo_interval) / HZ);
 		cifs_reconnect(server);
 		wake_up(&server->response_q);
 		return true;
@@ -828,7 +831,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	 * 48 bytes is enough to display the header and a little bit
 	 * into the payload for debugging purposes.
 	 */
-	length = server->ops->check_message(buf, server->total_read);
+	length = server->ops->check_message(buf, server->total_read, server);
 	if (length != 0)
 		cifs_dump_mem("Bad SMB: ", buf,
 			min_t(unsigned int, server->total_read, 48));
@@ -1624,6 +1627,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 				goto cifs_parse_mount_err;
 			}
 			break;
+		case Opt_echo_interval:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid echo interval value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->echo_interval = option;
+			break;
 
 		/* String Arguments */
 
@@ -2089,6 +2100,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
 	if (!match_security(server, vol))
 		return 0;
 
+	if (server->echo_interval != vol->echo_interval)
+		return 0;
+
 	return 1;
 }
 
@@ -2208,6 +2222,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	tcp_ses->tcpStatus = CifsNew;
 	++tcp_ses->srv_count;
 
+	if (volume_info->echo_interval >= SMB_ECHO_INTERVAL_MIN &&
+		volume_info->echo_interval <= SMB_ECHO_INTERVAL_MAX)
+		tcp_ses->echo_interval = volume_info->echo_interval * HZ;
+	else
+		tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ;
+
 	rc = ip_connect(tcp_ses);
 	if (rc < 0) {
 		cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n");
@@ -2237,7 +2257,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	cifs_fscache_get_client_cookie(tcp_ses);
 
 	/* queue echo request delayed work */
-	queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
+	queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval);
 
 	return tcp_ses;
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0a2752b79e72..ff882aeaccc6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2267,7 +2267,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
 	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (rc)
 		return rc;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	xid = get_xid();
 
@@ -2292,7 +2292,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
 	}
 
 	free_xid(xid);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return rc;
 }
 
@@ -2309,7 +2309,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (rc)
 		return rc;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	xid = get_xid();
 
@@ -2326,7 +2326,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	}
 
 	free_xid(xid);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return rc;
 }
 
@@ -2672,7 +2672,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
 	 * with a brlock that prevents writing.
 	 */
 	down_read(&cinode->lock_sem);
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	rc = generic_write_checks(iocb, from);
 	if (rc <= 0)
@@ -2685,7 +2685,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
 	else
 		rc = -EACCES;
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (rc > 0) {
 		ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a329f5ba35aa..aeb26dbfa1bf 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -814,8 +814,21 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 			}
 		} else
 			fattr.cf_uniqueid = iunique(sb, ROOT_I);
-	} else
-		fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+	} else {
+		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+		    validinum == false && server->ops->get_srv_inum) {
+			/*
+			 * Pass a NULL tcon to ensure we don't make a round
+			 * trip to the server. This only works for SMB2+.
+			 */
+			tmprc = server->ops->get_srv_inum(xid,
+				NULL, cifs_sb, full_path,
+				&fattr.cf_uniqueid, data);
+			if (tmprc)
+				fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+		} else
+			fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+	}
 
 	/* query for SFU type info if supported and needed */
 	if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
@@ -856,6 +869,13 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 	} else {
 		/* we already have inode, update it */
 
+		/* if uniqueid is different, return error */
+		if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
+		    CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
+			rc = -ESTALE;
+			goto cgii_exit;
+		}
+
 		/* if filetype is different, return error */
 		if (unlikely(((*inode)->i_mode & S_IFMT) !=
 		    (fattr.cf_mode & S_IFMT))) {
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 8442b8b8e0be..813fe13c2ae1 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -310,7 +310,7 @@ check_smb_hdr(struct smb_hdr *smb)
 }
 
 int
-checkSMB(char *buf, unsigned int total_read)
+checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
 {
 	struct smb_hdr *smb = (struct smb_hdr *)buf;
 	__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 0557c45e9c33..b30a4a6d98a0 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -847,6 +847,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
 		 * if buggy server returns . and .. late do we want to
 		 * check for that here?
 		 */
+		*tmp_buf = 0;
 		rc = cifs_filldir(current_entry, file, ctx,
 				  tmp_buf, max_len);
 		if (rc) {
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 1c5907019045..389fb9f8c84e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -38,7 +38,7 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
 	 * Make sure that this really is an SMB, that it is a response,
 	 * and that the message ids match.
 	 */
-	if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
+	if ((hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
 	    (mid == wire_mid)) {
 		if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
 			return 0;
@@ -50,9 +50,9 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
 				cifs_dbg(VFS, "Received Request not response\n");
 		}
 	} else { /* bad signature or mid */
-		if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER)
+		if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
 			cifs_dbg(VFS, "Bad protocol string signature header %x\n",
-				 *(unsigned int *) hdr->ProtocolId);
+				 le32_to_cpu(hdr->ProtocolId));
 		if (mid != wire_mid)
 			cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
 				 mid, wire_mid);
@@ -93,11 +93,11 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
 };
 
 int
-smb2_check_message(char *buf, unsigned int length)
+smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
 {
 	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
 	struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
-	__u64 mid = le64_to_cpu(hdr->MessageId);
+	__u64 mid;
 	__u32 len = get_rfc1002_length(buf);
 	__u32 clc_len;  /* calculated length */
 	int command;
@@ -111,6 +111,30 @@ smb2_check_message(char *buf, unsigned int length)
 	 * ie Validate the wct via smb2_struct_sizes table above
 	 */
 
+	if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+		struct smb2_transform_hdr *thdr =
+			(struct smb2_transform_hdr *)buf;
+		struct cifs_ses *ses = NULL;
+		struct list_head *tmp;
+
+		/* decrypt frame now that it is completely read in */
+		spin_lock(&cifs_tcp_ses_lock);
+		list_for_each(tmp, &srvr->smb_ses_list) {
+			ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+			if (ses->Suid == thdr->SessionId)
+				break;
+
+			ses = NULL;
+		}
+		spin_unlock(&cifs_tcp_ses_lock);
+		if (ses == NULL) {
+			cifs_dbg(VFS, "no decryption - session id not found\n");
+			return 1;
+		}
+	}
+
+
+	mid = le64_to_cpu(hdr->MessageId);
 	if (length < sizeof(struct smb2_pdu)) {
 		if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) {
 			pdu->StructureSize2 = 0;
@@ -322,7 +346,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
 
 	/* return pointer to beginning of data area, ie offset from SMB start */
 	if ((*off != 0) && (*len != 0))
-		return (char *)(&hdr->ProtocolId[0]) + *off;
+		return (char *)(&hdr->ProtocolId) + *off;
 	else
 		return NULL;
 }
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 53ccdde6ff18..3525ed756173 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -182,6 +182,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
 	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
 	__u64 wire_mid = le64_to_cpu(hdr->MessageId);
 
+	if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+		cifs_dbg(VFS, "encrypted frame parsing not supported yet");
+		return NULL;
+	}
+
 	spin_lock(&GlobalMid_Lock);
 	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
 		if ((mid->mid == wire_mid) &&
@@ -1692,7 +1697,7 @@ struct smb_version_operations smb30_operations = {
 	.get_lease_key = smb2_get_lease_key,
 	.set_lease_key = smb2_set_lease_key,
 	.new_lease_key = smb2_new_lease_key,
-	.generate_signingkey = generate_smb3signingkey,
+	.generate_signingkey = generate_smb30signingkey,
 	.calc_signature = smb3_calc_signature,
 	.set_integrity  = smb3_set_integrity,
 	.is_read_op = smb21_is_read_op,
@@ -1779,7 +1784,7 @@ struct smb_version_operations smb311_operations = {
 	.get_lease_key = smb2_get_lease_key,
 	.set_lease_key = smb2_set_lease_key,
 	.new_lease_key = smb2_new_lease_key,
-	.generate_signingkey = generate_smb3signingkey,
+	.generate_signingkey = generate_smb311signingkey,
 	.calc_signature = smb3_calc_signature,
 	.set_integrity  = smb3_set_integrity,
 	.is_read_op = smb21_is_read_op,
@@ -1838,7 +1843,7 @@ struct smb_version_values smb21_values = {
 struct smb_version_values smb30_values = {
 	.version_string = SMB30_VERSION_STRING,
 	.protocol_id = SMB30_PROT_ID,
-	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
 	.large_lock_type = 0,
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1858,7 +1863,7 @@ struct smb_version_values smb30_values = {
 struct smb_version_values smb302_values = {
 	.version_string = SMB302_VERSION_STRING,
 	.protocol_id = SMB302_PROT_ID,
-	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
 	.large_lock_type = 0,
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 767555518d40..10f8d5cf5681 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -97,10 +97,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
 	hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr)
 			- 4 /*  RFC 1001 length field itself not counted */);
 
-	hdr->ProtocolId[0] = 0xFE;
-	hdr->ProtocolId[1] = 'S';
-	hdr->ProtocolId[2] = 'M';
-	hdr->ProtocolId[3] = 'B';
+	hdr->ProtocolId = SMB2_PROTO_NUMBER;
 	hdr->StructureSize = cpu_to_le16(64);
 	hdr->Command = smb2_cmd;
 	hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */
@@ -1573,7 +1570,8 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 		goto ioctl_exit;
 	}
 
-	memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
+	memcpy(*out_data,
+	       (char *)&rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
 	       *plen);
 ioctl_exit:
 	free_rsp_buf(resp_buftype, rsp);
@@ -2093,7 +2091,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 	}
 
 	if (*buf) {
-		memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset,
+		memcpy(*buf, (char *)&rsp->hdr.ProtocolId + rsp->DataOffset,
 		       *nbytes);
 		free_rsp_buf(resp_buftype, iov[0].iov_base);
 	} else if (resp_buftype != CIFS_NO_BUFFER) {
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 4af52780ec35..ff88d9feb01e 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -86,6 +86,7 @@
 #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
 
 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
+#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
 
 /*
  * SMB2 Header Definition
@@ -102,7 +103,7 @@ struct smb2_hdr {
 	__be32 smb2_buf_length;	/* big endian on wire */
 				/* length is only two or three bytes - with
 				 one or two byte type preceding it that MBZ */
-	__u8   ProtocolId[4];	/* 0xFE 'S' 'M' 'B' */
+	__le32 ProtocolId;	/* 0xFE 'S' 'M' 'B' */
 	__le16 StructureSize;	/* 64 */
 	__le16 CreditCharge;	/* MBZ */
 	__le32 Status;		/* Error from server */
@@ -128,11 +129,10 @@ struct smb2_transform_hdr {
 				 one or two byte type preceding it that MBZ */
 	__u8   ProtocolId[4];	/* 0xFD 'S' 'M' 'B' */
 	__u8   Signature[16];
-	__u8   Nonce[11];
-	__u8   Reserved[5];
+	__u8   Nonce[16];
 	__le32 OriginalMessageSize;
 	__u16  Reserved1;
-	__le16 EncryptionAlgorithm;
+	__le16 Flags; /* EncryptionAlgorithm */
 	__u64  SessionId;
 } __packed;
 
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 79dc650c18b2..4f07dc93608d 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -34,7 +34,8 @@ struct smb_rqst;
  *****************************************************************
  */
 extern int map_smb2_to_linux_error(char *buf, bool log_err);
-extern int smb2_check_message(char *buf, unsigned int length);
+extern int smb2_check_message(char *buf, unsigned int length,
+			      struct TCP_Server_Info *server);
 extern unsigned int smb2_calc_size(void *buf);
 extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr);
 extern __le16 *cifs_convert_path_to_utf16(const char *from,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index d4c5b6f109a7..8732a43b1008 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -222,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	return rc;
 }
 
-int
-generate_smb3signingkey(struct cifs_ses *ses)
+static int generate_key(struct cifs_ses *ses, struct kvec label,
+			struct kvec context, __u8 *key, unsigned int key_size)
 {
 	unsigned char zero = 0x0;
 	__u8 i[4] = {0, 0, 0, 1};
@@ -233,7 +233,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
 	unsigned char *hashptr = prfhash;
 
 	memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
-	memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+	memset(key, 0x0, key_size);
 
 	rc = smb3_crypto_shash_allocate(ses->server);
 	if (rc) {
@@ -262,7 +262,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
 	}
 
 	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
-				"SMB2AESCMAC", 12);
+				label.iov_base, label.iov_len);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
 		goto smb3signkey_ret;
@@ -276,7 +276,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
 	}
 
 	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
-				"SmbSign", 8);
+				context.iov_base, context.iov_len);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
 		goto smb3signkey_ret;
@@ -296,12 +296,102 @@ generate_smb3signingkey(struct cifs_ses *ses)
 		goto smb3signkey_ret;
 	}
 
-	memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
+	memcpy(key, hashptr, key_size);
 
 smb3signkey_ret:
 	return rc;
 }
 
+struct derivation {
+	struct kvec label;
+	struct kvec context;
+};
+
+struct derivation_triplet {
+	struct derivation signing;
+	struct derivation encryption;
+	struct derivation decryption;
+};
+
+static int
+generate_smb3signingkey(struct cifs_ses *ses,
+			const struct derivation_triplet *ptriplet)
+{
+	int rc;
+
+	rc = generate_key(ses, ptriplet->signing.label,
+			  ptriplet->signing.context, ses->smb3signingkey,
+			  SMB3_SIGN_KEY_SIZE);
+	if (rc)
+		return rc;
+
+	rc = generate_key(ses, ptriplet->encryption.label,
+			  ptriplet->encryption.context, ses->smb3encryptionkey,
+			  SMB3_SIGN_KEY_SIZE);
+	if (rc)
+		return rc;
+
+	return generate_key(ses, ptriplet->decryption.label,
+			    ptriplet->decryption.context,
+			    ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+}
+
+int
+generate_smb30signingkey(struct cifs_ses *ses)
+
+{
+	struct derivation_triplet triplet;
+	struct derivation *d;
+
+	d = &triplet.signing;
+	d->label.iov_base = "SMB2AESCMAC";
+	d->label.iov_len = 12;
+	d->context.iov_base = "SmbSign";
+	d->context.iov_len = 8;
+
+	d = &triplet.encryption;
+	d->label.iov_base = "SMB2AESCCM";
+	d->label.iov_len = 11;
+	d->context.iov_base = "ServerIn ";
+	d->context.iov_len = 10;
+
+	d = &triplet.decryption;
+	d->label.iov_base = "SMB2AESCCM";
+	d->label.iov_len = 11;
+	d->context.iov_base = "ServerOut";
+	d->context.iov_len = 10;
+
+	return generate_smb3signingkey(ses, &triplet);
+}
+
+int
+generate_smb311signingkey(struct cifs_ses *ses)
+
+{
+	struct derivation_triplet triplet;
+	struct derivation *d;
+
+	d = &triplet.signing;
+	d->label.iov_base = "SMB2AESCMAC";
+	d->label.iov_len = 12;
+	d->context.iov_base = "SmbSign";
+	d->context.iov_len = 8;
+
+	d = &triplet.encryption;
+	d->label.iov_base = "SMB2AESCCM";
+	d->label.iov_len = 11;
+	d->context.iov_base = "ServerIn ";
+	d->context.iov_len = 10;
+
+	d = &triplet.decryption;
+	d->label.iov_base = "SMB2AESCCM";
+	d->label.iov_len = 11;
+	d->context.iov_base = "ServerOut";
+	d->context.iov_len = 10;
+
+	return generate_smb3signingkey(ses, &triplet);
+}
+
 int
 smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 2a24c524fb9a..87abe8ed074c 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -576,14 +576,16 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
 	cifs_in_send_dec(server);
 	cifs_save_when_sent(mid);
 
-	if (rc < 0)
+	if (rc < 0) {
 		server->sequence_number -= 2;
+		cifs_delete_mid(mid);
+	}
+
 	mutex_unlock(&server->srv_mutex);
 
 	if (rc == 0)
 		return 0;
 
-	cifs_delete_mid(mid);
 	add_credits_and_wake_if(server, credits, optype);
 	return rc;
 }
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index f829fe963f5b..5104d84c4f64 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -72,8 +72,7 @@ void coda_sysctl_clean(void);
 } while (0)
 
 
-#define CODA_FREE(ptr,size) \
-    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+#define CODA_FREE(ptr, size) kvfree((ptr))
 
 /* inode to cnode access functions */
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index fda9f4311212..42e731b8c80a 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -427,13 +427,13 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
 	if (host_file->f_op->iterate) {
 		struct inode *host_inode = file_inode(host_file);
 
-		mutex_lock(&host_inode->i_mutex);
+		inode_lock(host_inode);
 		ret = -ENOENT;
 		if (!IS_DEADDIR(host_inode)) {
 			ret = host_file->f_op->iterate(host_file, ctx);
 			file_accessed(host_file);
 		}
-		mutex_unlock(&host_inode->i_mutex);
+		inode_unlock(host_inode);
 		return ret;
 	}
 	/* Venus: we must read Venus dirents from a file */
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 1da3805f3ddc..f47c7483863b 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -71,12 +71,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
 
 	host_file = cfi->cfi_container;
 	file_start_write(host_file);
-	mutex_lock(&coda_inode->i_mutex);
+	inode_lock(coda_inode);
 	ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos);
 	coda_inode->i_size = file_inode(host_file)->i_size;
 	coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
 	coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
-	mutex_unlock(&coda_inode->i_mutex);
+	inode_unlock(coda_inode);
 	file_end_write(host_file);
 	return ret;
 }
@@ -203,7 +203,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
 	err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end);
 	if (err)
 		return err;
-	mutex_lock(&coda_inode->i_mutex);
+	inode_lock(coda_inode);
 
 	cfi = CODA_FTOC(coda_file);
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -212,7 +212,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
 	err = vfs_fsync(host_file, datasync);
 	if (!err && !datasync)
 		err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
-	mutex_unlock(&coda_inode->i_mutex);
+	inode_unlock(coda_inode);
 
 	return err;
 }
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index cab612b2ae76..f419519ec41f 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -640,13 +640,13 @@ static void detach_groups(struct config_group *group)
 
 		child = sd->s_dentry;
 
-		mutex_lock(&d_inode(child)->i_mutex);
+		inode_lock(d_inode(child));
 
 		configfs_detach_group(sd->s_element);
 		d_inode(child)->i_flags |= S_DEAD;
 		dont_mount(child);
 
-		mutex_unlock(&d_inode(child)->i_mutex);
+		inode_unlock(d_inode(child));
 
 		d_delete(child);
 		dput(child);
@@ -834,11 +834,11 @@ static int configfs_attach_item(struct config_item *parent_item,
 			 * the VFS may already have hit and used them. Thus,
 			 * we must lock them as rmdir() would.
 			 */
-			mutex_lock(&d_inode(dentry)->i_mutex);
+			inode_lock(d_inode(dentry));
 			configfs_remove_dir(item);
 			d_inode(dentry)->i_flags |= S_DEAD;
 			dont_mount(dentry);
-			mutex_unlock(&d_inode(dentry)->i_mutex);
+			inode_unlock(d_inode(dentry));
 			d_delete(dentry);
 		}
 	}
@@ -874,7 +874,7 @@ static int configfs_attach_group(struct config_item *parent_item,
 		 * We must also lock the inode to remove it safely in case of
 		 * error, as rmdir() would.
 		 */
-		mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
 		configfs_adjust_dir_dirent_depth_before_populate(sd);
 		ret = populate_groups(to_config_group(item));
 		if (ret) {
@@ -883,7 +883,7 @@ static int configfs_attach_group(struct config_item *parent_item,
 			dont_mount(dentry);
 		}
 		configfs_adjust_dir_dirent_depth_after_populate(sd);
-		mutex_unlock(&d_inode(dentry)->i_mutex);
+		inode_unlock(d_inode(dentry));
 		if (ret)
 			d_delete(dentry);
 	}
@@ -1135,7 +1135,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
 	 * subsystem is really registered, and so we need to lock out
 	 * configfs_[un]register_subsystem().
 	 */
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 
 	subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item);
 	if (!subsys_sd) {
@@ -1147,7 +1147,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
 	ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
 
 out_unlock_fs:
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 
 	/*
 	 * If we succeeded, the fs is pinned via other methods.  If not,
@@ -1230,7 +1230,7 @@ int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys,
 		 * additional locking to prevent other subsystem from being
 		 * unregistered
 		 */
-		mutex_lock(&d_inode(root->cg_item.ci_dentry)->i_mutex);
+		inode_lock(d_inode(root->cg_item.ci_dentry));
 
 		/*
 		 * As we are trying to depend item from other subsystem
@@ -1254,7 +1254,7 @@ out_root_unlock:
 		 * We were called from subsystem other than our target so we
 		 * took some locks so now it's time to release them
 		 */
-		mutex_unlock(&d_inode(root->cg_item.ci_dentry)->i_mutex);
+		inode_unlock(d_inode(root->cg_item.ci_dentry));
 
 	return ret;
 }
@@ -1561,7 +1561,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
 	down_write(&configfs_rename_sem);
 	parent = item->parent->dentry;
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 
 	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
 	if (!IS_ERR(new_dentry)) {
@@ -1577,7 +1577,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
 			error = -EEXIST;
 		dput(new_dentry);
 	}
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	up_write(&configfs_rename_sem);
 
 	return error;
@@ -1590,7 +1590,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
 	struct configfs_dirent * parent_sd = dentry->d_fsdata;
 	int err;
 
-	mutex_lock(&d_inode(dentry)->i_mutex);
+	inode_lock(d_inode(dentry));
 	/*
 	 * Fake invisibility if dir belongs to a group/default groups hierarchy
 	 * being attached
@@ -1603,7 +1603,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
 		else
 			err = 0;
 	}
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 
 	return err;
 }
@@ -1613,11 +1613,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
 	struct dentry * dentry = file->f_path.dentry;
 	struct configfs_dirent * cursor = file->private_data;
 
-	mutex_lock(&d_inode(dentry)->i_mutex);
+	inode_lock(d_inode(dentry));
 	spin_lock(&configfs_dirent_lock);
 	list_del_init(&cursor->s_sibling);
 	spin_unlock(&configfs_dirent_lock);
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 
 	release_configfs_dirent(cursor);
 
@@ -1698,7 +1698,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
 {
 	struct dentry * dentry = file->f_path.dentry;
 
-	mutex_lock(&d_inode(dentry)->i_mutex);
+	inode_lock(d_inode(dentry));
 	switch (whence) {
 		case 1:
 			offset += file->f_pos;
@@ -1706,7 +1706,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
 			if (offset >= 0)
 				break;
 		default:
-			mutex_unlock(&d_inode(dentry)->i_mutex);
+			inode_unlock(d_inode(dentry));
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -1732,7 +1732,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
 			spin_unlock(&configfs_dirent_lock);
 		}
 	}
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 	return offset;
 }
 
@@ -1767,14 +1767,14 @@ int configfs_register_group(struct config_group *parent_group,
 
 	parent = parent_group->cg_item.ci_dentry;
 
-	mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
 	ret = create_default_group(parent_group, group);
 	if (!ret) {
 		spin_lock(&configfs_dirent_lock);
 		configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
 		spin_unlock(&configfs_dirent_lock);
 	}
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	return ret;
 }
 EXPORT_SYMBOL(configfs_register_group);
@@ -1791,7 +1791,7 @@ void configfs_unregister_group(struct config_group *group)
 	struct dentry *dentry = group->cg_item.ci_dentry;
 	struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
 
-	mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
 	spin_lock(&configfs_dirent_lock);
 	configfs_detach_prep(dentry, NULL);
 	spin_unlock(&configfs_dirent_lock);
@@ -1800,7 +1800,7 @@ void configfs_unregister_group(struct config_group *group)
 	d_inode(dentry)->i_flags |= S_DEAD;
 	dont_mount(dentry);
 	d_delete(dentry);
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 
 	dput(dentry);
 
@@ -1872,7 +1872,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 	sd = root->d_fsdata;
 	link_group(to_config_group(sd->s_element), group);
 
-	mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
 
 	err = -ENOMEM;
 	dentry = d_alloc_name(root, group->cg_item.ci_name);
@@ -1892,7 +1892,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 		}
 	}
 
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 
 	if (err) {
 		unlink_group(group);
@@ -1913,9 +1913,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 		return;
 	}
 
-	mutex_lock_nested(&d_inode(root)->i_mutex,
+	inode_lock_nested(d_inode(root),
 			  I_MUTEX_PARENT);
-	mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
 	mutex_lock(&configfs_symlink_mutex);
 	spin_lock(&configfs_dirent_lock);
 	if (configfs_detach_prep(dentry, NULL)) {
@@ -1926,11 +1926,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 	configfs_detach_group(&group->cg_item);
 	d_inode(dentry)->i_flags |= S_DEAD;
 	dont_mount(dentry);
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 
 	d_delete(dentry);
 
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 
 	dput(dentry);
 
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 3687187c8ea5..33b7ee34eda5 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -540,10 +540,10 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
 	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
 	int error = 0;
 
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL);
+	inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL);
 	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
 				     CONFIGFS_ITEM_ATTR);
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 
 	return error;
 }
@@ -562,10 +562,10 @@ int configfs_create_bin_file(struct config_item *item,
 	umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG;
 	int error = 0;
 
-	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
+	inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL);
 	error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode,
 				     CONFIGFS_ITEM_BIN_ATTR);
-	mutex_unlock(&dir->d_inode->i_mutex);
+	inode_unlock(dir->d_inode);
 
 	return error;
 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 0cc810e9dccc..cee087d8f7e0 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -255,7 +255,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 		/* no inode means this hasn't been made visible yet */
 		return;
 
-	mutex_lock(&d_inode(dir)->i_mutex);
+	inode_lock(d_inode(dir));
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element)
 			continue;
@@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 			break;
 		}
 	}
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 }
diff --git a/fs/dax.c b/fs/dax.c
index 7af879759064..4fd6b0c5c6b5 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/pagevec.h>
 #include <linux/pmem.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
@@ -245,13 +246,14 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 	loff_t end = pos + iov_iter_count(iter);
 
 	memset(&bh, 0, sizeof(bh));
+	bh.b_bdev = inode->i_sb->s_bdev;
 
 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
 		struct address_space *mapping = inode->i_mapping;
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		retval = filemap_write_and_wait_range(mapping, pos, end - 1);
 		if (retval) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			goto out;
 		}
 	}
@@ -263,7 +265,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 	retval = dax_io(inode, iter, pos, end, get_block, &bh);
 
 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 	if ((retval > 0) && end_io)
 		end_io(iocb, pos, retval, bh.b_private);
@@ -324,6 +326,199 @@ static int copy_user_bh(struct page *to, struct inode *inode,
 	return 0;
 }
 
+#define NO_SECTOR -1
+#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
+
+static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
+		sector_t sector, bool pmd_entry, bool dirty)
+{
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+	pgoff_t pmd_index = DAX_PMD_INDEX(index);
+	int type, error = 0;
+	void *entry;
+
+	WARN_ON_ONCE(pmd_entry && !dirty);
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	spin_lock_irq(&mapping->tree_lock);
+
+	entry = radix_tree_lookup(page_tree, pmd_index);
+	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
+		index = pmd_index;
+		goto dirty;
+	}
+
+	entry = radix_tree_lookup(page_tree, index);
+	if (entry) {
+		type = RADIX_DAX_TYPE(entry);
+		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
+					type != RADIX_DAX_PMD)) {
+			error = -EIO;
+			goto unlock;
+		}
+
+		if (!pmd_entry || type == RADIX_DAX_PMD)
+			goto dirty;
+
+		/*
+		 * We only insert dirty PMD entries into the radix tree.  This
+		 * means we don't need to worry about removing a dirty PTE
+		 * entry and inserting a clean PMD entry, thus reducing the
+		 * range we would flush with a follow-up fsync/msync call.
+		 */
+		radix_tree_delete(&mapping->page_tree, index);
+		mapping->nrexceptional--;
+	}
+
+	if (sector == NO_SECTOR) {
+		/*
+		 * This can happen during correct operation if our pfn_mkwrite
+		 * fault raced against a hole punch operation.  If this
+		 * happens the pte that was hole punched will have been
+		 * unmapped and the radix tree entry will have been removed by
+		 * the time we are called, but the call will still happen.  We
+		 * will return all the way up to wp_pfn_shared(), where the
+		 * pte_same() check will fail, eventually causing page fault
+		 * to be retried by the CPU.
+		 */
+		goto unlock;
+	}
+
+	error = radix_tree_insert(page_tree, index,
+			RADIX_DAX_ENTRY(sector, pmd_entry));
+	if (error)
+		goto unlock;
+
+	mapping->nrexceptional++;
+ dirty:
+	if (dirty)
+		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ unlock:
+	spin_unlock_irq(&mapping->tree_lock);
+	return error;
+}
+
+static int dax_writeback_one(struct block_device *bdev,
+		struct address_space *mapping, pgoff_t index, void *entry)
+{
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+	int type = RADIX_DAX_TYPE(entry);
+	struct radix_tree_node *node;
+	struct blk_dax_ctl dax;
+	void **slot;
+	int ret = 0;
+
+	spin_lock_irq(&mapping->tree_lock);
+	/*
+	 * Regular page slots are stabilized by the page lock even
+	 * without the tree itself locked.  These unlocked entries
+	 * need verification under the tree lock.
+	 */
+	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
+		goto unlock;
+	if (*slot != entry)
+		goto unlock;
+
+	/* another fsync thread may have already written back this entry */
+	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+		goto unlock;
+
+	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+		ret = -EIO;
+		goto unlock;
+	}
+
+	dax.sector = RADIX_DAX_SECTOR(entry);
+	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+	spin_unlock_irq(&mapping->tree_lock);
+
+	/*
+	 * We cannot hold tree_lock while calling dax_map_atomic() because it
+	 * eventually calls cond_resched().
+	 */
+	ret = dax_map_atomic(bdev, &dax);
+	if (ret < 0)
+		return ret;
+
+	if (WARN_ON_ONCE(ret < dax.size)) {
+		ret = -EIO;
+		goto unmap;
+	}
+
+	wb_cache_pmem(dax.addr, dax.size);
+
+	spin_lock_irq(&mapping->tree_lock);
+	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+	spin_unlock_irq(&mapping->tree_lock);
+ unmap:
+	dax_unmap_atomic(bdev, &dax);
+	return ret;
+
+ unlock:
+	spin_unlock_irq(&mapping->tree_lock);
+	return ret;
+}
+
+/*
+ * Flush the mapping to the persistent domain within the byte range of [start,
+ * end]. This is required by data integrity operations to ensure file data is
+ * on persistent storage prior to completion of the operation.
+ */
+int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
+		loff_t end)
+{
+	struct inode *inode = mapping->host;
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	pgoff_t start_index, end_index, pmd_index;
+	pgoff_t indices[PAGEVEC_SIZE];
+	struct pagevec pvec;
+	bool done = false;
+	int i, ret = 0;
+	void *entry;
+
+	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
+		return -EIO;
+
+	start_index = start >> PAGE_CACHE_SHIFT;
+	end_index = end >> PAGE_CACHE_SHIFT;
+	pmd_index = DAX_PMD_INDEX(start_index);
+
+	rcu_read_lock();
+	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
+	rcu_read_unlock();
+
+	/* see if the start of our range is covered by a PMD entry */
+	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
+		start_index = pmd_index;
+
+	tag_pages_for_writeback(mapping, start_index, end_index);
+
+	pagevec_init(&pvec, 0);
+	while (!done) {
+		pvec.nr = find_get_entries_tag(mapping, start_index,
+				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
+				pvec.pages, indices);
+
+		if (pvec.nr == 0)
+			break;
+
+		for (i = 0; i < pvec.nr; i++) {
+			if (indices[i] > end_index) {
+				done = true;
+				break;
+			}
+
+			ret = dax_writeback_one(bdev, mapping, indices[i],
+					pvec.pages[i]);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	wmb_pmem();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
+
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 			struct vm_area_struct *vma, struct vm_fault *vmf)
 {
@@ -363,6 +558,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	}
 	dax_unmap_atomic(bdev, &dax);
 
+	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
+			vmf->flags & FAULT_FLAG_WRITE);
+	if (error)
+		goto out;
+
 	error = vm_insert_mixed(vma, vaddr, dax.pfn);
 
  out:
@@ -408,6 +608,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	memset(&bh, 0, sizeof(bh));
 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_SIZE;
 
  repeat:
@@ -487,6 +688,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		delete_from_page_cache(page);
 		unlock_page(page);
 		page_cache_release(page);
+		page = NULL;
 	}
 
 	/*
@@ -590,7 +792,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	struct block_device *bdev;
 	pgoff_t size, pgoff;
 	sector_t block;
-	int result = 0;
+	int error, result = 0;
+	bool alloc = false;
 
 	/* dax pmd mappings require pfn_t_devmap() */
 	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
@@ -624,13 +827,21 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	}
 
 	memset(&bh, 0, sizeof(bh));
+	bh.b_bdev = inode->i_sb->s_bdev;
 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
 	bh.b_size = PMD_SIZE;
-	if (get_block(inode, block, &bh, write) != 0)
+
+	if (get_block(inode, block, &bh, 0) != 0)
 		return VM_FAULT_SIGBUS;
+
+	if (!buffer_mapped(&bh) && write) {
+		if (get_block(inode, block, &bh, 1) != 0)
+			return VM_FAULT_SIGBUS;
+		alloc = true;
+	}
+
 	bdev = bh.b_bdev;
-	i_mmap_lock_read(mapping);
 
 	/*
 	 * If the filesystem isn't willing to tell us the length of a hole,
@@ -639,19 +850,22 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	 */
 	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
 		dax_pmd_dbg(&bh, address, "allocated block too small");
-		goto fallback;
+		return VM_FAULT_FALLBACK;
 	}
 
 	/*
 	 * If we allocated new storage, make sure no process has any
 	 * zero pages covering this hole
 	 */
-	if (buffer_new(&bh)) {
-		i_mmap_unlock_read(mapping);
-		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
-		i_mmap_lock_read(mapping);
+	if (alloc) {
+		loff_t lstart = pgoff << PAGE_SHIFT;
+		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
+
+		truncate_pagecache_range(inode, lstart, lend);
 	}
 
+	i_mmap_lock_read(mapping);
+
 	/*
 	 * If a truncate happened while we were allocating blocks, we may
 	 * leave blocks allocated to the file that are beyond EOF.  We can't
@@ -664,7 +878,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		goto out;
 	}
 	if ((pgoff | PG_PMD_COLOUR) >= size) {
-		dax_pmd_dbg(&bh, address, "pgoff unaligned");
+		dax_pmd_dbg(&bh, address,
+				"offset + huge page size > file size");
 		goto fallback;
 	}
 
@@ -732,6 +947,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		}
 		dax_unmap_atomic(bdev, &dax);
 
+		/*
+		 * For PTE faults we insert a radix tree entry for reads, and
+		 * leave it clean.  Then on the first write we dirty the radix
+		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
+		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
+		 * call into get_block() to translate the pgoff to a sector in
+		 * order to be able to create a new radix tree entry.
+		 *
+		 * The PMD path doesn't have an equivalent to
+		 * dax_pfn_mkwrite(), though, so for a read followed by a
+		 * write we traverse all the way through __dax_pmd_fault()
+		 * twice.  This means we can just skip inserting a radix tree
+		 * entry completely on the initial read and just wait until
+		 * the write to insert a dirty entry.
+		 */
+		if (write) {
+			error = dax_radix_entry(mapping, pgoff, dax.sector,
+					true, true);
+			if (error) {
+				dax_pmd_dbg(&bh, address,
+						"PMD radix insertion failed");
+				goto fallback;
+			}
+		}
+
 		dev_dbg(part_to_dev(bdev->bd_part),
 				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
 				__func__, current->comm, address,
@@ -790,15 +1030,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
  * dax_pfn_mkwrite - handle first write to DAX page
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
- *
  */
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+	struct file *file = vma->vm_file;
 
-	sb_start_pagefault(sb);
-	file_update_time(vma->vm_file);
-	sb_end_pagefault(sb);
+	/*
+	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
+	 * RADIX_DAX_PTE entry already exists in the radix tree from a
+	 * previous call to __dax_fault().  We just want to look up that PTE
+	 * entry using vmf->pgoff and make sure the dirty tag is set.  This
+	 * saves us from having to make a call to get_block() here to look
+	 * up the sector.
+	 */
+	dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -835,6 +1080,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 	BUG_ON((offset + length) > PAGE_CACHE_SIZE);
 
 	memset(&bh, 0, sizeof(bh));
+	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_CACHE_SIZE;
 	err = get_block(inode, index, &bh, 0);
 	if (err < 0)
diff --git a/fs/dcache.c b/fs/dcache.c
index b4539e84e577..92d5140de851 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2462,7 +2462,7 @@ EXPORT_SYMBOL(d_rehash);
  */
 void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 {
-	BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
+	BUG_ON(!inode_is_locked(dentry->d_parent->d_inode));
 	BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
 
 	spin_lock(&dentry->d_lock);
@@ -2738,7 +2738,7 @@ static int __d_unalias(struct inode *inode,
 	if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
 		goto out_err;
 	m1 = &dentry->d_sb->s_vfs_rename_mutex;
-	if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex))
+	if (!inode_trylock(alias->d_parent->d_inode))
 		goto out_err;
 	m2 = &alias->d_parent->d_inode->i_mutex;
 out_unalias:
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b7fcc0de0b2f..bece948b363d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -265,7 +265,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 	if (!parent)
 		parent = debugfs_mount->mnt_root;
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 	dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
 		dput(dentry);
@@ -273,7 +273,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 	}
 
 	if (IS_ERR(dentry)) {
-		mutex_unlock(&d_inode(parent)->i_mutex);
+		inode_unlock(d_inode(parent));
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	}
 
@@ -282,7 +282,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 
 static struct dentry *failed_creating(struct dentry *dentry)
 {
-	mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+	inode_unlock(d_inode(dentry->d_parent));
 	dput(dentry);
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	return NULL;
@@ -290,7 +290,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
 
 static struct dentry *end_creating(struct dentry *dentry)
 {
-	mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+	inode_unlock(d_inode(dentry->d_parent));
 	return dentry;
 }
 
@@ -560,9 +560,9 @@ void debugfs_remove(struct dentry *dentry)
 	if (!parent || d_really_is_negative(parent))
 		return;
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 	ret = __debugfs_remove(dentry, parent);
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	if (!ret)
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
@@ -594,7 +594,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 
 	parent = dentry;
  down:
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
  loop:
 	/*
 	 * The parent->d_subdirs is protected by the d_lock. Outside that
@@ -609,7 +609,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 		/* perhaps simple_empty(child) makes more sense */
 		if (!list_empty(&child->d_subdirs)) {
 			spin_unlock(&parent->d_lock);
-			mutex_unlock(&d_inode(parent)->i_mutex);
+			inode_unlock(d_inode(parent));
 			parent = child;
 			goto down;
 		}
@@ -630,10 +630,10 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	}
 	spin_unlock(&parent->d_lock);
 
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	child = parent;
 	parent = parent->d_parent;
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 
 	if (child != dentry)
 		/* go up */
@@ -641,7 +641,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 
 	if (!__debugfs_remove(child, parent))
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 }
 EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
 
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c35ffdc12bba..1f107fd51328 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -255,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb)
 	if (!uid_valid(root_uid) || !gid_valid(root_gid))
 		return -EINVAL;
 
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 
 	/* If we have already created ptmx node, return */
 	if (fsi->ptmx_dentry) {
@@ -292,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb)
 	fsi->ptmx_dentry = dentry;
 	rc = 0;
 out:
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 	return rc;
 }
 
@@ -615,7 +615,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
 
 	sprintf(s, "%d", index);
 
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 
 	dentry = d_alloc_name(root, s);
 	if (dentry) {
@@ -626,7 +626,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
 		inode = ERR_PTR(-ENOMEM);
 	}
 
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 
 	return inode;
 }
@@ -671,7 +671,7 @@ void devpts_pty_kill(struct inode *inode)
 
 	BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
 
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 
 	dentry = d_find_alias(inode);
 
@@ -680,7 +680,7 @@ void devpts_pty_kill(struct inode *inode)
 	dput(dentry);	/* d_alloc_name() in devpts_pty_new() */
 	dput(dentry);		/* d_find_alias above */
 
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 }
 
 static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 602e8441bc0f..1b2f7ffc8b84 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1157,12 +1157,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 					iocb->ki_filp->f_mapping;
 
 			/* will be released by direct_io_worker */
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 
 			retval = filemap_write_and_wait_range(mapping, offset,
 							      end - 1);
 			if (retval) {
-				mutex_unlock(&inode->i_mutex);
+				inode_unlock(inode);
 				kmem_cache_free(dio_cache, dio);
 				goto out;
 			}
@@ -1173,7 +1173,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	dio->i_size = i_size_read(inode);
 	if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
 		if (dio->flags & DIO_LOCKING)
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 		kmem_cache_free(dio_cache, dio);
 		retval = 0;
 		goto out;
@@ -1295,7 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	 * of protecting us from looking up uninitialized blocks.
 	 */
 	if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
-		mutex_unlock(&dio->inode->i_mutex);
+		inode_unlock(dio->inode);
 
 	/*
 	 * The only time we want to leave bios in flight is when a successful
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 1925d6d222b8..58c2f4a21b7f 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -516,7 +516,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 		return -EINVAL;
 
 	kbuf = memdup_user_nul(buf, count);
-	if (!IS_ERR(kbuf))
+	if (IS_ERR(kbuf))
 		return PTR_ERR(kbuf);
 
 	if (check_version(kbuf)) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 040aa879d634..4e685ac1024d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry)
 	struct dentry *dir;
 
 	dir = dget_parent(dentry);
-	mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 	return dir;
 }
 
 static void unlock_dir(struct dentry *dir)
 {
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(dir);
 }
 
@@ -397,11 +397,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	int rc = 0;
 
 	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-	mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dir_dentry));
 	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
 				      lower_dir_dentry,
 				      ecryptfs_dentry->d_name.len);
-	mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dir_dentry));
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
 		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -426,11 +426,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		       "filename; rc = [%d]\n", __func__, rc);
 		goto out;
 	}
-	mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dir_dentry));
 	lower_dentry = lookup_one_len(encrypted_and_encoded_name,
 				      lower_dir_dentry,
 				      encrypted_and_encoded_name_size);
-	mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dir_dentry));
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
 		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -869,9 +869,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 	if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
 		struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 
-		mutex_lock(&d_inode(lower_dentry)->i_mutex);
+		inode_lock(d_inode(lower_dentry));
 		rc = notify_change(lower_dentry, &lower_ia, NULL);
-		mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+		inode_unlock(d_inode(lower_dentry));
 	}
 	return rc;
 }
@@ -970,9 +970,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 	if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
 		lower_ia.ia_valid &= ~ATTR_MODE;
 
-	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dentry));
 	rc = notify_change(lower_dentry, &lower_ia, NULL);
-	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dentry));
 out:
 	fsstack_copy_attr_all(inode, lower_inode);
 	return rc;
@@ -1048,10 +1048,10 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
-	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dentry));
 	rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value,
 						   size);
-	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dentry));
 out:
 	return rc;
 }
@@ -1075,9 +1075,9 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
-	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dentry));
 	rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size);
-	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dentry));
 out:
 	return rc;
 }
@@ -1092,9 +1092,9 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
-	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dentry));
 	rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name);
-	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dentry));
 out:
 	return rc;
 }
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index caba848ac763..c6ced4cbf0cf 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -436,7 +436,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 		rc = -ENOMEM;
 		goto out;
 	}
-	mutex_lock(&lower_inode->i_mutex);
+	inode_lock(lower_inode);
 	size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
 					   xattr_virt, PAGE_CACHE_SIZE);
 	if (size < 0)
@@ -444,7 +444,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 	put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
 	rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
 					 xattr_virt, size, 0);
-	mutex_unlock(&lower_inode->i_mutex);
+	inode_unlock(lower_inode);
 	if (rc)
 		printk(KERN_ERR "Error whilst attempting to write inode size "
 		       "to lower file xattr; rc = [%d]\n", rc);
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 90001da9abfd..c424e4813ec8 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -50,9 +50,9 @@ static ssize_t efivarfs_file_write(struct file *file,
 		d_delete(file->f_path.dentry);
 		dput(file->f_path.dentry);
 	} else {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		i_size_write(inode, datasize + sizeof(attributes));
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 	bytes = count;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 86a2121828c3..b8a564f29107 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -160,10 +160,10 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 	efivar_entry_size(entry, &size);
 	efivar_entry_add(entry, &efivarfs_list);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	inode->i_private = entry;
 	i_size_write(inode, size + sizeof(entry->var.Attributes));
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	d_add(dentry, inode);
 
 	return 0;
diff --git a/fs/exec.c b/fs/exec.c
index 828ec5f07de0..dcd4ac7d3f1e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1307,13 +1307,13 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
 		return;
 
 	/* Be careful if suid/sgid is set */
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* reload atomically mode/uid/gid now that lock held */
 	mode = inode->i_mode;
 	uid = inode->i_uid;
 	gid = inode->i_gid;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	/* We ignore suid/sgid if there are no mappings for them in the ns */
 	if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 906de66e8e7e..28645f0640f7 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -52,9 +52,9 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = sync_inode_metadata(filp->f_mapping->host, 1);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 714cd37a6ba3..c46f1a190b8d 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -124,10 +124,10 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
 	int err;
 
 	parent = ERR_PTR(-EACCES);
-	mutex_lock(&dentry->d_inode->i_mutex);
+	inode_lock(dentry->d_inode);
 	if (mnt->mnt_sb->s_export_op->get_parent)
 		parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	inode_unlock(dentry->d_inode);
 
 	if (IS_ERR(parent)) {
 		dprintk("%s: get_parent of %ld failed, err %d\n",
@@ -143,9 +143,9 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
 	if (err)
 		goto out_err;
 	dprintk("%s: found name: %s\n", __func__, nbuf);
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
 	tmp = lookup_one_len(nbuf, parent, strlen(nbuf));
-	mutex_unlock(&parent->d_inode->i_mutex);
+	inode_unlock(parent->d_inode);
 	if (IS_ERR(tmp)) {
 		dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
 		goto out_err;
@@ -503,10 +503,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 		 */
 		err = exportfs_get_name(mnt, target_dir, nbuf, result);
 		if (!err) {
-			mutex_lock(&target_dir->d_inode->i_mutex);
+			inode_lock(target_dir->d_inode);
 			nresult = lookup_one_len(nbuf, target_dir,
 						 strlen(nbuf));
-			mutex_unlock(&target_dir->d_inode->i_mutex);
+			inode_unlock(target_dir->d_inode);
 			if (!IS_ERR(nresult)) {
 				if (nresult->d_inode) {
 					dput(result);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 11a42c5a09ae..2c88d683cd91 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -102,8 +102,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 {
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
-	int ret = VM_FAULT_NOPAGE;
 	loff_t size;
+	int ret;
 
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vma->vm_file);
@@ -113,6 +113,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
+	else
+		ret = dax_pfn_mkwrite(vma, vmf);
 
 	up_read(&ei->dax_sem);
 	sb_end_pagefault(inode->i_sb);
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 5d46c09863f0..b386af2e45f4 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -51,10 +51,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 		flags = ext2_mask_flags(inode->i_mode, flags);
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		/* Is it quota file? Do not allow user to mess with it */
 		if (IS_NOQUOTA(inode)) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			ret = -EPERM;
 			goto setflags_out;
 		}
@@ -68,7 +68,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		 */
 		if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
 			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				mutex_unlock(&inode->i_mutex);
+				inode_unlock(inode);
 				ret = -EPERM;
 				goto setflags_out;
 			}
@@ -80,7 +80,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 		ext2_set_inode_flags(inode);
 		inode->i_ctime = CURRENT_TIME_SEC;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		mark_inode_dirty(inode);
 setflags_out:
@@ -102,10 +102,10 @@ setflags_out:
 			goto setversion_out;
 		}
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		inode->i_ctime = CURRENT_TIME_SEC;
 		inode->i_generation = generation;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		mark_inode_dirty(inode);
 setversion_out:
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 1a0835073663..c8021208a7eb 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -384,14 +384,12 @@ int ext4_decrypt(struct page *page)
 				EXT4_DECRYPT, page->index, page, page);
 }
 
-int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
+int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
+			   ext4_fsblk_t pblk, ext4_lblk_t len)
 {
 	struct ext4_crypto_ctx	*ctx;
 	struct page		*ciphertext_page = NULL;
 	struct bio		*bio;
-	ext4_lblk_t		lblk = le32_to_cpu(ex->ee_block);
-	ext4_fsblk_t		pblk = ext4_ext_pblock(ex);
-	unsigned int		len = ext4_ext_get_actual_len(ex);
 	int			ret, err = 0;
 
 #if 0
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index c5882b36e558..9a16d1e75a49 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -213,9 +213,11 @@ retry:
 		res = -ENOKEY;
 		goto out;
 	}
+	down_read(&keyring_key->sem);
 	ukp = user_key_payload(keyring_key);
 	if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
 		res = -EINVAL;
+		up_read(&keyring_key->sem);
 		goto out;
 	}
 	master_key = (struct ext4_encryption_key *)ukp->data;
@@ -226,10 +228,12 @@ retry:
 			    "ext4: key size incorrect: %d\n",
 			    master_key->size);
 		res = -ENOKEY;
+		up_read(&keyring_key->sem);
 		goto out;
 	}
 	res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
 				  raw_key);
+	up_read(&keyring_key->sem);
 	if (res)
 		goto out;
 got_key:
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cc7ca4e87144..0662b285dc8a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -378,14 +378,22 @@ struct flex_groups {
 #define EXT4_PROJINHERIT_FL		0x20000000 /* Create with parents projid */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
-#define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x004380FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE		0x304BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE		0x204380FF /* User modifiable flags */
+
+#define EXT4_FL_XFLAG_VISIBLE		(EXT4_SYNC_FL | \
+					 EXT4_IMMUTABLE_FL | \
+					 EXT4_APPEND_FL | \
+					 EXT4_NODUMP_FL | \
+					 EXT4_NOATIME_FL | \
+					 EXT4_PROJINHERIT_FL)
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
 			   EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
 			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
-			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
+			   EXT4_PROJINHERIT_FL)
 
 /* Flags that are appropriate for regular files (all but dir-specific ones). */
 #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
@@ -555,10 +563,12 @@ enum {
 #define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
 	/* Request will not result in inode size update (user for fallocate) */
 #define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
-	/* Do not take i_data_sem locking in ext4_map_blocks */
-#define EXT4_GET_BLOCKS_NO_LOCK			0x0100
 	/* Convert written extents to unwritten */
-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0200
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0100
+	/* Write zeros to newly created written extents */
+#define EXT4_GET_BLOCKS_ZERO			0x0200
+#define EXT4_GET_BLOCKS_CREATE_ZERO		(EXT4_GET_BLOCKS_CREATE |\
+					EXT4_GET_BLOCKS_ZERO)
 
 /*
  * The bit position of these flags must not overlap with any of the
@@ -616,6 +626,46 @@ enum {
 #define EXT4_IOC_GET_ENCRYPTION_PWSALT	_IOW('f', 20, __u8[16])
 #define EXT4_IOC_GET_ENCRYPTION_POLICY	_IOW('f', 21, struct ext4_encryption_policy)
 
+#ifndef FS_IOC_FSGETXATTR
+/* Until the uapi changes get merged for project quota... */
+
+#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
+
+/*
+ * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
+ */
+struct fsxattr {
+	__u32		fsx_xflags;	/* xflags field value (get/set) */
+	__u32		fsx_extsize;	/* extsize field value (get/set)*/
+	__u32		fsx_nextents;	/* nextents field value (get)	*/
+	__u32		fsx_projid;	/* project identifier (get/set) */
+	unsigned char	fsx_pad[12];
+};
+
+/*
+ * Flags for the fsx_xflags field
+ */
+#define FS_XFLAG_REALTIME	0x00000001	/* data in realtime volume */
+#define FS_XFLAG_PREALLOC	0x00000002	/* preallocated file extents */
+#define FS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */
+#define FS_XFLAG_APPEND		0x00000010	/* all writes append */
+#define FS_XFLAG_SYNC		0x00000020	/* all writes synchronous */
+#define FS_XFLAG_NOATIME	0x00000040	/* do not update access time */
+#define FS_XFLAG_NODUMP		0x00000080	/* do not include in backups */
+#define FS_XFLAG_RTINHERIT	0x00000100	/* create with rt bit set */
+#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
+#define FS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */
+#define FS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */
+#define FS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */
+#define FS_XFLAG_NODEFRAG	0x00002000  	/* do not defragment */
+#define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
+#define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this */
+#endif /* !defined(FS_IOC_FSGETXATTR) */
+
+#define EXT4_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
+#define EXT4_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
+
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
@@ -910,6 +960,15 @@ struct ext4_inode_info {
 	 * by other means, so we have i_data_sem.
 	 */
 	struct rw_semaphore i_data_sem;
+	/*
+	 * i_mmap_sem is for serializing page faults with truncate / punch hole
+	 * operations. We have to make sure that new page cannot be faulted in
+	 * a section of the inode that is being punched. We cannot easily use
+	 * i_data_sem for this since we need protection for the whole punch
+	 * operation and i_data_sem ranks below transaction start so we have
+	 * to occasionally drop it.
+	 */
+	struct rw_semaphore i_mmap_sem;
 	struct inode vfs_inode;
 	struct jbd2_inode *jinode;
 
@@ -993,6 +1052,7 @@ struct ext4_inode_info {
 	/* Encryption params */
 	struct ext4_crypt_info *i_crypt_info;
 #endif
+	kprojid_t i_projid;
 };
 
 /*
@@ -1248,7 +1308,7 @@ struct ext4_super_block {
 #endif
 
 /* Number of quota types we support */
-#define EXT4_MAXQUOTAS 2
+#define EXT4_MAXQUOTAS 3
 
 /*
  * fourth extended-fs super-block data in memory
@@ -1754,7 +1814,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,		ENCRYPT)
 					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
 					 EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
 					 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
-					 EXT4_FEATURE_RO_COMPAT_QUOTA)
+					 EXT4_FEATURE_RO_COMPAT_QUOTA |\
+					 EXT4_FEATURE_RO_COMPAT_PROJECT)
 
 #define EXTN_FEATURE_FUNCS(ver) \
 static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -1796,6 +1857,11 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
 #define	EXT4_DEF_RESUID		0
 #define	EXT4_DEF_RESGID		0
 
+/*
+ * Default project ID
+ */
+#define	EXT4_DEF_PROJID		0
+
 #define EXT4_DEF_INODE_READAHEAD_BLKS	32
 
 /*
@@ -2234,7 +2300,8 @@ void ext4_restore_control_page(struct page *data_page);
 struct page *ext4_encrypt(struct inode *inode,
 			  struct page *plaintext_page);
 int ext4_decrypt(struct page *page);
-int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
+int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
+			   ext4_fsblk_t pblk, ext4_lblk_t len);
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 int ext4_init_crypto(void);
@@ -2440,8 +2507,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_write(struct inode *inode, sector_t iblock,
 			 struct buffer_head *bh_result, int create);
-int ext4_get_block_dax(struct inode *inode, sector_t iblock,
-			 struct buffer_head *bh_result, int create);
+int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
+			    struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
 				struct buffer_head *bh_result, int create);
 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
@@ -2484,9 +2551,13 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
+extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
+			      ext4_fsblk_t pblk, ext4_lblk_t len);
 
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -2825,7 +2896,7 @@ do {								\
 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 {
 	WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
-		     !mutex_is_locked(&inode->i_mutex));
+		     !inode_is_locked(inode));
 	down_write(&EXT4_I(inode)->i_data_sem);
 	if (newsize > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = newsize;
@@ -2848,6 +2919,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
 	return changed;
 }
 
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+				      loff_t len);
+
 struct ext4_group_info {
 	unsigned long   bb_state;
 	struct rb_root  bb_free_root;
@@ -2986,8 +3060,7 @@ extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
 					 struct page *page);
 extern int ext4_try_add_inline_entry(handle_t *handle,
 				     struct ext4_filename *fname,
-				     struct dentry *dentry,
-				     struct inode *inode);
+				     struct inode *dir, struct inode *inode);
 extern int ext4_try_create_inline_dir(handle_t *handle,
 				      struct inode *parent,
 				      struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 551353b1b17a..0ffabaf90aa5 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3119,19 +3119,11 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
 	ext4_fsblk_t ee_pblock;
 	unsigned int ee_len;
-	int ret;
 
 	ee_len    = ext4_ext_get_actual_len(ex);
 	ee_pblock = ext4_ext_pblock(ex);
-
-	if (ext4_encrypted_inode(inode))
-		return ext4_encrypted_zeroout(inode, ex);
-
-	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
-	if (ret > 0)
-		ret = 0;
-
-	return ret;
+	return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
+				  ee_len);
 }
 
 /*
@@ -4052,6 +4044,14 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
 	}
 	/* IO end_io complete, convert the filled extent to written */
 	if (flags & EXT4_GET_BLOCKS_CONVERT) {
+		if (flags & EXT4_GET_BLOCKS_ZERO) {
+			if (allocated > map->m_len)
+				allocated = map->m_len;
+			err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
+						 allocated);
+			if (err < 0)
+				goto out2;
+		}
 		ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
 							   ppath);
 		if (ret >= 0) {
@@ -4685,10 +4685,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 	if (len <= EXT_UNWRITTEN_MAX_LEN)
 		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 
-	/* Wait all existing dio workers, newcomers will block on i_mutex */
-	ext4_inode_block_unlocked_dio(inode);
-	inode_dio_wait(inode);
-
 	/*
 	 * credits to insert 1 extent into extent tree
 	 */
@@ -4752,8 +4748,6 @@ retry:
 		goto retry;
 	}
 
-	ext4_inode_resume_unlocked_dio(inode);
-
 	return ret > 0 ? ret2 : ret;
 }
 
@@ -4770,7 +4764,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	int partial_begin, partial_end;
 	loff_t start, end;
 	ext4_lblk_t lblk;
-	struct address_space *mapping = inode->i_mapping;
 	unsigned int blkbits = inode->i_blkbits;
 
 	trace_ext4_zero_range(inode, offset, len, mode);
@@ -4786,17 +4779,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	}
 
 	/*
-	 * Write out all dirty pages to avoid race conditions
-	 * Then release them.
-	 */
-	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-		ret = filemap_write_and_wait_range(mapping, offset,
-						   offset + len - 1);
-		if (ret)
-			return ret;
-	}
-
-	/*
 	 * Round up offset. This is not fallocate, we neet to zero out
 	 * blocks, so convert interior block aligned part of the range to
 	 * unwritten and possibly manually zero out unaligned parts of the
@@ -4817,7 +4799,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	else
 		max_blocks -= lblk;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * Indirect files do not support unwritten extnets
@@ -4839,6 +4821,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
 
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
 	/* Preallocate the range including the unaligned edges */
 	if (partial_begin || partial_end) {
 		ret = ext4_alloc_file_blocks(file,
@@ -4847,7 +4833,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 				 round_down(offset, 1 << blkbits)) >> blkbits,
 				new_size, flags, mode);
 		if (ret)
-			goto out_mutex;
+			goto out_dio;
 
 	}
 
@@ -4856,16 +4842,23 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
 			  EXT4_EX_NOCACHE);
 
-		/* Now release the pages and zero block aligned part of pages*/
+		/*
+		 * Prevent page faults from reinstantiating pages we have
+		 * released from page cache.
+		 */
+		down_write(&EXT4_I(inode)->i_mmap_sem);
+		ret = ext4_update_disksize_before_punch(inode, offset, len);
+		if (ret) {
+			up_write(&EXT4_I(inode)->i_mmap_sem);
+			goto out_dio;
+		}
+		/* Now release the pages and zero block aligned part of pages */
 		truncate_pagecache_range(inode, start, end - 1);
 		inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 
-		/* Wait all existing dio workers, newcomers will block on i_mutex */
-		ext4_inode_block_unlocked_dio(inode);
-		inode_dio_wait(inode);
-
 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
 					     flags, mode);
+		up_write(&EXT4_I(inode)->i_mmap_sem);
 		if (ret)
 			goto out_dio;
 	}
@@ -4909,7 +4902,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 out_dio:
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -4980,7 +4973,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * We only support preallocation for extent-based files only
@@ -4998,8 +4991,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 			goto out;
 	}
 
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
 	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
 				     flags, mode);
+	ext4_inode_resume_unlocked_dio(inode);
 	if (ret)
 		goto out;
 
@@ -5008,7 +5006,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 						EXT4_I(inode)->i_sync_tid);
 	}
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
 	return ret;
 }
@@ -5494,21 +5492,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 			return ret;
 	}
 
-	/*
-	 * Need to round down offset to be aligned with page size boundary
-	 * for page size > block size.
-	 */
-	ioffset = round_down(offset, PAGE_SIZE);
-
-	/* Write out all dirty pages */
-	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
-					   LLONG_MAX);
-	if (ret)
-		return ret;
-
-	/* Take mutex lock */
-	mutex_lock(&inode->i_mutex);
-
+	inode_lock(inode);
 	/*
 	 * There is no need to overlap collapse range with EOF, in which case
 	 * it is effectively a truncate operation
@@ -5524,17 +5508,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 		goto out_mutex;
 	}
 
-	truncate_pagecache(inode, ioffset);
-
 	/* Wait for existing dio to complete */
 	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 
+	/*
+	 * Prevent page faults from reinstantiating pages we have released from
+	 * page cache.
+	 */
+	down_write(&EXT4_I(inode)->i_mmap_sem);
+	/*
+	 * Need to round down offset to be aligned with page size boundary
+	 * for page size > block size.
+	 */
+	ioffset = round_down(offset, PAGE_SIZE);
+	/*
+	 * Write tail of the last page before removed range since it will get
+	 * removed from the page cache below.
+	 */
+	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+	if (ret)
+		goto out_mmap;
+	/*
+	 * Write data that will be shifted to preserve them when discarding
+	 * page cache below. We are also protected from pages becoming dirty
+	 * by i_mmap_sem.
+	 */
+	ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+					   LLONG_MAX);
+	if (ret)
+		goto out_mmap;
+	truncate_pagecache(inode, ioffset);
+
 	credits = ext4_writepage_trans_blocks(inode);
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
-		goto out_dio;
+		goto out_mmap;
 	}
 
 	down_write(&EXT4_I(inode)->i_data_sem);
@@ -5573,10 +5583,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 
 out_stop:
 	ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+	up_write(&EXT4_I(inode)->i_mmap_sem);
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -5627,21 +5638,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 			return ret;
 	}
 
-	/*
-	 * Need to round down to align start offset to page size boundary
-	 * for page size > block size.
-	 */
-	ioffset = round_down(offset, PAGE_SIZE);
-
-	/* Write out all dirty pages */
-	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
-			LLONG_MAX);
-	if (ret)
-		return ret;
-
-	/* Take mutex lock */
-	mutex_lock(&inode->i_mutex);
-
+	inode_lock(inode);
 	/* Currently just for extent based files */
 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		ret = -EOPNOTSUPP;
@@ -5660,17 +5657,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 		goto out_mutex;
 	}
 
-	truncate_pagecache(inode, ioffset);
-
 	/* Wait for existing dio to complete */
 	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 
+	/*
+	 * Prevent page faults from reinstantiating pages we have released from
+	 * page cache.
+	 */
+	down_write(&EXT4_I(inode)->i_mmap_sem);
+	/*
+	 * Need to round down to align start offset to page size boundary
+	 * for page size > block size.
+	 */
+	ioffset = round_down(offset, PAGE_SIZE);
+	/* Write out all dirty pages */
+	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+			LLONG_MAX);
+	if (ret)
+		goto out_mmap;
+	truncate_pagecache(inode, ioffset);
+
 	credits = ext4_writepage_trans_blocks(inode);
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
-		goto out_dio;
+		goto out_mmap;
 	}
 
 	/* Expand file to avoid data loss if there is error while shifting */
@@ -5741,10 +5753,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 out_stop:
 	ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+	up_write(&EXT4_I(inode)->i_mmap_sem);
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -5779,8 +5792,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 
 	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
 	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
-	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
-	BUG_ON(!mutex_is_locked(&inode2->i_mutex));
+	BUG_ON(!inode_is_locked(inode1));
+	BUG_ON(!inode_is_locked(inode2));
 
 	*erp = ext4_es_remove_extent(inode1, lblk1, count);
 	if (unlikely(*erp))
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 113837e7ba98..1126436dada1 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -113,7 +113,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		ext4_unwritten_wait(inode);
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = generic_write_checks(iocb, from);
 	if (ret <= 0)
 		goto out;
@@ -169,7 +169,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 
 	ret = __generic_file_write_iter(iocb, from);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (ret > 0) {
 		ssize_t err;
@@ -186,50 +186,42 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	return ret;
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (aio_mutex)
 		mutex_unlock(aio_mutex);
 	return ret;
 }
 
 #ifdef CONFIG_FS_DAX
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-	struct inode *inode = bh->b_assoc_map->host;
-	/* XXX: breaks on 32-bit > 16TB. Is that even supported? */
-	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-	int err;
-	if (!uptodate)
-		return;
-	WARN_ON(!buffer_unwritten(bh));
-	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int result;
 	handle_t *handle = NULL;
-	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
+		down_read(&EXT4_I(inode)->i_mmap_sem);
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
 						EXT4_DATA_TRANS_BLOCKS(sb));
-	}
+	} else
+		down_read(&EXT4_I(inode)->i_mmap_sem);
 
 	if (IS_ERR(handle))
 		result = VM_FAULT_SIGBUS;
 	else
-		result = __dax_fault(vma, vmf, ext4_get_block_dax,
-						ext4_end_io_unwritten);
+		result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
 
 	if (write) {
 		if (!IS_ERR(handle))
 			ext4_journal_stop(handle);
+		up_read(&EXT4_I(inode)->i_mmap_sem);
 		sb_end_pagefault(sb);
-	}
+	} else
+		up_read(&EXT4_I(inode)->i_mmap_sem);
 
 	return result;
 }
@@ -246,44 +238,88 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
+		down_read(&EXT4_I(inode)->i_mmap_sem);
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
 				ext4_chunk_trans_blocks(inode,
 							PMD_SIZE / PAGE_SIZE));
-	}
+	} else
+		down_read(&EXT4_I(inode)->i_mmap_sem);
 
 	if (IS_ERR(handle))
 		result = VM_FAULT_SIGBUS;
 	else
 		result = __dax_pmd_fault(vma, addr, pmd, flags,
-				ext4_get_block_dax, ext4_end_io_unwritten);
+				ext4_dax_mmap_get_block, NULL);
 
 	if (write) {
 		if (!IS_ERR(handle))
 			ext4_journal_stop(handle);
+		up_read(&EXT4_I(inode)->i_mmap_sem);
 		sb_end_pagefault(sb);
-	}
+	} else
+		up_read(&EXT4_I(inode)->i_mmap_sem);
 
 	return result;
 }
 
 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_mkwrite(vma, vmf, ext4_get_block_dax,
-				ext4_end_io_unwritten);
+	int err;
+	struct inode *inode = file_inode(vma->vm_file);
+
+	sb_start_pagefault(inode->i_sb);
+	file_update_time(vma->vm_file);
+	down_read(&EXT4_I(inode)->i_mmap_sem);
+	err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL);
+	up_read(&EXT4_I(inode)->i_mmap_sem);
+	sb_end_pagefault(inode->i_sb);
+
+	return err;
+}
+
+/*
+ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
+ * handler we check for races agaist truncate. Note that since we cycle through
+ * i_mmap_sem, we are sure that also any hole punching that began before we
+ * were called is finished by now and so if it included part of the file we
+ * are working on, our pte will get unmapped and the check for pte_same() in
+ * wp_pfn_shared() fails. Thus fault gets retried and things work out as
+ * desired.
+ */
+static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
+				struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	struct super_block *sb = inode->i_sb;
+	loff_t size;
+	int ret;
+
+	sb_start_pagefault(sb);
+	file_update_time(vma->vm_file);
+	down_read(&EXT4_I(inode)->i_mmap_sem);
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size)
+		ret = VM_FAULT_SIGBUS;
+	else
+		ret = dax_pfn_mkwrite(vma, vmf);
+	up_read(&EXT4_I(inode)->i_mmap_sem);
+	sb_end_pagefault(sb);
+
+	return ret;
 }
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
 	.pmd_fault	= ext4_dax_pmd_fault,
 	.page_mkwrite	= ext4_dax_mkwrite,
-	.pfn_mkwrite	= dax_pfn_mkwrite,
+	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
 };
 #else
 #define ext4_dax_vm_ops	ext4_file_vm_ops
 #endif
 
 static const struct vm_operations_struct ext4_file_vm_ops = {
-	.fault		= filemap_fault,
+	.fault		= ext4_filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite   = ext4_page_mkwrite,
 };
@@ -527,11 +563,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 	int blkbits;
 	int ret = 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	isize = i_size_read(inode);
 	if (offset >= isize) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		return -ENXIO;
 	}
 
@@ -579,7 +615,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 		dataoff = (loff_t)last << blkbits;
 	} while (last <= end);
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (dataoff > isize)
 		return -ENXIO;
@@ -600,11 +636,11 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 	int blkbits;
 	int ret = 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	isize = i_size_read(inode);
 	if (offset >= isize) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		return -ENXIO;
 	}
 
@@ -655,7 +691,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 		break;
 	} while (last <= end);
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (holeoff > isize)
 		holeoff = isize;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1b8024d26f65..3fcfd50a2e8a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -799,6 +799,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		inode->i_gid = dir->i_gid;
 	} else
 		inode_init_owner(inode, dir, mode);
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+	    ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
+		ei->i_projid = EXT4_I(dir)->i_projid;
+	else
+		ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
+
 	err = dquot_initialize(inode);
 	if (err)
 		goto out;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d884989cc83d..dfe3b9bafc0d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -995,12 +995,11 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
  */
 static int ext4_add_dirent_to_inline(handle_t *handle,
 				     struct ext4_filename *fname,
-				     struct dentry *dentry,
+				     struct inode *dir,
 				     struct inode *inode,
 				     struct ext4_iloc *iloc,
 				     void *inline_start, int inline_size)
 {
-	struct inode	*dir = d_inode(dentry->d_parent);
 	int		err;
 	struct ext4_dir_entry_2 *de;
 
@@ -1245,12 +1244,11 @@ out:
  * the new created block.
  */
 int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
-			      struct dentry *dentry, struct inode *inode)
+			      struct inode *dir, struct inode *inode)
 {
 	int ret, inline_size;
 	void *inline_start;
 	struct ext4_iloc iloc;
-	struct inode *dir = d_inode(dentry->d_parent);
 
 	ret = ext4_get_inode_loc(dir, &iloc);
 	if (ret)
@@ -1264,7 +1262,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
 						 EXT4_INLINE_DOTDOT_SIZE;
 	inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
 
-	ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc,
+	ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc,
 					inline_start, inline_size);
 	if (ret != -ENOSPC)
 		goto out;
@@ -1285,7 +1283,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
 	if (inline_size) {
 		inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
 
-		ret = ext4_add_dirent_to_inline(handle, fname, dentry,
+		ret = ext4_add_dirent_to_inline(handle, fname, dir,
 						inode, &iloc, inline_start,
 						inline_size);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3bd912df6bf..83bc8bfb3bea 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -383,6 +383,21 @@ static int __check_block_validity(struct inode *inode, const char *func,
 	return 0;
 }
 
+int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
+		       ext4_lblk_t len)
+{
+	int ret;
+
+	if (ext4_encrypted_inode(inode))
+		return ext4_encrypted_zeroout(inode, lblk, pblk, len);
+
+	ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
+	if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
 #define check_block_validity(inode, map)	\
 	__check_block_validity((inode), __func__, __LINE__, (map))
 
@@ -403,8 +418,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
 	 * out taking i_data_sem.  So at the time the unwritten extent
 	 * could be converted.
 	 */
-	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-		down_read(&EXT4_I(inode)->i_data_sem);
+	down_read(&EXT4_I(inode)->i_data_sem);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -412,8 +426,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
 		retval = ext4_ind_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
-	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-		up_read((&EXT4_I(inode)->i_data_sem));
+	up_read((&EXT4_I(inode)->i_data_sem));
 
 	/*
 	 * We don't check m_len because extent will be collpased in status
@@ -509,8 +522,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
 	 */
-	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-		down_read(&EXT4_I(inode)->i_data_sem);
+	down_read(&EXT4_I(inode)->i_data_sem);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -541,8 +553,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		if (ret < 0)
 			retval = ret;
 	}
-	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-		up_read((&EXT4_I(inode)->i_data_sem));
+	up_read((&EXT4_I(inode)->i_data_sem));
 
 found:
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -626,13 +637,29 @@ found:
 		}
 
 		/*
+		 * We have to zeroout blocks before inserting them into extent
+		 * status tree. Otherwise someone could look them up there and
+		 * use them before they are really zeroed.
+		 */
+		if (flags & EXT4_GET_BLOCKS_ZERO &&
+		    map->m_flags & EXT4_MAP_MAPPED &&
+		    map->m_flags & EXT4_MAP_NEW) {
+			ret = ext4_issue_zeroout(inode, map->m_lblk,
+						 map->m_pblk, map->m_len);
+			if (ret) {
+				retval = ret;
+				goto out_sem;
+			}
+		}
+
+		/*
 		 * If the extent has been zeroed out, we don't need to update
 		 * extent status tree.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
 		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
 			if (ext4_es_is_written(&es))
-				goto has_zeroout;
+				goto out_sem;
 		}
 		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -643,11 +670,13 @@ found:
 			status |= EXTENT_STATUS_DELAYED;
 		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 					    map->m_pblk, status);
-		if (ret < 0)
+		if (ret < 0) {
 			retval = ret;
+			goto out_sem;
+		}
 	}
 
-has_zeroout:
+out_sem:
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		ret = check_block_validity(inode, map);
@@ -674,7 +703,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 
-	if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
+	if (flags && !handle) {
 		/* Direct IO write... */
 		if (map.m_len > DIO_MAX_BLOCKS)
 			map.m_len = DIO_MAX_BLOCKS;
@@ -694,16 +723,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-		if (IS_DAX(inode) && buffer_unwritten(bh)) {
-			/*
-			 * dgc: I suspect unwritten conversion on ext4+DAX is
-			 * fundamentally broken here when there are concurrent
-			 * read/write in progress on this inode.
-			 */
-			WARN_ON_ONCE(io_end);
-			bh->b_assoc_map = inode->i_mapping;
-			bh->b_private = (void *)(unsigned long)iblock;
-		}
 		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
 			set_buffer_defer_completion(bh);
 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -879,9 +898,6 @@ int do_journal_get_write_access(handle_t *handle,
 	return ret;
 }
 
-static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
-		   struct buffer_head *bh_result, int create);
-
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 				  get_block_t *get_block)
@@ -3054,25 +3070,96 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock,
 			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
-static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
-	ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+	int ret;
+
+	ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
-	return _ext4_get_block(inode, iblock, bh_result,
-			       EXT4_GET_BLOCKS_NO_LOCK);
+	ret = _ext4_get_block(inode, iblock, bh_result, 0);
+	/*
+	 * Blocks should have been preallocated! ext4_file_write_iter() checks
+	 * that.
+	 */
+	WARN_ON_ONCE(!buffer_mapped(bh_result));
+
+	return ret;
 }
 
-int ext4_get_block_dax(struct inode *inode, sector_t iblock,
-		   struct buffer_head *bh_result, int create)
+#ifdef CONFIG_FS_DAX
+int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
+			    struct buffer_head *bh_result, int create)
 {
-	int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
-	if (create)
-		flags |= EXT4_GET_BLOCKS_CREATE;
-	ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+	int ret, err;
+	int credits;
+	struct ext4_map_blocks map;
+	handle_t *handle = NULL;
+	int flags = 0;
+
+	ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
-	return _ext4_get_block(inode, iblock, bh_result, flags);
+	map.m_lblk = iblock;
+	map.m_len = bh_result->b_size >> inode->i_blkbits;
+	credits = ext4_chunk_trans_blocks(inode, map.m_len);
+	if (create) {
+		flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			return ret;
+		}
+	}
+
+	ret = ext4_map_blocks(handle, inode, &map, flags);
+	if (create) {
+		err = ext4_journal_stop(handle);
+		if (ret >= 0 && err < 0)
+			ret = err;
+	}
+	if (ret <= 0)
+		goto out;
+	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+		int err2;
+
+		/*
+		 * We are protected by i_mmap_sem so we know block cannot go
+		 * away from under us even though we dropped i_data_sem.
+		 * Convert extent to written and write zeros there.
+		 *
+		 * Note: We may get here even when create == 0.
+		 */
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+
+		err = ext4_map_blocks(handle, inode, &map,
+		      EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
+		if (err < 0)
+			ret = err;
+		err2 = ext4_journal_stop(handle);
+		if (err2 < 0 && ret > 0)
+			ret = err2;
+	}
+out:
+	WARN_ON_ONCE(ret == 0 && create);
+	if (ret > 0) {
+		map_bh(bh_result, inode->i_sb, map.m_pblk);
+		bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
+					map.m_flags;
+		/*
+		 * At least for now we have to clear BH_New so that DAX code
+		 * doesn't attempt to zero blocks again in a racy way.
+		 */
+		bh_result->b_state &= ~(1 << BH_New);
+		bh_result->b_size = map.m_len << inode->i_blkbits;
+		ret = 0;
+	}
+	return ret;
 }
+#endif
 
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 			    ssize_t size, void *private)
@@ -3143,10 +3230,8 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	/* If we do a overwrite dio, i_mutex locking can be released */
 	overwrite = *((int *)iocb->private);
 
-	if (overwrite) {
-		down_read(&EXT4_I(inode)->i_data_sem);
-		mutex_unlock(&inode->i_mutex);
-	}
+	if (overwrite)
+		inode_unlock(inode);
 
 	/*
 	 * We could direct write to holes and fallocate.
@@ -3189,7 +3274,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	}
 
 	if (overwrite) {
-		get_block_func = ext4_get_block_write_nolock;
+		get_block_func = ext4_get_block_overwrite;
 	} else {
 		get_block_func = ext4_get_block_write;
 		dio_flags = DIO_LOCKING;
@@ -3245,10 +3330,8 @@ retake_lock:
 	if (iov_iter_rw(iter) == WRITE)
 		inode_dio_end(inode);
 	/* take i_mutex locking again if we do a ovewrite dio */
-	if (overwrite) {
-		up_read(&EXT4_I(inode)->i_data_sem);
-		mutex_lock(&inode->i_mutex);
-	}
+	if (overwrite)
+		inode_lock(inode);
 
 	return ret;
 }
@@ -3559,6 +3642,35 @@ int ext4_can_truncate(struct inode *inode)
 }
 
 /*
+ * We have to make sure i_disksize gets properly updated before we truncate
+ * page cache due to hole punching or zero range. Otherwise i_disksize update
+ * can get lost as it may have been postponed to submission of writeback but
+ * that will never happen after we truncate page cache.
+ */
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+				      loff_t len)
+{
+	handle_t *handle;
+	loff_t size = i_size_read(inode);
+
+	WARN_ON(!inode_is_locked(inode));
+	if (offset > size || offset + len < size)
+		return 0;
+
+	if (EXT4_I(inode)->i_disksize >= size)
+		return 0;
+
+	handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ext4_update_i_disksize(inode, size);
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
+
+	return 0;
+}
+
+/*
  * ext4_punch_hole: punches a hole in a file by releaseing the blocks
  * associated with the given offset and length
  *
@@ -3595,7 +3707,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 			return ret;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* No need to punch hole beyond i_size */
 	if (offset >= inode->i_size)
@@ -3623,17 +3735,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 
 	}
 
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
+	/*
+	 * Prevent page faults from reinstantiating pages we have released from
+	 * page cache.
+	 */
+	down_write(&EXT4_I(inode)->i_mmap_sem);
 	first_block_offset = round_up(offset, sb->s_blocksize);
 	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
 
 	/* Now release the pages and zero block aligned part of pages*/
-	if (last_block_offset > first_block_offset)
+	if (last_block_offset > first_block_offset) {
+		ret = ext4_update_disksize_before_punch(inode, offset, length);
+		if (ret)
+			goto out_dio;
 		truncate_pagecache_range(inode, first_block_offset,
 					 last_block_offset);
-
-	/* Wait all existing dio workers, newcomers will block on i_mutex */
-	ext4_inode_block_unlocked_dio(inode);
-	inode_dio_wait(inode);
+	}
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		credits = ext4_writepage_trans_blocks(inode);
@@ -3680,19 +3801,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 
-	/* Now release the pages again to reduce race window */
-	if (last_block_offset > first_block_offset)
-		truncate_pagecache_range(inode, first_block_offset,
-					 last_block_offset);
-
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 out_stop:
 	ext4_journal_stop(handle);
 out_dio:
+	up_write(&EXT4_I(inode)->i_mmap_sem);
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -3762,7 +3879,7 @@ void ext4_truncate(struct inode *inode)
 	 * have i_mutex locked because it's not necessary.
 	 */
 	if (!(inode->i_state & (I_NEW|I_FREEING)))
-		WARN_ON(!mutex_is_locked(&inode->i_mutex));
+		WARN_ON(!inode_is_locked(inode));
 	trace_ext4_truncate_enter(inode);
 
 	if (!ext4_can_truncate(inode))
@@ -4076,6 +4193,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
 		EXT4_I(inode)->i_inline_off = 0;
 }
 
+int ext4_get_projid(struct inode *inode, kprojid_t *projid)
+{
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
+		return -EOPNOTSUPP;
+	*projid = EXT4_I(inode)->i_projid;
+	return 0;
+}
+
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
 	struct ext4_iloc iloc;
@@ -4087,6 +4212,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	int block;
 	uid_t i_uid;
 	gid_t i_gid;
+	projid_t i_projid;
 
 	inode = iget_locked(sb, ino);
 	if (!inode)
@@ -4136,12 +4262,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+	    EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+		i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
+	else
+		i_projid = EXT4_DEF_PROJID;
+
 	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
 	i_uid_write(inode, i_uid);
 	i_gid_write(inode, i_gid);
+	ei->i_projid = make_kprojid(&init_user_ns, i_projid);
 	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 
 	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
@@ -4440,6 +4574,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	int need_datasync = 0, set_large_file = 0;
 	uid_t i_uid;
 	gid_t i_gid;
+	projid_t i_projid;
 
 	spin_lock(&ei->i_raw_lock);
 
@@ -4452,6 +4587,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	i_uid = i_uid_read(inode);
 	i_gid = i_gid_read(inode);
+	i_projid = from_kprojid(&init_user_ns, ei->i_projid);
 	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
@@ -4529,6 +4665,15 @@ static int ext4_do_update_inode(handle_t *handle,
 				cpu_to_le16(ei->i_extra_isize);
 		}
 	}
+
+	BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+			EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+	       i_projid != EXT4_DEF_PROJID);
+
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+		raw_inode->i_projid = cpu_to_le32(i_projid);
+
 	ext4_inode_csum_set(inode, raw_inode, ei);
 	spin_unlock(&ei->i_raw_lock);
 	if (inode->i_sb->s_flags & MS_LAZYTIME)
@@ -4824,6 +4969,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			} else
 				ext4_wait_for_tail_page_commit(inode);
 		}
+		down_write(&EXT4_I(inode)->i_mmap_sem);
 		/*
 		 * Truncate pagecache after we've waited for commit
 		 * in data=journal mode to make pages freeable.
@@ -4831,6 +4977,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		truncate_pagecache(inode, inode->i_size);
 		if (shrink)
 			ext4_truncate(inode);
+		up_write(&EXT4_I(inode)->i_mmap_sem);
 	}
 
 	if (!rc) {
@@ -5279,6 +5426,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vma->vm_file);
+
+	down_read(&EXT4_I(inode)->i_mmap_sem);
 	/* Delalloc case is easy... */
 	if (test_opt(inode->i_sb, DELALLOC) &&
 	    !ext4_should_journal_data(inode) &&
@@ -5348,6 +5497,19 @@ retry_alloc:
 out_ret:
 	ret = block_page_mkwrite_return(ret);
 out:
+	up_read(&EXT4_I(inode)->i_mmap_sem);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
 }
+
+int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	int err;
+
+	down_read(&EXT4_I(inode)->i_mmap_sem);
+	err = filemap_fault(vma, vmf);
+	up_read(&EXT4_I(inode)->i_mmap_sem);
+
+	return err;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5e872fd40e5e..0f6c36922c24 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -14,6 +14,7 @@
 #include <linux/mount.h>
 #include <linux/file.h>
 #include <linux/random.h>
+#include <linux/quotaops.h>
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
@@ -202,6 +203,238 @@ static int uuid_is_zero(__u8 u[16])
 	return 1;
 }
 
+static int ext4_ioctl_setflags(struct inode *inode,
+			       unsigned int flags)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	handle_t *handle = NULL;
+	int err = EPERM, migrate = 0;
+	struct ext4_iloc iloc;
+	unsigned int oldflags, mask, i;
+	unsigned int jflag;
+
+	/* Is it quota file? Do not allow user to mess with it */
+	if (IS_NOQUOTA(inode))
+		goto flags_out;
+
+	oldflags = ei->i_flags;
+
+	/* The JOURNAL_DATA flag is modifiable only by root */
+	jflag = flags & EXT4_JOURNAL_DATA_FL;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+	 * the relevant capability.
+	 *
+	 * This test looks nicer. Thanks to Pauline Middelink
+	 */
+	if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
+		if (!capable(CAP_LINUX_IMMUTABLE))
+			goto flags_out;
+	}
+
+	/*
+	 * The JOURNAL_DATA flag can only be changed by
+	 * the relevant capability.
+	 */
+	if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+		if (!capable(CAP_SYS_RESOURCE))
+			goto flags_out;
+	}
+	if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
+		migrate = 1;
+
+	if (flags & EXT4_EOFBLOCKS_FL) {
+		/* we don't support adding EOFBLOCKS flag */
+		if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+			err = -EOPNOTSUPP;
+			goto flags_out;
+		}
+	} else if (oldflags & EXT4_EOFBLOCKS_FL)
+		ext4_truncate(inode);
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto flags_out;
+	}
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto flags_err;
+
+	for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+		if (!(mask & EXT4_FL_USER_MODIFIABLE))
+			continue;
+		if (mask & flags)
+			ext4_set_inode_flag(inode, i);
+		else
+			ext4_clear_inode_flag(inode, i);
+	}
+
+	ext4_set_inode_flags(inode);
+	inode->i_ctime = ext4_current_time(inode);
+
+	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+	ext4_journal_stop(handle);
+	if (err)
+		goto flags_out;
+
+	if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+		err = ext4_change_inode_journal_flag(inode, jflag);
+	if (err)
+		goto flags_out;
+	if (migrate) {
+		if (flags & EXT4_EXTENTS_FL)
+			err = ext4_ext_migrate(inode);
+		else
+			err = ext4_ind_migrate(inode);
+	}
+
+flags_out:
+	return err;
+}
+
+#ifdef CONFIG_QUOTA
+static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+{
+	struct inode *inode = file_inode(filp);
+	struct super_block *sb = inode->i_sb;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int err, rc;
+	handle_t *handle;
+	kprojid_t kprojid;
+	struct ext4_iloc iloc;
+	struct ext4_inode *raw_inode;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+		if (projid != EXT4_DEF_PROJID)
+			return -EOPNOTSUPP;
+		else
+			return 0;
+	}
+
+	if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE)
+		return -EOPNOTSUPP;
+
+	kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
+
+	if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
+		return 0;
+
+	err = mnt_want_write_file(filp);
+	if (err)
+		return err;
+
+	err = -EPERM;
+	inode_lock(inode);
+	/* Is it quota file? Do not allow user to mess with it */
+	if (IS_NOQUOTA(inode))
+		goto out_unlock;
+
+	err = ext4_get_inode_loc(inode, &iloc);
+	if (err)
+		goto out_unlock;
+
+	raw_inode = ext4_raw_inode(&iloc);
+	if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
+		err = -EOVERFLOW;
+		brelse(iloc.bh);
+		goto out_unlock;
+	}
+	brelse(iloc.bh);
+
+	dquot_initialize(inode);
+
+	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+		EXT4_QUOTA_INIT_BLOCKS(sb) +
+		EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto out_unlock;
+	}
+
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto out_stop;
+
+	if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) {
+		struct dquot *transfer_to[MAXQUOTAS] = { };
+
+		transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
+		if (transfer_to[PRJQUOTA]) {
+			err = __dquot_transfer(inode, transfer_to);
+			dqput(transfer_to[PRJQUOTA]);
+			if (err)
+				goto out_dirty;
+		}
+	}
+	EXT4_I(inode)->i_projid = kprojid;
+	inode->i_ctime = ext4_current_time(inode);
+out_dirty:
+	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+	if (!err)
+		err = rc;
+out_stop:
+	ext4_journal_stop(handle);
+out_unlock:
+	inode_unlock(inode);
+	mnt_drop_write_file(filp);
+	return err;
+}
+#else
+static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+{
+	if (projid != EXT4_DEF_PROJID)
+		return -EOPNOTSUPP;
+	return 0;
+}
+#endif
+
+/* Transfer internal flags to xflags */
+static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
+{
+	__u32 xflags = 0;
+
+	if (iflags & EXT4_SYNC_FL)
+		xflags |= FS_XFLAG_SYNC;
+	if (iflags & EXT4_IMMUTABLE_FL)
+		xflags |= FS_XFLAG_IMMUTABLE;
+	if (iflags & EXT4_APPEND_FL)
+		xflags |= FS_XFLAG_APPEND;
+	if (iflags & EXT4_NODUMP_FL)
+		xflags |= FS_XFLAG_NODUMP;
+	if (iflags & EXT4_NOATIME_FL)
+		xflags |= FS_XFLAG_NOATIME;
+	if (iflags & EXT4_PROJINHERIT_FL)
+		xflags |= FS_XFLAG_PROJINHERIT;
+	return xflags;
+}
+
+/* Transfer xflags flags to internal */
+static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
+{
+	unsigned long iflags = 0;
+
+	if (xflags & FS_XFLAG_SYNC)
+		iflags |= EXT4_SYNC_FL;
+	if (xflags & FS_XFLAG_IMMUTABLE)
+		iflags |= EXT4_IMMUTABLE_FL;
+	if (xflags & FS_XFLAG_APPEND)
+		iflags |= EXT4_APPEND_FL;
+	if (xflags & FS_XFLAG_NODUMP)
+		iflags |= EXT4_NODUMP_FL;
+	if (xflags & FS_XFLAG_NOATIME)
+		iflags |= EXT4_NOATIME_FL;
+	if (xflags & FS_XFLAG_PROJINHERIT)
+		iflags |= EXT4_PROJINHERIT_FL;
+
+	return iflags;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -217,11 +450,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
 		return put_user(flags, (int __user *) arg);
 	case EXT4_IOC_SETFLAGS: {
-		handle_t *handle = NULL;
-		int err, migrate = 0;
-		struct ext4_iloc iloc;
-		unsigned int oldflags, mask, i;
-		unsigned int jflag;
+		int err;
 
 		if (!inode_owner_or_capable(inode))
 			return -EACCES;
@@ -235,90 +464,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 		flags = ext4_mask_flags(inode->i_mode, flags);
 
-		err = -EPERM;
-		mutex_lock(&inode->i_mutex);
-		/* Is it quota file? Do not allow user to mess with it */
-		if (IS_NOQUOTA(inode))
-			goto flags_out;
-
-		oldflags = ei->i_flags;
-
-		/* The JOURNAL_DATA flag is modifiable only by root */
-		jflag = flags & EXT4_JOURNAL_DATA_FL;
-
-		/*
-		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-		 * the relevant capability.
-		 *
-		 * This test looks nicer. Thanks to Pauline Middelink
-		 */
-		if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE))
-				goto flags_out;
-		}
-
-		/*
-		 * The JOURNAL_DATA flag can only be changed by
-		 * the relevant capability.
-		 */
-		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
-				goto flags_out;
-		}
-		if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
-			migrate = 1;
-
-		if (flags & EXT4_EOFBLOCKS_FL) {
-			/* we don't support adding EOFBLOCKS flag */
-			if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
-				err = -EOPNOTSUPP;
-				goto flags_out;
-			}
-		} else if (oldflags & EXT4_EOFBLOCKS_FL)
-			ext4_truncate(inode);
-
-		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
-		if (IS_ERR(handle)) {
-			err = PTR_ERR(handle);
-			goto flags_out;
-		}
-		if (IS_SYNC(inode))
-			ext4_handle_sync(handle);
-		err = ext4_reserve_inode_write(handle, inode, &iloc);
-		if (err)
-			goto flags_err;
-
-		for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
-			if (!(mask & EXT4_FL_USER_MODIFIABLE))
-				continue;
-			if (mask & flags)
-				ext4_set_inode_flag(inode, i);
-			else
-				ext4_clear_inode_flag(inode, i);
-		}
-
-		ext4_set_inode_flags(inode);
-		inode->i_ctime = ext4_current_time(inode);
-
-		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-flags_err:
-		ext4_journal_stop(handle);
-		if (err)
-			goto flags_out;
-
-		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
-			err = ext4_change_inode_journal_flag(inode, jflag);
-		if (err)
-			goto flags_out;
-		if (migrate) {
-			if (flags & EXT4_EXTENTS_FL)
-				err = ext4_ext_migrate(inode);
-			else
-				err = ext4_ind_migrate(inode);
-		}
-
-flags_out:
-		mutex_unlock(&inode->i_mutex);
+		inode_lock(inode);
+		err = ext4_ioctl_setflags(inode, flags);
+		inode_unlock(inode);
 		mnt_drop_write_file(filp);
 		return err;
 	}
@@ -349,7 +497,7 @@ flags_out:
 			goto setversion_out;
 		}
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 		if (IS_ERR(handle)) {
 			err = PTR_ERR(handle);
@@ -364,7 +512,7 @@ flags_out:
 		ext4_journal_stop(handle);
 
 unlock_out:
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 setversion_out:
 		mnt_drop_write_file(filp);
 		return err;
@@ -510,9 +658,9 @@ group_add_out:
 		 * ext4_ext_swap_inode_data before we switch the
 		 * inode format to prevent read.
 		 */
-		mutex_lock(&(inode->i_mutex));
+		inode_lock((inode));
 		err = ext4_ext_migrate(inode);
-		mutex_unlock(&(inode->i_mutex));
+		inode_unlock((inode));
 		mnt_drop_write_file(filp);
 		return err;
 	}
@@ -689,6 +837,60 @@ encryption_policy_out:
 		return -EOPNOTSUPP;
 #endif
 	}
+	case EXT4_IOC_FSGETXATTR:
+	{
+		struct fsxattr fa;
+
+		memset(&fa, 0, sizeof(struct fsxattr));
+		ext4_get_inode_flags(ei);
+		fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+			fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
+				EXT4_I(inode)->i_projid);
+		}
+
+		if (copy_to_user((struct fsxattr __user *)arg,
+				 &fa, sizeof(fa)))
+			return -EFAULT;
+		return 0;
+	}
+	case EXT4_IOC_FSSETXATTR:
+	{
+		struct fsxattr fa;
+		int err;
+
+		if (copy_from_user(&fa, (struct fsxattr __user *)arg,
+				   sizeof(fa)))
+			return -EFAULT;
+
+		/* Make sure caller has proper permission */
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+
+		flags = ext4_xflags_to_iflags(fa.fsx_xflags);
+		flags = ext4_mask_flags(inode->i_mode, flags);
+
+		inode_lock(inode);
+		flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
+			 (flags & EXT4_FL_XFLAG_VISIBLE);
+		err = ext4_ioctl_setflags(inode, flags);
+		inode_unlock(inode);
+		mnt_drop_write_file(filp);
+		if (err)
+			return err;
+
+		err = ext4_ioctl_setproject(filp, fa.fsx_projid);
+		if (err)
+			return err;
+
+		return 0;
+	}
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f27e0c2598c5..06574dd77614 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -273,7 +273,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 		struct ext4_filename *fname,
 		struct ext4_dir_entry_2 **res_dir);
 static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
-			     struct dentry *dentry, struct inode *inode);
+			     struct inode *dir, struct inode *inode);
 
 /* checksumming functions */
 void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
@@ -1928,10 +1928,9 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
  * directory, and adds the dentry to the indexed directory.
  */
 static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
-			    struct dentry *dentry,
+			    struct inode *dir,
 			    struct inode *inode, struct buffer_head *bh)
 {
-	struct inode	*dir = d_inode(dentry->d_parent);
 	struct buffer_head *bh2;
 	struct dx_root	*root;
 	struct dx_frame	frames[2], *frame;
@@ -2086,8 +2085,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		return retval;
 
 	if (ext4_has_inline_data(dir)) {
-		retval = ext4_try_add_inline_entry(handle, &fname,
-						   dentry, inode);
+		retval = ext4_try_add_inline_entry(handle, &fname, dir, inode);
 		if (retval < 0)
 			goto out;
 		if (retval == 1) {
@@ -2097,7 +2095,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	}
 
 	if (is_dx(dir)) {
-		retval = ext4_dx_add_entry(handle, &fname, dentry, inode);
+		retval = ext4_dx_add_entry(handle, &fname, dir, inode);
 		if (!retval || (retval != ERR_BAD_DX_DIR))
 			goto out;
 		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
@@ -2119,7 +2117,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 
 		if (blocks == 1 && !dx_fallback &&
 		    ext4_has_feature_dir_index(sb)) {
-			retval = make_indexed_dir(handle, &fname, dentry,
+			retval = make_indexed_dir(handle, &fname, dir,
 						  inode, bh);
 			bh = NULL; /* make_indexed_dir releases bh */
 			goto out;
@@ -2154,12 +2152,11 @@ out:
  * Returns 0 for success, or a negative error value
  */
 static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
-			     struct dentry *dentry, struct inode *inode)
+			     struct inode *dir, struct inode *inode)
 {
 	struct dx_frame frames[2], *frame;
 	struct dx_entry *entries, *at;
 	struct buffer_head *bh;
-	struct inode *dir = d_inode(dentry->d_parent);
 	struct super_block *sb = dir->i_sb;
 	struct ext4_dir_entry_2 *de;
 	int err;
@@ -2756,7 +2753,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 		return 0;
 
 	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
-		     !mutex_is_locked(&inode->i_mutex));
+		     !inode_is_locked(inode));
 	/*
 	 * Exit early if inode already is on orphan list. This is a big speedup
 	 * since we don't have to contend on the global s_orphan_lock.
@@ -2838,7 +2835,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 		return 0;
 
 	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
-		     !mutex_is_locked(&inode->i_mutex));
+		     !inode_is_locked(inode));
 	/* Do this quick check before taking global s_orphan_lock. */
 	if (list_empty(&ei->i_orphan))
 		return 0;
@@ -3212,6 +3209,12 @@ static int ext4_link(struct dentry *old_dentry,
 	if (ext4_encrypted_inode(dir) &&
 	    !ext4_is_child_context_consistent_with_parent(dir, inode))
 		return -EPERM;
+
+       if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
+	   (!projid_eq(EXT4_I(dir)->i_projid,
+		       EXT4_I(old_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -3492,6 +3495,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int credits;
 	u8 old_file_type;
 
+	if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) &&
+	    (!projid_eq(EXT4_I(new_dir)->i_projid,
+			EXT4_I(old_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
 	retval = dquot_initialize(old.dir);
 	if (retval)
 		return retval;
@@ -3701,6 +3709,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 							   new.inode)))
 		return -EPERM;
 
+	if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
+	     !projid_eq(EXT4_I(new_dir)->i_projid,
+			EXT4_I(old_dentry->d_inode)->i_projid)) ||
+	    (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) &&
+	     !projid_eq(EXT4_I(old_dir)->i_projid,
+			EXT4_I(new_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
 	retval = dquot_initialize(old.dir);
 	if (retval)
 		return retval;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f1b56ff01208..3ed01ec011d7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -80,6 +80,36 @@ static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
 
+/*
+ * Lock ordering
+ *
+ * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
+ * i_mmap_rwsem (inode->i_mmap_rwsem)!
+ *
+ * page fault path:
+ * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
+ *   page lock -> i_data_sem (rw)
+ *
+ * buffered write path:
+ * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> transaction start -> page lock ->
+ *   i_data_sem (rw)
+ *
+ * truncate:
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ *   i_mmap_rwsem (w) -> page lock
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ *   transaction start -> i_data_sem (rw)
+ *
+ * direct IO:
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) ->
+ *   transaction start -> i_data_sem (rw)
+ *
+ * writepages:
+ * transaction start -> page lock(s) -> i_data_sem (rw)
+ */
+
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
@@ -958,6 +988,7 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&ei->i_orphan);
 	init_rwsem(&ei->xattr_sem);
 	init_rwsem(&ei->i_data_sem);
+	init_rwsem(&ei->i_mmap_sem);
 	inode_init_once(&ei->vfs_inode);
 }
 
@@ -1066,8 +1097,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
 }
 
 #ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
-#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+static char *quotatypes[] = INITQFNAMES;
+#define QTYPE2NAME(t) (quotatypes[t])
 
 static int ext4_write_dquot(struct dquot *dquot);
 static int ext4_acquire_dquot(struct dquot *dquot);
@@ -1100,6 +1131,7 @@ static const struct dquot_operations ext4_quota_operations = {
 	.write_info	= ext4_write_info,
 	.alloc_dquot	= dquot_alloc,
 	.destroy_dquot	= dquot_destroy,
+	.get_projid	= ext4_get_projid,
 };
 
 static const struct quotactl_ops ext4_qctl_operations = {
@@ -2254,10 +2286,10 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 					__func__, inode->i_ino, inode->i_size);
 			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 			truncate_inode_pages(inode->i_mapping, inode->i_size);
 			ext4_truncate(inode);
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			nr_truncates++;
 		} else {
 			if (test_opt(sb, DEBUG))
@@ -2526,6 +2558,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 			 "without CONFIG_QUOTA");
 		return 0;
 	}
+	if (ext4_has_feature_project(sb) && !readonly) {
+		ext4_msg(sb, KERN_ERR,
+			 "Filesystem with project quota feature cannot be mounted RDWR "
+			 "without CONFIG_QUOTA");
+		return 0;
+	}
 #endif  /* CONFIG_QUOTA */
 	return 1;
 }
@@ -3654,7 +3692,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		sb->s_qcop = &dquot_quotactl_sysfile_ops;
 	else
 		sb->s_qcop = &ext4_qctl_operations;
-	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
 #endif
 	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
 
@@ -4790,6 +4828,48 @@ restore_opts:
 	return err;
 }
 
+#ifdef CONFIG_QUOTA
+static int ext4_statfs_project(struct super_block *sb,
+			       kprojid_t projid, struct kstatfs *buf)
+{
+	struct kqid qid;
+	struct dquot *dquot;
+	u64 limit;
+	u64 curblock;
+
+	qid = make_kqid_projid(projid);
+	dquot = dqget(sb, qid);
+	if (IS_ERR(dquot))
+		return PTR_ERR(dquot);
+	spin_lock(&dq_data_lock);
+
+	limit = (dquot->dq_dqb.dqb_bsoftlimit ?
+		 dquot->dq_dqb.dqb_bsoftlimit :
+		 dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+	if (limit && buf->f_blocks > limit) {
+		curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+		buf->f_blocks = limit;
+		buf->f_bfree = buf->f_bavail =
+			(buf->f_blocks > curblock) ?
+			 (buf->f_blocks - curblock) : 0;
+	}
+
+	limit = dquot->dq_dqb.dqb_isoftlimit ?
+		dquot->dq_dqb.dqb_isoftlimit :
+		dquot->dq_dqb.dqb_ihardlimit;
+	if (limit && buf->f_files > limit) {
+		buf->f_files = limit;
+		buf->f_ffree =
+			(buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
+			 (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+	}
+
+	spin_unlock(&dq_data_lock);
+	dqput(dquot);
+	return 0;
+}
+#endif
+
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
@@ -4822,6 +4902,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
 
+#ifdef CONFIG_QUOTA
+	if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
+	    sb_has_quota_limits_enabled(sb, PRJQUOTA))
+		ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
+#endif
 	return 0;
 }
 
@@ -4986,7 +5071,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 	struct inode *qf_inode;
 	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
-		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
 	};
 
 	BUG_ON(!ext4_has_feature_quota(sb));
@@ -5014,7 +5100,8 @@ static int ext4_enable_quotas(struct super_block *sb)
 	int type, err = 0;
 	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
-		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
 	};
 
 	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index 011ba6670d99..c70d06a383e2 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -10,8 +10,10 @@
  */
 static inline void ext4_truncate_failed_write(struct inode *inode)
 {
+	down_write(&EXT4_I(inode)->i_mmap_sem);
 	truncate_inode_pages(inode->i_mapping, inode->i_size);
 	ext4_truncate(inode);
+	up_write(&EXT4_I(inode)->i_mmap_sem);
 }
 
 /*
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ac9e7c6aac74..5c06db17e41f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -794,7 +794,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			return ret;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	isize = i_size_read(inode);
 	if (start >= isize)
@@ -860,7 +860,7 @@ out:
 	if (ret == 1)
 		ret = 0;
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 18ddb1e5182a..ea272be62677 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -333,7 +333,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 	loff_t isize;
 	int err = 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	isize = i_size_read(inode);
 	if (offset >= isize)
@@ -388,10 +388,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 found:
 	if (whence == SEEK_HOLE && data_ofs > isize)
 		data_ofs = isize;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return vfs_setpos(file, data_ofs, maxbytes);
 fail:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return -ENXIO;
 }
 
@@ -1219,7 +1219,7 @@ static long f2fs_fallocate(struct file *file, int mode,
 			FALLOC_FL_INSERT_RANGE))
 		return -EOPNOTSUPP;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
 		if (offset >= inode->i_size)
@@ -1243,7 +1243,7 @@ static long f2fs_fallocate(struct file *file, int mode,
 	}
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	trace_f2fs_fallocate(inode, mode, offset, len, ret);
 	return ret;
@@ -1307,13 +1307,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 
 	flags = f2fs_mask_flags(inode->i_mode, flags);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	oldflags = fi->i_flags;
 
 	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
 		if (!capable(CAP_LINUX_IMMUTABLE)) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			ret = -EPERM;
 			goto out;
 		}
@@ -1322,7 +1322,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 	flags = flags & FS_FL_USER_MODIFIABLE;
 	flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
 	fi->i_flags = flags;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	f2fs_set_inode_flags(inode);
 	inode->i_ctime = CURRENT_TIME;
@@ -1667,7 +1667,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 
 	f2fs_balance_fs(sbi, true);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* writeback all dirty pages in the range */
 	err = filemap_write_and_wait_range(inode->i_mapping, range->start,
@@ -1778,7 +1778,7 @@ do_map:
 clear_out:
 	clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (!err)
 		range->len = (u64)total << PAGE_CACHE_SHIFT;
 	return err;
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 7def96caec5f..d0b95c95079b 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -769,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
 
 	buf.dirent = dirent;
 	buf.result = 0;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	buf.ctx.pos = file->f_pos;
 	ret = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
@@ -777,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
 				    short_only, both ? &buf : NULL);
 		file->f_pos = buf.ctx.pos;
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (ret >= 0)
 		ret = buf.result;
 	return ret;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 43d3475da83a..f70185668832 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -24,9 +24,9 @@ static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
 {
 	u32 attr;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	attr = fat_make_attrs(inode);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return put_user(attr, user_attr);
 }
@@ -47,7 +47,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 	err = mnt_want_write_file(file);
 	if (err)
 		goto out;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -109,7 +109,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 	fat_save_attrs(inode, attr);
 	mark_inode_dirty(inode);
 out_unlock_inode:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	mnt_drop_write_file(file);
 out:
 	return err;
@@ -246,7 +246,7 @@ static long fat_fallocate(struct file *file, int mode,
 	if (!S_ISREG(inode->i_mode))
 		return -EOPNOTSUPP;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	if (mode & FALLOC_FL_KEEP_SIZE) {
 		ondisksize = inode->i_blocks << 9;
 		if ((offset + len) <= ondisksize)
@@ -272,7 +272,7 @@ static long fat_fallocate(struct file *file, int mode,
 	}
 
 error:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err;
 }
 
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 5797d45a78cb..c5618db110be 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -46,9 +46,9 @@ void put_filesystem(struct file_system_type *fs)
 static struct file_system_type **find_filesystem(const char *name, unsigned len)
 {
 	struct file_system_type **p;
-	for (p=&file_systems; *p; p=&(*p)->next)
-		if (strlen((*p)->name) == len &&
-		    strncmp((*p)->name, name, len) == 0)
+	for (p = &file_systems; *p; p = &(*p)->next)
+		if (strncmp((*p)->name, name, len) == 0 &&
+		    !(*p)->name[len])
 			break;
 	return p;
 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 712601f299b8..4b855b65d457 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -944,7 +944,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 	if (!parent)
 		return -ENOENT;
 
-	mutex_lock(&parent->i_mutex);
+	inode_lock(parent);
 	if (!S_ISDIR(parent->i_mode))
 		goto unlock;
 
@@ -962,7 +962,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 	fuse_invalidate_entry(entry);
 
 	if (child_nodeid != 0 && d_really_is_positive(entry)) {
-		mutex_lock(&d_inode(entry)->i_mutex);
+		inode_lock(d_inode(entry));
 		if (get_node_id(d_inode(entry)) != child_nodeid) {
 			err = -ENOENT;
 			goto badentry;
@@ -983,7 +983,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 		clear_nlink(d_inode(entry));
 		err = 0;
  badentry:
-		mutex_unlock(&d_inode(entry)->i_mutex);
+		inode_unlock(d_inode(entry));
 		if (!err)
 			d_delete(entry);
 	} else {
@@ -992,7 +992,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 	dput(entry);
 
  unlock:
-	mutex_unlock(&parent->i_mutex);
+	inode_unlock(parent);
 	iput(parent);
 	return err;
 }
@@ -1504,7 +1504,7 @@ void fuse_set_nowrite(struct inode *inode)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
-	BUG_ON(!mutex_is_locked(&inode->i_mutex));
+	BUG_ON(!inode_is_locked(inode));
 
 	spin_lock(&fc->lock);
 	BUG_ON(fi->writectr < 0);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index aa03aab6a24f..b03d253ece15 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -207,7 +207,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 		return err;
 
 	if (lock_inode)
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 
 	err = fuse_do_open(fc, get_node_id(inode), file, isdir);
 
@@ -215,7 +215,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 		fuse_finish_open(inode, file);
 
 	if (lock_inode)
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 	return err;
 }
@@ -413,9 +413,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	if (err)
 		return err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	fuse_sync_writes(inode);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	req = fuse_get_req_nofail_nopages(fc, file);
 	memset(&inarg, 0, sizeof(inarg));
@@ -450,7 +450,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * Start writeback against all dirty pages of the inode, then
@@ -486,7 +486,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 		err = 0;
 	}
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err;
 }
 
@@ -1160,7 +1160,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return generic_file_write_iter(iocb, from);
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
@@ -1210,7 +1210,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 out:
 	current->backing_dev_info = NULL;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return written ? written : err;
 }
@@ -1322,10 +1322,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 
 	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
 		if (!write)
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 		fuse_sync_writes(inode);
 		if (!write)
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 	}
 
 	while (count) {
@@ -1413,14 +1413,14 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return -EIO;
 
 	/* Don't allow parallel writes to the same file */
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	res = generic_write_checks(iocb, from);
 	if (res > 0)
 		res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
 	fuse_invalidate_attr(inode);
 	if (res > 0)
 		fuse_write_update_size(inode, iocb->ki_pos);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return res;
 }
@@ -2287,17 +2287,17 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
 		retval = generic_file_llseek(file, offset, whence);
 		break;
 	case SEEK_END:
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		retval = fuse_update_attributes(inode, NULL, file, NULL);
 		if (!retval)
 			retval = generic_file_llseek(file, offset, whence);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		break;
 	case SEEK_HOLE:
 	case SEEK_DATA:
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		retval = fuse_lseek(file, offset, whence);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		break;
 	default:
 		retval = -EINVAL;
@@ -2944,7 +2944,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 		return -EOPNOTSUPP;
 
 	if (lock_inode) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if (mode & FALLOC_FL_PUNCH_HOLE) {
 			loff_t endbyte = offset + length - 1;
 			err = filemap_write_and_wait_range(inode->i_mapping,
@@ -2990,7 +2990,7 @@ out:
 		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
 	if (lock_inode)
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 	return err;
 }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7412863cda1e..c9384f932975 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -914,7 +914,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
 	if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
 		return -EOPNOTSUPP;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
@@ -946,7 +946,7 @@ out_unlock:
 	gfs2_glock_dq(&gh);
 out_uninit:
 	gfs2_holder_uninit(&gh);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3e94400d587c..352f958769e1 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -2067,7 +2067,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
 	if (ret)
@@ -2094,7 +2094,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
 	gfs2_glock_dq_uninit(&gh);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index be6d9c450b22..a39891344259 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -888,7 +888,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 		return -ENOMEM;
 
 	sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
-	mutex_lock(&ip->i_inode.i_mutex);
+	inode_lock(&ip->i_inode);
 	for (qx = 0; qx < num_qd; qx++) {
 		error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
 					   GL_NOCACHE, &ghs[qx]);
@@ -953,7 +953,7 @@ out_alloc:
 out:
 	while (qx--)
 		gfs2_glock_dq_uninit(&ghs[qx]);
-	mutex_unlock(&ip->i_inode.i_mutex);
+	inode_unlock(&ip->i_inode);
 	kfree(ghs);
 	gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH);
 	return error;
@@ -1674,7 +1674,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
 	if (error)
 		goto out_put;
 
-	mutex_lock(&ip->i_inode.i_mutex);
+	inode_lock(&ip->i_inode);
 	error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
 	if (error)
 		goto out_unlockput;
@@ -1739,7 +1739,7 @@ out_i:
 out_q:
 	gfs2_glock_dq_uninit(&q_gh);
 out_unlockput:
-	mutex_unlock(&ip->i_inode.i_mutex);
+	inode_unlock(&ip->i_inode);
 out_put:
 	qd_put(qd);
 	return error;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 70788e03820a..e9f2b855f831 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -173,9 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
 {
 	struct hfs_readdir_data *rd = file->private_data;
 	if (rd) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		list_del(&rd->list);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		kfree(rd);
 	}
 	return 0;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index b99ebddb10cb..6686bf39a5b5 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -570,13 +570,13 @@ static int hfs_file_release(struct inode *inode, struct file *file)
 	if (HFS_IS_RSRC(inode))
 		inode = HFS_I(inode)->rsrc_inode;
 	if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		hfs_file_truncate(inode);
 		//if (inode->i_flags & S_DEAD) {
 		//	hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
 		//	hfs_delete_inode(inode);
 		//}
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	return 0;
 }
@@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (ret)
 		return ret;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* sync the inode to buffers */
 	ret = write_inode_now(inode, 0);
@@ -668,7 +668,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	err = sync_blockdev(sb->s_bdev);
 	if (!ret)
 		ret = err;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d0f39dcbb58e..a4e867e08947 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -284,9 +284,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
 {
 	struct hfsplus_readdir_data *rd = file->private_data;
 	if (rd) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		list_del(&rd->list);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		kfree(rd);
 	}
 	return 0;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 19b33f8151f1..1a6394cdb54e 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -229,14 +229,14 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 	if (HFSPLUS_IS_RSRC(inode))
 		inode = HFSPLUS_I(inode)->rsrc_inode;
 	if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		hfsplus_file_truncate(inode);
 		if (inode->i_flags & S_DEAD) {
 			hfsplus_delete_cat(inode->i_ino,
 					   HFSPLUS_SB(sb)->hidden_dir, NULL);
 			hfsplus_delete_inode(inode);
 		}
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	return 0;
 }
@@ -286,7 +286,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (error)
 		return error;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * Sync inode metadata into the catalog and extent trees.
@@ -327,7 +327,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return error;
 }
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 0624ce4e0702..32a49e292b6a 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -93,7 +93,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 		goto out_drop_write;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
 	    inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
@@ -126,7 +126,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 	mark_inode_dirty(inode);
 
 out_unlock_inode:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 out_drop_write:
 	mnt_drop_write_file(file);
 out:
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index cfaa18c7a337..d1abbee281d1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -378,9 +378,9 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = fsync_file(HOSTFS_I(inode)->fd, datasync);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return ret;
 }
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index dc540bfcee1d..e57a53c13d86 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -33,7 +33,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
 	if (whence == SEEK_DATA || whence == SEEK_HOLE)
 		return -EINVAL;
 
-	mutex_lock(&i->i_mutex);
+	inode_lock(i);
 	hpfs_lock(s);
 
 	/*pr_info("dir lseek\n");*/
@@ -48,12 +48,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
 ok:
 	filp->f_pos = new_off;
 	hpfs_unlock(s);
-	mutex_unlock(&i->i_mutex);
+	inode_unlock(i);
 	return new_off;
 fail:
 	/*pr_warn("illegal lseek: %016llx\n", new_off);*/
 	hpfs_unlock(s);
-	mutex_unlock(&i->i_mutex);
+	inode_unlock(i);
 	return -ESPIPE;
 }
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8bbf7f3e2a27..e1f465a389d5 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -141,7 +141,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	file_accessed(file);
 
 	ret = -ENOMEM;
@@ -157,7 +157,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
 		inode->i_size = len;
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return ret;
 }
@@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	if (hole_end > hole_start) {
 		struct address_space *mapping = inode->i_mapping;
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		i_mmap_lock_write(mapping);
 		if (!RB_EMPTY_ROOT(&mapping->i_mmap))
 			hugetlb_vmdelete_list(&mapping->i_mmap,
@@ -538,7 +538,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 						hole_end  >> PAGE_SHIFT);
 		i_mmap_unlock_write(mapping);
 		remove_inode_hugepages(inode, hole_start, hole_end);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 	return 0;
@@ -572,7 +572,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 	start = offset >> hpage_shift;
 	end = (offset + len + hpage_size - 1) >> hpage_shift;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
 	error = inode_newsize_ok(inode, offset + len);
@@ -659,7 +659,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		i_size_write(inode, offset + len);
 	inode->i_ctime = CURRENT_TIME;
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return error;
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index e491e54d2430..9f62db3bcc3e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -495,7 +495,7 @@ void clear_inode(struct inode *inode)
 	 */
 	spin_lock_irq(&inode->i_data.tree_lock);
 	BUG_ON(inode->i_data.nrpages);
-	BUG_ON(inode->i_data.nrshadows);
+	BUG_ON(inode->i_data.nrexceptional);
 	spin_unlock_irq(&inode->i_data.tree_lock);
 	BUG_ON(!list_empty(&inode->i_data.private_list));
 	BUG_ON(!(inode->i_state & I_FREEING));
@@ -966,9 +966,9 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
 		swap(inode1, inode2);
 
 	if (inode1 && !S_ISDIR(inode1->i_mode))
-		mutex_lock(&inode1->i_mutex);
+		inode_lock(inode1);
 	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
+		inode_lock_nested(inode2, I_MUTEX_NONDIR2);
 }
 EXPORT_SYMBOL(lock_two_nondirectories);
 
@@ -980,9 +980,9 @@ EXPORT_SYMBOL(lock_two_nondirectories);
 void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
 {
 	if (inode1 && !S_ISDIR(inode1->i_mode))
-		mutex_unlock(&inode1->i_mutex);
+		inode_unlock(inode1);
 	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
-		mutex_unlock(&inode2->i_mutex);
+		inode_unlock(inode2);
 }
 EXPORT_SYMBOL(unlock_two_nondirectories);
 
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 29466c380958..116a333e9c77 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -434,9 +434,9 @@ int generic_block_fiemap(struct inode *inode,
 			 u64 len, get_block_t *get_block)
 {
 	int ret;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 EXPORT_SYMBOL(generic_block_fiemap);
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index a3750f902adc..0ae91ad6df2d 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mtd/mtd.h>
+#include <linux/mm.h> /* kvfree() */
 #include "nodelist.h"
 
 static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
@@ -383,12 +384,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
 	return 0;
 
  out_free:
-#ifndef __ECOS
-	if (jffs2_blocks_use_vmalloc(c))
-		vfree(c->blocks);
-	else
-#endif
-		kfree(c->blocks);
+	kvfree(c->blocks);
 
 	return ret;
 }
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index f509f62e12f6..c5ac5944bc1b 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -39,10 +39,10 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	/* Trigger GC to flush any pending writes for this inode */
 	jffs2_flush_wbuf_gc(c, inode->i_ino);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return 0;
 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 2caf1682036d..bead25ae8fe4 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 out_root:
 	jffs2_free_ino_caches(c);
 	jffs2_free_raw_node_refs(c);
-	if (jffs2_blocks_use_vmalloc(c))
-		vfree(c->blocks);
-	else
-		kfree(c->blocks);
+	kvfree(c->blocks);
  out_inohash:
 	jffs2_clear_xattr_subsystem(c);
 	kfree(c->inocache_list);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index bb080c272149..0a9a114bb9d1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb)
 
 	jffs2_free_ino_caches(c);
 	jffs2_free_raw_node_refs(c);
-	if (jffs2_blocks_use_vmalloc(c))
-		vfree(c->blocks);
-	else
-		kfree(c->blocks);
+	kvfree(c->blocks);
 	jffs2_flash_cleanup(c);
 	kfree(c->inocache_list);
 	jffs2_clear_xattr_subsystem(c);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 0e026a7bdcd4..4ce7735dd042 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -38,17 +38,17 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	if (rc)
 		return rc;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	if (!(inode->i_state & I_DIRTY_ALL) ||
 	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
 		/* Make sure committed changes hit the disk */
 		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		return rc;
 	}
 
 	rc |= jfs_commit_inode(inode, 1);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return rc ? -EIO : 0;
 }
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 8db8b7d61e40..8653cac7e12e 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -96,7 +96,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 
 		/* Lock against other parallel changes of flags */
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 
 		jfs_get_inode_flags(jfs_inode);
 		oldflags = jfs_inode->mode2;
@@ -109,7 +109,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			((flags ^ oldflags) &
 			(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
 			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				mutex_unlock(&inode->i_mutex);
+				inode_unlock(inode);
 				err = -EPERM;
 				goto setflags_out;
 			}
@@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		jfs_inode->mode2 = flags;
 
 		jfs_set_inode_flags(inode);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		inode->i_ctime = CURRENT_TIME_SEC;
 		mark_inode_dirty(inode);
 setflags_out:
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 900925b5eb8c..4f5d85ba8e23 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -792,7 +792,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
 	struct buffer_head tmp_bh;
 	struct buffer_head *bh;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
 				sb->s_blocksize - offset : towrite;
@@ -824,7 +824,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
 	}
 out:
 	if (len == towrite) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		return err;
 	}
 	if (inode->i_size < off+len-towrite)
@@ -832,7 +832,7 @@ out:
 	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(inode);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return len - towrite;
 }
 
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 821973853340..996b7742c90b 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1511,9 +1511,9 @@ static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
 	struct inode *inode = file_inode(file);
 	loff_t ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = generic_file_llseek(file, offset, whence);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return ret;
 }
diff --git a/fs/libfs.c b/fs/libfs.c
index 01491299f348..0ca80b2af420 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -89,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close);
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 {
 	struct dentry *dentry = file->f_path.dentry;
-	mutex_lock(&d_inode(dentry)->i_mutex);
+	inode_lock(d_inode(dentry));
 	switch (whence) {
 		case 1:
 			offset += file->f_pos;
@@ -97,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 			if (offset >= 0)
 				break;
 		default:
-			mutex_unlock(&d_inode(dentry)->i_mutex);
+			inode_unlock(d_inode(dentry));
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -124,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 			spin_unlock(&dentry->d_lock);
 		}
 	}
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 	return offset;
 }
 EXPORT_SYMBOL(dcache_dir_lseek);
@@ -941,7 +941,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
 	if (err)
 		return err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = sync_mapping_buffers(inode->i_mapping);
 	if (!(inode->i_state & I_DIRTY_ALL))
 		goto out;
@@ -953,7 +953,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
 		ret = err;
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 EXPORT_SYMBOL(__generic_file_fsync);
diff --git a/fs/locks.c b/fs/locks.c
index af1ed74a657f..7c5f91be9b65 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1650,12 +1650,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 * bother, maybe that's a sign this just isn't a good file to
 	 * hand out a delegation on.
 	 */
-	if (is_deleg && !mutex_trylock(&inode->i_mutex))
+	if (is_deleg && !inode_trylock(inode))
 		return -EAGAIN;
 
 	if (is_deleg && arg == F_WRLCK) {
 		/* Write delegations are not currently supported: */
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		WARN_ON_ONCE(1);
 		return -EINVAL;
 	}
@@ -1732,7 +1732,7 @@ out:
 	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	if (is_deleg)
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	if (!error && !my_fl)
 		*flp = NULL;
 	return error;
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 1a6f0167b16a..61eaeb1b6cac 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -204,12 +204,12 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (err)
 			return err;
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		oldflags = li->li_flags;
 		flags &= LOGFS_FL_USER_MODIFIABLE;
 		flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
 		li->li_flags = flags;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		inode->i_ctime = CURRENT_TIME;
 		mark_inode_dirty_sync(inode);
@@ -230,11 +230,11 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	logfs_get_wblocks(sb, NULL, WF_LOCK);
 	logfs_write_anchor(sb);
 	logfs_put_wblocks(sb, NULL, WF_LOCK);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return 0;
 }
diff --git a/fs/namei.c b/fs/namei.c
index bceefd5588a2..f624d132e01e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1629,9 +1629,9 @@ static int lookup_slow(struct nameidata *nd, struct path *path)
 	parent = nd->path.dentry;
 	BUG_ON(nd->inode != parent->d_inode);
 
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
 	dentry = __lookup_hash(&nd->last, parent, nd->flags);
-	mutex_unlock(&parent->d_inode->i_mutex);
+	inode_unlock(parent->d_inode);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 	path->mnt = nd->path.mnt;
@@ -2229,10 +2229,10 @@ struct dentry *kern_path_locked(const char *name, struct path *path)
 		putname(filename);
 		return ERR_PTR(-EINVAL);
 	}
-	mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
 	d = __lookup_hash(&last, path->dentry, 0);
 	if (IS_ERR(d)) {
-		mutex_unlock(&path->dentry->d_inode->i_mutex);
+		inode_unlock(path->dentry->d_inode);
 		path_put(path);
 	}
 	putname(filename);
@@ -2282,7 +2282,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	unsigned int c;
 	int err;
 
-	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
 
 	this.name = name;
 	this.len = len;
@@ -2380,9 +2380,9 @@ struct dentry *lookup_one_len_unlocked(const char *name,
 	if (ret)
 		return ret;
 
-	mutex_lock(&base->d_inode->i_mutex);
+	inode_lock(base->d_inode);
 	ret =  __lookup_hash(&this, base, 0);
-	mutex_unlock(&base->d_inode->i_mutex);
+	inode_unlock(base->d_inode);
 	return ret;
 }
 EXPORT_SYMBOL(lookup_one_len_unlocked);
@@ -2463,7 +2463,7 @@ mountpoint_last(struct nameidata *nd, struct path *path)
 		goto done;
 	}
 
-	mutex_lock(&dir->d_inode->i_mutex);
+	inode_lock(dir->d_inode);
 	dentry = d_lookup(dir, &nd->last);
 	if (!dentry) {
 		/*
@@ -2473,16 +2473,16 @@ mountpoint_last(struct nameidata *nd, struct path *path)
 		 */
 		dentry = d_alloc(dir, &nd->last);
 		if (!dentry) {
-			mutex_unlock(&dir->d_inode->i_mutex);
+			inode_unlock(dir->d_inode);
 			return -ENOMEM;
 		}
 		dentry = lookup_real(dir->d_inode, dentry, nd->flags);
 		if (IS_ERR(dentry)) {
-			mutex_unlock(&dir->d_inode->i_mutex);
+			inode_unlock(dir->d_inode);
 			return PTR_ERR(dentry);
 		}
 	}
-	mutex_unlock(&dir->d_inode->i_mutex);
+	inode_unlock(dir->d_inode);
 
 done:
 	if (d_is_negative(dentry)) {
@@ -2672,7 +2672,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 	struct dentry *p;
 
 	if (p1 == p2) {
-		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
 		return NULL;
 	}
 
@@ -2680,29 +2680,29 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 
 	p = d_ancestor(p2, p1);
 	if (p) {
-		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+		inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
 		return p;
 	}
 
 	p = d_ancestor(p1, p2);
 	if (p) {
-		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+		inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
 		return p;
 	}
 
-	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
+	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
 	return NULL;
 }
 EXPORT_SYMBOL(lock_rename);
 
 void unlock_rename(struct dentry *p1, struct dentry *p2)
 {
-	mutex_unlock(&p1->d_inode->i_mutex);
+	inode_unlock(p1->d_inode);
 	if (p1 != p2) {
-		mutex_unlock(&p2->d_inode->i_mutex);
+		inode_unlock(p2->d_inode);
 		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
 	}
 }
@@ -3141,9 +3141,9 @@ retry_lookup:
 		 * dropping this one anyway.
 		 */
 	}
-	mutex_lock(&dir->d_inode->i_mutex);
+	inode_lock(dir->d_inode);
 	error = lookup_open(nd, &path, file, op, got_write, opened);
-	mutex_unlock(&dir->d_inode->i_mutex);
+	inode_unlock(dir->d_inode);
 
 	if (error <= 0) {
 		if (error)
@@ -3489,7 +3489,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 	 * Do the final lookup.
 	 */
 	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
-	mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
 	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
 	if (IS_ERR(dentry))
 		goto unlock;
@@ -3518,7 +3518,7 @@ fail:
 	dput(dentry);
 	dentry = ERR_PTR(error);
 unlock:
-	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	inode_unlock(path->dentry->d_inode);
 	if (!err2)
 		mnt_drop_write(path->mnt);
 out:
@@ -3538,7 +3538,7 @@ EXPORT_SYMBOL(kern_path_create);
 void done_path_create(struct path *path, struct dentry *dentry)
 {
 	dput(dentry);
-	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	inode_unlock(path->dentry->d_inode);
 	mnt_drop_write(path->mnt);
 	path_put(path);
 }
@@ -3735,7 +3735,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -EPERM;
 
 	dget(dentry);
-	mutex_lock(&dentry->d_inode->i_mutex);
+	inode_lock(dentry->d_inode);
 
 	error = -EBUSY;
 	if (is_local_mountpoint(dentry))
@@ -3755,7 +3755,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	detach_mounts(dentry);
 
 out:
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	inode_unlock(dentry->d_inode);
 	dput(dentry);
 	if (!error)
 		d_delete(dentry);
@@ -3794,7 +3794,7 @@ retry:
 	if (error)
 		goto exit1;
 
-	mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
 	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
@@ -3810,7 +3810,7 @@ retry:
 exit3:
 	dput(dentry);
 exit2:
-	mutex_unlock(&path.dentry->d_inode->i_mutex);
+	inode_unlock(path.dentry->d_inode);
 	mnt_drop_write(path.mnt);
 exit1:
 	path_put(&path);
@@ -3856,7 +3856,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
 	if (!dir->i_op->unlink)
 		return -EPERM;
 
-	mutex_lock(&target->i_mutex);
+	inode_lock(target);
 	if (is_local_mountpoint(dentry))
 		error = -EBUSY;
 	else {
@@ -3873,7 +3873,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
 		}
 	}
 out:
-	mutex_unlock(&target->i_mutex);
+	inode_unlock(target);
 
 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
@@ -3916,7 +3916,7 @@ retry:
 	if (error)
 		goto exit1;
 retry_deleg:
-	mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
 	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
@@ -3934,7 +3934,7 @@ retry_deleg:
 exit2:
 		dput(dentry);
 	}
-	mutex_unlock(&path.dentry->d_inode->i_mutex);
+	inode_unlock(path.dentry->d_inode);
 	if (inode)
 		iput(inode);	/* truncate the inode here */
 	inode = NULL;
@@ -4086,7 +4086,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (error)
 		return error;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	/* Make sure we don't allow creating hardlink to an unlinked file */
 	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
 		error =  -ENOENT;
@@ -4103,7 +4103,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		inode->i_state &= ~I_LINKABLE;
 		spin_unlock(&inode->i_lock);
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (!error)
 		fsnotify_link(dir, inode, new_dentry);
 	return error;
@@ -4303,7 +4303,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (!is_dir || (flags & RENAME_EXCHANGE))
 		lock_two_nondirectories(source, target);
 	else if (target)
-		mutex_lock(&target->i_mutex);
+		inode_lock(target);
 
 	error = -EBUSY;
 	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
@@ -4356,7 +4356,7 @@ out:
 	if (!is_dir || (flags & RENAME_EXCHANGE))
 		unlock_two_nondirectories(source, target);
 	else if (target)
-		mutex_unlock(&target->i_mutex);
+		inode_unlock(target);
 	dput(new_dentry);
 	if (!error) {
 		fsnotify_move(old_dir, new_dir, old_name, is_dir,
diff --git a/fs/namespace.c b/fs/namespace.c
index a830e1463704..4fb1691b4355 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1961,9 +1961,9 @@ static struct mountpoint *lock_mount(struct path *path)
 	struct vfsmount *mnt;
 	struct dentry *dentry = path->dentry;
 retry:
-	mutex_lock(&dentry->d_inode->i_mutex);
+	inode_lock(dentry->d_inode);
 	if (unlikely(cant_mount(dentry))) {
-		mutex_unlock(&dentry->d_inode->i_mutex);
+		inode_unlock(dentry->d_inode);
 		return ERR_PTR(-ENOENT);
 	}
 	namespace_lock();
@@ -1974,13 +1974,13 @@ retry:
 			mp = new_mountpoint(dentry);
 		if (IS_ERR(mp)) {
 			namespace_unlock();
-			mutex_unlock(&dentry->d_inode->i_mutex);
+			inode_unlock(dentry->d_inode);
 			return mp;
 		}
 		return mp;
 	}
 	namespace_unlock();
-	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	inode_unlock(path->dentry->d_inode);
 	path_put(path);
 	path->mnt = mnt;
 	dentry = path->dentry = dget(mnt->mnt_root);
@@ -1992,7 +1992,7 @@ static void unlock_mount(struct mountpoint *where)
 	struct dentry *dentry = where->m_dentry;
 	put_mountpoint(where);
 	namespace_unlock();
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	inode_unlock(dentry->d_inode);
 }
 
 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f0e3e9e747dd..26c2de2de13f 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -369,7 +369,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
 	if (!res) {
 		struct inode *inode = d_inode(dentry);
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
 			ncp_new_dentry(dentry);
 			val=1;
@@ -377,7 +377,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
 			ncp_dbg(2, "found, but dirEntNum changed\n");
 
 		ncp_update_inode2(inode, &finfo);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 finished:
@@ -639,9 +639,9 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
 	} else {
 		struct inode *inode = d_inode(newdent);
 
-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(inode, I_MUTEX_CHILD);
 		ncp_update_inode2(inode, entry);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 	if (ctl.idx >= NCP_DIRCACHE_SIZE) {
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 011324ce9df2..dd38ca1f2ecb 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -224,10 +224,10 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	iocb->ki_pos = pos;
 
 	if (pos > i_size_read(inode)) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if (pos > i_size_read(inode))
 			i_size_write(inode, pos);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	ncp_dbg(1, "exit %pD2\n", file);
 outrel:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c82a21228a34..9cce67043f92 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -940,7 +940,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 	dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
 			filp, offset, whence);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	switch (whence) {
 		case 1:
 			offset += filp->f_pos;
@@ -957,7 +957,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 		dir_ctx->duped = 0;
 	}
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return offset;
 }
 
@@ -972,9 +972,9 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
 
 	dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return 0;
 }
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 7ab7ec9f4eed..7a0cfd3266e5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -580,7 +580,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 	if (!count)
 		goto out;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	result = nfs_sync_mapping(mapping);
 	if (result)
 		goto out_unlock;
@@ -608,7 +608,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 	NFS_I(inode)->read_io += count;
 	result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (!result) {
 		result = nfs_direct_wait(dreq);
@@ -622,7 +622,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 out_release:
 	nfs_direct_req_release(dreq);
 out_unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 out:
 	return result;
 }
@@ -1005,7 +1005,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 	pos = iocb->ki_pos;
 	end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	result = nfs_sync_mapping(mapping);
 	if (result)
@@ -1045,7 +1045,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 					      pos >> PAGE_CACHE_SHIFT, end);
 	}
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (!result) {
 		result = nfs_direct_wait(dreq);
@@ -1066,7 +1066,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 out_release:
 	nfs_direct_req_release(dreq);
 out_unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return result;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 4ef8f5addcad..748bb813b8ec 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -278,9 +278,9 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 		if (ret != 0)
 			break;
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		ret = nfs_file_fsync_commit(file, start, end, datasync);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		/*
 		 * If nfs_file_fsync_commit detected a server reboot, then
 		 * resend all dirty pages that might have been covered by
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index bb1f4e7a3270..3384dc8e6683 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -971,7 +971,7 @@ filelayout_mark_request_commit(struct nfs_page *req,
 	u32 i, j;
 
 	if (fl->commit_through_mds) {
-		nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+		nfs_request_add_commit_list(req, cinfo);
 	} else {
 		/* Note that we are calling nfs4_fl_calc_j_index on each page
 		 * that ends up being committed to a data server.  An attractive
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 6594e9f903a0..5bcd92d50e82 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1948,11 +1948,9 @@ ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
 	start = xdr_reserve_space(xdr, 4);
 	BUG_ON(!start);
 
-	if (ff_layout_encode_ioerr(flo, xdr, args))
-		goto out;
-
+	ff_layout_encode_ioerr(flo, xdr, args);
 	ff_layout_encode_iostats(flo, xdr, args);
-out:
+
 	*start = cpu_to_be32((xdr->p - start - 1) * 4);
 	dprintk("%s: Return\n", __func__);
 }
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index bd0327541366..29898a9550fa 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -218,63 +218,55 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
 	err->length = end - err->offset;
 }
 
-static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err,  u64 offset,
-			       u64 length, int status, enum nfs_opnum4 opnum,
-			       nfs4_stateid *stateid,
-			       struct nfs4_deviceid *deviceid)
+static int
+ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
+		const struct nfs4_ff_layout_ds_err *e2)
 {
-	return err->status == status && err->opnum == opnum &&
-	       nfs4_stateid_match(&err->stateid, stateid) &&
-	       !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
-	       end_offset(err->offset, err->length) >= offset &&
-	       err->offset <= end_offset(offset, length);
-}
-
-static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
-			   struct nfs4_ff_layout_ds_err *new)
-{
-	if (!ds_error_can_merge(old, new->offset, new->length, new->status,
-				new->opnum, &new->stateid, &new->deviceid))
-		return false;
-
-	extend_ds_error(old, new->offset, new->length);
-	return true;
+	int ret;
+
+	if (e1->opnum != e2->opnum)
+		return e1->opnum < e2->opnum ? -1 : 1;
+	if (e1->status != e2->status)
+		return e1->status < e2->status ? -1 : 1;
+	ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid));
+	if (ret != 0)
+		return ret;
+	ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
+	if (ret != 0)
+		return ret;
+	if (end_offset(e1->offset, e1->length) < e2->offset)
+		return -1;
+	if (e1->offset > end_offset(e2->offset, e2->length))
+		return 1;
+	/* If ranges overlap or are contiguous, they are the same */
+	return 0;
 }
 
-static bool
+static void
 ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
 			      struct nfs4_ff_layout_ds_err *dserr)
 {
-	struct nfs4_ff_layout_ds_err *err;
-
-	list_for_each_entry(err, &flo->error_list, list) {
-		if (merge_ds_error(err, dserr)) {
-			return true;
-		}
-	}
-
-	list_add(&dserr->list, &flo->error_list);
-	return false;
-}
-
-static bool
-ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
-			  u64 length, int status, enum nfs_opnum4 opnum,
-			  nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
-{
-	bool found = false;
-	struct nfs4_ff_layout_ds_err *err;
-
-	list_for_each_entry(err, &flo->error_list, list) {
-		if (ds_error_can_merge(err, offset, length, status, opnum,
-				       stateid, deviceid)) {
-			found = true;
-			extend_ds_error(err, offset, length);
+	struct nfs4_ff_layout_ds_err *err, *tmp;
+	struct list_head *head = &flo->error_list;
+	int match;
+
+	/* Do insertion sort w/ merges */
+	list_for_each_entry_safe(err, tmp, &flo->error_list, list) {
+		match = ff_ds_error_match(err, dserr);
+		if (match < 0)
+			continue;
+		if (match > 0) {
+			/* Add entry "dserr" _before_ entry "err" */
+			head = &err->list;
 			break;
 		}
+		/* Entries match, so merge "err" into "dserr" */
+		extend_ds_error(dserr, err->offset, err->length);
+		list_del(&err->list);
+		kfree(err);
 	}
 
-	return found;
+	list_add_tail(&dserr->list, head);
 }
 
 int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
@@ -283,7 +275,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
 			     gfp_t gfp_flags)
 {
 	struct nfs4_ff_layout_ds_err *dserr;
-	bool needfree;
 
 	if (status == 0)
 		return 0;
@@ -291,14 +282,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
 	if (mirror->mirror_ds == NULL)
 		return -EINVAL;
 
-	spin_lock(&flo->generic_hdr.plh_inode->i_lock);
-	if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
-				      &mirror->stateid,
-				      &mirror->mirror_ds->id_node.deviceid)) {
-		spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
-		return 0;
-	}
-	spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
 	dserr = kmalloc(sizeof(*dserr), gfp_flags);
 	if (!dserr)
 		return -ENOMEM;
@@ -313,10 +296,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
 	       NFS4_DEVICEID4_SIZE);
 
 	spin_lock(&flo->generic_hdr.plh_inode->i_lock);
-	needfree = ff_layout_add_ds_error_locked(flo, dserr);
+	ff_layout_add_ds_error_locked(flo, dserr);
 	spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
-	if (needfree)
-		kfree(dserr);
 
 	return 0;
 }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 8e24d886d2c5..86faecf8f328 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -661,9 +661,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	trace_nfs_getattr_enter(inode);
 	/* Flush out writes to the server in order to update c/mtime.  */
 	if (S_ISREG(inode->i_mode)) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		err = nfs_sync_inode(inode);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		if (err)
 			goto out;
 	}
@@ -1178,9 +1178,9 @@ static int __nfs_revalidate_mapping(struct inode *inode,
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
 	if (may_lock) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		ret = nfs_invalidate_mapping(inode, mapping);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	} else
 		ret = nfs_invalidate_mapping(inode, mapping);
 	trace_nfs_invalidate_mapping_exit(inode, ret);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4e8cc942336c..9a547aa3ec8e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -484,7 +484,7 @@ void nfs_retry_commit(struct list_head *page_list,
 		      struct nfs_commit_info *cinfo,
 		      u32 ds_commit_idx);
 void nfs_commitdata_release(struct nfs_commit_data *data);
-void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
+void nfs_request_add_commit_list(struct nfs_page *req,
 				 struct nfs_commit_info *cinfo);
 void nfs_request_add_commit_list_locked(struct nfs_page *req,
 		struct list_head *dst,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 6e8174930a48..bd25dc7077f7 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -101,13 +101,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
 	if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
 		return -EOPNOTSUPP;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	err = nfs42_proc_fallocate(&msg, filep, offset, len);
 	if (err == -EOPNOTSUPP)
 		NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err;
 }
 
@@ -123,7 +123,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
 		return -EOPNOTSUPP;
 
 	nfs_wb_all(inode);
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	err = nfs42_proc_fallocate(&msg, filep, offset, len);
 	if (err == 0)
@@ -131,7 +131,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
 	if (err == -EOPNOTSUPP)
 		NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err;
 }
 
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 26f9a23e2b25..57ca1c8039c1 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -141,11 +141,11 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 		if (ret != 0)
 			break;
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		ret = nfs_file_fsync_commit(file, start, end, datasync);
 		if (!ret)
 			ret = pnfs_sync_inode(inode, !!datasync);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		/*
 		 * If nfs_file_fsync_commit detected a server reboot, then
 		 * resend all dirty pages that might have been covered by
@@ -219,13 +219,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
 
 	/* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
 	if (same_inode) {
-		mutex_lock(&src_inode->i_mutex);
+		inode_lock(src_inode);
 	} else if (dst_inode < src_inode) {
-		mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(dst_inode, I_MUTEX_PARENT);
+		inode_lock_nested(src_inode, I_MUTEX_CHILD);
 	} else {
-		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(src_inode, I_MUTEX_PARENT);
+		inode_lock_nested(dst_inode, I_MUTEX_CHILD);
 	}
 
 	/* flush all pending writes on both src and dst so that server
@@ -246,13 +246,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
 
 out_unlock:
 	if (same_inode) {
-		mutex_unlock(&src_inode->i_mutex);
+		inode_unlock(src_inode);
 	} else if (dst_inode < src_inode) {
-		mutex_unlock(&src_inode->i_mutex);
-		mutex_unlock(&dst_inode->i_mutex);
+		inode_unlock(src_inode);
+		inode_unlock(dst_inode);
 	} else {
-		mutex_unlock(&dst_inode->i_mutex);
-		mutex_unlock(&src_inode->i_mutex);
+		inode_unlock(dst_inode);
+		inode_unlock(src_inode);
 	}
 out:
 	return ret;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ce43cd6d88c6..5754835a2886 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -830,11 +830,10 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
  * holding the nfs_page lock.
  */
 void
-nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
-			    struct nfs_commit_info *cinfo)
+nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
 	spin_lock(cinfo->lock);
-	nfs_request_add_commit_list_locked(req, dst, cinfo);
+	nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
 	spin_unlock(cinfo->lock);
 	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
@@ -892,7 +891,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
 {
 	if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
 		return;
-	nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+	nfs_request_add_commit_list(req, cinfo);
 }
 
 static void
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 819ad812c71b..4cba7865f496 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -55,10 +55,10 @@ nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u
 	struct inode *inode = d_inode(resfh->fh_dentry);
 	int status;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	status = security_inode_setsecctx(resfh->fh_dentry,
 		label->data, label->len);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (status)
 		/*
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 79f0307a5ec8..dc8ebecf5618 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -192,7 +192,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 
 	dir = nn->rec_file->f_path.dentry;
 	/* lock the parent */
-	mutex_lock(&d_inode(dir)->i_mutex);
+	inode_lock(d_inode(dir));
 
 	dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
 	if (IS_ERR(dentry)) {
@@ -213,7 +213,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 out_put:
 	dput(dentry);
 out_unlock:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	if (status == 0) {
 		if (nn->in_grace) {
 			crp = nfs4_client_to_reclaim(dname, nn);
@@ -286,7 +286,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
 	}
 
 	status = iterate_dir(nn->rec_file, &ctx.ctx);
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
 	list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
 		if (!status) {
@@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
 		list_del(&entry->list);
 		kfree(entry);
 	}
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	nfs4_reset_creds(original_cred);
 
 	list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
@@ -322,7 +322,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
 	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
 
 	dir = nn->rec_file->f_path.dentry;
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 	dentry = lookup_one_len(name, dir, namlen);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
@@ -335,7 +335,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
 out:
 	dput(dentry);
 out_unlock:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	return status;
 }
 
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 0770bcb543c8..f84fe6bf9aee 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -288,7 +288,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
 	}
 
 	inode = d_inode(dentry);
-	mutex_lock_nested(&inode->i_mutex, subclass);
+	inode_lock_nested(inode, subclass);
 	fill_pre_wcc(fhp);
 	fhp->fh_locked = true;
 }
@@ -307,7 +307,7 @@ fh_unlock(struct svc_fh *fhp)
 {
 	if (fhp->fh_locked) {
 		fill_post_wcc(fhp);
-		mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
+		inode_unlock(d_inode(fhp->fh_dentry));
 		fhp->fh_locked = false;
 	}
 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6739077f17fe..5d2a57e4c03a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -493,9 +493,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 	dentry = fhp->fh_dentry;
 
-	mutex_lock(&d_inode(dentry)->i_mutex);
+	inode_lock(d_inode(dentry));
 	host_error = security_inode_setsecctx(dentry, label->data, label->len);
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 	return nfserrno(host_error);
 }
 #else
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 10b22527a617..21a1e2e0d92f 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -1003,7 +1003,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	isize = i_size_read(inode);
 
@@ -1113,6 +1113,6 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (ret == 1)
 		ret = 0;
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index aba43811d6ef..e8fe24882b5b 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -158,7 +158,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
 
 	flags = nilfs_mask_flags(inode->i_mode, flags);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	oldflags = NILFS_I(inode)->i_flags;
 
@@ -186,7 +186,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
 	nilfs_mark_inode_dirty(inode);
 	ret = nilfs_transaction_commit(inode->i_sb);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	mnt_drop_write_file(filp);
 	return ret;
 }
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9e38dafa3bc7..b2eff5816adc 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1509,7 +1509,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 	err = filemap_write_and_wait_range(vi->i_mapping, start, end);
 	if (err)
 		return err;
-	mutex_lock(&vi->i_mutex);
+	inode_lock(vi);
 
 	BUG_ON(!S_ISDIR(vi->i_mode));
 	/* If the bitmap attribute inode is in memory sync it, too. */
@@ -1532,7 +1532,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 	else
 		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
 				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
-	mutex_unlock(&vi->i_mutex);
+	inode_unlock(vi);
 	return ret;
 }
 
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 9d383e5eff0e..bed4d427dfae 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1944,14 +1944,14 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t written = 0;
 	ssize_t err;
 
-	mutex_lock(&vi->i_mutex);
+	inode_lock(vi);
 	/* We can write back this queue in page reclaim. */
 	current->backing_dev_info = inode_to_bdi(vi);
 	err = ntfs_prepare_file_for_write(iocb, from);
 	if (iov_iter_count(from) && !err)
 		written = ntfs_perform_write(file, from, iocb->ki_pos);
 	current->backing_dev_info = NULL;
-	mutex_unlock(&vi->i_mutex);
+	inode_unlock(vi);
 	if (likely(written > 0)) {
 		err = generic_write_sync(file, iocb->ki_pos, written);
 		if (err < 0)
@@ -1996,7 +1996,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	err = filemap_write_and_wait_range(vi->i_mapping, start, end);
 	if (err)
 		return err;
-	mutex_lock(&vi->i_mutex);
+	inode_lock(vi);
 
 	BUG_ON(S_ISDIR(vi->i_mode));
 	if (!datasync || !NInoNonResident(NTFS_I(vi)))
@@ -2015,7 +2015,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	else
 		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
 				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
-	mutex_unlock(&vi->i_mutex);
+	inode_unlock(vi);
 	return ret;
 }
 
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
index d80e3315cab0..9793e68ba1dd 100644
--- a/fs/ntfs/quota.c
+++ b/fs/ntfs/quota.c
@@ -48,7 +48,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 		ntfs_error(vol->sb, "Quota inodes are not open.");
 		return false;
 	}
-	mutex_lock(&vol->quota_q_ino->i_mutex);
+	inode_lock(vol->quota_q_ino);
 	ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
 	if (!ictx) {
 		ntfs_error(vol->sb, "Failed to get index context.");
@@ -98,7 +98,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 	ntfs_index_entry_mark_dirty(ictx);
 set_done:
 	ntfs_index_ctx_put(ictx);
-	mutex_unlock(&vol->quota_q_ino->i_mutex);
+	inode_unlock(vol->quota_q_ino);
 	/*
 	 * We set the flag so we do not try to mark the quotas out of date
 	 * again on remount.
@@ -110,7 +110,7 @@ done:
 err_out:
 	if (ictx)
 		ntfs_index_ctx_put(ictx);
-	mutex_unlock(&vol->quota_q_ino->i_mutex);
+	inode_unlock(vol->quota_q_ino);
 	return false;
 }
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 2f77f8dfb861..1b38abdaa3ed 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1284,10 +1284,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
 	 * Find the inode number for the hibernation file by looking up the
 	 * filename hiberfil.sys in the root directory.
 	 */
-	mutex_lock(&vol->root_ino->i_mutex);
+	inode_lock(vol->root_ino);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
 			&name);
-	mutex_unlock(&vol->root_ino->i_mutex);
+	inode_unlock(vol->root_ino);
 	if (IS_ERR_MREF(mref)) {
 		ret = MREF_ERR(mref);
 		/* If the file does not exist, Windows is not hibernated. */
@@ -1377,10 +1377,10 @@ static bool load_and_init_quota(ntfs_volume *vol)
 	 * Find the inode number for the quota file by looking up the filename
 	 * $Quota in the extended system files directory $Extend.
 	 */
-	mutex_lock(&vol->extend_ino->i_mutex);
+	inode_lock(vol->extend_ino);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
 			&name);
-	mutex_unlock(&vol->extend_ino->i_mutex);
+	inode_unlock(vol->extend_ino);
 	if (IS_ERR_MREF(mref)) {
 		/*
 		 * If the file does not exist, quotas are disabled and have
@@ -1460,10 +1460,10 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
 	 * Find the inode number for the transaction log file by looking up the
 	 * filename $UsnJrnl in the extended system files directory $Extend.
 	 */
-	mutex_lock(&vol->extend_ino->i_mutex);
+	inode_lock(vol->extend_ino);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
 			&name);
-	mutex_unlock(&vol->extend_ino->i_mutex);
+	inode_unlock(vol->extend_ino);
 	if (IS_ERR_MREF(mref)) {
 		/*
 		 * If the file does not exist, transaction logging is disabled,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a3ded88718c9..d002579c6f2b 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5719,7 +5719,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 		goto bail;
 	}
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	if (ocfs2_truncate_log_needs_flush(osb)) {
 		ret = __ocfs2_flush_truncate_log(osb);
@@ -5776,7 +5776,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 bail:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
@@ -5832,7 +5832,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 	struct ocfs2_dinode *di;
 	struct ocfs2_truncate_log *tl;
 
-	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+	BUG_ON(inode_trylock(tl_inode));
 
 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
 
@@ -5980,7 +5980,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 	struct ocfs2_dinode *di;
 	struct ocfs2_truncate_log *tl;
 
-	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+	BUG_ON(inode_trylock(tl_inode));
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
 
@@ -6008,7 +6008,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 		goto out;
 	}
 
-	mutex_lock(&data_alloc_inode->i_mutex);
+	inode_lock(data_alloc_inode);
 
 	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
 	if (status < 0) {
@@ -6035,7 +6035,7 @@ out_unlock:
 	ocfs2_inode_unlock(data_alloc_inode, 1);
 
 out_mutex:
-	mutex_unlock(&data_alloc_inode->i_mutex);
+	inode_unlock(data_alloc_inode);
 	iput(data_alloc_inode);
 
 out:
@@ -6047,9 +6047,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 	int status;
 	struct inode *tl_inode = osb->osb_tl_inode;
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 	status = __ocfs2_flush_truncate_log(osb);
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	return status;
 }
@@ -6208,7 +6208,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 		(unsigned long long)le64_to_cpu(tl_copy->i_blkno),
 		num_recs);
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 	for(i = 0; i < num_recs; i++) {
 		if (ocfs2_truncate_log_needs_flush(osb)) {
 			status = __ocfs2_flush_truncate_log(osb);
@@ -6239,7 +6239,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 	}
 
 bail_up:
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	return status;
 }
@@ -6346,7 +6346,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret) {
@@ -6395,7 +6395,7 @@ out_unlock:
 	ocfs2_inode_unlock(inode, 1);
 	brelse(di_bh);
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	iput(inode);
 out:
 	while(head) {
@@ -6439,7 +6439,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
 	handle_t *handle;
 	int ret = 0;
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	while (head) {
 		if (ocfs2_truncate_log_needs_flush(osb)) {
@@ -6471,7 +6471,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
 		}
 	}
 
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	while (head) {
 		/* Premature exit may have left some dangling items. */
@@ -7355,7 +7355,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 		goto out;
 	}
 
-	mutex_lock(&main_bm_inode->i_mutex);
+	inode_lock(main_bm_inode);
 
 	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
 	if (ret < 0) {
@@ -7422,7 +7422,7 @@ out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 0);
 	brelse(main_bm_bh);
 out_mutex:
-	mutex_unlock(&main_bm_inode->i_mutex);
+	inode_unlock(main_bm_inode);
 	iput(main_bm_inode);
 out:
 	return ret;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7f604727f487..794fd1587f34 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2046,9 +2046,9 @@ static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
 	int ret = 0;
 	unsigned int truncated_clusters;
 
-	mutex_lock(&osb->osb_tl_inode->i_mutex);
+	inode_lock(osb->osb_tl_inode);
 	truncated_clusters = osb->truncated_clusters;
-	mutex_unlock(&osb->osb_tl_inode->i_mutex);
+	inode_unlock(osb->osb_tl_inode);
 
 	/*
 	 * Check whether we can succeed in allocating if we free
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ffecf89c8c1c..e1adf285fc31 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -4361,7 +4361,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
 		mlog_errno(ret);
 		goto out;
 	}
-	mutex_lock(&dx_alloc_inode->i_mutex);
+	inode_lock(dx_alloc_inode);
 
 	ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
 	if (ret) {
@@ -4410,7 +4410,7 @@ out_unlock:
 	ocfs2_inode_unlock(dx_alloc_inode, 1);
 
 out_mutex:
-	mutex_unlock(&dx_alloc_inode->i_mutex);
+	inode_unlock(dx_alloc_inode);
 	brelse(dx_alloc_bh);
 out:
 	iput(dx_alloc_inode);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f92612e4b9d6..474e57f834e6 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
 	unsigned int gen;
 	int noqueue_attempted = 0;
 	int dlm_locked = 0;
+	int kick_dc = 0;
 
 	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
 		mlog_errno(-EINVAL);
@@ -1524,7 +1525,12 @@ update_holders:
 unlock:
 	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
 
+	/* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+	kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	if (kick_dc)
+		ocfs2_wake_downconvert_thread(osb);
 out:
 	/*
 	 * This is helping work around a lock inversion between the page lock
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d63127932509..7cb38fdca229 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1872,7 +1872,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * This prevents concurrent writes on other nodes
@@ -1991,7 +1991,7 @@ out_rw_unlock:
 	ocfs2_rw_unlock(inode, 1);
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -2299,7 +2299,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 	appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
 	direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 relock:
 	/*
@@ -2435,7 +2435,7 @@ out:
 		ocfs2_rw_unlock(inode, rw_level);
 
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (written)
 		ret = written;
@@ -2547,7 +2547,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
 	struct inode *inode = file->f_mapping->host;
 	int ret = 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	switch (whence) {
 	case SEEK_SET:
@@ -2585,7 +2585,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (ret)
 		return ret;
 	return offset;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 97a563bab9a8..36294446d960 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -630,10 +630,10 @@ static int ocfs2_remove_inode(struct inode *inode,
 		goto bail;
 	}
 
-	mutex_lock(&inode_alloc_inode->i_mutex);
+	inode_lock(inode_alloc_inode);
 	status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
 	if (status < 0) {
-		mutex_unlock(&inode_alloc_inode->i_mutex);
+		inode_unlock(inode_alloc_inode);
 
 		mlog_errno(status);
 		goto bail;
@@ -680,7 +680,7 @@ bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
 	ocfs2_inode_unlock(inode_alloc_inode, 1);
-	mutex_unlock(&inode_alloc_inode->i_mutex);
+	inode_unlock(inode_alloc_inode);
 	brelse(inode_alloc_bh);
 bail:
 	iput(inode_alloc_inode);
@@ -751,10 +751,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		/* Lock the orphan dir. The lock will be held for the entire
 		 * delete_inode operation. We do this now to avoid races with
 		 * recovery completion on other nodes. */
-		mutex_lock(&orphan_dir_inode->i_mutex);
+		inode_lock(orphan_dir_inode);
 		status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
 		if (status < 0) {
-			mutex_unlock(&orphan_dir_inode->i_mutex);
+			inode_unlock(orphan_dir_inode);
 
 			mlog_errno(status);
 			goto bail;
@@ -803,7 +803,7 @@ bail_unlock_dir:
 		return status;
 
 	ocfs2_inode_unlock(orphan_dir_inode, 1);
-	mutex_unlock(&orphan_dir_inode->i_mutex);
+	inode_unlock(orphan_dir_inode);
 	brelse(orphan_dir_bh);
 bail:
 	iput(orphan_dir_inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 16b0bb482ea7..4506ec5ec2ea 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -86,7 +86,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 	unsigned oldflags;
 	int status;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	status = ocfs2_inode_lock(inode, &bh, 1);
 	if (status < 0) {
@@ -135,7 +135,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 bail_unlock:
 	ocfs2_inode_unlock(inode, 1);
 bail:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	brelse(bh);
 
@@ -287,7 +287,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
 	struct ocfs2_dinode *dinode_alloc = NULL;
 
 	if (inode_alloc)
-		mutex_lock(&inode_alloc->i_mutex);
+		inode_lock(inode_alloc);
 
 	if (o2info_coherent(&fi->ifi_req)) {
 		status = ocfs2_inode_lock(inode_alloc, &bh, 0);
@@ -317,7 +317,7 @@ bail:
 		ocfs2_inode_unlock(inode_alloc, 0);
 
 	if (inode_alloc)
-		mutex_unlock(&inode_alloc->i_mutex);
+		inode_unlock(inode_alloc);
 
 	brelse(bh);
 
@@ -547,7 +547,7 @@ static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
 	struct ocfs2_dinode *gb_dinode = NULL;
 
 	if (gb_inode)
-		mutex_lock(&gb_inode->i_mutex);
+		inode_lock(gb_inode);
 
 	if (o2info_coherent(&ffg->iff_req)) {
 		status = ocfs2_inode_lock(gb_inode, &bh, 0);
@@ -604,7 +604,7 @@ bail:
 		ocfs2_inode_unlock(gb_inode, 0);
 
 	if (gb_inode)
-		mutex_unlock(&gb_inode->i_mutex);
+		inode_unlock(gb_inode);
 
 	iput(gb_inode);
 	brelse(bh);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 3772a2dbb980..61b833b721d8 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2088,7 +2088,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 		return status;
 	}
 
-	mutex_lock(&orphan_dir_inode->i_mutex);
+	inode_lock(orphan_dir_inode);
 	status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
 	if (status < 0) {
 		mlog_errno(status);
@@ -2106,7 +2106,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 out_cluster:
 	ocfs2_inode_unlock(orphan_dir_inode, 0);
 out:
-	mutex_unlock(&orphan_dir_inode->i_mutex);
+	inode_unlock(orphan_dir_inode);
 	iput(orphan_dir_inode);
 	return status;
 }
@@ -2196,7 +2196,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		oi->ip_next_orphan = NULL;
 
 		if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 			ret = ocfs2_rw_lock(inode, 1);
 			if (ret < 0) {
 				mlog_errno(ret);
@@ -2235,7 +2235,7 @@ unlock_inode:
 unlock_rw:
 			ocfs2_rw_unlock(inode, 1);
 unlock_mutex:
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 
 			/* clear dio flag in ocfs2_inode_info */
 			oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index e9c99e35f5ea..7d62c43a2c3e 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -414,7 +414,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 		goto out;
 	}
 
-	mutex_lock(&main_bm_inode->i_mutex);
+	inode_lock(main_bm_inode);
 
 	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
 	if (status < 0) {
@@ -468,7 +468,7 @@ out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-	mutex_unlock(&main_bm_inode->i_mutex);
+	inode_unlock(main_bm_inode);
 	iput(main_bm_inode);
 
 out:
@@ -506,7 +506,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
 					     OCFS2_BH_IGNORE_CACHE);
@@ -539,7 +539,7 @@ bail:
 	brelse(alloc_bh);
 
 	if (inode) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		iput(inode);
 	}
 
@@ -571,7 +571,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	mutex_lock(&main_bm_inode->i_mutex);
+	inode_lock(main_bm_inode);
 
 	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
 	if (status < 0) {
@@ -601,7 +601,7 @@ out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-	mutex_unlock(&main_bm_inode->i_mutex);
+	inode_unlock(main_bm_inode);
 
 	brelse(main_bm_bh);
 
@@ -643,7 +643,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	mutex_lock(&local_alloc_inode->i_mutex);
+	inode_lock(local_alloc_inode);
 
 	/*
 	 * We must double check state and allocator bits because
@@ -709,7 +709,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 	status = 0;
 bail:
 	if (status < 0 && local_alloc_inode) {
-		mutex_unlock(&local_alloc_inode->i_mutex);
+		inode_unlock(local_alloc_inode);
 		iput(local_alloc_inode);
 	}
 
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 124471d26a73..e3d05d9901a3 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -276,7 +276,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
 	 *	context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
 	 */
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	if (ocfs2_truncate_log_needs_flush(osb)) {
 		ret = __ocfs2_flush_truncate_log(osb);
@@ -338,7 +338,7 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);
 
 out_unlock_mutex:
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	if (context->data_ac) {
 		ocfs2_free_alloc_context(context->data_ac);
@@ -632,7 +632,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 		goto out;
 	}
 
-	mutex_lock(&gb_inode->i_mutex);
+	inode_lock(gb_inode);
 
 	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
 	if (ret) {
@@ -640,7 +640,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 		goto out_unlock_gb_mutex;
 	}
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
@@ -708,11 +708,11 @@ out_commit:
 	brelse(gd_bh);
 
 out_unlock_tl_inode:
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	ocfs2_inode_unlock(gb_inode, 1);
 out_unlock_gb_mutex:
-	mutex_unlock(&gb_inode->i_mutex);
+	inode_unlock(gb_inode);
 	brelse(gb_bh);
 	iput(gb_inode);
 
@@ -905,7 +905,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * This prevents concurrent writes from other nodes
@@ -969,7 +969,7 @@ out_inode_unlock:
 out_rw_unlock:
 	ocfs2_rw_unlock(inode, 1);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return status;
 }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ab42c38031b1..6b3e87189a64 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1045,7 +1045,7 @@ leave:
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
 		ocfs2_inode_unlock(orphan_dir, 1);
-		mutex_unlock(&orphan_dir->i_mutex);
+		inode_unlock(orphan_dir);
 		iput(orphan_dir);
 	}
 
@@ -1664,7 +1664,7 @@ bail:
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
 		ocfs2_inode_unlock(orphan_dir, 1);
-		mutex_unlock(&orphan_dir->i_mutex);
+		inode_unlock(orphan_dir);
 		iput(orphan_dir);
 	}
 
@@ -2121,11 +2121,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
 		return ret;
 	}
 
-	mutex_lock(&orphan_dir_inode->i_mutex);
+	inode_lock(orphan_dir_inode);
 
 	ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
 	if (ret < 0) {
-		mutex_unlock(&orphan_dir_inode->i_mutex);
+		inode_unlock(orphan_dir_inode);
 		iput(orphan_dir_inode);
 
 		mlog_errno(ret);
@@ -2226,7 +2226,7 @@ out:
 
 	if (ret) {
 		ocfs2_inode_unlock(orphan_dir_inode, 1);
-		mutex_unlock(&orphan_dir_inode->i_mutex);
+		inode_unlock(orphan_dir_inode);
 		iput(orphan_dir_inode);
 	}
 
@@ -2495,7 +2495,7 @@ out:
 			ocfs2_free_alloc_context(inode_ac);
 
 		/* Unroll orphan dir locking */
-		mutex_unlock(&orphan_dir->i_mutex);
+		inode_unlock(orphan_dir);
 		ocfs2_inode_unlock(orphan_dir, 1);
 		iput(orphan_dir);
 	}
@@ -2602,7 +2602,7 @@ leave:
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
 		ocfs2_inode_unlock(orphan_dir, 1);
-		mutex_unlock(&orphan_dir->i_mutex);
+		inode_unlock(orphan_dir);
 		iput(orphan_dir);
 	}
 
@@ -2689,7 +2689,7 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
 
 bail_unlock_orphan:
 	ocfs2_inode_unlock(orphan_dir_inode, 1);
-	mutex_unlock(&orphan_dir_inode->i_mutex);
+	inode_unlock(orphan_dir_inode);
 	iput(orphan_dir_inode);
 
 	ocfs2_free_dir_lookup_result(&orphan_insert);
@@ -2721,10 +2721,10 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	mutex_lock(&orphan_dir_inode->i_mutex);
+	inode_lock(orphan_dir_inode);
 	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
 	if (status < 0) {
-		mutex_unlock(&orphan_dir_inode->i_mutex);
+		inode_unlock(orphan_dir_inode);
 		iput(orphan_dir_inode);
 		mlog_errno(status);
 		goto bail;
@@ -2770,7 +2770,7 @@ bail_commit:
 
 bail_unlock_orphan:
 	ocfs2_inode_unlock(orphan_dir_inode, 1);
-	mutex_unlock(&orphan_dir_inode->i_mutex);
+	inode_unlock(orphan_dir_inode);
 	brelse(orphan_dir_bh);
 	iput(orphan_dir_inode);
 
@@ -2834,12 +2834,12 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 		goto leave;
 	}
 
-	mutex_lock(&orphan_dir_inode->i_mutex);
+	inode_lock(orphan_dir_inode);
 
 	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
 	if (status < 0) {
 		mlog_errno(status);
-		mutex_unlock(&orphan_dir_inode->i_mutex);
+		inode_unlock(orphan_dir_inode);
 		iput(orphan_dir_inode);
 		goto leave;
 	}
@@ -2901,7 +2901,7 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);
 orphan_unlock:
 	ocfs2_inode_unlock(orphan_dir_inode, 1);
-	mutex_unlock(&orphan_dir_inode->i_mutex);
+	inode_unlock(orphan_dir_inode);
 	iput(orphan_dir_inode);
 leave:
 
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index fde9ef18cff3..9c9dd30bc945 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -308,7 +308,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
 		WARN_ON(bh != oinfo->dqi_gqi_bh);
 	spin_unlock(&dq_data_lock);
 	if (ex) {
-		mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+		inode_lock(oinfo->dqi_gqinode);
 		down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
 	} else {
 		down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
@@ -320,7 +320,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
 {
 	if (ex) {
 		up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
-		mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+		inode_unlock(oinfo->dqi_gqinode);
 	} else {
 		up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
 	}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 252119860e6c..3eff031aaf26 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -807,7 +807,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
 			mlog_errno(ret);
 			goto out;
 		}
-		mutex_lock(&alloc_inode->i_mutex);
+		inode_lock(alloc_inode);
 
 		ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
 		if (ret) {
@@ -867,7 +867,7 @@ out_unlock:
 	}
 out_mutex:
 	if (alloc_inode) {
-		mutex_unlock(&alloc_inode->i_mutex);
+		inode_unlock(alloc_inode);
 		iput(alloc_inode);
 	}
 out:
@@ -4197,7 +4197,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
 		goto out;
 	}
 
-	mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(new_inode, I_MUTEX_CHILD);
 	ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
 				      OI_LS_REFLINK_TARGET);
 	if (ret) {
@@ -4231,7 +4231,7 @@ inode_unlock:
 	ocfs2_inode_unlock(new_inode, 1);
 	brelse(new_bh);
 out_unlock:
-	mutex_unlock(&new_inode->i_mutex);
+	inode_unlock(new_inode);
 out:
 	if (!ret) {
 		ret = filemap_fdatawait(inode->i_mapping);
@@ -4402,11 +4402,11 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
 			return error;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = dquot_initialize(dir);
 	if (!error)
 		error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (!error)
 		fsnotify_create(dir, new_dentry);
 	return error;
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 79b8021302b3..576b9a04873f 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -301,7 +301,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 		goto out;
 	}
 
-	mutex_lock(&main_bm_inode->i_mutex);
+	inode_lock(main_bm_inode);
 
 	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
 	if (ret < 0) {
@@ -375,7 +375,7 @@ out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-	mutex_unlock(&main_bm_inode->i_mutex);
+	inode_unlock(main_bm_inode);
 	iput(main_bm_inode);
 
 out:
@@ -486,7 +486,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 		goto out;
 	}
 
-	mutex_lock(&main_bm_inode->i_mutex);
+	inode_lock(main_bm_inode);
 
 	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
 	if (ret < 0) {
@@ -590,7 +590,7 @@ out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-	mutex_unlock(&main_bm_inode->i_mutex);
+	inode_unlock(main_bm_inode);
 	iput(main_bm_inode);
 
 out:
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index fc6d25f6d444..2f19aeec5482 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -141,7 +141,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
 			ocfs2_inode_unlock(inode, 1);
 
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		iput(inode);
 		ac->ac_inode = NULL;
@@ -797,11 +797,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 		return -EINVAL;
 	}
 
-	mutex_lock(&alloc_inode->i_mutex);
+	inode_lock(alloc_inode);
 
 	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
 	if (status < 0) {
-		mutex_unlock(&alloc_inode->i_mutex);
+		inode_unlock(alloc_inode);
 		iput(alloc_inode);
 
 		mlog_errno(status);
@@ -2875,10 +2875,10 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 		goto bail;
 	}
 
-	mutex_lock(&inode_alloc_inode->i_mutex);
+	inode_lock(inode_alloc_inode);
 	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
 	if (status < 0) {
-		mutex_unlock(&inode_alloc_inode->i_mutex);
+		inode_unlock(inode_alloc_inode);
 		iput(inode_alloc_inode);
 		mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
 		     (u32)suballoc_slot, status);
@@ -2891,7 +2891,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
 
 	ocfs2_inode_unlock(inode_alloc_inode, 0);
-	mutex_unlock(&inode_alloc_inode->i_mutex);
+	inode_unlock(inode_alloc_inode);
 
 	iput(inode_alloc_inode);
 	brelse(alloc_bh);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f0e241ffd94f..7d3d979f57d9 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2524,7 +2524,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 		mlog_errno(ret);
 		goto out;
 	}
-	mutex_lock(&xb_alloc_inode->i_mutex);
+	inode_lock(xb_alloc_inode);
 
 	ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
 	if (ret < 0) {
@@ -2549,7 +2549,7 @@ out_unlock:
 	ocfs2_inode_unlock(xb_alloc_inode, 1);
 	brelse(xb_alloc_bh);
 out_mutex:
-	mutex_unlock(&xb_alloc_inode->i_mutex);
+	inode_unlock(xb_alloc_inode);
 	iput(xb_alloc_inode);
 out:
 	brelse(blk_bh);
@@ -3619,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode,
 		}
 	}
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	if (ocfs2_truncate_log_needs_flush(osb)) {
 		ret = __ocfs2_flush_truncate_log(osb);
 		if (ret < 0) {
-			mutex_unlock(&tl_inode->i_mutex);
+			inode_unlock(tl_inode);
 			mlog_errno(ret);
 			goto cleanup;
 		}
 	}
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
 					&xbs, &ctxt, ref_meta, &credits);
@@ -5460,7 +5460,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 		return ret;
 	}
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	if (ocfs2_truncate_log_needs_flush(osb)) {
 		ret = __ocfs2_flush_truncate_log(osb);
@@ -5504,7 +5504,7 @@ out_commit:
 out:
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
diff --git a/fs/open.c b/fs/open.c
index b25b1542c530..55bdc75e2172 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -58,10 +58,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	if (ret)
 		newattrs.ia_valid |= ret | ATTR_FORCE;
 
-	mutex_lock(&dentry->d_inode->i_mutex);
+	inode_lock(dentry->d_inode);
 	/* Note any delegations or leases have already been broken: */
 	ret = notify_change(dentry, &newattrs, NULL);
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	inode_unlock(dentry->d_inode);
 	return ret;
 }
 
@@ -510,7 +510,7 @@ static int chmod_common(struct path *path, umode_t mode)
 	if (error)
 		return error;
 retry_deleg:
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = security_path_chmod(path, mode);
 	if (error)
 		goto out_unlock;
@@ -518,7 +518,7 @@ retry_deleg:
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	error = notify_change(path->dentry, &newattrs, &delegated_inode);
 out_unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
@@ -593,11 +593,11 @@ retry_deleg:
 	if (!S_ISDIR(inode->i_mode))
 		newattrs.ia_valid |=
 			ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = security_path_chown(path, uid, gid);
 	if (!error)
 		error = notify_change(path->dentry, &newattrs, &delegated_inode);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index eff6319d5037..d894e7cd9a86 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -248,9 +248,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
 	if (err)
 		goto out_cleanup;
 
-	mutex_lock(&newdentry->d_inode->i_mutex);
+	inode_lock(newdentry->d_inode);
 	err = ovl_set_attr(newdentry, stat);
-	mutex_unlock(&newdentry->d_inode->i_mutex);
+	inode_unlock(newdentry->d_inode);
 	if (err)
 		goto out_cleanup;
 
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 692ceda3bc21..ed95272d57a6 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -167,7 +167,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 	struct dentry *newdentry;
 	int err;
 
-	mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(udir, I_MUTEX_PARENT);
 	newdentry = lookup_one_len(dentry->d_name.name, upperdir,
 				   dentry->d_name.len);
 	err = PTR_ERR(newdentry);
@@ -185,7 +185,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 out_dput:
 	dput(newdentry);
 out_unlock:
-	mutex_unlock(&udir->i_mutex);
+	inode_unlock(udir);
 	return err;
 }
 
@@ -258,9 +258,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
 	if (err)
 		goto out_cleanup;
 
-	mutex_lock(&opaquedir->d_inode->i_mutex);
+	inode_lock(opaquedir->d_inode);
 	err = ovl_set_attr(opaquedir, &stat);
-	mutex_unlock(&opaquedir->d_inode->i_mutex);
+	inode_unlock(opaquedir->d_inode);
 	if (err)
 		goto out_cleanup;
 
@@ -599,7 +599,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
 	struct dentry *upper = ovl_dentry_upper(dentry);
 	int err;
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(dir, I_MUTEX_PARENT);
 	err = -ESTALE;
 	if (upper->d_parent == upperdir) {
 		/* Don't let d_delete() think it can reset d_inode */
@@ -619,7 +619,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
 	 * now.
 	 */
 	d_drop(dentry);
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 
 	return err;
 }
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index bf996e574f3d..49e204560655 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -63,9 +63,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 	if (!err) {
 		upperdentry = ovl_dentry_upper(dentry);
 
-		mutex_lock(&upperdentry->d_inode->i_mutex);
+		inode_lock(upperdentry->d_inode);
 		err = notify_change(upperdentry, attr, NULL);
-		mutex_unlock(&upperdentry->d_inode->i_mutex);
+		inode_unlock(upperdentry->d_inode);
 	}
 	ovl_drop_write(dentry);
 out:
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index adcb1398c481..fdaf28f75e12 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -228,7 +228,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
 				dput(dentry);
 			}
 		}
-		mutex_unlock(&dir->d_inode->i_mutex);
+		inode_unlock(dir->d_inode);
 	}
 	revert_creds(old_cred);
 	put_cred(override_cred);
@@ -399,7 +399,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
 	loff_t res;
 	struct ovl_dir_file *od = file->private_data;
 
-	mutex_lock(&file_inode(file)->i_mutex);
+	inode_lock(file_inode(file));
 	if (!file->f_pos)
 		ovl_dir_reset(file);
 
@@ -429,7 +429,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
 		res = offset;
 	}
 out_unlock:
-	mutex_unlock(&file_inode(file)->i_mutex);
+	inode_unlock(file_inode(file));
 
 	return res;
 }
@@ -454,10 +454,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 			ovl_path_upper(dentry, &upperpath);
 			realfile = ovl_path_open(&upperpath, O_RDONLY);
 			smp_mb__before_spinlock();
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 			if (!od->upperfile) {
 				if (IS_ERR(realfile)) {
-					mutex_unlock(&inode->i_mutex);
+					inode_unlock(inode);
 					return PTR_ERR(realfile);
 				}
 				od->upperfile = realfile;
@@ -467,7 +467,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 					fput(realfile);
 				realfile = od->upperfile;
 			}
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 		}
 	}
 
@@ -479,9 +479,9 @@ static int ovl_dir_release(struct inode *inode, struct file *file)
 	struct ovl_dir_file *od = file->private_data;
 
 	if (od->cache) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		ovl_cache_put(od, file->f_path.dentry);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	fput(od->realfile);
 	if (od->upperfile)
@@ -557,7 +557,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
 {
 	struct ovl_cache_entry *p;
 
-	mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
 	list_for_each_entry(p, list, l_node) {
 		struct dentry *dentry;
 
@@ -575,5 +575,5 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
 			ovl_cleanup(upper->d_inode, dentry);
 		dput(dentry);
 	}
-	mutex_unlock(&upper->d_inode->i_mutex);
+	inode_unlock(upper->d_inode);
 }
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index d250604f985a..8d826bd56b26 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -229,7 +229,7 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
-	WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
+	WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode));
 	WARN_ON(oe->__upperdentry);
 	BUG_ON(!upperdentry->d_inode);
 	/*
@@ -244,7 +244,7 @@ void ovl_dentry_version_inc(struct dentry *dentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
-	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+	WARN_ON(!inode_is_locked(dentry->d_inode));
 	oe->version++;
 }
 
@@ -252,7 +252,7 @@ u64 ovl_dentry_version_get(struct dentry *dentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
-	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+	WARN_ON(!inode_is_locked(dentry->d_inode));
 	return oe->version;
 }
 
@@ -375,9 +375,9 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
 {
 	struct dentry *dentry;
 
-	mutex_lock(&dir->d_inode->i_mutex);
+	inode_lock(dir->d_inode);
 	dentry = lookup_one_len(name->name, dir, name->len);
-	mutex_unlock(&dir->d_inode->i_mutex);
+	inode_unlock(dir->d_inode);
 
 	if (IS_ERR(dentry)) {
 		if (PTR_ERR(dentry) == -ENOENT)
@@ -744,7 +744,7 @@ static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
 	if (err)
 		return ERR_PTR(err);
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(dir, I_MUTEX_PARENT);
 retry:
 	work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
 			      strlen(OVL_WORKDIR_NAME));
@@ -770,7 +770,7 @@ retry:
 			goto out_dput;
 	}
 out_unlock:
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	mnt_drop_write(mnt);
 
 	return work;
diff --git a/fs/pipe.c b/fs/pipe.c
index 42cf8ddf0e55..ab8dad3ccb6a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576;
  */
 unsigned int pipe_min_size = PAGE_SIZE;
 
+/* Maximum allocatable pages per user. Hard limit is unset by default, soft
+ * matches default values.
+ */
+unsigned long pipe_user_pages_hard;
+unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
+
 /*
  * We use a start+len construction, which provides full use of the 
  * allocated memory.
@@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on)
 	return retval;
 }
 
+static void account_pipe_buffers(struct pipe_inode_info *pipe,
+                                 unsigned long old, unsigned long new)
+{
+	atomic_long_add(new - old, &pipe->user->pipe_bufs);
+}
+
+static bool too_many_pipe_buffers_soft(struct user_struct *user)
+{
+	return pipe_user_pages_soft &&
+	       atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft;
+}
+
+static bool too_many_pipe_buffers_hard(struct user_struct *user)
+{
+	return pipe_user_pages_hard &&
+	       atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard;
+}
+
 struct pipe_inode_info *alloc_pipe_info(void)
 {
 	struct pipe_inode_info *pipe;
 
 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
 	if (pipe) {
-		pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+		unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
+		struct user_struct *user = get_current_user();
+
+		if (!too_many_pipe_buffers_hard(user)) {
+			if (too_many_pipe_buffers_soft(user))
+				pipe_bufs = 1;
+			pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL);
+		}
+
 		if (pipe->bufs) {
 			init_waitqueue_head(&pipe->wait);
 			pipe->r_counter = pipe->w_counter = 1;
-			pipe->buffers = PIPE_DEF_BUFFERS;
+			pipe->buffers = pipe_bufs;
+			pipe->user = user;
+			account_pipe_buffers(pipe, 0, pipe_bufs);
 			mutex_init(&pipe->mutex);
 			return pipe;
 		}
+		free_uid(user);
 		kfree(pipe);
 	}
 
@@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
 
+	account_pipe_buffers(pipe, pipe->buffers, 0);
+	free_uid(pipe->user);
 	for (i = 0; i < pipe->buffers; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 		if (buf->ops)
@@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 			memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
 	}
 
+	account_pipe_buffers(pipe, pipe->buffers, nr_pages);
 	pipe->curbuf = 0;
 	kfree(pipe->bufs);
 	pipe->bufs = bufs;
@@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
 			ret = -EPERM;
 			goto out;
+		} else if ((too_many_pipe_buffers_hard(pipe->user) ||
+			    too_many_pipe_buffers_soft(pipe->user)) &&
+		           !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+			ret = -EPERM;
+			goto out;
 		}
 		ret = pipe_set_size(pipe, nr_pages);
 		break;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 92e6726f6e37..a939f5ed7f89 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -552,9 +552,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
 	if (kcore_need_update)
 		kcore_update_ram();
 	if (i_size_read(inode) != proc_root_kcore->size) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		i_size_write(inode, proc_root_kcore->size);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	return 0;
 }
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 67e8db442cf0..b6a8d3529fea 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -50,7 +50,7 @@ int proc_setup_self(struct super_block *s)
 	struct pid_namespace *ns = s->s_fs_info;
 	struct dentry *self;
 	
-	mutex_lock(&root_inode->i_mutex);
+	inode_lock(root_inode);
 	self = d_alloc_name(s->s_root, "self");
 	if (self) {
 		struct inode *inode = new_inode_pseudo(s);
@@ -69,7 +69,7 @@ int proc_setup_self(struct super_block *s)
 	} else {
 		self = ERR_PTR(-ENOMEM);
 	}
-	mutex_unlock(&root_inode->i_mutex);
+	inode_unlock(root_inode);
 	if (IS_ERR(self)) {
 		pr_err("proc_fill_super: can't allocate /proc/self\n");
 		return PTR_ERR(self);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 71ffc91060f6..85d16c67c33e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -602,7 +602,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		smaps_pmd_entry(pmd, addr, walk);
 		spin_unlock(ptl);
 		return 0;
@@ -913,7 +914,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	struct page *page;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
 			clear_soft_dirty_pmd(vma, addr, pmd);
 			goto out;
@@ -1187,7 +1189,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 	int err = 0;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmdp, vma);
+	if (ptl) {
 		u64 flags = 0, frame = 0;
 		pmd_t pmd = *pmdp;
 
@@ -1519,7 +1522,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 	pte_t *orig_pte;
 	pte_t *pte;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 9eacd59e0360..e58a31e8fb2a 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -52,7 +52,7 @@ int proc_setup_thread_self(struct super_block *s)
 	struct pid_namespace *ns = s->s_fs_info;
 	struct dentry *thread_self;
 
-	mutex_lock(&root_inode->i_mutex);
+	inode_lock(root_inode);
 	thread_self = d_alloc_name(s->s_root, "thread-self");
 	if (thread_self) {
 		struct inode *inode = new_inode_pseudo(s);
@@ -71,7 +71,7 @@ int proc_setup_thread_self(struct super_block *s)
 	} else {
 		thread_self = ERR_PTR(-ENOMEM);
 	}
-	mutex_unlock(&root_inode->i_mutex);
+	inode_unlock(root_inode);
 	if (IS_ERR(thread_self)) {
 		pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
 		return PTR_ERR(thread_self);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index d8c439d813ce..dc645b66cd79 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -377,7 +377,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 		break;
 	}
 
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 
 	dentry = d_alloc_name(root, name);
 	if (!dentry)
@@ -397,12 +397,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 	list_add(&private->list, &allpstore);
 	spin_unlock_irqrestore(&allpstore_lock, flags);
 
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 
 	return 0;
 
 fail_lockedalloc:
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 	kfree(private);
 fail_alloc:
 	iput(inode);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index fbd70af98820..3c3b81bb6dfe 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -682,9 +682,9 @@ int dquot_quota_sync(struct super_block *sb, int type)
 			continue;
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
-		mutex_lock(&dqopt->files[cnt]->i_mutex);
+		inode_lock(dqopt->files[cnt]);
 		truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
-		mutex_unlock(&dqopt->files[cnt]->i_mutex);
+		inode_unlock(dqopt->files[cnt]);
 	}
 	mutex_unlock(&dqopt->dqonoff_mutex);
 
@@ -2162,12 +2162,12 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 			/* If quota was reenabled in the meantime, we have
 			 * nothing to do */
 			if (!sb_has_quota_loaded(sb, cnt)) {
-				mutex_lock(&toputinode[cnt]->i_mutex);
+				inode_lock(toputinode[cnt]);
 				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				  S_NOATIME | S_NOQUOTA);
 				truncate_inode_pages(&toputinode[cnt]->i_data,
 						     0);
-				mutex_unlock(&toputinode[cnt]->i_mutex);
+				inode_unlock(toputinode[cnt]);
 				mark_inode_dirty_sync(toputinode[cnt]);
 			}
 			mutex_unlock(&dqopt->dqonoff_mutex);
@@ -2258,11 +2258,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		/* We don't want quota and atime on quota files (deadlocks
 		 * possible) Also nobody should write to the file - we use
 		 * special IO operations which ignore the immutable bit. */
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
 					     S_NOQUOTA);
 		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		/*
 		 * When S_NOQUOTA is set, remove dquot references as no more
 		 * references can be added
@@ -2305,12 +2305,12 @@ out_file_init:
 	iput(inode);
 out_lock:
 	if (oldflags != -1) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		/* Set the flags back (in the case of accidental quotaon()
 		 * on a wrong file we don't want to mess up the flags) */
 		inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
 		inode->i_flags |= oldflags;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	mutex_unlock(&dqopt->dqonoff_mutex);
 out_fmt:
@@ -2430,9 +2430,9 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
 	struct dentry *dentry;
 	int error;
 
-	mutex_lock(&d_inode(sb->s_root)->i_mutex);
+	inode_lock(d_inode(sb->s_root));
 	dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
-	mutex_unlock(&d_inode(sb->s_root)->i_mutex);
+	inode_unlock(d_inode(sb->s_root));
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 06b07d5a08fe..324ec271cc4e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -238,7 +238,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
 	struct inode *inode = file_inode(file);
 	loff_t retval;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	switch (whence) {
 		case SEEK_END:
 			offset += i_size_read(inode);
@@ -283,7 +283,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
 		retval = offset;
 	}
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return retval;
 }
 EXPORT_SYMBOL(default_llseek);
@@ -1656,6 +1656,9 @@ next_file:
 		mnt_drop_write_file(dst_file);
 next_loop:
 		fdput(dst_fd);
+
+		if (fatal_signal_pending(current))
+			goto out;
 	}
 
 out:
diff --git a/fs/readdir.c b/fs/readdir.c
index ced679179cac..e69ef3b79787 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -44,7 +44,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
 		fsnotify_access(file);
 		file_accessed(file);
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 out:
 	return res;
 }
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 4a024e2ceb9f..3abd4004184b 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -38,11 +38,11 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 	if (err)
 		return err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	reiserfs_write_lock(inode->i_sb);
 	err = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (err < 0)
 		return err;
 	return 0;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 96a1bcf33db4..9424a4ba93a9 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -158,7 +158,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
 	if (err)
 		return err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	BUG_ON(!S_ISREG(inode->i_mode));
 	err = sync_mapping_buffers(inode->i_mapping);
 	reiserfs_write_lock(inode->i_sb);
@@ -166,7 +166,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (barrier_done < 0)
 		return barrier_done;
 	return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 6ec8a30a0911..036a1fc0a8c3 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -224,7 +224,7 @@ out_unlock:
 	page_cache_release(page);
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	reiserfs_write_unlock(inode->i_sb);
 	return retval;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 05db7473bcb5..c0306ec8ed7b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s)
 		pathrelse(&path);
 
 		inode = reiserfs_iget(s, &obj_key);
-		if (!inode) {
+		if (IS_ERR_OR_NULL(inode)) {
 			/*
 			 * the unlink almost completed, it just did not
 			 * manage to remove "save" link and release objectid
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e5ddb4e5ea94..57e0b2310532 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -64,14 +64,14 @@
 #ifdef CONFIG_REISERFS_FS_XATTR
 static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
 {
-	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	BUG_ON(!inode_is_locked(dir));
 	return dir->i_op->create(dir, dentry, mode, true);
 }
 #endif
 
 static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	BUG_ON(!inode_is_locked(dir));
 	return dir->i_op->mkdir(dir, dentry, mode);
 }
 
@@ -85,11 +85,11 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int error;
 
-	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	BUG_ON(!inode_is_locked(dir));
 
-	mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
 	error = dir->i_op->unlink(dir, dentry);
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 
 	if (!error)
 		d_delete(dentry);
@@ -100,13 +100,13 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	int error;
 
-	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	BUG_ON(!inode_is_locked(dir));
 
-	mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
 	error = dir->i_op->rmdir(dir, dentry);
 	if (!error)
 		d_inode(dentry)->i_flags |= S_DEAD;
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 	if (!error)
 		d_delete(dentry);
 
@@ -123,7 +123,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
 	if (d_really_is_negative(privroot))
 		return ERR_PTR(-ENODATA);
 
-	mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
 
 	xaroot = dget(REISERFS_SB(sb)->xattr_root);
 	if (!xaroot)
@@ -139,7 +139,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
 		}
 	}
 
-	mutex_unlock(&d_inode(privroot)->i_mutex);
+	inode_unlock(d_inode(privroot));
 	return xaroot;
 }
 
@@ -156,7 +156,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 		 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
 		 inode->i_generation);
 
-	mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR);
 
 	xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
 	if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
@@ -170,7 +170,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 		}
 	}
 
-	mutex_unlock(&d_inode(xaroot)->i_mutex);
+	inode_unlock(d_inode(xaroot));
 	dput(xaroot);
 	return xadir;
 }
@@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
 		container_of(ctx, struct reiserfs_dentry_buf, ctx);
 	struct dentry *dentry;
 
-	WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex));
+	WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
 
 	if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
 		return -ENOSPC;
@@ -254,7 +254,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
 		goto out_dir;
 	}
 
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
 
 	buf.xadir = dir;
 	while (1) {
@@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
 			break;
 		buf.count = 0;
 	}
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 
 	cleanup_dentry_buf(&buf);
 
@@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode,
 		if (!err) {
 			int jerror;
 
-			mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex,
+			inode_lock_nested(d_inode(dir->d_parent),
 					  I_MUTEX_XATTR);
 			err = action(dir, data);
 			reiserfs_write_lock(inode->i_sb);
 			jerror = journal_end(&th);
 			reiserfs_write_unlock(inode->i_sb);
-			mutex_unlock(&d_inode(dir->d_parent)->i_mutex);
+			inode_unlock(d_inode(dir->d_parent));
 			err = jerror ?: err;
 		}
 	}
@@ -384,7 +384,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
 	if (IS_ERR(xadir))
 		return ERR_CAST(xadir);
 
-	mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
 	xafile = lookup_one_len(name, xadir, strlen(name));
 	if (IS_ERR(xafile)) {
 		err = PTR_ERR(xafile);
@@ -404,7 +404,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
 	if (err)
 		dput(xafile);
 out:
-	mutex_unlock(&d_inode(xadir)->i_mutex);
+	inode_unlock(d_inode(xadir));
 	dput(xadir);
 	if (err)
 		return ERR_PTR(err);
@@ -469,7 +469,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
 	if (IS_ERR(xadir))
 		return PTR_ERR(xadir);
 
-	mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
 	dentry = lookup_one_len(name, xadir, strlen(name));
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -483,7 +483,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
 
 	dput(dentry);
 out_dput:
-	mutex_unlock(&d_inode(xadir)->i_mutex);
+	inode_unlock(d_inode(xadir));
 	dput(xadir);
 	return err;
 }
@@ -580,11 +580,11 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
 			.ia_valid = ATTR_SIZE | ATTR_CTIME,
 		};
 
-		mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR);
+		inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
 		inode_dio_wait(d_inode(dentry));
 
 		err = reiserfs_setattr(dentry, &newattrs);
-		mutex_unlock(&d_inode(dentry)->i_mutex);
+		inode_unlock(d_inode(dentry));
 	} else
 		update_ctime(inode);
 out_unlock:
@@ -888,9 +888,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 		goto out;
 	}
 
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
 	err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 
 	if (!err)
 		err = buf.pos;
@@ -905,7 +905,7 @@ static int create_privroot(struct dentry *dentry)
 	int err;
 	struct inode *inode = d_inode(dentry->d_parent);
 
-	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+	WARN_ON_ONCE(!inode_is_locked(inode));
 
 	err = xattr_mkdir(inode, dentry, 0700);
 	if (err || d_really_is_negative(dentry)) {
@@ -995,7 +995,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
 	int err = 0;
 
 	/* If we don't have the privroot located yet - go find it */
-	mutex_lock(&d_inode(s->s_root)->i_mutex);
+	inode_lock(d_inode(s->s_root));
 	dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
 				strlen(PRIVROOT_NAME));
 	if (!IS_ERR(dentry)) {
@@ -1005,7 +1005,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
 			d_inode(dentry)->i_flags |= S_PRIVATE;
 	} else
 		err = PTR_ERR(dentry);
-	mutex_unlock(&d_inode(s->s_root)->i_mutex);
+	inode_unlock(d_inode(s->s_root));
 
 	return err;
 }
@@ -1025,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 		goto error;
 
 	if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) {
-		mutex_lock(&d_inode(s->s_root)->i_mutex);
+		inode_lock(d_inode(s->s_root));
 		err = create_privroot(REISERFS_SB(s)->priv_root);
-		mutex_unlock(&d_inode(s->s_root)->i_mutex);
+		inode_unlock(d_inode(s->s_root));
 	}
 
 	if (d_really_is_positive(privroot)) {
 		s->s_xattr = reiserfs_xattr_handlers;
-		mutex_lock(&d_inode(privroot)->i_mutex);
+		inode_lock(d_inode(privroot));
 		if (!REISERFS_SB(s)->xattr_root) {
 			struct dentry *dentry;
 
@@ -1043,7 +1043,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 			else
 				err = PTR_ERR(dentry);
 		}
-		mutex_unlock(&d_inode(privroot)->i_mutex);
+		inode_unlock(d_inode(privroot));
 	}
 
 error:
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index c66f2423e1f5..4a0e48f92104 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -84,9 +84,9 @@ static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umo
 	 * the files within the tracefs system. It is up to the individual
 	 * mkdir routine to handle races.
 	 */
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	ret = tracefs_ops.mkdir(name);
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	kfree(name);
 
@@ -109,13 +109,13 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
 	 * This time we need to unlock not only the parent (inode) but
 	 * also the directory that is being deleted.
 	 */
-	mutex_unlock(&inode->i_mutex);
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	inode_unlock(inode);
+	inode_unlock(dentry->d_inode);
 
 	ret = tracefs_ops.rmdir(name);
 
-	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-	mutex_lock(&dentry->d_inode->i_mutex);
+	inode_lock_nested(inode, I_MUTEX_PARENT);
+	inode_lock(dentry->d_inode);
 
 	kfree(name);
 
@@ -334,7 +334,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 	if (!parent)
 		parent = tracefs_mount->mnt_root;
 
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
 	dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry) && dentry->d_inode) {
 		dput(dentry);
@@ -342,7 +342,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 	}
 
 	if (IS_ERR(dentry)) {
-		mutex_unlock(&parent->d_inode->i_mutex);
+		inode_unlock(parent->d_inode);
 		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
 	}
 
@@ -351,7 +351,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 
 static struct dentry *failed_creating(struct dentry *dentry)
 {
-	mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+	inode_unlock(dentry->d_parent->d_inode);
 	dput(dentry);
 	simple_release_fs(&tracefs_mount, &tracefs_mount_count);
 	return NULL;
@@ -359,7 +359,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
 
 static struct dentry *end_creating(struct dentry *dentry)
 {
-	mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+	inode_unlock(dentry->d_parent->d_inode);
 	return dentry;
 }
 
@@ -544,9 +544,9 @@ void tracefs_remove(struct dentry *dentry)
 	if (!parent || !parent->d_inode)
 		return;
 
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
 	ret = __tracefs_remove(dentry, parent);
-	mutex_unlock(&parent->d_inode->i_mutex);
+	inode_unlock(parent->d_inode);
 	if (!ret)
 		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
 }
@@ -572,7 +572,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 
 	parent = dentry;
  down:
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
  loop:
 	/*
 	 * The parent->d_subdirs is protected by the d_lock. Outside that
@@ -587,7 +587,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 		/* perhaps simple_empty(child) makes more sense */
 		if (!list_empty(&child->d_subdirs)) {
 			spin_unlock(&parent->d_lock);
-			mutex_unlock(&parent->d_inode->i_mutex);
+			inode_unlock(parent->d_inode);
 			parent = child;
 			goto down;
 		}
@@ -608,10 +608,10 @@ void tracefs_remove_recursive(struct dentry *dentry)
 	}
 	spin_unlock(&parent->d_lock);
 
-	mutex_unlock(&parent->d_inode->i_mutex);
+	inode_unlock(parent->d_inode);
 	child = parent;
 	parent = parent->d_parent;
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
 
 	if (child != dentry)
 		/* go up */
@@ -619,7 +619,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 
 	if (!__tracefs_remove(child, parent))
 		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-	mutex_unlock(&parent->d_inode->i_mutex);
+	inode_unlock(parent->d_inode);
 }
 
 /**
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e49bd2808bf3..795992a8321e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -515,8 +515,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu",
 		dentry, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
-	ubifs_assert(mutex_is_locked(&dir->i_mutex));
-	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+	ubifs_assert(inode_is_locked(dir));
+	ubifs_assert(inode_is_locked(inode));
 
 	err = dbg_check_synced_i_size(c, inode);
 	if (err)
@@ -572,8 +572,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu",
 		dentry, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
-	ubifs_assert(mutex_is_locked(&dir->i_mutex));
-	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+	ubifs_assert(inode_is_locked(dir));
+	ubifs_assert(inode_is_locked(inode));
 	err = dbg_check_synced_i_size(c, inode);
 	if (err)
 		return err;
@@ -661,8 +661,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry,
 		inode->i_ino, dir->i_ino);
-	ubifs_assert(mutex_is_locked(&dir->i_mutex));
-	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+	ubifs_assert(inode_is_locked(dir));
+	ubifs_assert(inode_is_locked(inode));
 	err = check_dir_empty(c, d_inode(dentry));
 	if (err)
 		return err;
@@ -996,10 +996,10 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu",
 		old_dentry, old_inode->i_ino, old_dir->i_ino,
 		new_dentry, new_dir->i_ino);
-	ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
-	ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
+	ubifs_assert(inode_is_locked(old_dir));
+	ubifs_assert(inode_is_locked(new_dir));
 	if (unlink)
-		ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
+		ubifs_assert(inode_is_locked(new_inode));
 
 
 	if (unlink && is_dir) {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index eff62801acbf..065c88f8e4b8 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1317,7 +1317,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (err)
 		return err;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* Synchronize the inode unless this is a 'datasync()' call. */
 	if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
@@ -1332,7 +1332,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	 */
 	err = ubifs_sync_wbufs_by_inode(c, inode);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err;
 }
 
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index e53292d0c21b..c7f4d434d098 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -313,7 +313,7 @@ static int setxattr(struct inode *host, const char *name, const void *value,
 	union ubifs_key key;
 	int err, type;
 
-	ubifs_assert(mutex_is_locked(&host->i_mutex));
+	ubifs_assert(inode_is_locked(host));
 
 	if (size > UBIFS_MAX_INO_DATA)
 		return -ERANGE;
@@ -550,7 +550,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
 
 	dbg_gen("xattr '%s', ino %lu ('%pd')", name,
 		host->i_ino, dentry);
-	ubifs_assert(mutex_is_locked(&host->i_mutex));
+	ubifs_assert(inode_is_locked(host));
 
 	err = check_namespace(&nm);
 	if (err < 0)
diff --git a/fs/udf/file.c b/fs/udf/file.c
index bddf3d071dae..1af98963d860 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct udf_inode_info *iinfo = UDF_I(inode);
 	int err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	retval = generic_write_checks(iocb, from);
 	if (retval <= 0)
@@ -136,7 +136,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 				(udf_file_entry_alloc_offset(inode) + end)) {
 			err = udf_expand_file_adinicb(inode);
 			if (err) {
-				mutex_unlock(&inode->i_mutex);
+				inode_unlock(inode);
 				udf_debug("udf_expand_adinicb: err=%d\n", err);
 				return err;
 			}
@@ -149,7 +149,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	retval = __generic_file_write_iter(iocb, from);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (retval > 0) {
 		mark_inode_dirty(inode);
@@ -223,12 +223,12 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 		 * Grab i_mutex to avoid races with writes changing i_size
 		 * while we are running.
 		 */
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		down_write(&UDF_I(inode)->i_data_sem);
 		udf_discard_prealloc(inode);
 		udf_truncate_tail_extent(inode);
 		up_write(&UDF_I(inode)->i_data_sem);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	return 0;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 87dc16d15572..166d3ed32c39 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -262,7 +262,7 @@ int udf_expand_file_adinicb(struct inode *inode)
 		.nr_to_write = 1,
 	};
 
-	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+	WARN_ON_ONCE(!inode_is_locked(inode));
 	if (!iinfo->i_lenAlloc) {
 		if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
 			iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 0fbb4c7c72e8..a522c15a0bfd 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -279,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
 {
 	int i;
 	int nr_groups = bitmap->s_nr_groups;
-	int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) *
-						nr_groups);
 
 	for (i = 0; i < nr_groups; i++)
 		if (bitmap->s_block_bitmap[i])
 			brelse(bitmap->s_block_bitmap[i]);
 
-	if (size <= PAGE_SIZE)
-		kfree(bitmap);
-	else
-		vfree(bitmap);
+	kvfree(bitmap);
 }
 
 static void udf_free_partition(struct udf_part_map *map)
diff --git a/fs/utimes.c b/fs/utimes.c
index aa138d64560a..85c40f4f373d 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -103,9 +103,9 @@ static int utimes_common(struct path *path, struct timespec *times)
 		}
 	}
 retry_deleg:
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = notify_change(path->dentry, &newattrs, &delegated_inode);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
diff --git a/fs/xattr.c b/fs/xattr.c
index d5dd6c8b82a7..07d0e47f6a7f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -129,7 +129,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	if (error)
 		return error;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = security_inode_setxattr(dentry, name, value, size, flags);
 	if (error)
 		goto out;
@@ -137,7 +137,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_setxattr);
@@ -277,7 +277,7 @@ vfs_removexattr(struct dentry *dentry, const char *name)
 	if (error)
 		return error;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = security_inode_removexattr(dentry, name);
 	if (error)
 		goto out;
@@ -290,7 +290,7 @@ vfs_removexattr(struct dentry *dentry, const char *name)
 	}
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_removexattr);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index e2536bb1c760..dc97eb21af07 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -984,8 +984,6 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 
 /*
  * Values for di_flags
- * There should be a one-to-one correspondence between these flags and the
- * XFS_XFLAG_s.
  */
 #define XFS_DIFLAG_REALTIME_BIT  0	/* file's blocks come from rt area */
 #define XFS_DIFLAG_PREALLOC_BIT  1	/* file space has been preallocated */
@@ -1026,6 +1024,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 	 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
 
 /*
+ * Values for di_flags2 These start by being exposed to userspace in the upper
+ * 16 bits of the XFS_XFLAG_s range.
+ */
+#define XFS_DIFLAG2_DAX_BIT	0	/* use DAX for this inode */
+#define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
+
+#define XFS_DIFLAG2_ANY		(XFS_DIFLAG2_DAX)
+
+/*
  * Inode number format:
  * low inopblog bits - offset in block
  * next agblklog bits - block number in ag
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index b2b73a998d42..fffe3d01bd9f 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -36,40 +36,6 @@ struct dioattr {
 #endif
 
 /*
- * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
- */
-#ifndef HAVE_FSXATTR
-struct fsxattr {
-	__u32		fsx_xflags;	/* xflags field value (get/set) */
-	__u32		fsx_extsize;	/* extsize field value (get/set)*/
-	__u32		fsx_nextents;	/* nextents field value (get)	*/
-	__u32		fsx_projid;	/* project identifier (get/set) */
-	unsigned char	fsx_pad[12];
-};
-#endif
-
-/*
- * Flags for the bs_xflags/fsx_xflags field
- * There should be a one-to-one correspondence between these flags and the
- * XFS_DIFLAG_s.
- */
-#define XFS_XFLAG_REALTIME	0x00000001	/* data in realtime volume */
-#define XFS_XFLAG_PREALLOC	0x00000002	/* preallocated file extents */
-#define XFS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */
-#define XFS_XFLAG_APPEND	0x00000010	/* all writes append */
-#define XFS_XFLAG_SYNC		0x00000020	/* all writes synchronous */
-#define XFS_XFLAG_NOATIME	0x00000040	/* do not update access time */
-#define XFS_XFLAG_NODUMP	0x00000080	/* do not include in backups */
-#define XFS_XFLAG_RTINHERIT	0x00000100	/* create with rt bit set */
-#define XFS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
-#define XFS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */
-#define XFS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */
-#define XFS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */
-#define XFS_XFLAG_NODEFRAG	0x00002000  	/* do not defragment */
-#define XFS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
-#define XFS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
-
-/*
  * Structure for XFS_IOC_GETBMAP.
  * On input, fill in bmv_offset and bmv_length of the first structure
  * to indicate the area of interest in the file, and bmv_entries with
@@ -514,8 +480,8 @@ typedef struct xfs_swapext
 #define XFS_IOC_ALLOCSP		_IOW ('X', 10, struct xfs_flock64)
 #define XFS_IOC_FREESP		_IOW ('X', 11, struct xfs_flock64)
 #define XFS_IOC_DIOINFO		_IOR ('X', 30, struct dioattr)
-#define XFS_IOC_FSGETXATTR	_IOR ('X', 31, struct fsxattr)
-#define XFS_IOC_FSSETXATTR	_IOW ('X', 32, struct fsxattr)
+#define XFS_IOC_FSGETXATTR	FS_IOC_FSGETXATTR
+#define XFS_IOC_FSSETXATTR	FS_IOC_FSSETXATTR
 #define XFS_IOC_ALLOCSP64	_IOW ('X', 36, struct xfs_flock64)
 #define XFS_IOC_FREESP64	_IOW ('X', 37, struct xfs_flock64)
 #define XFS_IOC_GETBMAP		_IOWR('X', 38, struct getbmap)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index daed4bfb85b2..435c7de42e5f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1527,6 +1527,16 @@ xfs_wait_buftarg(
 	LIST_HEAD(dispose);
 	int loop = 0;
 
+	/*
+	 * We need to flush the buffer workqueue to ensure that all IO
+	 * completion processing is 100% done. Just waiting on buffer locks is
+	 * not sufficient for async IO as the reference count held over IO is
+	 * not released until after the buffer lock is dropped. Hence we need to
+	 * ensure here that all reference counts have been dropped before we
+	 * start walking the LRU list.
+	 */
+	drain_workqueue(btp->bt_mount->m_buf_workqueue);
+
 	/* loop until there is nothing left on the lru list. */
 	while (list_lru_count(&btp->bt_lru)) {
 		list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ebe9b8290a70..52883ac3cf84 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -55,7 +55,7 @@ xfs_rw_ilock(
 	int			type)
 {
 	if (type & XFS_IOLOCK_EXCL)
-		mutex_lock(&VFS_I(ip)->i_mutex);
+		inode_lock(VFS_I(ip));
 	xfs_ilock(ip, type);
 }
 
@@ -66,7 +66,7 @@ xfs_rw_iunlock(
 {
 	xfs_iunlock(ip, type);
 	if (type & XFS_IOLOCK_EXCL)
-		mutex_unlock(&VFS_I(ip)->i_mutex);
+		inode_unlock(VFS_I(ip));
 }
 
 static inline void
@@ -76,7 +76,7 @@ xfs_rw_ilock_demote(
 {
 	xfs_ilock_demote(ip, type);
 	if (type & XFS_IOLOCK_EXCL)
-		mutex_unlock(&VFS_I(ip)->i_mutex);
+		inode_unlock(VFS_I(ip));
 }
 
 /*
@@ -1610,9 +1610,8 @@ xfs_filemap_pmd_fault(
 /*
  * pfn_mkwrite was originally inteneded to ensure we capture time stamp
  * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
- * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
- * barrier in place.
+ * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
+ * to ensure we serialise the fault barrier in place.
  */
 static int
 xfs_filemap_pfn_mkwrite(
@@ -1635,6 +1634,8 @@ xfs_filemap_pfn_mkwrite(
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
+	else if (IS_DAX(inode))
+		ret = dax_pfn_mkwrite(vma, vmf);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ae3758a90ed6..ceba1a83cacc 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -610,60 +610,69 @@ __xfs_iflock(
 
 STATIC uint
 _xfs_dic2xflags(
-	__uint16_t		di_flags)
+	__uint16_t		di_flags,
+	uint64_t		di_flags2,
+	bool			has_attr)
 {
 	uint			flags = 0;
 
 	if (di_flags & XFS_DIFLAG_ANY) {
 		if (di_flags & XFS_DIFLAG_REALTIME)
-			flags |= XFS_XFLAG_REALTIME;
+			flags |= FS_XFLAG_REALTIME;
 		if (di_flags & XFS_DIFLAG_PREALLOC)
-			flags |= XFS_XFLAG_PREALLOC;
+			flags |= FS_XFLAG_PREALLOC;
 		if (di_flags & XFS_DIFLAG_IMMUTABLE)
-			flags |= XFS_XFLAG_IMMUTABLE;
+			flags |= FS_XFLAG_IMMUTABLE;
 		if (di_flags & XFS_DIFLAG_APPEND)
-			flags |= XFS_XFLAG_APPEND;
+			flags |= FS_XFLAG_APPEND;
 		if (di_flags & XFS_DIFLAG_SYNC)
-			flags |= XFS_XFLAG_SYNC;
+			flags |= FS_XFLAG_SYNC;
 		if (di_flags & XFS_DIFLAG_NOATIME)
-			flags |= XFS_XFLAG_NOATIME;
+			flags |= FS_XFLAG_NOATIME;
 		if (di_flags & XFS_DIFLAG_NODUMP)
-			flags |= XFS_XFLAG_NODUMP;
+			flags |= FS_XFLAG_NODUMP;
 		if (di_flags & XFS_DIFLAG_RTINHERIT)
-			flags |= XFS_XFLAG_RTINHERIT;
+			flags |= FS_XFLAG_RTINHERIT;
 		if (di_flags & XFS_DIFLAG_PROJINHERIT)
-			flags |= XFS_XFLAG_PROJINHERIT;
+			flags |= FS_XFLAG_PROJINHERIT;
 		if (di_flags & XFS_DIFLAG_NOSYMLINKS)
-			flags |= XFS_XFLAG_NOSYMLINKS;
+			flags |= FS_XFLAG_NOSYMLINKS;
 		if (di_flags & XFS_DIFLAG_EXTSIZE)
-			flags |= XFS_XFLAG_EXTSIZE;
+			flags |= FS_XFLAG_EXTSIZE;
 		if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
-			flags |= XFS_XFLAG_EXTSZINHERIT;
+			flags |= FS_XFLAG_EXTSZINHERIT;
 		if (di_flags & XFS_DIFLAG_NODEFRAG)
-			flags |= XFS_XFLAG_NODEFRAG;
+			flags |= FS_XFLAG_NODEFRAG;
 		if (di_flags & XFS_DIFLAG_FILESTREAM)
-			flags |= XFS_XFLAG_FILESTREAM;
+			flags |= FS_XFLAG_FILESTREAM;
 	}
 
+	if (di_flags2 & XFS_DIFLAG2_ANY) {
+		if (di_flags2 & XFS_DIFLAG2_DAX)
+			flags |= FS_XFLAG_DAX;
+	}
+
+	if (has_attr)
+		flags |= FS_XFLAG_HASATTR;
+
 	return flags;
 }
 
 uint
 xfs_ip2xflags(
-	xfs_inode_t		*ip)
+	struct xfs_inode	*ip)
 {
-	xfs_icdinode_t		*dic = &ip->i_d;
+	struct xfs_icdinode	*dic = &ip->i_d;
 
-	return _xfs_dic2xflags(dic->di_flags) |
-				(XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
+	return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
 }
 
 uint
 xfs_dic2xflags(
-	xfs_dinode_t		*dip)
+	struct xfs_dinode	*dip)
 {
-	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
-				(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
+	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
+				be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
 }
 
 /*
@@ -862,7 +871,8 @@ xfs_ialloc(
 	case S_IFREG:
 	case S_IFDIR:
 		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
-			uint	di_flags = 0;
+			uint64_t	di_flags2 = 0;
+			uint		di_flags = 0;
 
 			if (S_ISDIR(mode)) {
 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
@@ -898,7 +908,11 @@ xfs_ialloc(
 				di_flags |= XFS_DIFLAG_NODEFRAG;
 			if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
 				di_flags |= XFS_DIFLAG_FILESTREAM;
+			if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+				di_flags2 |= XFS_DIFLAG2_DAX;
+
 			ip->i_d.di_flags |= di_flags;
+			ip->i_d.di_flags2 |= di_flags2;
 		}
 		/* FALLTHROUGH */
 	case S_IFLNK:
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d42738deec6d..478d04e07f95 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -859,25 +859,25 @@ xfs_merge_ioc_xflags(
 	unsigned int	xflags = start;
 
 	if (flags & FS_IMMUTABLE_FL)
-		xflags |= XFS_XFLAG_IMMUTABLE;
+		xflags |= FS_XFLAG_IMMUTABLE;
 	else
-		xflags &= ~XFS_XFLAG_IMMUTABLE;
+		xflags &= ~FS_XFLAG_IMMUTABLE;
 	if (flags & FS_APPEND_FL)
-		xflags |= XFS_XFLAG_APPEND;
+		xflags |= FS_XFLAG_APPEND;
 	else
-		xflags &= ~XFS_XFLAG_APPEND;
+		xflags &= ~FS_XFLAG_APPEND;
 	if (flags & FS_SYNC_FL)
-		xflags |= XFS_XFLAG_SYNC;
+		xflags |= FS_XFLAG_SYNC;
 	else
-		xflags &= ~XFS_XFLAG_SYNC;
+		xflags &= ~FS_XFLAG_SYNC;
 	if (flags & FS_NOATIME_FL)
-		xflags |= XFS_XFLAG_NOATIME;
+		xflags |= FS_XFLAG_NOATIME;
 	else
-		xflags &= ~XFS_XFLAG_NOATIME;
+		xflags &= ~FS_XFLAG_NOATIME;
 	if (flags & FS_NODUMP_FL)
-		xflags |= XFS_XFLAG_NODUMP;
+		xflags |= FS_XFLAG_NODUMP;
 	else
-		xflags &= ~XFS_XFLAG_NODUMP;
+		xflags &= ~FS_XFLAG_NODUMP;
 
 	return xflags;
 }
@@ -945,40 +945,51 @@ xfs_set_diflags(
 	unsigned int		xflags)
 {
 	unsigned int		di_flags;
+	uint64_t		di_flags2;
 
 	/* can't set PREALLOC this way, just preserve it */
 	di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
-	if (xflags & XFS_XFLAG_IMMUTABLE)
+	if (xflags & FS_XFLAG_IMMUTABLE)
 		di_flags |= XFS_DIFLAG_IMMUTABLE;
-	if (xflags & XFS_XFLAG_APPEND)
+	if (xflags & FS_XFLAG_APPEND)
 		di_flags |= XFS_DIFLAG_APPEND;
-	if (xflags & XFS_XFLAG_SYNC)
+	if (xflags & FS_XFLAG_SYNC)
 		di_flags |= XFS_DIFLAG_SYNC;
-	if (xflags & XFS_XFLAG_NOATIME)
+	if (xflags & FS_XFLAG_NOATIME)
 		di_flags |= XFS_DIFLAG_NOATIME;
-	if (xflags & XFS_XFLAG_NODUMP)
+	if (xflags & FS_XFLAG_NODUMP)
 		di_flags |= XFS_DIFLAG_NODUMP;
-	if (xflags & XFS_XFLAG_NODEFRAG)
+	if (xflags & FS_XFLAG_NODEFRAG)
 		di_flags |= XFS_DIFLAG_NODEFRAG;
-	if (xflags & XFS_XFLAG_FILESTREAM)
+	if (xflags & FS_XFLAG_FILESTREAM)
 		di_flags |= XFS_DIFLAG_FILESTREAM;
 	if (S_ISDIR(ip->i_d.di_mode)) {
-		if (xflags & XFS_XFLAG_RTINHERIT)
+		if (xflags & FS_XFLAG_RTINHERIT)
 			di_flags |= XFS_DIFLAG_RTINHERIT;
-		if (xflags & XFS_XFLAG_NOSYMLINKS)
+		if (xflags & FS_XFLAG_NOSYMLINKS)
 			di_flags |= XFS_DIFLAG_NOSYMLINKS;
-		if (xflags & XFS_XFLAG_EXTSZINHERIT)
+		if (xflags & FS_XFLAG_EXTSZINHERIT)
 			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-		if (xflags & XFS_XFLAG_PROJINHERIT)
+		if (xflags & FS_XFLAG_PROJINHERIT)
 			di_flags |= XFS_DIFLAG_PROJINHERIT;
 	} else if (S_ISREG(ip->i_d.di_mode)) {
-		if (xflags & XFS_XFLAG_REALTIME)
+		if (xflags & FS_XFLAG_REALTIME)
 			di_flags |= XFS_DIFLAG_REALTIME;
-		if (xflags & XFS_XFLAG_EXTSIZE)
+		if (xflags & FS_XFLAG_EXTSIZE)
 			di_flags |= XFS_DIFLAG_EXTSIZE;
 	}
-
 	ip->i_d.di_flags = di_flags;
+
+	/* diflags2 only valid for v3 inodes. */
+	if (ip->i_d.di_version < 3)
+		return;
+
+	di_flags2 = 0;
+	if (xflags & FS_XFLAG_DAX)
+		di_flags2 |= XFS_DIFLAG2_DAX;
+
+	ip->i_d.di_flags2 = di_flags2;
+
 }
 
 STATIC void
@@ -988,22 +999,27 @@ xfs_diflags_to_linux(
 	struct inode		*inode = VFS_I(ip);
 	unsigned int		xflags = xfs_ip2xflags(ip);
 
-	if (xflags & XFS_XFLAG_IMMUTABLE)
+	if (xflags & FS_XFLAG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
 	else
 		inode->i_flags &= ~S_IMMUTABLE;
-	if (xflags & XFS_XFLAG_APPEND)
+	if (xflags & FS_XFLAG_APPEND)
 		inode->i_flags |= S_APPEND;
 	else
 		inode->i_flags &= ~S_APPEND;
-	if (xflags & XFS_XFLAG_SYNC)
+	if (xflags & FS_XFLAG_SYNC)
 		inode->i_flags |= S_SYNC;
 	else
 		inode->i_flags &= ~S_SYNC;
-	if (xflags & XFS_XFLAG_NOATIME)
+	if (xflags & FS_XFLAG_NOATIME)
 		inode->i_flags |= S_NOATIME;
 	else
 		inode->i_flags &= ~S_NOATIME;
+	if (xflags & FS_XFLAG_DAX)
+		inode->i_flags |= S_DAX;
+	else
+		inode->i_flags &= ~S_DAX;
+
 }
 
 static int
@@ -1016,11 +1032,11 @@ xfs_ioctl_setattr_xflags(
 
 	/* Can't change realtime flag if any extents are allocated. */
 	if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
-	    XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME))
+	    XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
 		return -EINVAL;
 
 	/* If realtime flag is set then must have realtime device */
-	if (fa->fsx_xflags & XFS_XFLAG_REALTIME) {
+	if (fa->fsx_xflags & FS_XFLAG_REALTIME) {
 		if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
 		    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
 			return -EINVAL;
@@ -1031,7 +1047,7 @@ xfs_ioctl_setattr_xflags(
 	 * we have appropriate permission.
 	 */
 	if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
-	     (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+	     (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) &&
 	    !capable(CAP_LINUX_IMMUTABLE))
 		return -EPERM;
 
@@ -1095,8 +1111,8 @@ out_cancel:
  * extent size hint validation is somewhat cumbersome. Rules are:
  *
  * 1. extent size hint is only valid for directories and regular files
- * 2. XFS_XFLAG_EXTSIZE is only valid for regular files
- * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 2. FS_XFLAG_EXTSIZE is only valid for regular files
+ * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
  * 4. can only be changed on regular files if no extents are allocated
  * 5. can be changed on directories at any time
  * 6. extsize hint of 0 turns off hints, clears inode flags.
@@ -1112,10 +1128,10 @@ xfs_ioctl_setattr_check_extsize(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 
-	if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
 		return -EINVAL;
 
-	if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) &&
+	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
 	    !S_ISDIR(ip->i_d.di_mode))
 		return -EINVAL;
 
@@ -1132,7 +1148,7 @@ xfs_ioctl_setattr_check_extsize(
 			return -EINVAL;
 
 		if (XFS_IS_REALTIME_INODE(ip) ||
-		    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+		    (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
 			size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
 		} else {
 			size = mp->m_sb.sb_blocksize;
@@ -1143,7 +1159,7 @@ xfs_ioctl_setattr_check_extsize(
 		if (fa->fsx_extsize % size)
 			return -EINVAL;
 	} else
-		fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT);
+		fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
 
 	return 0;
 }
@@ -1168,7 +1184,7 @@ xfs_ioctl_setattr_check_projid(
 
 	if (xfs_get_projid(ip) != fa->fsx_projid)
 		return -EINVAL;
-	if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) !=
+	if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) !=
 	    (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
 		return -EINVAL;
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 06eafafe636e..76b71a1c6c32 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1205,8 +1205,8 @@ xfs_diflags_to_iflags(
 		inode->i_flags |= S_SYNC;
 	if (flags & XFS_DIFLAG_NOATIME)
 		inode->i_flags |= S_NOATIME;
-	/* XXX: Also needs an on-disk per inode flag! */
-	if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+	if (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
+	    ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
 		inode->i_flags |= S_DAX;
 }
 
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index dc6221942b85..ade236e90bb3 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -42,11 +42,11 @@ xfs_break_layouts(
 	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
 		xfs_iunlock(ip, *iolock);
 		if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 		error = break_layout(inode, true);
 		*iolock = XFS_IOLOCK_EXCL;
 		if (with_imutex)
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 		xfs_ilock(ip, *iolock);
 	}
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index aa67339b9537..4f18fd92ca13 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -497,7 +497,6 @@ xfsaild(
 	long		tout = 0;	/* milliseconds */
 
 	current->flags |= PF_MEMALLOC;
-	set_freezable();
 
 	while (!kthread_should_stop()) {
 		if (tout && tout <= 20)
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 3d69c93d50e8..6361892ea737 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -204,6 +204,7 @@ struct crypto_ahash {
 		      unsigned int keylen);
 
 	unsigned int reqsize;
+	bool has_setkey;
 	struct crypto_tfm base;
 };
 
@@ -375,6 +376,11 @@ static inline void *ahash_request_ctx(struct ahash_request *req)
 int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
 			unsigned int keylen);
 
+static inline bool crypto_ahash_has_setkey(struct crypto_ahash *tfm)
+{
+	return tfm->has_setkey;
+}
+
 /**
  * crypto_ahash_finup() - update and finalize message digest
  * @req: reference to the ahash_request handle that holds all information
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index 018afb264ac2..a2bfd7843f18 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -30,6 +30,9 @@ struct alg_sock {
 
 	struct sock *parent;
 
+	unsigned int refcnt;
+	unsigned int nokey_refcnt;
+
 	const struct af_alg_type *type;
 	void *private;
 };
@@ -50,9 +53,11 @@ struct af_alg_type {
 	void (*release)(void *private);
 	int (*setkey)(void *private, const u8 *key, unsigned int keylen);
 	int (*accept)(void *private, struct sock *sk);
+	int (*accept_nokey)(void *private, struct sock *sk);
 	int (*setauthsize)(void *private, unsigned int authsize);
 
 	struct proto_ops *ops;
+	struct proto_ops *ops_nokey;
 	struct module *owner;
 	char name[14];
 };
@@ -67,6 +72,7 @@ int af_alg_register_type(const struct af_alg_type *type);
 int af_alg_unregister_type(const struct af_alg_type *type);
 
 int af_alg_release(struct socket *sock);
+void af_alg_release_parent(struct sock *sk);
 int af_alg_accept(struct sock *sk, struct socket *newsock);
 
 int af_alg_make_sg(struct af_alg_sgl *sgl, struct iov_iter *iter, int len);
@@ -83,11 +89,6 @@ static inline struct alg_sock *alg_sk(struct sock *sk)
 	return (struct alg_sock *)sk;
 }
 
-static inline void af_alg_release_parent(struct sock *sk)
-{
-	sock_put(alg_sk(sk)->parent);
-}
-
 static inline void af_alg_init_completion(struct af_alg_completion *completion)
 {
 	init_completion(&completion->completion);
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index d8dd41fb034f..fd8742a40ff3 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -61,6 +61,8 @@ struct crypto_skcipher {
 	unsigned int ivsize;
 	unsigned int reqsize;
 
+	bool has_setkey;
+
 	struct crypto_tfm base;
 };
 
@@ -305,6 +307,11 @@ static inline int crypto_skcipher_setkey(struct crypto_skcipher *tfm,
 	return tfm->setkey(tfm, key, keylen);
 }
 
+static inline bool crypto_skcipher_has_setkey(struct crypto_skcipher *tfm)
+{
+	return tfm->has_setkey;
+}
+
 /**
  * crypto_skcipher_reqtfm() - obtain cipher handle from request
  * @req: skcipher_request out of which the cipher handle is to be obtained
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 744b997d6a94..164049357e5c 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -7,6 +7,7 @@
 #ifndef _AER_H_
 #define _AER_H_
 
+#include <linux/errno.h>
 #include <linux/types.h>
 
 #define AER_NONFATAL			0
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b9b6e046b52e..5349e6816cbb 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -318,16 +318,6 @@ enum bip_flags {
 	BIP_IP_CHECKSUM		= 1 << 4, /* IP checksum */
 };
 
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-static inline struct bio_integrity_payload *bio_integrity(struct bio *bio)
-{
-	if (bio->bi_rw & REQ_INTEGRITY)
-		return bio->bi_integrity;
-
-	return NULL;
-}
-
 /*
  * bio integrity payload
  */
@@ -349,6 +339,16 @@ struct bio_integrity_payload {
 	struct bio_vec		bip_inline_vecs[0];/* embedded bvec array */
 };
 
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+
+static inline struct bio_integrity_payload *bio_integrity(struct bio *bio)
+{
+	if (bio->bi_rw & REQ_INTEGRITY)
+		return bio->bi_integrity;
+
+	return NULL;
+}
+
 static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag)
 {
 	struct bio_integrity_payload *bip = bio_integrity(bio);
@@ -795,6 +795,18 @@ static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag)
 	return false;
 }
 
+static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp,
+								unsigned int nr)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
+					unsigned int len, unsigned int offset)
+{
+	return 0;
+}
+
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 #endif /* CONFIG_BLOCK */
diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h
deleted file mode 100644
index 77ae77c0b704..000000000000
--- a/include/linux/blk-iopoll.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef BLK_IOPOLL_H
-#define BLK_IOPOLL_H
-
-struct blk_iopoll;
-typedef int (blk_iopoll_fn)(struct blk_iopoll *, int);
-
-struct blk_iopoll {
-	struct list_head list;
-	unsigned long state;
-	unsigned long data;
-	int weight;
-	int max;
-	blk_iopoll_fn *poll;
-};
-
-enum {
-	IOPOLL_F_SCHED		= 0,
-	IOPOLL_F_DISABLE	= 1,
-};
-
-/*
- * Returns 0 if we successfully set the IOPOLL_F_SCHED bit, indicating
- * that we were the first to acquire this iop for scheduling. If this iop
- * is currently disabled, return "failure".
- */
-static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop)
-{
-	if (!test_bit(IOPOLL_F_DISABLE, &iop->state))
-		return test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
-
-	return 1;
-}
-
-static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop)
-{
-	return test_bit(IOPOLL_F_DISABLE, &iop->state);
-}
-
-extern void blk_iopoll_sched(struct blk_iopoll *);
-extern void blk_iopoll_init(struct blk_iopoll *, int, blk_iopoll_fn *);
-extern void blk_iopoll_complete(struct blk_iopoll *);
-extern void __blk_iopoll_complete(struct blk_iopoll *);
-extern void blk_iopoll_enable(struct blk_iopoll *);
-extern void blk_iopoll_disable(struct blk_iopoll *);
-
-#endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0fb65843ec1e..86a38ea1823f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -188,7 +188,6 @@ enum rq_flag_bits {
 	__REQ_PM,		/* runtime pm request */
 	__REQ_HASHED,		/* on IO scheduler merge hash */
 	__REQ_MQ_INFLIGHT,	/* track inflight for MQ */
-	__REQ_NO_TIMEOUT,	/* requests may never expire */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -242,7 +241,6 @@ enum rq_flag_bits {
 #define REQ_PM			(1ULL << __REQ_PM)
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
 #define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
-#define REQ_NO_TIMEOUT		(1ULL << __REQ_NO_TIMEOUT)
 
 typedef unsigned int blk_qc_t;
 #define BLK_QC_T_NONE	-1U
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d372ea87ead5..29189aeace19 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -409,6 +409,7 @@ struct request_queue {
 
 	unsigned int		rq_timeout;
 	struct timer_list	timeout;
+	struct work_struct	timeout_work;
 	struct list_head	timeout_list;
 
 	struct list_head	icq_list;
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index 5babb8e95352..b827e066e55a 100644
--- a/include/linux/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
@@ -40,46 +40,11 @@ static inline __u32 ceph_frag_mask_shift(__u32 f)
 	return 24 - ceph_frag_bits(f);
 }
 
-static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+static inline bool ceph_frag_contains_value(__u32 f, __u32 v)
 {
 	return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
 }
-static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
-{
-	/* is sub as specific as us, and contained by us? */
-	return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
-	       (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
 
-static inline __u32 ceph_frag_parent(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f) - 1,
-			 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
-}
-static inline int ceph_frag_is_left_child(__u32 f)
-{
-	return ceph_frag_bits(f) > 0 &&
-		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
-}
-static inline int ceph_frag_is_right_child(__u32 f)
-{
-	return ceph_frag_bits(f) > 0 &&
-		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
-}
-static inline __u32 ceph_frag_sibling(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f),
-		      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
-}
-static inline __u32 ceph_frag_left_child(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
-}
-static inline __u32 ceph_frag_right_child(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f)+1,
-	      ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
-}
 static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
 {
 	int newbits = ceph_frag_bits(f) + by;
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 71b1d6cdcb5d..8dbd7879fdc6 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -220,6 +220,7 @@ struct ceph_connection {
 	struct ceph_entity_addr actual_peer_addr;
 
 	/* message out temps */
+	struct ceph_msg_header out_hdr;
 	struct ceph_msg *out_msg;        /* sending message (== tail of
 					    out_sent) */
 	bool out_msg_done;
@@ -229,7 +230,6 @@ struct ceph_connection {
 	int out_kvec_left;   /* kvec's left in out_kvec */
 	int out_skip;        /* skip this many bytes */
 	int out_kvec_bytes;  /* total bytes left */
-	bool out_kvec_is_msg; /* kvec refers to out_msg */
 	int out_more;        /* there is more data after the kvecs */
 	__le64 out_temp_ack; /* for writing an ack */
 	struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b415e521528d..8204c3dc3800 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -36,4 +36,11 @@ static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
 	return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
 }
+
+static inline bool dax_mapping(struct address_space *mapping)
+{
+	return mapping->host && IS_DAX(mapping->host);
+}
+int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
+		loff_t end);
 #endif
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 8723f2a99e15..d6b3c9943a2c 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -25,7 +25,6 @@
 */
 #ifndef DRBD_H
 #define DRBD_H
-#include <linux/connector.h>
 #include <asm/types.h>
 
 #ifdef __KERNEL__
@@ -52,7 +51,7 @@
 #endif
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.5"
+#define REL_VERSION "8.4.6"
 #define API_VERSION 1
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 101
@@ -339,6 +338,8 @@ enum drbd_state_rv {
 #define MDF_AL_CLEAN		(1 << 7)
 #define MDF_AL_DISABLED		(1 << 8)
 
+#define MAX_PEERS 32
+
 enum drbd_uuid_index {
 	UI_CURRENT,
 	UI_BITMAP,
@@ -349,14 +350,35 @@ enum drbd_uuid_index {
 	UI_EXTENDED_SIZE   /* Everything. */
 };
 
+#define HISTORY_UUIDS MAX_PEERS
+
 enum drbd_timeout_flag {
 	UT_DEFAULT      = 0,
 	UT_DEGRADED     = 1,
 	UT_PEER_OUTDATED = 2,
 };
 
+enum drbd_notification_type {
+	NOTIFY_EXISTS,
+	NOTIFY_CREATE,
+	NOTIFY_CHANGE,
+	NOTIFY_DESTROY,
+	NOTIFY_CALL,
+	NOTIFY_RESPONSE,
+
+	NOTIFY_CONTINUES = 0x8000,
+	NOTIFY_FLAGS = NOTIFY_CONTINUES,
+};
+
 #define UUID_JUST_CREATED ((__u64)4)
 
+enum write_ordering_e {
+	WO_NONE,
+	WO_DRAIN_IO,
+	WO_BDEV_FLUSH,
+	WO_BIO_BARRIER
+};
+
 /* magic numbers used in meta data and network packets */
 #define DRBD_MAGIC 0x83740267
 #define DRBD_MAGIC_BIG 0x835a
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index 7b131ed8f9c6..2d0e5ad5de9d 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -250,6 +250,76 @@ GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms,
 	__flg_field(1, DRBD_GENLA_F_MANDATORY,	force_detach)
 )
 
+GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info,
+	__u32_field(1, 0, res_role)
+	__flg_field(2, 0, res_susp)
+	__flg_field(3, 0, res_susp_nod)
+	__flg_field(4, 0, res_susp_fen)
+	/* __flg_field(5, 0, res_weak) */
+)
+
+GENL_struct(DRBD_NLA_DEVICE_INFO, 16, device_info,
+	__u32_field(1, 0, dev_disk_state)
+)
+
+GENL_struct(DRBD_NLA_CONNECTION_INFO, 17, connection_info,
+	__u32_field(1, 0, conn_connection_state)
+	__u32_field(2, 0, conn_role)
+)
+
+GENL_struct(DRBD_NLA_PEER_DEVICE_INFO, 18, peer_device_info,
+	__u32_field(1, 0, peer_repl_state)
+	__u32_field(2, 0, peer_disk_state)
+	__u32_field(3, 0, peer_resync_susp_user)
+	__u32_field(4, 0, peer_resync_susp_peer)
+	__u32_field(5, 0, peer_resync_susp_dependency)
+)
+
+GENL_struct(DRBD_NLA_RESOURCE_STATISTICS, 19, resource_statistics,
+	__u32_field(1, 0, res_stat_write_ordering)
+)
+
+GENL_struct(DRBD_NLA_DEVICE_STATISTICS, 20, device_statistics,
+	__u64_field(1, 0, dev_size)  /* (sectors) */
+	__u64_field(2, 0, dev_read)  /* (sectors) */
+	__u64_field(3, 0, dev_write)  /* (sectors) */
+	__u64_field(4, 0, dev_al_writes)  /* activity log writes (count) */
+	__u64_field(5, 0, dev_bm_writes)  /*  bitmap writes  (count) */
+	__u32_field(6, 0, dev_upper_pending)  /* application requests in progress */
+	__u32_field(7, 0, dev_lower_pending)  /* backing device requests in progress */
+	__flg_field(8, 0, dev_upper_blocked)
+	__flg_field(9, 0, dev_lower_blocked)
+	__flg_field(10, 0, dev_al_suspended)  /* activity log suspended */
+	__u64_field(11, 0, dev_exposed_data_uuid)
+	__u64_field(12, 0, dev_current_uuid)
+	__u32_field(13, 0, dev_disk_flags)
+	__bin_field(14, 0, history_uuids, HISTORY_UUIDS * sizeof(__u64))
+)
+
+GENL_struct(DRBD_NLA_CONNECTION_STATISTICS, 21, connection_statistics,
+	__flg_field(1, 0, conn_congested)
+)
+
+GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics,
+	__u64_field(1, 0, peer_dev_received)  /* sectors */
+	__u64_field(2, 0, peer_dev_sent)  /* sectors */
+	__u32_field(3, 0, peer_dev_pending)  /* number of requests */
+	__u32_field(4, 0, peer_dev_unacked)  /* number of requests */
+	__u64_field(5, 0, peer_dev_out_of_sync)  /* sectors */
+	__u64_field(6, 0, peer_dev_resync_failed)  /* sectors */
+	__u64_field(7, 0, peer_dev_bitmap_uuid)
+	__u32_field(9, 0, peer_dev_flags)
+)
+
+GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header,
+	__u32_field(1, DRBD_GENLA_F_MANDATORY, nh_type)
+)
+
+GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info,
+	__str_field(1, DRBD_GENLA_F_MANDATORY, helper_name, 32)
+	__u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status)
+)
+
 /*
  * Notifications and commands (genlmsghdr->cmd)
  */
@@ -382,3 +452,82 @@ GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type),
 	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
 GENL_op(DRBD_ADM_DOWN,		27, GENL_doit(drbd_adm_down),
 	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+
+GENL_op(DRBD_ADM_GET_RESOURCES, 30,
+	 GENL_op_init(
+		 .dumpit = drbd_adm_dump_resources,
+	 ),
+	 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+	 GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY)
+	 GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+
+GENL_op(DRBD_ADM_GET_DEVICES, 31,
+	 GENL_op_init(
+		 .dumpit = drbd_adm_dump_devices,
+		 .done = drbd_adm_dump_devices_done,
+	 ),
+	 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+	 GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
+	 GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+
+GENL_op(DRBD_ADM_GET_CONNECTIONS, 32,
+	 GENL_op_init(
+		 .dumpit = drbd_adm_dump_connections,
+		 .done = drbd_adm_dump_connections_done,
+	 ),
+	 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+	 GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY)
+	 GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY))
+
+GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33,
+	 GENL_op_init(
+		 .dumpit = drbd_adm_dump_peer_devices,
+		 .done = drbd_adm_dump_peer_devices_done,
+	 ),
+	 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+	 GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
+	 GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+
+GENL_notification(
+	DRBD_RESOURCE_STATE, 34, events,
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_F_REQUIRED))
+
+GENL_notification(
+	DRBD_DEVICE_STATE, 35, events,
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_F_REQUIRED))
+
+GENL_notification(
+	DRBD_CONNECTION_STATE, 36, events,
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_F_REQUIRED))
+
+GENL_notification(
+	DRBD_PEER_DEVICE_STATE, 37, events,
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_F_REQUIRED))
+
+GENL_op(
+	DRBD_ADM_GET_INITIAL_STATE, 38,
+	GENL_op_init(
+	        .dumpit = drbd_adm_get_initial_state,
+	),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY))
+
+GENL_notification(
+	DRBD_HELPER, 40, events,
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_HELPER, DRBD_F_REQUIRED))
+
+GENL_notification(
+	DRBD_INITIAL_STATE_DONE, 41, events,
+	GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index eb73d74ed992..1a2046275cdf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -433,7 +433,8 @@ struct address_space {
 	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
 	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
-	unsigned long		nrshadows;	/* number of shadow entries */
+	/* number of shadow or DAX exceptional entries */
+	unsigned long		nrexceptional;
 	pgoff_t			writeback_index;/* writeback starts here */
 	const struct address_space_operations *a_ops;	/* methods */
 	unsigned long		flags;		/* error bits/gfp mask */
@@ -714,6 +715,31 @@ enum inode_i_mutex_lock_class
 	I_MUTEX_PARENT2,
 };
 
+static inline void inode_lock(struct inode *inode)
+{
+	mutex_lock(&inode->i_mutex);
+}
+
+static inline void inode_unlock(struct inode *inode)
+{
+	mutex_unlock(&inode->i_mutex);
+}
+
+static inline int inode_trylock(struct inode *inode)
+{
+	return mutex_trylock(&inode->i_mutex);
+}
+
+static inline int inode_is_locked(struct inode *inode)
+{
+	return mutex_is_locked(&inode->i_mutex);
+}
+
+static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
+{
+	mutex_lock_nested(&inode->i_mutex, subclass);
+}
+
 void lock_two_nondirectories(struct inode *, struct inode*);
 void unlock_two_nondirectories(struct inode *, struct inode*);
 
@@ -3047,8 +3073,8 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
 }
 static inline bool dir_relax(struct inode *inode)
 {
-	mutex_unlock(&inode->i_mutex);
-	mutex_lock(&inode->i_mutex);
+	inode_unlock(inode);
+	inode_lock(inode);
 	return !IS_DEADDIR(inode);
 }
 
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index cfe81e10bd54..459fd25b378e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -120,15 +120,15 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
 				    unsigned long start,
 				    unsigned long end,
 				    long adjust_next);
-extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-		spinlock_t **ptl);
+extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
+		struct vm_area_struct *vma);
 /* mmap_sem must be held on entry */
-static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-		spinlock_t **ptl)
+static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
+		struct vm_area_struct *vma)
 {
 	VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
 	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
-		return __pmd_trans_huge_lock(pmd, vma, ptl);
+		return __pmd_trans_huge_lock(pmd, vma);
 	else
 		return false;
 }
@@ -190,10 +190,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 long adjust_next)
 {
 }
-static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-		spinlock_t **ptl)
+static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
+		struct vm_area_struct *vma)
 {
-	return false;
+	return NULL;
 }
 
 static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 013fd9bc4cb6..083d61e92706 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -135,6 +135,20 @@ static inline void *idr_find(struct idr *idr, int id)
 #define idr_for_each_entry(idp, entry, id)			\
 	for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
 
+/**
+ * idr_for_each_entry - continue iteration over an idr's elements of a given type
+ * @idp:     idr handle
+ * @entry:   the type * to use as cursor
+ * @id:      id entry's key
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define idr_for_each_entry_continue(idp, entry, id)			\
+	for ((entry) = idr_get_next((idp), &(id));			\
+	     entry;							\
+	     ++id, (entry) = idr_get_next((idp), &(id)))
+
 /*
  * IDA - IDR based id allocator, use when translation from id to
  * pointer isn't necessary.
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index cb30edbfe9fc..0e95fcc75b2a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -413,7 +413,7 @@ enum
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
 	BLOCK_SOFTIRQ,
-	BLOCK_IOPOLL_SOFTIRQ,
+	IRQ_POLL_SOFTIRQ,
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
 	HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the
diff --git a/include/linux/irq_poll.h b/include/linux/irq_poll.h
new file mode 100644
index 000000000000..3e8c1b8fb9be
--- /dev/null
+++ b/include/linux/irq_poll.h
@@ -0,0 +1,25 @@
+#ifndef IRQ_POLL_H
+#define IRQ_POLL_H
+
+struct irq_poll;
+typedef int (irq_poll_fn)(struct irq_poll *, int);
+
+struct irq_poll {
+	struct list_head list;
+	unsigned long state;
+	int weight;
+	irq_poll_fn *poll;
+};
+
+enum {
+	IRQ_POLL_F_SCHED	= 0,
+	IRQ_POLL_F_DISABLE	= 1,
+};
+
+extern void irq_poll_sched(struct irq_poll *);
+extern void irq_poll_init(struct irq_poll *, int, irq_poll_fn *);
+extern void irq_poll_complete(struct irq_poll *);
+extern void irq_poll_enable(struct irq_poll *);
+extern void irq_poll_disable(struct irq_poll *);
+
+#endif
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 034117b3be5f..d6750111e48e 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -1,6 +1,8 @@
 #ifndef NVM_H
 #define NVM_H
 
+#include <linux/types.h>
+
 enum {
 	NVM_IO_OK = 0,
 	NVM_IO_REQUEUE = 1,
@@ -11,12 +13,74 @@ enum {
 	NVM_IOTYPE_GC = 1,
 };
 
+#define NVM_BLK_BITS (16)
+#define NVM_PG_BITS  (16)
+#define NVM_SEC_BITS (8)
+#define NVM_PL_BITS  (8)
+#define NVM_LUN_BITS (8)
+#define NVM_CH_BITS  (8)
+
+struct ppa_addr {
+	/* Generic structure for all addresses */
+	union {
+		struct {
+			u64 blk		: NVM_BLK_BITS;
+			u64 pg		: NVM_PG_BITS;
+			u64 sec		: NVM_SEC_BITS;
+			u64 pl		: NVM_PL_BITS;
+			u64 lun		: NVM_LUN_BITS;
+			u64 ch		: NVM_CH_BITS;
+		} g;
+
+		u64 ppa;
+	};
+};
+
+struct nvm_rq;
+struct nvm_id;
+struct nvm_dev;
+
+typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
+typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *);
+typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
+typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
+				nvm_l2p_update_fn *, void *);
+typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int,
+				nvm_bb_update_fn *, void *);
+typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int);
+typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
+typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *);
+typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
+typedef void (nvm_destroy_dma_pool_fn)(void *);
+typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
+								dma_addr_t *);
+typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t);
+
+struct nvm_dev_ops {
+	nvm_id_fn		*identity;
+	nvm_get_l2p_tbl_fn	*get_l2p_tbl;
+	nvm_op_bb_tbl_fn	*get_bb_tbl;
+	nvm_op_set_bb_fn	*set_bb_tbl;
+
+	nvm_submit_io_fn	*submit_io;
+	nvm_erase_blk_fn	*erase_block;
+
+	nvm_create_dma_pool_fn	*create_dma_pool;
+	nvm_destroy_dma_pool_fn	*destroy_dma_pool;
+	nvm_dev_dma_alloc_fn	*dev_dma_alloc;
+	nvm_dev_dma_free_fn	*dev_dma_free;
+
+	unsigned int		max_phys_sect;
+};
+
+
+
 #ifdef CONFIG_NVM
 
 #include <linux/blkdev.h>
-#include <linux/types.h>
 #include <linux/file.h>
 #include <linux/dmapool.h>
+#include <uapi/linux/lightnvm.h>
 
 enum {
 	/* HW Responsibilities */
@@ -58,8 +122,29 @@ enum {
 	/* Block Types */
 	NVM_BLK_T_FREE		= 0x0,
 	NVM_BLK_T_BAD		= 0x1,
-	NVM_BLK_T_DEV		= 0x2,
-	NVM_BLK_T_HOST		= 0x4,
+	NVM_BLK_T_GRWN_BAD	= 0x2,
+	NVM_BLK_T_DEV		= 0x4,
+	NVM_BLK_T_HOST		= 0x8,
+
+	/* Memory capabilities */
+	NVM_ID_CAP_SLC		= 0x1,
+	NVM_ID_CAP_CMD_SUSPEND	= 0x2,
+	NVM_ID_CAP_SCRAMBLE	= 0x4,
+	NVM_ID_CAP_ENCRYPT	= 0x8,
+
+	/* Memory types */
+	NVM_ID_FMTYPE_SLC	= 0,
+	NVM_ID_FMTYPE_MLC	= 1,
+};
+
+struct nvm_id_lp_mlc {
+	u16	num_pairs;
+	u8	pairs[886];
+};
+
+struct nvm_id_lp_tbl {
+	__u8	id[8];
+	struct nvm_id_lp_mlc mlc;
 };
 
 struct nvm_id_group {
@@ -82,6 +167,8 @@ struct nvm_id_group {
 	u32	mpos;
 	u32	mccap;
 	u16	cpar;
+
+	struct nvm_id_lp_tbl lptbl;
 };
 
 struct nvm_addr_format {
@@ -125,28 +212,8 @@ struct nvm_tgt_instance {
 #define NVM_VERSION_MINOR 0
 #define NVM_VERSION_PATCH 0
 
-#define NVM_BLK_BITS (16)
-#define NVM_PG_BITS  (16)
-#define NVM_SEC_BITS (8)
-#define NVM_PL_BITS  (8)
-#define NVM_LUN_BITS (8)
-#define NVM_CH_BITS  (8)
-
-struct ppa_addr {
-	/* Generic structure for all addresses */
-	union {
-		struct {
-			u64 blk		: NVM_BLK_BITS;
-			u64 pg		: NVM_PG_BITS;
-			u64 sec		: NVM_SEC_BITS;
-			u64 pl		: NVM_PL_BITS;
-			u64 lun		: NVM_LUN_BITS;
-			u64 ch		: NVM_CH_BITS;
-		} g;
-
-		u64 ppa;
-	};
-};
+struct nvm_rq;
+typedef void (nvm_end_io_fn)(struct nvm_rq *);
 
 struct nvm_rq {
 	struct nvm_tgt_instance *ins;
@@ -164,9 +231,14 @@ struct nvm_rq {
 	void *metadata;
 	dma_addr_t dma_metadata;
 
+	struct completion *wait;
+	nvm_end_io_fn *end_io;
+
 	uint8_t opcode;
 	uint16_t nr_pages;
 	uint16_t flags;
+
+	int error;
 };
 
 static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu)
@@ -181,51 +253,31 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata)
 
 struct nvm_block;
 
-typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
-typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *);
-typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
-typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
-				nvm_l2p_update_fn *, void *);
-typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int,
-				nvm_bb_update_fn *, void *);
-typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int);
-typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
-typedef void (nvm_destroy_dma_pool_fn)(void *);
-typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
-								dma_addr_t *);
-typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t);
-
-struct nvm_dev_ops {
-	nvm_id_fn		*identity;
-	nvm_get_l2p_tbl_fn	*get_l2p_tbl;
-	nvm_op_bb_tbl_fn	*get_bb_tbl;
-	nvm_op_set_bb_fn	*set_bb_tbl;
-
-	nvm_submit_io_fn	*submit_io;
-	nvm_erase_blk_fn	*erase_block;
-
-	nvm_create_dma_pool_fn	*create_dma_pool;
-	nvm_destroy_dma_pool_fn	*destroy_dma_pool;
-	nvm_dev_dma_alloc_fn	*dev_dma_alloc;
-	nvm_dev_dma_free_fn	*dev_dma_free;
-
-	unsigned int		max_phys_sect;
-};
-
 struct nvm_lun {
 	int id;
 
 	int lun_id;
 	int chnl_id;
 
-	unsigned int nr_inuse_blocks;	/* Number of used blocks */
+	/* It is up to the target to mark blocks as closed. If the target does
+	 * not do it, all blocks are marked as open, and nr_open_blocks
+	 * represents the number of blocks in use
+	 */
+	unsigned int nr_open_blocks;	/* Number of used, writable blocks */
+	unsigned int nr_closed_blocks;	/* Number of used, read-only blocks */
 	unsigned int nr_free_blocks;	/* Number of unused blocks */
 	unsigned int nr_bad_blocks;	/* Number of bad blocks */
-	struct nvm_block *blocks;
 
 	spinlock_t lock;
+
+	struct nvm_block *blocks;
+};
+
+enum {
+	NVM_BLK_ST_FREE =	0x1,	/* Free block */
+	NVM_BLK_ST_OPEN =	0x2,	/* Open block - read-write */
+	NVM_BLK_ST_CLOSED =	0x4,	/* Closed block - read-only */
+	NVM_BLK_ST_BAD =	0x8,	/* Bad block */
 };
 
 struct nvm_block {
@@ -234,7 +286,16 @@ struct nvm_block {
 	unsigned long id;
 
 	void *priv;
-	int type;
+	int state;
+};
+
+/* system block cpu representation */
+struct nvm_sb_info {
+	unsigned long		seqnr;
+	unsigned long		erase_cnt;
+	unsigned int		version;
+	char			mmtype[NVM_MMTYPE_LEN];
+	struct ppa_addr		fs_ppa;
 };
 
 struct nvm_dev {
@@ -247,6 +308,9 @@ struct nvm_dev {
 	struct nvmm_type *mt;
 	void *mp;
 
+	/* System blocks */
+	struct nvm_sb_info sb;
+
 	/* Device information */
 	int nr_chnls;
 	int nr_planes;
@@ -256,6 +320,7 @@ struct nvm_dev {
 	int blks_per_lun;
 	int sec_size;
 	int oob_size;
+	int mccap;
 	struct nvm_addr_format ppaf;
 
 	/* Calculated/Cached values. These do not reflect the actual usable
@@ -268,6 +333,10 @@ struct nvm_dev {
 	int sec_per_blk;
 	int sec_per_lun;
 
+	/* lower page table */
+	int lps_per_blk;
+	int *lptbl;
+
 	unsigned long total_pages;
 	unsigned long total_blocks;
 	int nr_luns;
@@ -280,6 +349,8 @@ struct nvm_dev {
 	/* Backend device */
 	struct request_queue *q;
 	char name[DISK_NAME_LEN];
+
+	struct mutex mlock;
 };
 
 static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
@@ -345,9 +416,13 @@ static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev,
 	return ppa;
 }
 
+static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg)
+{
+	return dev->lptbl[slc_pg];
+}
+
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
-typedef int (nvm_tgt_end_io_fn)(struct nvm_rq *, int);
 typedef void *(nvm_tgt_init_fn)(struct nvm_dev *, struct gendisk *, int, int);
 typedef void (nvm_tgt_exit_fn)(void *);
 
@@ -358,7 +433,7 @@ struct nvm_tgt_type {
 	/* target entry points */
 	nvm_tgt_make_rq_fn *make_rq;
 	nvm_tgt_capacity_fn *capacity;
-	nvm_tgt_end_io_fn *end_io;
+	nvm_end_io_fn *end_io;
 
 	/* module-specific init/teardown */
 	nvm_tgt_init_fn *init;
@@ -383,7 +458,6 @@ typedef int (nvmm_open_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef int (nvmm_close_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvmm_end_io_fn)(struct nvm_rq *, int);
 typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *,
 								unsigned long);
 typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int);
@@ -397,6 +471,8 @@ struct nvmm_type {
 	nvmm_unregister_fn *unregister_mgr;
 
 	/* Block administration callbacks */
+	nvmm_get_blk_fn *get_blk_unlocked;
+	nvmm_put_blk_fn *put_blk_unlocked;
 	nvmm_get_blk_fn *get_blk;
 	nvmm_put_blk_fn *put_blk;
 	nvmm_open_blk_fn *open_blk;
@@ -404,7 +480,6 @@ struct nvmm_type {
 	nvmm_flush_blk_fn *flush_blk;
 
 	nvmm_submit_io_fn *submit_io;
-	nvmm_end_io_fn *end_io;
 	nvmm_erase_blk_fn *erase_blk;
 
 	/* Configuration management */
@@ -418,6 +493,10 @@ struct nvmm_type {
 extern int nvm_register_mgr(struct nvmm_type *);
 extern void nvm_unregister_mgr(struct nvmm_type *);
 
+extern struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *,
+					struct nvm_lun *, unsigned long);
+extern void nvm_put_blk_unlocked(struct nvm_dev *, struct nvm_block *);
+
 extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *,
 								unsigned long);
 extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *);
@@ -427,7 +506,36 @@ extern int nvm_register(struct request_queue *, char *,
 extern void nvm_unregister(char *);
 
 extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *);
+extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
+extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
+extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
+							struct ppa_addr *, int);
+extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
+extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int);
 extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *);
+extern void nvm_end_io(struct nvm_rq *, int);
+extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int,
+								void *, int);
+
+/* sysblk.c */
+#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */
+
+/* system block on disk representation */
+struct nvm_system_block {
+	__be32			magic;		/* magic signature */
+	__be32			seqnr;		/* sequence number */
+	__be32			erase_cnt;	/* erase count */
+	__be16			version;	/* version number */
+	u8			mmtype[NVM_MMTYPE_LEN]; /* media manager name */
+	__be64			fs_ppa;		/* PPA for media manager
+						 * superblock */
+};
+
+extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *);
+extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *);
+extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *);
+
+extern int nvm_dev_factory(struct nvm_dev *, int flags);
 #else /* CONFIG_NVM */
 struct nvm_dev_ops;
 
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index 46262284de47..04fc6e6c7ff0 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -264,7 +264,7 @@ extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
 extern void lc_committed(struct lru_cache *lc);
 
 struct seq_file;
-extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
+extern void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
 
 extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
 				void (*detail) (struct seq_file *, struct lc_element *));
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 58391f2e0414..116b284bc4ce 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -206,7 +206,8 @@ enum {
 	MLX4_SET_PORT_GID_TABLE = 0x5,
 	MLX4_SET_PORT_PRIO2TC	= 0x8,
 	MLX4_SET_PORT_SCHEDULER = 0x9,
-	MLX4_SET_PORT_VXLAN	= 0xB
+	MLX4_SET_PORT_VXLAN	= 0xB,
+	MLX4_SET_PORT_ROCE_ADDR	= 0xD
 };
 
 enum {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index d3133be12d92..430a929f048b 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -216,6 +216,7 @@ enum {
 	MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN	= 1LL <<  30,
 	MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1ULL << 31,
 	MLX4_DEV_CAP_FLAG2_LB_SRC_CHK           = 1ULL << 32,
+	MLX4_DEV_CAP_FLAG2_ROCE_V1_V2		= 1ULL <<  33,
 };
 
 enum {
@@ -267,12 +268,14 @@ enum {
 	MLX4_BMME_FLAG_TYPE_2_WIN	= 1 <<  9,
 	MLX4_BMME_FLAG_RESERVED_LKEY	= 1 << 10,
 	MLX4_BMME_FLAG_FAST_REG_WR	= 1 << 11,
+	MLX4_BMME_FLAG_ROCE_V1_V2	= 1 << 19,
 	MLX4_BMME_FLAG_PORT_REMAP	= 1 << 24,
 	MLX4_BMME_FLAG_VSD_INIT2RTR	= 1 << 28,
 };
 
 enum {
-	MLX4_FLAG_PORT_REMAP		= MLX4_BMME_FLAG_PORT_REMAP
+	MLX4_FLAG_PORT_REMAP		= MLX4_BMME_FLAG_PORT_REMAP,
+	MLX4_FLAG_ROCE_V1_V2		= MLX4_BMME_FLAG_ROCE_V1_V2
 };
 
 enum mlx4_event {
@@ -979,14 +982,11 @@ struct mlx4_mad_ifc {
 	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	\
 		if ((type) == (dev)->caps.port_mask[(port)])
 
-#define mlx4_foreach_non_ib_transport_port(port, dev)                     \
-	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	  \
-		if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB))
-
 #define mlx4_foreach_ib_transport_port(port, dev)                         \
-	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	  \
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)       \
 		if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
-			((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+			((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) || \
+			((dev)->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2))
 
 #define MLX4_INVALID_SLAVE_ID	0xFF
 #define MLX4_SINK_COUNTER_INDEX(dev)	(dev->caps.max_counters - 1)
@@ -1457,6 +1457,7 @@ int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port);
 
 int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port);
 int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis);
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port);
 int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2);
 int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
 int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index fe052e234906..587cdf943b52 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -194,7 +194,7 @@ struct mlx4_qp_context {
 	u8			mtu_msgmax;
 	u8			rq_size_stride;
 	u8			sq_size_stride;
-	u8			rlkey;
+	u8			rlkey_roce_mode;
 	__be32			usr_page;
 	__be32			local_qpn;
 	__be32			remote_qpn;
@@ -204,7 +204,8 @@ struct mlx4_qp_context {
 	u32			reserved1;
 	__be32			next_send_psn;
 	__be32			cqn_send;
-	u32			reserved2[2];
+	__be16                  roce_entropy;
+	__be16                  reserved2[3];
 	__be32			last_acked_psn;
 	__be32			ssn;
 	__be32			params2;
@@ -487,4 +488,14 @@ static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
 
 void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
 
+static inline u16 folded_qp(u32 q)
+{
+	u16 res;
+
+	res = ((q & 0xff) ^ ((q & 0xff0000) >> 16)) | (q & 0xff00);
+	return res;
+}
+
+u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn);
+
 #endif /* MLX4_QP_H */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 7be845e30689..987764afa65c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -223,6 +223,14 @@ enum {
 #define MLX5_UMR_MTT_MASK      (MLX5_UMR_MTT_ALIGNMENT - 1)
 #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT
 
+#define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8)
+
+enum {
+	MLX5_EVENT_QUEUE_TYPE_QP = 0,
+	MLX5_EVENT_QUEUE_TYPE_RQ = 1,
+	MLX5_EVENT_QUEUE_TYPE_SQ = 2,
+};
+
 enum mlx5_event {
 	MLX5_EVENT_TYPE_COMP		   = 0x0,
 
@@ -280,6 +288,26 @@ enum {
 };
 
 enum {
+	MLX5_ROCE_VERSION_1		= 0,
+	MLX5_ROCE_VERSION_2		= 2,
+};
+
+enum {
+	MLX5_ROCE_VERSION_1_CAP		= 1 << MLX5_ROCE_VERSION_1,
+	MLX5_ROCE_VERSION_2_CAP		= 1 << MLX5_ROCE_VERSION_2,
+};
+
+enum {
+	MLX5_ROCE_L3_TYPE_IPV4		= 0,
+	MLX5_ROCE_L3_TYPE_IPV6		= 1,
+};
+
+enum {
+	MLX5_ROCE_L3_TYPE_IPV4_CAP	= 1 << 1,
+	MLX5_ROCE_L3_TYPE_IPV6_CAP	= 1 << 2,
+};
+
+enum {
 	MLX5_OPCODE_NOP			= 0x00,
 	MLX5_OPCODE_SEND_INVAL		= 0x01,
 	MLX5_OPCODE_RDMA_WRITE		= 0x08,
@@ -446,7 +474,7 @@ struct mlx5_init_seg {
 	__be32			rsvd2[880];
 	__be32			internal_timer_h;
 	__be32			internal_timer_l;
-	__be32			rsrv3[2];
+	__be32			rsvd3[2];
 	__be32			health_counter;
 	__be32			rsvd4[1019];
 	__be64			ieee1588_clk;
@@ -460,7 +488,9 @@ struct mlx5_eqe_comp {
 };
 
 struct mlx5_eqe_qp_srq {
-	__be32	reserved[6];
+	__be32	reserved1[5];
+	u8	type;
+	u8	reserved2[3];
 	__be32	qp_srq_n;
 };
 
@@ -651,6 +681,12 @@ enum {
 };
 
 enum {
+	MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH	= 0x0,
+	MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6	= 0x1,
+	MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4	= 0x2,
+};
+
+enum {
 	CQE_L2_OK	= 1 << 0,
 	CQE_L3_OK	= 1 << 1,
 	CQE_L4_OK	= 1 << 2,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5162f3533042..1e3006dcf35d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -115,6 +115,11 @@ enum {
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
 };
 
+enum {
+	MLX5_ATOMIC_OPS_CMP_SWAP	= 1 << 0,
+	MLX5_ATOMIC_OPS_FETCH_ADD	= 1 << 1,
+};
+
 enum mlx5_page_fault_resume_flags {
 	MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0,
 	MLX5_PAGE_FAULT_RESUME_WRITE	 = 1 << 1,
@@ -341,9 +346,11 @@ struct mlx5_core_mr {
 };
 
 enum mlx5_res_type {
-	MLX5_RES_QP,
-	MLX5_RES_SRQ,
-	MLX5_RES_XSRQ,
+	MLX5_RES_QP	= MLX5_EVENT_QUEUE_TYPE_QP,
+	MLX5_RES_RQ	= MLX5_EVENT_QUEUE_TYPE_RQ,
+	MLX5_RES_SQ	= MLX5_EVENT_QUEUE_TYPE_SQ,
+	MLX5_RES_SRQ	= 3,
+	MLX5_RES_XSRQ	= 4,
 };
 
 struct mlx5_core_rsc_common {
@@ -651,13 +658,6 @@ extern struct workqueue_struct *mlx5_core_wq;
 	.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
 	.struct_size_bytes   = sizeof((struct ib_unpacked_ ## header *)0)->field
 
-struct ib_field {
-	size_t struct_offset_bytes;
-	size_t struct_size_bytes;
-	int    offset_bits;
-	int    size_bits;
-};
-
 static inline struct mlx5_core_dev *pci2mlx5_core_dev(struct pci_dev *pdev)
 {
 	return pci_get_drvdata(pdev);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 68d73f82e009..231ab6bcea76 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -67,6 +67,11 @@ enum {
 };
 
 enum {
+	MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE        = 0x0,
+	MLX5_SET_HCA_CAP_OP_MOD_ATOMIC                = 0x3,
+};
+
+enum {
 	MLX5_CMD_OP_QUERY_HCA_CAP                 = 0x100,
 	MLX5_CMD_OP_QUERY_ADAPTER                 = 0x101,
 	MLX5_CMD_OP_INIT_HCA                      = 0x102,
@@ -573,21 +578,24 @@ enum {
 struct mlx5_ifc_atomic_caps_bits {
 	u8         reserved_0[0x40];
 
-	u8         atomic_req_endianness[0x1];
-	u8         reserved_1[0x1f];
+	u8         atomic_req_8B_endianess_mode[0x2];
+	u8         reserved_1[0x4];
+	u8         supported_atomic_req_8B_endianess_mode_1[0x1];
 
-	u8         reserved_2[0x20];
+	u8         reserved_2[0x19];
 
-	u8         reserved_3[0x10];
-	u8         atomic_operations[0x10];
+	u8         reserved_3[0x20];
 
 	u8         reserved_4[0x10];
-	u8         atomic_size_qp[0x10];
+	u8         atomic_operations[0x10];
 
 	u8         reserved_5[0x10];
+	u8         atomic_size_qp[0x10];
+
+	u8         reserved_6[0x10];
 	u8         atomic_size_dc[0x10];
 
-	u8         reserved_6[0x720];
+	u8         reserved_7[0x720];
 };
 
 struct mlx5_ifc_odp_cap_bits {
@@ -850,7 +858,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_66[0x8];
 	u8         log_uar_page_sz[0x10];
 
-	u8         reserved_67[0x40];
+	u8         reserved_67[0x20];
+	u8         device_frequency_mhz[0x20];
 	u8         device_frequency_khz[0x20];
 	u8         reserved_68[0x5f];
 	u8         cqe_zip[0x1];
@@ -2215,19 +2224,25 @@ struct mlx5_ifc_nic_vport_context_bits {
 
 	u8         mtu[0x10];
 
-	u8         reserved_3[0x640];
+	u8         system_image_guid[0x40];
+	u8         port_guid[0x40];
+	u8         node_guid[0x40];
+
+	u8         reserved_3[0x140];
+	u8         qkey_violation_counter[0x10];
+	u8         reserved_4[0x430];
 
 	u8         promisc_uc[0x1];
 	u8         promisc_mc[0x1];
 	u8         promisc_all[0x1];
-	u8         reserved_4[0x2];
+	u8         reserved_5[0x2];
 	u8         allowed_list_type[0x3];
-	u8         reserved_5[0xc];
+	u8         reserved_6[0xc];
 	u8         allowed_list_size[0xc];
 
 	struct mlx5_ifc_mac_address_layout_bits permanent_address;
 
-	u8         reserved_6[0x20];
+	u8         reserved_7[0x20];
 
 	u8         current_uc_mac_address[0][0x40];
 };
@@ -4199,6 +4214,13 @@ struct mlx5_ifc_modify_tis_out_bits {
 	u8         reserved_1[0x40];
 };
 
+struct mlx5_ifc_modify_tis_bitmask_bits {
+	u8         reserved_0[0x20];
+
+	u8         reserved_1[0x1f];
+	u8         prio[0x1];
+};
+
 struct mlx5_ifc_modify_tis_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_0[0x10];
@@ -4211,7 +4233,7 @@ struct mlx5_ifc_modify_tis_in_bits {
 
 	u8         reserved_3[0x20];
 
-	u8         modify_bitmask[0x40];
+	struct mlx5_ifc_modify_tis_bitmask_bits bitmask;
 
 	u8         reserved_4[0x40];
 
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index f079fb1a31f7..5b8c89ffaa58 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -85,7 +85,16 @@ enum mlx5_qp_state {
 	MLX5_QP_STATE_ERR			= 6,
 	MLX5_QP_STATE_SQ_DRAINING		= 7,
 	MLX5_QP_STATE_SUSPENDED			= 9,
-	MLX5_QP_NUM_STATE
+	MLX5_QP_NUM_STATE,
+	MLX5_QP_STATE,
+	MLX5_QP_STATE_BAD,
+};
+
+enum {
+	MLX5_SQ_STATE_NA	= MLX5_SQC_STATE_ERR + 1,
+	MLX5_SQ_NUM_STATE	= MLX5_SQ_STATE_NA + 1,
+	MLX5_RQ_STATE_NA	= MLX5_RQC_STATE_ERR + 1,
+	MLX5_RQ_NUM_STATE	= MLX5_RQ_STATE_NA + 1,
 };
 
 enum {
@@ -130,6 +139,9 @@ enum {
 	MLX5_QP_BIT_RWE				= 1 << 14,
 	MLX5_QP_BIT_RAE				= 1 << 13,
 	MLX5_QP_BIT_RIC				= 1 <<	4,
+	MLX5_QP_BIT_CC_SLAVE_RECV		= 1 <<  2,
+	MLX5_QP_BIT_CC_SLAVE_SEND		= 1 <<  1,
+	MLX5_QP_BIT_CC_MASTER			= 1 <<  0
 };
 
 enum {
@@ -248,8 +260,12 @@ struct mlx5_av {
 	__be32	dqp_dct;
 	u8	stat_rate_sl;
 	u8	fl_mlid;
-	__be16	rlid;
-	u8	reserved0[10];
+	union {
+		__be16	rlid;
+		__be16  udp_sport;
+	};
+	u8	reserved0[4];
+	u8	rmac[6];
 	u8	tclass;
 	u8	hop_limit;
 	__be32	grh_gid_fl;
@@ -456,11 +472,16 @@ struct mlx5_qp_path {
 	u8			static_rate;
 	u8			hop_limit;
 	__be32			tclass_flowlabel;
-	u8			rgid[16];
-	u8			rsvd1[4];
-	u8			sl;
+	union {
+		u8		rgid[16];
+		u8		rip[16];
+	};
+	u8			f_dscp_ecn_prio;
+	u8			ecn_dscp;
+	__be16			udp_sport;
+	u8			dci_cfi_prio_sl;
 	u8			port;
-	u8			rsvd2[6];
+	u8			rmac[6];
 };
 
 struct mlx5_qp_context {
@@ -620,8 +641,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
 			struct mlx5_core_qp *qp,
 			struct mlx5_create_qp_mbox_in *in,
 			int inlen);
-int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state,
-			enum mlx5_qp_state new_state,
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation,
 			struct mlx5_modify_qp_mbox_in *in, int sqd_event,
 			struct mlx5_core_qp *qp);
 int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
@@ -639,6 +659,14 @@ void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
 int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
 				u8 context, int error);
 #endif
+int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *rq);
+void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *rq);
+int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *sq);
+void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *sq);
 
 static inline const char *mlx5_qp_type_str(int type)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/include/linux/mlx5/transobj.h
index 74cae51436e4..88441f5ece25 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
+++ b/include/linux/mlx5/transobj.h
@@ -33,16 +33,20 @@
 #ifndef __TRANSOBJ_H__
 #define __TRANSOBJ_H__
 
-int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn);
-void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn);
+#include <linux/mlx5/driver.h>
+
+int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn);
+void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn);
 int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			u32 *rqn);
 int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen);
 void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn);
+int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out);
 int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			u32 *sqn);
 int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen);
 void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn);
+int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out);
 int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *tirn);
 int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
@@ -50,6 +54,8 @@ int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
 void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn);
 int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *tisn);
+int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in,
+			 int inlen);
 void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn);
 int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *rmpn);
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 638f2ca7a527..123771003e68 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -45,6 +45,11 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 				     u16 vport, u8 *addr);
 int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev,
 				      u16 vport, u8 *addr);
+int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
+					   u64 *system_image_guid);
+int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
+int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
+					u16 *qkey_viol_cntr);
 int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
 			     u8 port_num, u16  vf_num, u16 gid_index,
 			     union ib_gid *gid);
@@ -85,4 +90,7 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
 				u16 vlans[],
 				int list_size);
 
+int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev);
+int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev);
+
 #endif /* __MLX5_VPORT_H__ */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 3af5f454c04a..a55986f6fe38 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -17,20 +17,19 @@
 
 #include <linux/types.h>
 
-struct nvme_bar {
-	__u64			cap;	/* Controller Capabilities */
-	__u32			vs;	/* Version */
-	__u32			intms;	/* Interrupt Mask Set */
-	__u32			intmc;	/* Interrupt Mask Clear */
-	__u32			cc;	/* Controller Configuration */
-	__u32			rsvd1;	/* Reserved */
-	__u32			csts;	/* Controller Status */
-	__u32			nssr;	/* Subsystem Reset */
-	__u32			aqa;	/* Admin Queue Attributes */
-	__u64			asq;	/* Admin SQ Base Address */
-	__u64			acq;	/* Admin CQ Base Address */
-	__u32			cmbloc; /* Controller Memory Buffer Location */
-	__u32			cmbsz;  /* Controller Memory Buffer Size */
+enum {
+	NVME_REG_CAP	= 0x0000,	/* Controller Capabilities */
+	NVME_REG_VS	= 0x0008,	/* Version */
+	NVME_REG_INTMS	= 0x000c,	/* Interrupt Mask Set */
+	NVME_REG_INTMC	= 0x0010,	/* Interrupt Mask Set */
+	NVME_REG_CC	= 0x0014,	/* Controller Configuration */
+	NVME_REG_CSTS	= 0x001c,	/* Controller Status */
+	NVME_REG_NSSR	= 0x0020,	/* NVM Subsystem Reset */
+	NVME_REG_AQA	= 0x0024,	/* Admin Queue Attributes */
+	NVME_REG_ASQ	= 0x0028,	/* Admin SQ Base Address */
+	NVME_REG_ACQ	= 0x0030,	/* Admin SQ Base Address */
+	NVME_REG_CMBLOC = 0x0038,	/* Controller Memory Buffer Location */
+	NVME_REG_CMBSZ	= 0x003c,	/* Controller Memory Buffer Size */
 };
 
 #define NVME_CAP_MQES(cap)	((cap) & 0xffff)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4d08b6c33557..92395a0a7dc5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -361,6 +361,9 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 			       unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages);
+unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
+			int tag, unsigned int nr_entries,
+			struct page **entries, pgoff_t *indices);
 
 struct page *grab_cache_page_write_begin(struct address_space *mapping,
 			pgoff_t index, unsigned flags);
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index eb8b8ac6df3c..24f5470d3944 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -42,6 +42,7 @@ struct pipe_buffer {
  *	@fasync_readers: reader side fasync
  *	@fasync_writers: writer side fasync
  *	@bufs: the circular array of pipe buffers
+ *	@user: the user who created this pipe
  **/
 struct pipe_inode_info {
 	struct mutex mutex;
@@ -57,6 +58,7 @@ struct pipe_inode_info {
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
 	struct pipe_buffer *bufs;
+	struct user_struct *user;
 };
 
 /*
@@ -123,6 +125,8 @@ void pipe_unlock(struct pipe_inode_info *);
 void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
 
 extern unsigned int pipe_max_size, pipe_min_size;
+extern unsigned long pipe_user_pages_hard;
+extern unsigned long pipe_user_pages_soft;
 int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 
 
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index acfea8ce4a07..7c3d11a6b4ad 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -53,12 +53,18 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 {
 	BUG();
 }
+
+static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
+{
+	BUG();
+}
 #endif
 
 /*
  * Architectures that define ARCH_HAS_PMEM_API must provide
  * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(),
- * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem().
+ * arch_copy_from_iter_pmem(), arch_clear_pmem(), arch_wb_cache_pmem()
+ * and arch_has_wmb_pmem().
  */
 static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
 {
@@ -178,4 +184,18 @@ static inline void clear_pmem(void __pmem *addr, size_t size)
 	else
 		default_clear_pmem(addr, size);
 }
+
+/**
+ * wb_cache_pmem - write back processor cache for PMEM memory range
+ * @addr:	virtual start address
+ * @size:	number of bytes to write back
+ *
+ * Write back the processor cache range starting at 'addr' for 'size' bytes.
+ * This function requires explicit ordering with a wmb_pmem() call.
+ */
+static inline void wb_cache_pmem(void __pmem *addr, size_t size)
+{
+	if (arch_has_pmem_api())
+		arch_wb_cache_pmem(addr, size);
+}
 #endif /* __PMEM_H__ */
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 57e7d87d2d4c..7c88ad156a29 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -51,6 +51,15 @@
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
+#define RADIX_DAX_MASK	0xf
+#define RADIX_DAX_SHIFT	4
+#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f1e81e128592..a10494a94cc3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -835,6 +835,7 @@ struct user_struct {
 #endif
 	unsigned long locked_shm; /* How many pages of mlocked shm ? */
 	unsigned long unix_inflight;	/* How many files in flight in unix sockets */
+	atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
 
 #ifdef CONFIG_KEYS
 	struct key *uid_keyring;	/* UID specific keyring */
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index a43f41cb3c43..4d4780c00d34 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -15,10 +15,7 @@ struct shmem_inode_info {
 	unsigned int		seals;		/* shmem seals */
 	unsigned long		flags;
 	unsigned long		alloced;	/* data pages alloced to file */
-	union {
-		unsigned long	swapped;	/* subtotal assigned to swap */
-		char		*symlink;	/* unswappable short symlink */
-	};
+	unsigned long		swapped;	/* subtotal assigned to swap */
 	struct shared_policy	policy;		/* NUMA memory alloc policy */
 	struct list_head	swaplist;	/* chain of maybes on swap */
 	struct simple_xattrs	xattrs;		/* list of xattrs */
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f869807a0d0e..5322fea6fe4c 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -51,6 +51,7 @@
 /* RPC/RDMA parameters and stats */
 extern unsigned int svcrdma_ord;
 extern unsigned int svcrdma_max_requests;
+extern unsigned int svcrdma_max_bc_requests;
 extern unsigned int svcrdma_max_req_size;
 
 extern atomic_t rdma_stat_recv;
@@ -69,6 +70,7 @@ extern atomic_t rdma_stat_sq_prod;
  * completes.
  */
 struct svc_rdma_op_ctxt {
+	struct list_head free;
 	struct svc_rdma_op_ctxt *read_hdr;
 	struct svc_rdma_fastreg_mr *frmr;
 	int hdr_count;
@@ -112,6 +114,7 @@ struct svc_rdma_fastreg_mr {
 	struct list_head frmr_list;
 };
 struct svc_rdma_req_map {
+	struct list_head free;
 	unsigned long count;
 	union {
 		struct kvec sge[RPCSVC_MAXPAGES];
@@ -132,28 +135,32 @@ struct svcxprt_rdma {
 	int                  sc_max_sge;
 	int                  sc_max_sge_rd;	/* max sge for read target */
 
-	int                  sc_sq_depth;	/* Depth of SQ */
 	atomic_t             sc_sq_count;	/* Number of SQ WR on queue */
-
-	int                  sc_max_requests;	/* Depth of RQ */
+	unsigned int	     sc_sq_depth;	/* Depth of SQ */
+	unsigned int	     sc_rq_depth;	/* Depth of RQ */
+	u32		     sc_max_requests;	/* Forward credits */
+	u32		     sc_max_bc_requests;/* Backward credits */
 	int                  sc_max_req_size;	/* Size of each RQ WR buf */
 
 	struct ib_pd         *sc_pd;
 
 	atomic_t	     sc_dma_used;
-	atomic_t	     sc_ctxt_used;
+	spinlock_t	     sc_ctxt_lock;
+	struct list_head     sc_ctxts;
+	int		     sc_ctxt_used;
+	spinlock_t	     sc_map_lock;
+	struct list_head     sc_maps;
+
 	struct list_head     sc_rq_dto_q;
 	spinlock_t	     sc_rq_dto_lock;
 	struct ib_qp         *sc_qp;
 	struct ib_cq         *sc_rq_cq;
 	struct ib_cq         *sc_sq_cq;
-	struct ib_mr         *sc_phys_mr;	/* MR for server memory */
 	int		     (*sc_reader)(struct svcxprt_rdma *,
 					  struct svc_rqst *,
 					  struct svc_rdma_op_ctxt *,
 					  int *, u32 *, u32, u32, u64, bool);
 	u32		     sc_dev_caps;	/* distilled device caps */
-	u32		     sc_dma_lkey;	/* local dma key */
 	unsigned int	     sc_frmr_pg_list_len;
 	struct list_head     sc_frmr_q;
 	spinlock_t	     sc_frmr_q_lock;
@@ -179,8 +186,18 @@ struct svcxprt_rdma {
 #define RPCRDMA_MAX_REQUESTS    32
 #define RPCRDMA_MAX_REQ_SIZE    4096
 
+/* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our
+ * current NFSv4.1 implementation supports one backchannel slot.
+ */
+#define RPCRDMA_MAX_BC_REQUESTS	2
+
 #define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
 
+/* svc_rdma_backchannel.c */
+extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
+				    struct rpcrdma_msg *rmsgp,
+				    struct xdr_buf *rcvbuf);
+
 /* svc_rdma_marshal.c */
 extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
@@ -206,6 +223,8 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
 				u32, u32, u64, bool);
 
 /* svc_rdma_sendto.c */
+extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
+			    struct svc_rdma_req_map *);
 extern int svc_rdma_sendto(struct svc_rqst *);
 extern struct rpcrdma_read_chunk *
 	svc_rdma_get_read_chunk(struct rpcrdma_msg *);
@@ -214,13 +233,14 @@ extern struct rpcrdma_read_chunk *
 extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *);
 extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
 				enum rpcrdma_errcode);
-extern int svc_rdma_post_recv(struct svcxprt_rdma *);
+extern int svc_rdma_post_recv(struct svcxprt_rdma *, gfp_t);
 extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
 extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
 extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
 extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
-extern struct svc_rdma_req_map *svc_rdma_get_req_map(void);
-extern void svc_rdma_put_req_map(struct svc_rdma_req_map *);
+extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *);
+extern void svc_rdma_put_req_map(struct svcxprt_rdma *,
+				 struct svc_rdma_req_map *);
 extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *);
 extern void svc_rdma_put_frmr(struct svcxprt_rdma *,
 			      struct svc_rdma_fastreg_mr *);
@@ -234,6 +254,7 @@ extern struct svc_xprt_class svc_rdma_bc_class;
 #endif
 
 /* svc_rdma.c */
+extern struct workqueue_struct *svc_rdma_wq;
 extern int svc_rdma_init(void);
 extern void svc_rdma_cleanup(void);
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 613c29bd6baf..e13a1ace50e9 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -43,6 +43,9 @@
 /* Default weight of a bound cooling device */
 #define THERMAL_WEIGHT_DEFAULT 0
 
+/* use value, which < 0K, to indicate an invalid/uninitialized temperature */
+#define THERMAL_TEMP_INVALID	-274000
+
 /* Unit conversion macros */
 #define DECI_KELVIN_TO_CELSIUS(t)	({			\
 	long _t = (t);						\
@@ -167,6 +170,7 @@ struct thermal_attr {
  * @forced_passive:	If > 0, temperature at which to switch on all ACPI
  *			processor cooling devices.  Currently only used by the
  *			step-wise governor.
+ * @need_update:	if equals 1, thermal_zone_device_update needs to be invoked.
  * @ops:	operations this &thermal_zone_device supports
  * @tzp:	thermal zone parameters
  * @governor:	pointer to the governor for this thermal zone
@@ -194,6 +198,7 @@ struct thermal_zone_device {
 	int emul_temperature;
 	int passive;
 	unsigned int forced_passive;
+	atomic_t need_update;
 	struct thermal_zone_device_ops *ops;
 	struct thermal_zone_params *tzp;
 	struct thermal_governor *governor;
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index 11528591d0d7..c34c9002460c 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -83,6 +83,8 @@ struct rdma_dev_addr {
 	int bound_dev_if;
 	enum rdma_transport_type transport;
 	struct net *net;
+	enum rdma_network_type network;
+	int hoplimit;
 };
 
 /**
@@ -91,8 +93,8 @@ struct rdma_dev_addr {
  *
  * The dev_addr->net field must be initialized.
  */
-int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
-		      u16 *vlan_id);
+int rdma_translate_ip(const struct sockaddr *addr,
+		      struct rdma_dev_addr *dev_addr, u16 *vlan_id);
 
 /**
  * rdma_resolve_ip - Resolve source and destination IP addresses to
@@ -117,6 +119,10 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
 				     struct rdma_dev_addr *addr, void *context),
 		    void *context);
 
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+			  const struct sockaddr *dst_addr,
+			  struct rdma_dev_addr *addr);
+
 void rdma_addr_cancel(struct rdma_dev_addr *addr);
 
 int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
@@ -125,8 +131,10 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
 int rdma_addr_size(struct sockaddr *addr);
 
 int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id);
-int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid,
-			       u8 *smac, u16 *vlan_id, int if_index);
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+				 const union ib_gid *dgid,
+				 u8 *smac, u16 *vlan_id, int *if_index,
+				 int *hoplimit);
 
 static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
 {
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
index 269a27cf0a46..e30f19bd4a41 100644
--- a/include/rdma/ib_cache.h
+++ b/include/rdma/ib_cache.h
@@ -60,6 +60,7 @@ int ib_get_cached_gid(struct ib_device    *device,
  *   a specified GID value occurs.
  * @device: The device to query.
  * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
  * @ndev: In RoCE, the net device of the device. NULL means ignore.
  * @port_num: The port number of the device where the GID value was found.
  * @index: The index into the cached GID table where the GID was found.  This
@@ -70,6 +71,7 @@ int ib_get_cached_gid(struct ib_device    *device,
  */
 int ib_find_cached_gid(struct ib_device *device,
 		       const union ib_gid *gid,
+		       enum ib_gid_type gid_type,
 		       struct net_device *ndev,
 		       u8               *port_num,
 		       u16              *index);
@@ -79,6 +81,7 @@ int ib_find_cached_gid(struct ib_device *device,
  * GID value occurs
  * @device: The device to query.
  * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
  * @port_num: The port number of the device where the GID value sould be
  *   searched.
  * @ndev: In RoCE, the net device of the device. Null means ignore.
@@ -90,6 +93,7 @@ int ib_find_cached_gid(struct ib_device *device,
  */
 int ib_find_cached_gid_by_port(struct ib_device *device,
 			       const union ib_gid *gid,
+			       enum ib_gid_type gid_type,
 			       u8               port_num,
 			       struct net_device *ndev,
 			       u16              *index);
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index ec9b44dd3d80..0ff049bd9ad4 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -438,6 +438,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
 /**
  * ib_mad_recv_handler - callback handler for a received MAD.
  * @mad_agent: MAD agent requesting the received MAD.
+ * @send_buf: Send buffer if found, else NULL
  * @mad_recv_wc: Received work completion information on the received MAD.
  *
  * MADs received in response to a send request operation will be handed to
@@ -447,6 +448,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
  * modify the data referenced by @mad_recv_wc.
  */
 typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+				    struct ib_mad_send_buf *send_buf,
 				    struct ib_mad_recv_wc *mad_recv_wc);
 
 /**
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index e99d8f9a4551..0f3daae44bf9 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -41,6 +41,8 @@ enum {
 	IB_ETH_BYTES  = 14,
 	IB_VLAN_BYTES = 4,
 	IB_GRH_BYTES  = 40,
+	IB_IP4_BYTES  = 20,
+	IB_UDP_BYTES  = 8,
 	IB_BTH_BYTES  = 12,
 	IB_DETH_BYTES = 8
 };
@@ -223,6 +225,27 @@ struct ib_unpacked_eth {
 	__be16	type;
 };
 
+struct ib_unpacked_ip4 {
+	u8	ver;
+	u8	hdr_len;
+	u8	tos;
+	__be16	tot_len;
+	__be16	id;
+	__be16	frag_off;
+	u8	ttl;
+	u8	protocol;
+	__sum16	check;
+	__be32	saddr;
+	__be32	daddr;
+};
+
+struct ib_unpacked_udp {
+	__be16	sport;
+	__be16	dport;
+	__be16	length;
+	__be16	csum;
+};
+
 struct ib_unpacked_vlan {
 	__be16  tag;
 	__be16  type;
@@ -237,6 +260,10 @@ struct ib_ud_header {
 	struct ib_unpacked_vlan vlan;
 	int			grh_present;
 	struct ib_unpacked_grh	grh;
+	int			ipv4_present;
+	struct ib_unpacked_ip4	ip4;
+	int			udp_present;
+	struct ib_unpacked_udp	udp;
 	struct ib_unpacked_bth	bth;
 	struct ib_unpacked_deth deth;
 	int			immediate_present;
@@ -253,13 +280,17 @@ void ib_unpack(const struct ib_field        *desc,
 	       void                         *buf,
 	       void                         *structure);
 
-void ib_ud_header_init(int		    payload_bytes,
-		       int		    lrh_present,
-		       int		    eth_present,
-		       int		    vlan_present,
-		       int		    grh_present,
-		       int		    immediate_present,
-		       struct ib_ud_header *header);
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header);
+
+int ib_ud_header_init(int		    payload_bytes,
+		      int		    lrh_present,
+		      int		    eth_present,
+		      int		    vlan_present,
+		      int		    grh_present,
+		      int		    ip_version,
+		      int		    udp_present,
+		      int		    immediate_present,
+		      struct ib_ud_header *header);
 
 int ib_ud_header_pack(struct ib_ud_header *header,
 		      void                *buf);
diff --git a/include/rdma/ib_pma.h b/include/rdma/ib_pma.h
index a5889f18807b..2f8a65c1fca7 100644
--- a/include/rdma/ib_pma.h
+++ b/include/rdma/ib_pma.h
@@ -42,6 +42,7 @@
  */
 #define IB_PMA_CLASS_CAP_ALLPORTSELECT  cpu_to_be16(1 << 8)
 #define IB_PMA_CLASS_CAP_EXT_WIDTH      cpu_to_be16(1 << 9)
+#define IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF cpu_to_be16(1 << 10)
 #define IB_PMA_CLASS_CAP_XMIT_WAIT      cpu_to_be16(1 << 12)
 
 #define IB_PMA_CLASS_PORT_INFO          cpu_to_be16(0x0001)
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 301969552d0a..cdc1c81aa275 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -160,6 +160,7 @@ struct ib_sa_path_rec {
 	int	     ifindex;
 	/* ignored in IB */
 	struct net  *net;
+	enum ib_gid_type gid_type;
 };
 
 static inline struct net_device *ib_get_ndev_from_path(struct ib_sa_path_rec *rec)
@@ -402,6 +403,8 @@ int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
  */
 int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
 			     struct ib_sa_mcmember_rec *rec,
+			     struct net_device *ndev,
+			     enum ib_gid_type gid_type,
 			     struct ib_ah_attr *ah_attr);
 
 /**
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 120da1d7f57e..284b00c8fea4 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -49,13 +49,19 @@
 #include <linux/scatterlist.h>
 #include <linux/workqueue.h>
 #include <linux/socket.h>
+#include <linux/irq_poll.h>
 #include <uapi/linux/if_ether.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <linux/atomic.h>
 #include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 
 extern struct workqueue_struct *ib_wq;
+extern struct workqueue_struct *ib_comp_wq;
 
 union ib_gid {
 	u8	raw[16];
@@ -67,7 +73,17 @@ union ib_gid {
 
 extern union ib_gid zgid;
 
+enum ib_gid_type {
+	/* If link layer is Ethernet, this is RoCE V1 */
+	IB_GID_TYPE_IB        = 0,
+	IB_GID_TYPE_ROCE      = 0,
+	IB_GID_TYPE_ROCE_UDP_ENCAP = 1,
+	IB_GID_TYPE_SIZE
+};
+
+#define ROCE_V2_UDP_DPORT      4791
 struct ib_gid_attr {
+	enum ib_gid_type	gid_type;
 	struct net_device	*ndev;
 };
 
@@ -98,6 +114,35 @@ enum rdma_protocol_type {
 __attribute_const__ enum rdma_transport_type
 rdma_node_get_transport(enum rdma_node_type node_type);
 
+enum rdma_network_type {
+	RDMA_NETWORK_IB,
+	RDMA_NETWORK_ROCE_V1 = RDMA_NETWORK_IB,
+	RDMA_NETWORK_IPV4,
+	RDMA_NETWORK_IPV6
+};
+
+static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type)
+{
+	if (network_type == RDMA_NETWORK_IPV4 ||
+	    network_type == RDMA_NETWORK_IPV6)
+		return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+	/* IB_GID_TYPE_IB same as RDMA_NETWORK_ROCE_V1 */
+	return IB_GID_TYPE_IB;
+}
+
+static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type,
+							    union ib_gid *gid)
+{
+	if (gid_type == IB_GID_TYPE_IB)
+		return RDMA_NETWORK_IB;
+
+	if (ipv6_addr_v4mapped((struct in6_addr *)gid))
+		return RDMA_NETWORK_IPV4;
+	else
+		return RDMA_NETWORK_IPV6;
+}
+
 enum rdma_link_layer {
 	IB_LINK_LAYER_UNSPECIFIED,
 	IB_LINK_LAYER_INFINIBAND,
@@ -105,24 +150,32 @@ enum rdma_link_layer {
 };
 
 enum ib_device_cap_flags {
-	IB_DEVICE_RESIZE_MAX_WR		= 1,
-	IB_DEVICE_BAD_PKEY_CNTR		= (1<<1),
-	IB_DEVICE_BAD_QKEY_CNTR		= (1<<2),
-	IB_DEVICE_RAW_MULTI		= (1<<3),
-	IB_DEVICE_AUTO_PATH_MIG		= (1<<4),
-	IB_DEVICE_CHANGE_PHY_PORT	= (1<<5),
-	IB_DEVICE_UD_AV_PORT_ENFORCE	= (1<<6),
-	IB_DEVICE_CURR_QP_STATE_MOD	= (1<<7),
-	IB_DEVICE_SHUTDOWN_PORT		= (1<<8),
-	IB_DEVICE_INIT_TYPE		= (1<<9),
-	IB_DEVICE_PORT_ACTIVE_EVENT	= (1<<10),
-	IB_DEVICE_SYS_IMAGE_GUID	= (1<<11),
-	IB_DEVICE_RC_RNR_NAK_GEN	= (1<<12),
-	IB_DEVICE_SRQ_RESIZE		= (1<<13),
-	IB_DEVICE_N_NOTIFY_CQ		= (1<<14),
-	IB_DEVICE_LOCAL_DMA_LKEY	= (1<<15),
-	IB_DEVICE_RESERVED		= (1<<16), /* old SEND_W_INV */
-	IB_DEVICE_MEM_WINDOW		= (1<<17),
+	IB_DEVICE_RESIZE_MAX_WR			= (1 << 0),
+	IB_DEVICE_BAD_PKEY_CNTR			= (1 << 1),
+	IB_DEVICE_BAD_QKEY_CNTR			= (1 << 2),
+	IB_DEVICE_RAW_MULTI			= (1 << 3),
+	IB_DEVICE_AUTO_PATH_MIG			= (1 << 4),
+	IB_DEVICE_CHANGE_PHY_PORT		= (1 << 5),
+	IB_DEVICE_UD_AV_PORT_ENFORCE		= (1 << 6),
+	IB_DEVICE_CURR_QP_STATE_MOD		= (1 << 7),
+	IB_DEVICE_SHUTDOWN_PORT			= (1 << 8),
+	IB_DEVICE_INIT_TYPE			= (1 << 9),
+	IB_DEVICE_PORT_ACTIVE_EVENT		= (1 << 10),
+	IB_DEVICE_SYS_IMAGE_GUID		= (1 << 11),
+	IB_DEVICE_RC_RNR_NAK_GEN		= (1 << 12),
+	IB_DEVICE_SRQ_RESIZE			= (1 << 13),
+	IB_DEVICE_N_NOTIFY_CQ			= (1 << 14),
+
+	/*
+	 * This device supports a per-device lkey or stag that can be
+	 * used without performing a memory registration for the local
+	 * memory.  Note that ULPs should never check this flag, but
+	 * instead of use the local_dma_lkey flag in the ib_pd structure,
+	 * which will always contain a usable lkey.
+	 */
+	IB_DEVICE_LOCAL_DMA_LKEY		= (1 << 15),
+	IB_DEVICE_RESERVED /* old SEND_W_INV */	= (1 << 16),
+	IB_DEVICE_MEM_WINDOW			= (1 << 17),
 	/*
 	 * Devices should set IB_DEVICE_UD_IP_SUM if they support
 	 * insertion of UDP and TCP checksum on outgoing UD IPoIB
@@ -130,18 +183,35 @@ enum ib_device_cap_flags {
 	 * incoming messages.  Setting this flag implies that the
 	 * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode.
 	 */
-	IB_DEVICE_UD_IP_CSUM		= (1<<18),
-	IB_DEVICE_UD_TSO		= (1<<19),
-	IB_DEVICE_XRC			= (1<<20),
-	IB_DEVICE_MEM_MGT_EXTENSIONS	= (1<<21),
-	IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
-	IB_DEVICE_MEM_WINDOW_TYPE_2A	= (1<<23),
-	IB_DEVICE_MEM_WINDOW_TYPE_2B	= (1<<24),
-	IB_DEVICE_RC_IP_CSUM		= (1<<25),
-	IB_DEVICE_RAW_IP_CSUM		= (1<<26),
-	IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29),
-	IB_DEVICE_SIGNATURE_HANDOVER	= (1<<30),
-	IB_DEVICE_ON_DEMAND_PAGING	= (1<<31),
+	IB_DEVICE_UD_IP_CSUM			= (1 << 18),
+	IB_DEVICE_UD_TSO			= (1 << 19),
+	IB_DEVICE_XRC				= (1 << 20),
+
+	/*
+	 * This device supports the IB "base memory management extension",
+	 * which includes support for fast registrations (IB_WR_REG_MR,
+	 * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs).  This flag should
+	 * also be set by any iWarp device which must support FRs to comply
+	 * to the iWarp verbs spec.  iWarp devices also support the
+	 * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the
+	 * stag.
+	 */
+	IB_DEVICE_MEM_MGT_EXTENSIONS		= (1 << 21),
+	IB_DEVICE_BLOCK_MULTICAST_LOOPBACK	= (1 << 22),
+	IB_DEVICE_MEM_WINDOW_TYPE_2A		= (1 << 23),
+	IB_DEVICE_MEM_WINDOW_TYPE_2B		= (1 << 24),
+	IB_DEVICE_RC_IP_CSUM			= (1 << 25),
+	IB_DEVICE_RAW_IP_CSUM			= (1 << 26),
+	/*
+	 * Devices should set IB_DEVICE_CROSS_CHANNEL if they
+	 * support execution of WQEs that involve synchronization
+	 * of I/O operations with single completion queue managed
+	 * by hardware.
+	 */
+	IB_DEVICE_CROSS_CHANNEL		= (1 << 27),
+	IB_DEVICE_MANAGED_FLOW_STEERING		= (1 << 29),
+	IB_DEVICE_SIGNATURE_HANDOVER		= (1 << 30),
+	IB_DEVICE_ON_DEMAND_PAGING		= (1 << 31),
 };
 
 enum ib_signature_prot_cap {
@@ -184,6 +254,7 @@ struct ib_odp_caps {
 
 enum ib_cq_creation_flags {
 	IB_CQ_FLAGS_TIMESTAMP_COMPLETION   = 1 << 0,
+	IB_CQ_FLAGS_IGNORE_OVERRUN	   = 1 << 1,
 };
 
 struct ib_cq_init_attr {
@@ -393,6 +464,7 @@ union rdma_protocol_stats {
 #define RDMA_CORE_CAP_PROT_IB           0x00100000
 #define RDMA_CORE_CAP_PROT_ROCE         0x00200000
 #define RDMA_CORE_CAP_PROT_IWARP        0x00400000
+#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000
 
 #define RDMA_CORE_PORT_IBA_IB          (RDMA_CORE_CAP_PROT_IB  \
 					| RDMA_CORE_CAP_IB_MAD \
@@ -405,6 +477,12 @@ union rdma_protocol_stats {
 					| RDMA_CORE_CAP_IB_CM   \
 					| RDMA_CORE_CAP_AF_IB   \
 					| RDMA_CORE_CAP_ETH_AH)
+#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP			\
+					(RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \
+					| RDMA_CORE_CAP_IB_MAD  \
+					| RDMA_CORE_CAP_IB_CM   \
+					| RDMA_CORE_CAP_AF_IB   \
+					| RDMA_CORE_CAP_ETH_AH)
 #define RDMA_CORE_PORT_IWARP           (RDMA_CORE_CAP_PROT_IWARP \
 					| RDMA_CORE_CAP_IW_CM)
 #define RDMA_CORE_PORT_INTEL_OPA       (RDMA_CORE_PORT_IBA_IB  \
@@ -519,6 +597,17 @@ struct ib_grh {
 	union ib_gid	dgid;
 };
 
+union rdma_network_hdr {
+	struct ib_grh ibgrh;
+	struct {
+		/* The IB spec states that if it's IPv4, the header
+		 * is located in the last 20 bytes of the header.
+		 */
+		u8		reserved[20];
+		struct iphdr	roce4grh;
+	};
+};
+
 enum {
 	IB_MULTICAST_QPN = 0xffffff
 };
@@ -734,7 +823,6 @@ enum ib_wc_opcode {
 	IB_WC_RDMA_READ,
 	IB_WC_COMP_SWAP,
 	IB_WC_FETCH_ADD,
-	IB_WC_BIND_MW,
 	IB_WC_LSO,
 	IB_WC_LOCAL_INV,
 	IB_WC_REG_MR,
@@ -755,10 +843,14 @@ enum ib_wc_flags {
 	IB_WC_IP_CSUM_OK	= (1<<3),
 	IB_WC_WITH_SMAC		= (1<<4),
 	IB_WC_WITH_VLAN		= (1<<5),
+	IB_WC_WITH_NETWORK_HDR_TYPE	= (1<<6),
 };
 
 struct ib_wc {
-	u64			wr_id;
+	union {
+		u64		wr_id;
+		struct ib_cqe	*wr_cqe;
+	};
 	enum ib_wc_status	status;
 	enum ib_wc_opcode	opcode;
 	u32			vendor_err;
@@ -777,6 +869,7 @@ struct ib_wc {
 	u8			port_num;	/* valid only for DR SMPs on switches */
 	u8			smac[ETH_ALEN];
 	u16			vlan_id;
+	u8			network_hdr_type;
 };
 
 enum ib_cq_notify_flags {
@@ -866,6 +959,9 @@ enum ib_qp_type {
 enum ib_qp_create_flags {
 	IB_QP_CREATE_IPOIB_UD_LSO		= 1 << 0,
 	IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK	= 1 << 1,
+	IB_QP_CREATE_CROSS_CHANNEL              = 1 << 2,
+	IB_QP_CREATE_MANAGED_SEND               = 1 << 3,
+	IB_QP_CREATE_MANAGED_RECV               = 1 << 4,
 	IB_QP_CREATE_NETIF_QP			= 1 << 5,
 	IB_QP_CREATE_SIGNATURE_EN		= 1 << 6,
 	IB_QP_CREATE_USE_GFP_NOIO		= 1 << 7,
@@ -1027,7 +1123,6 @@ enum ib_wr_opcode {
 	IB_WR_REG_MR,
 	IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
 	IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
-	IB_WR_BIND_MW,
 	IB_WR_REG_SIG_MR,
 	/* reserve values for low level drivers' internal use.
 	 * These values will not be used at all in the ib core layer.
@@ -1062,26 +1157,16 @@ struct ib_sge {
 	u32	lkey;
 };
 
-/**
- * struct ib_mw_bind_info - Parameters for a memory window bind operation.
- * @mr: A memory region to bind the memory window to.
- * @addr: The address where the memory window should begin.
- * @length: The length of the memory window, in bytes.
- * @mw_access_flags: Access flags from enum ib_access_flags for the window.
- *
- * This struct contains the shared parameters for type 1 and type 2
- * memory window bind operations.
- */
-struct ib_mw_bind_info {
-	struct ib_mr   *mr;
-	u64		addr;
-	u64		length;
-	int		mw_access_flags;
+struct ib_cqe {
+	void (*done)(struct ib_cq *cq, struct ib_wc *wc);
 };
 
 struct ib_send_wr {
 	struct ib_send_wr      *next;
-	u64			wr_id;
+	union {
+		u64		wr_id;
+		struct ib_cqe	*wr_cqe;
+	};
 	struct ib_sge	       *sg_list;
 	int			num_sge;
 	enum ib_wr_opcode	opcode;
@@ -1147,19 +1232,6 @@ static inline struct ib_reg_wr *reg_wr(struct ib_send_wr *wr)
 	return container_of(wr, struct ib_reg_wr, wr);
 }
 
-struct ib_bind_mw_wr {
-	struct ib_send_wr	wr;
-	struct ib_mw		*mw;
-	/* The new rkey for the memory window. */
-	u32			rkey;
-	struct ib_mw_bind_info	bind_info;
-};
-
-static inline struct ib_bind_mw_wr *bind_mw_wr(struct ib_send_wr *wr)
-{
-	return container_of(wr, struct ib_bind_mw_wr, wr);
-}
-
 struct ib_sig_handover_wr {
 	struct ib_send_wr	wr;
 	struct ib_sig_attrs    *sig_attrs;
@@ -1175,7 +1247,10 @@ static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr)
 
 struct ib_recv_wr {
 	struct ib_recv_wr      *next;
-	u64			wr_id;
+	union {
+		u64		wr_id;
+		struct ib_cqe	*wr_cqe;
+	};
 	struct ib_sge	       *sg_list;
 	int			num_sge;
 };
@@ -1190,20 +1265,10 @@ enum ib_access_flags {
 	IB_ACCESS_ON_DEMAND     = (1<<6),
 };
 
-struct ib_phys_buf {
-	u64      addr;
-	u64      size;
-};
-
-struct ib_mr_attr {
-	struct ib_pd	*pd;
-	u64		device_virt_addr;
-	u64		size;
-	int		mr_access_flags;
-	u32		lkey;
-	u32		rkey;
-};
-
+/*
+ * XXX: these are apparently used for ->rereg_user_mr, no idea why they
+ * are hidden here instead of a uapi header!
+ */
 enum ib_mr_rereg_flags {
 	IB_MR_REREG_TRANS	= 1,
 	IB_MR_REREG_PD		= (1<<1),
@@ -1211,18 +1276,6 @@ enum ib_mr_rereg_flags {
 	IB_MR_REREG_SUPPORTED	= ((IB_MR_REREG_ACCESS << 1) - 1)
 };
 
-/**
- * struct ib_mw_bind - Parameters for a type 1 memory window bind operation.
- * @wr_id:      Work request id.
- * @send_flags: Flags from ib_send_flags enum.
- * @bind_info:  More parameters of the bind operation.
- */
-struct ib_mw_bind {
-	u64                    wr_id;
-	int                    send_flags;
-	struct ib_mw_bind_info bind_info;
-};
-
 struct ib_fmr_attr {
 	int	max_pages;
 	int	max_maps;
@@ -1307,6 +1360,12 @@ struct ib_ah {
 
 typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
 
+enum ib_poll_context {
+	IB_POLL_DIRECT,		/* caller context, no hw completions */
+	IB_POLL_SOFTIRQ,	/* poll from softirq context */
+	IB_POLL_WORKQUEUE,	/* poll from workqueue */
+};
+
 struct ib_cq {
 	struct ib_device       *device;
 	struct ib_uobject      *uobject;
@@ -1315,6 +1374,12 @@ struct ib_cq {
 	void                   *cq_context;
 	int               	cqe;
 	atomic_t          	usecnt; /* count number of work queues */
+	enum ib_poll_context	poll_ctx;
+	struct ib_wc		*wc;
+	union {
+		struct irq_poll		iop;
+		struct work_struct	work;
+	};
 };
 
 struct ib_srq {
@@ -1363,7 +1428,6 @@ struct ib_mr {
 	u64		   iova;
 	u32		   length;
 	unsigned int	   page_size;
-	atomic_t	   usecnt; /* count number of MWs */
 };
 
 struct ib_mw {
@@ -1724,11 +1788,6 @@ struct ib_device {
 						      int wc_cnt);
 	struct ib_mr *             (*get_dma_mr)(struct ib_pd *pd,
 						 int mr_access_flags);
-	struct ib_mr *             (*reg_phys_mr)(struct ib_pd *pd,
-						  struct ib_phys_buf *phys_buf_array,
-						  int num_phys_buf,
-						  int mr_access_flags,
-						  u64 *iova_start);
 	struct ib_mr *             (*reg_user_mr)(struct ib_pd *pd,
 						  u64 start, u64 length,
 						  u64 virt_addr,
@@ -1741,8 +1800,6 @@ struct ib_device {
 						    int mr_access_flags,
 						    struct ib_pd *pd,
 						    struct ib_udata *udata);
-	int                        (*query_mr)(struct ib_mr *mr,
-					       struct ib_mr_attr *mr_attr);
 	int                        (*dereg_mr)(struct ib_mr *mr);
 	struct ib_mr *		   (*alloc_mr)(struct ib_pd *pd,
 					       enum ib_mr_type mr_type,
@@ -1750,18 +1807,8 @@ struct ib_device {
 	int                        (*map_mr_sg)(struct ib_mr *mr,
 						struct scatterlist *sg,
 						int sg_nents);
-	int                        (*rereg_phys_mr)(struct ib_mr *mr,
-						    int mr_rereg_mask,
-						    struct ib_pd *pd,
-						    struct ib_phys_buf *phys_buf_array,
-						    int num_phys_buf,
-						    int mr_access_flags,
-						    u64 *iova_start);
 	struct ib_mw *             (*alloc_mw)(struct ib_pd *pd,
 					       enum ib_mw_type type);
-	int                        (*bind_mw)(struct ib_qp *qp,
-					      struct ib_mw *mw,
-					      struct ib_mw_bind *mw_bind);
 	int                        (*dealloc_mw)(struct ib_mw *mw);
 	struct ib_fmr *	           (*alloc_fmr)(struct ib_pd *pd,
 						int mr_access_flags,
@@ -1823,6 +1870,7 @@ struct ib_device {
 	u16                          is_switch:1;
 	u8                           node_type;
 	u8                           phys_port_cnt;
+	struct ib_device_attr        attrs;
 
 	/**
 	 * The following mandatory functions are used only at device
@@ -1888,6 +1936,31 @@ static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len
 	return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
 }
 
+static inline bool ib_is_udata_cleared(struct ib_udata *udata,
+				       size_t offset,
+				       size_t len)
+{
+	const void __user *p = udata->inbuf + offset;
+	bool ret = false;
+	u8 *buf;
+
+	if (len > USHRT_MAX)
+		return false;
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return false;
+
+	if (copy_from_user(buf, p, len))
+		goto free;
+
+	ret = !memchr_inv(buf, 0, len);
+
+free:
+	kfree(buf);
+	return ret;
+}
+
 /**
  * ib_modify_qp_is_ok - Check that the supplied attribute mask
  * contains all required attributes and no attributes not allowed for
@@ -1912,9 +1985,6 @@ int ib_register_event_handler  (struct ib_event_handler *event_handler);
 int ib_unregister_event_handler(struct ib_event_handler *event_handler);
 void ib_dispatch_event(struct ib_event *event);
 
-int ib_query_device(struct ib_device *device,
-		    struct ib_device_attr *device_attr);
-
 int ib_query_port(struct ib_device *device,
 		  u8 port_num, struct ib_port_attr *port_attr);
 
@@ -1968,6 +2038,17 @@ static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num)
 
 static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num)
 {
+	return device->port_immutable[port_num].core_cap_flags &
+		(RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP);
+}
+
+static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
+}
+
+static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num)
+{
 	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE;
 }
 
@@ -1978,8 +2059,8 @@ static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_n
 
 static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags &
-		(RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_PROT_ROCE);
+	return rdma_protocol_ib(device, port_num) ||
+		rdma_protocol_roce(device, port_num);
 }
 
 /**
@@ -2220,7 +2301,8 @@ int ib_modify_port(struct ib_device *device,
 		   struct ib_port_modify *port_modify);
 
 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
-		struct net_device *ndev, u8 *port_num, u16 *index);
+		enum ib_gid_type gid_type, struct net_device *ndev,
+		u8 *port_num, u16 *index);
 
 int ib_find_pkey(struct ib_device *device,
 		 u8 port_num, u16 pkey, u16 *index);
@@ -2454,6 +2536,11 @@ static inline int ib_post_recv(struct ib_qp *qp,
 	return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
 }
 
+struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+		int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx);
+void ib_free_cq(struct ib_cq *cq);
+int ib_process_cq_direct(struct ib_cq *cq, int budget);
+
 /**
  * ib_create_cq - Creates a CQ on the specified device.
  * @device: The device on which to create the CQ.
@@ -2839,13 +2926,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
 }
 
 /**
- * ib_query_mr - Retrieves information about a specific memory region.
- * @mr: The memory region to retrieve information about.
- * @mr_attr: The attributes of the specified memory region.
- */
-int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
-
-/**
  * ib_dereg_mr - Deregisters a memory region and removes it from the
  *   HCA translation table.
  * @mr: The memory region to deregister.
@@ -2882,42 +2962,6 @@ static inline u32 ib_inc_rkey(u32 rkey)
 }
 
 /**
- * ib_alloc_mw - Allocates a memory window.
- * @pd: The protection domain associated with the memory window.
- * @type: The type of the memory window (1 or 2).
- */
-struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
-
-/**
- * ib_bind_mw - Posts a work request to the send queue of the specified
- *   QP, which binds the memory window to the given address range and
- *   remote access attributes.
- * @qp: QP to post the bind work request on.
- * @mw: The memory window to bind.
- * @mw_bind: Specifies information about the memory window, including
- *   its address range, remote access rights, and associated memory region.
- *
- * If there is no immediate error, the function will update the rkey member
- * of the mw parameter to its new value. The bind operation can still fail
- * asynchronously.
- */
-static inline int ib_bind_mw(struct ib_qp *qp,
-			     struct ib_mw *mw,
-			     struct ib_mw_bind *mw_bind)
-{
-	/* XXX reference counting in corresponding MR? */
-	return mw->device->bind_mw ?
-		mw->device->bind_mw(qp, mw, mw_bind) :
-		-ENOSYS;
-}
-
-/**
- * ib_dealloc_mw - Deallocates a memory window.
- * @mw: The memory window to deallocate.
- */
-int ib_dealloc_mw(struct ib_mw *mw);
-
-/**
  * ib_alloc_fmr - Allocates a unmapped fast memory region.
  * @pd: The protection domain associated with the unmapped region.
  * @mr_access_flags: Specifies the memory access rights.
diff --git a/include/scsi/iser.h b/include/scsi/iser.h
new file mode 100644
index 000000000000..2e678fa74eca
--- /dev/null
+++ b/include/scsi/iser.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ISCSI_ISER_H
+#define ISCSI_ISER_H
+
+#define ISER_ZBVA_NOT_SUP		0x80
+#define ISER_SEND_W_INV_NOT_SUP		0x40
+#define ISERT_ZBVA_NOT_USED		0x80
+#define ISERT_SEND_W_INV_NOT_USED	0x40
+
+#define ISCSI_CTRL	0x10
+#define ISER_HELLO	0x20
+#define ISER_HELLORPLY	0x30
+
+#define ISER_VER	0x10
+#define ISER_WSV	0x08
+#define ISER_RSV	0x04
+
+/**
+ * struct iser_cm_hdr - iSER CM header (from iSER Annex A12)
+ *
+ * @flags:        flags support (zbva, send_w_inv)
+ * @rsvd:         reserved
+ */
+struct iser_cm_hdr {
+	u8      flags;
+	u8      rsvd[3];
+} __packed;
+
+/**
+ * struct iser_ctrl - iSER header of iSCSI control PDU
+ *
+ * @flags:        opcode and read/write valid bits
+ * @rsvd:         reserved
+ * @write_stag:   write rkey
+ * @write_va:     write virtual address
+ * @reaf_stag:    read rkey
+ * @read_va:      read virtual address
+ */
+struct iser_ctrl {
+	u8      flags;
+	u8      rsvd[3];
+	__be32  write_stag;
+	__be64  write_va;
+	__be32  read_stag;
+	__be64  read_va;
+} __packed;
+
+#endif /* ISCSI_ISER_H */
diff --git a/include/sound/timer.h b/include/sound/timer.h
index 7990469a44ce..c4d76ff056c6 100644
--- a/include/sound/timer.h
+++ b/include/sound/timer.h
@@ -104,6 +104,7 @@ struct snd_timer_instance {
 			   int event,
 			   struct timespec * tstamp,
 			   unsigned long resolution);
+	void (*disconnect)(struct snd_timer_instance *timeri);
 	void *callback_data;
 	unsigned long ticks;		/* auto-load ticks when expired */
 	unsigned long cticks;		/* current ticks */
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 594b4b29a224..4e4b2fa78609 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -43,7 +43,7 @@ struct extent_status;
 	{ EXT4_GET_BLOCKS_METADATA_NOFAIL,	"METADATA_NOFAIL" },	\
 	{ EXT4_GET_BLOCKS_NO_NORMALIZE,		"NO_NORMALIZE" },	\
 	{ EXT4_GET_BLOCKS_KEEP_SIZE,		"KEEP_SIZE" },		\
-	{ EXT4_GET_BLOCKS_NO_LOCK,		"NO_LOCK" })
+	{ EXT4_GET_BLOCKS_ZERO,			"ZERO" })
 
 #define show_mflags(flags) __print_flags(flags, "",	\
 	{ EXT4_MAP_NEW,		"N" },			\
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 0f803d2783e3..47c6212d8f3c 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -46,10 +46,10 @@ SCAN_STATUS
 
 TRACE_EVENT(mm_khugepaged_scan_pmd,
 
-	TP_PROTO(struct mm_struct *mm, unsigned long pfn, bool writable,
+	TP_PROTO(struct mm_struct *mm, struct page *page, bool writable,
 		 bool referenced, int none_or_zero, int status),
 
-	TP_ARGS(mm, pfn, writable, referenced, none_or_zero, status),
+	TP_ARGS(mm, page, writable, referenced, none_or_zero, status),
 
 	TP_STRUCT__entry(
 		__field(struct mm_struct *, mm)
@@ -62,7 +62,7 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
 
 	TP_fast_assign(
 		__entry->mm = mm;
-		__entry->pfn = pfn;
+		__entry->pfn = page ? page_to_pfn(page) : -1;
 		__entry->writable = writable;
 		__entry->referenced = referenced;
 		__entry->none_or_zero = none_or_zero;
@@ -104,10 +104,10 @@ TRACE_EVENT(mm_collapse_huge_page,
 
 TRACE_EVENT(mm_collapse_huge_page_isolate,
 
-	TP_PROTO(unsigned long pfn, int none_or_zero,
+	TP_PROTO(struct page *page, int none_or_zero,
 		 bool referenced, bool  writable, int status),
 
-	TP_ARGS(pfn, none_or_zero, referenced, writable, status),
+	TP_ARGS(page, none_or_zero, referenced, writable, status),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, pfn)
@@ -118,7 +118,7 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
 	),
 
 	TP_fast_assign(
-		__entry->pfn = pfn;
+		__entry->pfn = page ? page_to_pfn(page) : -1;
 		__entry->none_or_zero = none_or_zero;
 		__entry->referenced = referenced;
 		__entry->writable = writable;
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index ff8f6c091a15..f95f25e786ef 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -15,7 +15,7 @@ struct softirq_action;
 			 softirq_name(NET_TX)		\
 			 softirq_name(NET_RX)		\
 			 softirq_name(BLOCK)		\
-			 softirq_name(BLOCK_IOPOLL)	\
+			 softirq_name(IRQ_POLL)		\
 			 softirq_name(TASKLET)		\
 			 softirq_name(SCHED)		\
 			 softirq_name(HRTIMER)		\
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index c2e5d6cb34e3..ebd10e624598 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -307,7 +307,7 @@ header-y += nfs_mount.h
 header-y += nl80211.h
 header-y += n_r3964.h
 header-y += nubus.h
-header-y += nvme.h
+header-y += nvme_ioctl.h
 header-y += nvram.h
 header-y += omap3isp.h
 header-y += omapfb.h
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b8a5b3b86ad6..41e0433b4a83 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -2,8 +2,11 @@
 #define _UAPI_LINUX_FS_H
 
 /*
- * This file has definitions for some important file table
- * structures etc.
+ * This file has definitions for some important file table structures
+ * and constants and structures used by various generic file system
+ * ioctl's.  Please do not make any changes in this file before
+ * sending patches for review to linux-fsdevel@vger.kernel.org and
+ * linux-api@vger.kernel.org.
  */
 
 #include <linux/limits.h>
@@ -146,6 +149,37 @@ struct inodes_stat_t {
 #define MS_MGC_VAL 0xC0ED0000
 #define MS_MGC_MSK 0xffff0000
 
+/*
+ * Structure for FS_IOC_FSGETXATTR[A] and FS_IOC_FSSETXATTR.
+ */
+struct fsxattr {
+	__u32		fsx_xflags;	/* xflags field value (get/set) */
+	__u32		fsx_extsize;	/* extsize field value (get/set)*/
+	__u32		fsx_nextents;	/* nextents field value (get)	*/
+	__u32		fsx_projid;	/* project identifier (get/set) */
+	unsigned char	fsx_pad[12];
+};
+
+/*
+ * Flags for the fsx_xflags field
+ */
+#define FS_XFLAG_REALTIME	0x00000001	/* data in realtime volume */
+#define FS_XFLAG_PREALLOC	0x00000002	/* preallocated file extents */
+#define FS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */
+#define FS_XFLAG_APPEND		0x00000010	/* all writes append */
+#define FS_XFLAG_SYNC		0x00000020	/* all writes synchronous */
+#define FS_XFLAG_NOATIME	0x00000040	/* do not update access time */
+#define FS_XFLAG_NODUMP		0x00000080	/* do not include in backups */
+#define FS_XFLAG_RTINHERIT	0x00000100	/* create with rt bit set */
+#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
+#define FS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */
+#define FS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */
+#define FS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */
+#define FS_XFLAG_NODEFRAG	0x00002000	/* do not defragment */
+#define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
+#define FS_XFLAG_DAX		0x00008000	/* use DAX for IO */
+#define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
+
 /* the read-only stuff doesn't really belong here, but any other place is
    probably as bad and I don't want to create yet another include file. */
 
@@ -210,9 +244,28 @@ struct inodes_stat_t {
 #define FS_IOC32_SETFLAGS		_IOW('f', 2, int)
 #define FS_IOC32_GETVERSION		_IOR('v', 1, int)
 #define FS_IOC32_SETVERSION		_IOW('v', 2, int)
+#define FS_IOC_FSGETXATTR		_IOR ('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR		_IOW ('X', 32, struct fsxattr)
 
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
+ *
+ * Note: for historical reasons, these flags were originally used and
+ * defined for use by ext2/ext3, and then other file systems started
+ * using these flags so they wouldn't need to write their own version
+ * of chattr/lsattr (which was shipped as part of e2fsprogs).  You
+ * should think twice before trying to use these flags in new
+ * contexts, or trying to assign these flags, since they are used both
+ * as the UAPI and the on-disk encoding for ext2/3/4.  Also, we are
+ * almost out of 32-bit flags.  :-)
+ *
+ * We have recently hoisted FS_IOC_FSGETXATTR / FS_IOC_FSSETXATTR from
+ * XFS to the generic FS level interface.  This uses a structure that
+ * has padding and hence has more room to grow, so it may be more
+ * appropriate for many new use cases.
+ *
+ * Please do not change these flags or interfaces before checking with
+ * linux-fsdevel@vger.kernel.org and linux-api@vger.kernel.org.
  */
 #define	FS_SECRM_FL			0x00000001 /* Secure deletion */
 #define	FS_UNRM_FL			0x00000002 /* Undelete */
@@ -226,8 +279,8 @@ struct inodes_stat_t {
 #define FS_DIRTY_FL			0x00000100
 #define FS_COMPRBLK_FL			0x00000200 /* One or more compressed clusters */
 #define FS_NOCOMP_FL			0x00000400 /* Don't compress */
-#define FS_ECOMPR_FL			0x00000800 /* Compression error */
 /* End compression flags --- maybe not all used */
+#define FS_ENCRYPT_FL			0x00000800 /* Encrypted file */
 #define FS_BTREE_FL			0x00001000 /* btree format dir */
 #define FS_INDEX_FL			0x00001000 /* hash-indexed directory */
 #define FS_IMAGIC_FL			0x00002000 /* AFS directory */
@@ -235,9 +288,12 @@ struct inodes_stat_t {
 #define FS_NOTAIL_FL			0x00008000 /* file tail should not be merged */
 #define FS_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
 #define FS_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
+#define FS_HUGE_FILE_FL			0x00040000 /* Reserved for ext4 */
 #define FS_EXTENT_FL			0x00080000 /* Extents */
-#define FS_DIRECTIO_FL			0x00100000 /* Use direct i/o */
+#define FS_EA_INODE_FL			0x00200000 /* Inode used for large EA */
+#define FS_EOFBLOCKS_FL			0x00400000 /* Reserved for ext4 */
 #define FS_NOCOW_FL			0x00800000 /* Do not cow file */
+#define FS_INLINE_DATA_FL		0x10000000 /* Reserved for ext4 */
 #define FS_PROJINHERIT_FL		0x20000000 /* Create with parents projid */
 #define FS_RESERVED_FL			0x80000000 /* reserved for ext2 lib */
 
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index 928f98997d8a..774a43128a7a 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -33,6 +33,7 @@
 
 #define NVM_TTYPE_NAME_MAX 48
 #define NVM_TTYPE_MAX 63
+#define NVM_MMTYPE_LEN 8
 
 #define NVM_CTRL_FILE "/dev/lightnvm/control"
 
@@ -100,6 +101,26 @@ struct nvm_ioctl_remove {
 	__u32 flags;
 };
 
+struct nvm_ioctl_dev_init {
+	char dev[DISK_NAME_LEN];		/* open-channel SSD device */
+	char mmtype[NVM_MMTYPE_LEN];		/* register to media manager */
+
+	__u32 flags;
+};
+
+enum {
+	NVM_FACTORY_ERASE_ONLY_USER	= 1 << 0, /* erase only blocks used as
+						   * host blks or grown blks */
+	NVM_FACTORY_RESET_HOST_BLKS	= 1 << 1, /* remove host blk marks */
+	NVM_FACTORY_RESET_GRWN_BBLKS	= 1 << 2, /* remove grown blk marks */
+	NVM_FACTORY_NR_BITS		= 1 << 3, /* stops here */
+};
+
+struct nvm_ioctl_dev_factory {
+	char dev[DISK_NAME_LEN];
+
+	__u32 flags;
+};
 
 /* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */
 enum {
@@ -110,6 +131,12 @@ enum {
 	/* device level cmds */
 	NVM_DEV_CREATE_CMD,
 	NVM_DEV_REMOVE_CMD,
+
+	/* Init a device to support LightNVM media managers */
+	NVM_DEV_INIT_CMD,
+
+	/* Factory reset device */
+	NVM_DEV_FACTORY_CMD,
 };
 
 #define NVM_IOCTL 'L' /* 0x4c */
@@ -122,6 +149,10 @@ enum {
 						struct nvm_ioctl_create)
 #define NVM_DEV_REMOVE		_IOW(NVM_IOCTL, NVM_DEV_REMOVE_CMD, \
 						struct nvm_ioctl_remove)
+#define NVM_DEV_INIT		_IOW(NVM_IOCTL, NVM_DEV_INIT_CMD, \
+						struct nvm_ioctl_dev_init)
+#define NVM_DEV_FACTORY		_IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \
+						struct nvm_ioctl_dev_factory)
 
 #define NVM_VERSION_MAJOR	1
 #define NVM_VERSION_MINOR	0
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index c33e1c489eb2..8b8cfadf7833 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -28,6 +28,54 @@ typedef uint16_t blkif_vdev_t;
 typedef uint64_t blkif_sector_t;
 
 /*
+ * Multiple hardware queues/rings:
+ * If supported, the backend will write the key "multi-queue-max-queues" to
+ * the directory for that vbd, and set its value to the maximum supported
+ * number of queues.
+ * Frontends that are aware of this feature and wish to use it can write the
+ * key "multi-queue-num-queues" with the number they wish to use, which must be
+ * greater than zero, and no more than the value reported by the backend in
+ * "multi-queue-max-queues".
+ *
+ * For frontends requesting just one queue, the usual event-channel and
+ * ring-ref keys are written as before, simplifying the backend processing
+ * to avoid distinguishing between a frontend that doesn't understand the
+ * multi-queue feature, and one that does, but requested only one queue.
+ *
+ * Frontends requesting two or more queues must not write the toplevel
+ * event-channel and ring-ref keys, instead writing those keys under sub-keys
+ * having the name "queue-N" where N is the integer ID of the queue/ring for
+ * which those keys belong. Queues are indexed from zero.
+ * For example, a frontend with two queues must write the following set of
+ * queue-related keys:
+ *
+ * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
+ * /local/domain/1/device/vbd/0/queue-0 = ""
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>"
+ * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
+ * /local/domain/1/device/vbd/0/queue-1 = ""
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>"
+ * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
+ *
+ * It is also possible to use multiple queues/rings together with
+ * feature multi-page ring buffer.
+ * For example, a frontend requests two queues/rings and the size of each ring
+ * buffer is two pages must write the following set of related keys:
+ *
+ * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
+ * /local/domain/1/device/vbd/0/ring-page-order = "1"
+ * /local/domain/1/device/vbd/0/queue-0 = ""
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>"
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>"
+ * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
+ * /local/domain/1/device/vbd/0/queue-1 = ""
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>"
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>"
+ * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
+ *
+ */
+
+/*
  * REQUEST CODES.
  */
 #define BLKIF_OP_READ              0
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index f4617cf07069..781c1399c6a3 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -795,7 +795,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 
 	ro = mnt_want_write(mnt);	/* we'll drop it in any case */
 	error = 0;
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 	path.dentry = lookup_one_len(name->name, root, strlen(name->name));
 	if (IS_ERR(path.dentry)) {
 		error = PTR_ERR(path.dentry);
@@ -841,7 +841,7 @@ out_putfd:
 		put_unused_fd(fd);
 		fd = error;
 	}
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 	if (!ro)
 		mnt_drop_write(mnt);
 out_putname:
@@ -866,7 +866,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 	err = mnt_want_write(mnt);
 	if (err)
 		goto out_name;
-	mutex_lock_nested(&d_inode(mnt->mnt_root)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
 	dentry = lookup_one_len(name->name, mnt->mnt_root,
 				strlen(name->name));
 	if (IS_ERR(dentry)) {
@@ -884,7 +884,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 	dput(dentry);
 
 out_unlock:
-	mutex_unlock(&d_inode(mnt->mnt_root)->i_mutex);
+	inode_unlock(d_inode(mnt->mnt_root));
 	if (inode)
 		iput(inode);
 	mnt_drop_write(mnt);
diff --git a/ipc/sem.c b/ipc/sem.c
index b471e5a3863d..cddd5b5fde51 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1493,7 +1493,7 @@ out_rcu_wakeup:
 	wake_up_sem_queue_do(&tasks);
 out_free:
 	if (sem_io != fast_sem_io)
-		ipc_free(sem_io, sizeof(ushort)*nsems);
+		ipc_free(sem_io);
 	return err;
 }
 
diff --git a/ipc/util.c b/ipc/util.c
index 0f401d94b7c6..798cad18dd87 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -414,17 +414,12 @@ void *ipc_alloc(int size)
 /**
  * ipc_free - free ipc space
  * @ptr: pointer returned by ipc_alloc
- * @size: size of block
  *
- * Free a block created with ipc_alloc(). The caller must know the size
- * used in the allocation call.
+ * Free a block created with ipc_alloc().
  */
-void ipc_free(void *ptr, int size)
+void ipc_free(void *ptr)
 {
-	if (size > PAGE_SIZE)
-		vfree(ptr);
-	else
-		kfree(ptr);
+	kvfree(ptr);
 }
 
 /**
diff --git a/ipc/util.h b/ipc/util.h
index 3a8a5a0eca62..51f7ca58ac67 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -118,7 +118,7 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
  * both function can sleep
  */
 void *ipc_alloc(int size);
-void ipc_free(void *ptr, int size);
+void ipc_free(void *ptr);
 
 /*
  * For allocation that need to be freed by RCU.
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 27c6046c2c3d..f84f8d06e1f6 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -95,7 +95,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
 	if (IS_ERR(dentry))
 		return (void *)dentry; /* returning an error */
 	inode = path.dentry->d_inode;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
 	if (unlikely(!audit_mark)) {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 656c7e93ac0d..9f194aad0adc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -364,7 +364,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 	struct dentry *d = kern_path_locked(watch->path, parent);
 	if (IS_ERR(d))
 		return PTR_ERR(d);
-	mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex);
+	inode_unlock(d_backing_inode(parent->dentry));
 	if (d_is_positive(d)) {
 		/* update watch filter fields */
 		watch->dev = d_backing_inode(d)->i_sb->s_dev;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c0957416b32e..06ae52e99ac2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4872,9 +4872,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
 	struct perf_event *event = filp->private_data;
 	int retval;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	retval = fasync_helper(fd, filp, on, &event->fasync);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (retval < 0)
 		return retval;
diff --git a/kernel/relay.c b/kernel/relay.c
index 0b4570cfacae..074994bcfa9b 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1133,7 +1133,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
 	if (!desc->count)
 		return 0;
 
-	mutex_lock(&file_inode(filp)->i_mutex);
+	inode_lock(file_inode(filp));
 	do {
 		if (!relay_file_read_avail(buf, *ppos))
 			break;
@@ -1153,7 +1153,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
 			*ppos = relay_file_read_end_pos(buf, read_start, ret);
 		}
 	} while (desc->count && ret);
-	mutex_unlock(&file_inode(filp)->i_mutex);
+	inode_unlock(file_inode(filp));
 
 	return desc->written;
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44253adb3c36..63d3a24e081a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -222,9 +222,9 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 
 	/* Ensure the static_key remains in a consistent state */
 	inode = file_inode(filp);
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	i = sched_feat_set(cmp);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (i == __SCHED_FEAT_NR)
 		return -EINVAL;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 91420362e0b3..97715fd9e790 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1757,6 +1757,20 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= &pipe_proc_fn,
 		.extra1		= &pipe_min_size,
 	},
+	{
+		.procname	= "pipe-user-pages-hard",
+		.data		= &pipe_user_pages_hard,
+		.maxlen		= sizeof(pipe_user_pages_hard),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "pipe-user-pages-soft",
+		.data		= &pipe_user_pages_soft,
+		.maxlen		= sizeof(pipe_user_pages_soft),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 	{ }
 };
 
diff --git a/lib/Kconfig b/lib/Kconfig
index 5a0c1c83cdf0..133ebc0c1773 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -210,9 +210,11 @@ config RANDOM32_SELFTEST
 # compression support is select'ed if needed
 #
 config 842_COMPRESS
+	select CRC32
 	tristate
 
 config 842_DECOMPRESS
+	select CRC32
 	tristate
 
 config ZLIB_INFLATE
@@ -475,6 +477,11 @@ config DDR
 	  information. This data is useful for drivers handling
 	  DDR SDRAM controllers.
 
+config IRQ_POLL
+	bool "IRQ polling library"
+	help
+	  Helper library to poll interrupt mitigation using polling.
+
 config MPILIB
 	tristate
 	select CLZ_TAB
diff --git a/lib/Makefile b/lib/Makefile
index 2d4bc33d09b4..a7c26a41a738 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -165,6 +165,7 @@ obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
 
 obj-$(CONFIG_SG_SPLIT) += sg_split.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
+obj-$(CONFIG_IRQ_POLL) += irq_poll.o
 
 libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
 	       fdt_empty_tree.o
diff --git a/block/blk-iopoll.c b/lib/irq_poll.c
index 0736729d6494..836f7db4e548 100644
--- a/block/blk-iopoll.c
+++ b/lib/irq_poll.c
@@ -6,84 +6,84 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/bio.h>
-#include <linux/blkdev.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
-#include <linux/blk-iopoll.h>
+#include <linux/irq_poll.h>
 #include <linux/delay.h>
 
-#include "blk.h"
-
-static unsigned int blk_iopoll_budget __read_mostly = 256;
+static unsigned int irq_poll_budget __read_mostly = 256;
 
 static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
 
 /**
- * blk_iopoll_sched - Schedule a run of the iopoll handler
+ * irq_poll_sched - Schedule a run of the iopoll handler
  * @iop:      The parent iopoll structure
  *
  * Description:
- *     Add this blk_iopoll structure to the pending poll list and trigger the
- *     raise of the blk iopoll softirq. The driver must already have gotten a
- *     successful return from blk_iopoll_sched_prep() before calling this.
+ *     Add this irq_poll structure to the pending poll list and trigger the
+ *     raise of the blk iopoll softirq.
  **/
-void blk_iopoll_sched(struct blk_iopoll *iop)
+void irq_poll_sched(struct irq_poll *iop)
 {
 	unsigned long flags;
 
+	if (test_bit(IRQ_POLL_F_DISABLE, &iop->state))
+		return;
+	if (test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state))
+		return;
+
 	local_irq_save(flags);
 	list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
-	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+	__raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL(blk_iopoll_sched);
+EXPORT_SYMBOL(irq_poll_sched);
 
 /**
- * __blk_iopoll_complete - Mark this @iop as un-polled again
+ * __irq_poll_complete - Mark this @iop as un-polled again
  * @iop:      The parent iopoll structure
  *
  * Description:
- *     See blk_iopoll_complete(). This function must be called with interrupts
+ *     See irq_poll_complete(). This function must be called with interrupts
  *     disabled.
  **/
-void __blk_iopoll_complete(struct blk_iopoll *iop)
+static void __irq_poll_complete(struct irq_poll *iop)
 {
 	list_del(&iop->list);
 	smp_mb__before_atomic();
-	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+	clear_bit_unlock(IRQ_POLL_F_SCHED, &iop->state);
 }
-EXPORT_SYMBOL(__blk_iopoll_complete);
 
 /**
- * blk_iopoll_complete - Mark this @iop as un-polled again
+ * irq_poll_complete - Mark this @iop as un-polled again
  * @iop:      The parent iopoll structure
  *
  * Description:
  *     If a driver consumes less than the assigned budget in its run of the
  *     iopoll handler, it'll end the polled mode by calling this function. The
- *     iopoll handler will not be invoked again before blk_iopoll_sched_prep()
+ *     iopoll handler will not be invoked again before irq_poll_sched()
  *     is called.
  **/
-void blk_iopoll_complete(struct blk_iopoll *iop)
+void irq_poll_complete(struct irq_poll *iop)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__blk_iopoll_complete(iop);
+	__irq_poll_complete(iop);
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL(blk_iopoll_complete);
+EXPORT_SYMBOL(irq_poll_complete);
 
-static void blk_iopoll_softirq(struct softirq_action *h)
+static void irq_poll_softirq(struct softirq_action *h)
 {
 	struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
-	int rearm = 0, budget = blk_iopoll_budget;
+	int rearm = 0, budget = irq_poll_budget;
 	unsigned long start_time = jiffies;
 
 	local_irq_disable();
 
 	while (!list_empty(list)) {
-		struct blk_iopoll *iop;
+		struct irq_poll *iop;
 		int work, weight;
 
 		/*
@@ -101,11 +101,11 @@ static void blk_iopoll_softirq(struct softirq_action *h)
 		 * entries to the tail of this list, and only ->poll()
 		 * calls can remove this head entry from the list.
 		 */
-		iop = list_entry(list->next, struct blk_iopoll, list);
+		iop = list_entry(list->next, struct irq_poll, list);
 
 		weight = iop->weight;
 		work = 0;
-		if (test_bit(IOPOLL_F_SCHED, &iop->state))
+		if (test_bit(IRQ_POLL_F_SCHED, &iop->state))
 			work = iop->poll(iop, weight);
 
 		budget -= work;
@@ -121,72 +121,70 @@ static void blk_iopoll_softirq(struct softirq_action *h)
 		 * move the instance around on the list at-will.
 		 */
 		if (work >= weight) {
-			if (blk_iopoll_disable_pending(iop))
-				__blk_iopoll_complete(iop);
+			if (test_bit(IRQ_POLL_F_DISABLE, &iop->state))
+				__irq_poll_complete(iop);
 			else
 				list_move_tail(&iop->list, list);
 		}
 	}
 
 	if (rearm)
-		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+		__raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
 
 	local_irq_enable();
 }
 
 /**
- * blk_iopoll_disable - Disable iopoll on this @iop
+ * irq_poll_disable - Disable iopoll on this @iop
  * @iop:      The parent iopoll structure
  *
  * Description:
  *     Disable io polling and wait for any pending callbacks to have completed.
  **/
-void blk_iopoll_disable(struct blk_iopoll *iop)
+void irq_poll_disable(struct irq_poll *iop)
 {
-	set_bit(IOPOLL_F_DISABLE, &iop->state);
-	while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
+	set_bit(IRQ_POLL_F_DISABLE, &iop->state);
+	while (test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state))
 		msleep(1);
-	clear_bit(IOPOLL_F_DISABLE, &iop->state);
+	clear_bit(IRQ_POLL_F_DISABLE, &iop->state);
 }
-EXPORT_SYMBOL(blk_iopoll_disable);
+EXPORT_SYMBOL(irq_poll_disable);
 
 /**
- * blk_iopoll_enable - Enable iopoll on this @iop
+ * irq_poll_enable - Enable iopoll on this @iop
  * @iop:      The parent iopoll structure
  *
  * Description:
  *     Enable iopoll on this @iop. Note that the handler run will not be
  *     scheduled, it will only mark it as active.
  **/
-void blk_iopoll_enable(struct blk_iopoll *iop)
+void irq_poll_enable(struct irq_poll *iop)
 {
-	BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
+	BUG_ON(!test_bit(IRQ_POLL_F_SCHED, &iop->state));
 	smp_mb__before_atomic();
-	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+	clear_bit_unlock(IRQ_POLL_F_SCHED, &iop->state);
 }
-EXPORT_SYMBOL(blk_iopoll_enable);
+EXPORT_SYMBOL(irq_poll_enable);
 
 /**
- * blk_iopoll_init - Initialize this @iop
+ * irq_poll_init - Initialize this @iop
  * @iop:      The parent iopoll structure
  * @weight:   The default weight (or command completion budget)
  * @poll_fn:  The handler to invoke
  *
  * Description:
- *     Initialize this blk_iopoll structure. Before being actively used, the
- *     driver must call blk_iopoll_enable().
+ *     Initialize and enable this irq_poll structure.
  **/
-void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
+void irq_poll_init(struct irq_poll *iop, int weight, irq_poll_fn *poll_fn)
 {
 	memset(iop, 0, sizeof(*iop));
 	INIT_LIST_HEAD(&iop->list);
 	iop->weight = weight;
 	iop->poll = poll_fn;
-	set_bit(IOPOLL_F_SCHED, &iop->state);
 }
-EXPORT_SYMBOL(blk_iopoll_init);
+EXPORT_SYMBOL(irq_poll_init);
 
-static int blk_iopoll_cpu_notify(struct notifier_block *self,
+static int irq_poll_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	/*
@@ -199,26 +197,26 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
 		local_irq_disable();
 		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
 				 this_cpu_ptr(&blk_cpu_iopoll));
-		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+		__raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
 		local_irq_enable();
 	}
 
 	return NOTIFY_OK;
 }
 
-static struct notifier_block blk_iopoll_cpu_notifier = {
-	.notifier_call	= blk_iopoll_cpu_notify,
+static struct notifier_block irq_poll_cpu_notifier = {
+	.notifier_call	= irq_poll_cpu_notify,
 };
 
-static __init int blk_iopoll_setup(void)
+static __init int irq_poll_setup(void)
 {
 	int i;
 
 	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
 
-	open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
-	register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
+	open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq);
+	register_hotcpu_notifier(&irq_poll_cpu_notifier);
 	return 0;
 }
-subsys_initcall(blk_iopoll_setup);
+subsys_initcall(irq_poll_setup);
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index 31ce853fbfb1..74a54b7f2562 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -75,3 +75,4 @@ module_exit(libcrc32c_mod_fini);
 MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>");
 MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations");
 MODULE_LICENSE("GPL");
+MODULE_SOFTDEP("pre: crc32c");
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 028f5d996eef..28ba40b99337 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -238,7 +238,7 @@ void lc_reset(struct lru_cache *lc)
  * @seq: the seq_file to print into
  * @lc: the lru cache to print statistics of
  */
-size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
+void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
 {
 	/* NOTE:
 	 * total calls to lc_get are
@@ -250,8 +250,6 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
 	seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n",
 		   lc->name, lc->used, lc->nr_elements,
 		   lc->hits, lc->misses, lc->starving, lc->locked, lc->changed);
-
-	return 0;
 }
 
 static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
diff --git a/lib/ratelimit.c b/lib/ratelimit.c
index 40e03ea2a967..2c5de86460c5 100644
--- a/lib/ratelimit.c
+++ b/lib/ratelimit.c
@@ -49,7 +49,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func)
 		if (rs->missed)
 			printk(KERN_WARNING "%s: %d callbacks suppressed\n",
 				func, rs->missed);
-		rs->begin   = 0;
+		rs->begin   = jiffies;
 		rs->printed = 0;
 		rs->missed  = 0;
 	}
diff --git a/mm/filemap.c b/mm/filemap.c
index 847ee43c2806..bc943867d68c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -11,6 +11,7 @@
  */
 #include <linux/export.h>
 #include <linux/compiler.h>
+#include <linux/dax.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/capability.h>
@@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
 
 	if (shadow) {
-		mapping->nrshadows++;
+		mapping->nrexceptional++;
 		/*
-		 * Make sure the nrshadows update is committed before
+		 * Make sure the nrexceptional update is committed before
 		 * the nrpages update so that final truncate racing
 		 * with reclaim does not see both counters 0 at the
 		 * same time and miss a shadow entry.
@@ -481,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 {
 	int err = 0;
 
+	if (dax_mapping(mapping) && mapping->nrexceptional) {
+		err = dax_writeback_mapping_range(mapping, lstart, lend);
+		if (err)
+			return err;
+	}
+
 	if (mapping->nrpages) {
 		err = __filemap_fdatawrite_range(mapping, lstart, lend,
 						 WB_SYNC_ALL);
@@ -579,9 +586,13 @@ static int page_cache_tree_insert(struct address_space *mapping,
 		p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 		if (!radix_tree_exceptional_entry(p))
 			return -EEXIST;
+
+		if (WARN_ON(dax_mapping(mapping)))
+			return -EINVAL;
+
 		if (shadowp)
 			*shadowp = p;
-		mapping->nrshadows--;
+		mapping->nrexceptional--;
 		if (node)
 			workingset_node_shadows_dec(node);
 	}
@@ -1245,9 +1256,9 @@ repeat:
 			if (radix_tree_deref_retry(page))
 				goto restart;
 			/*
-			 * A shadow entry of a recently evicted page,
-			 * or a swap entry from shmem/tmpfs.  Return
-			 * it without attempting to raise page count.
+			 * A shadow entry of a recently evicted page, a swap
+			 * entry from shmem/tmpfs or a DAX entry.  Return it
+			 * without attempting to raise page count.
 			 */
 			goto export;
 		}
@@ -1494,6 +1505,74 @@ repeat:
 }
 EXPORT_SYMBOL(find_get_pages_tag);
 
+/**
+ * find_get_entries_tag - find and return entries that match @tag
+ * @mapping:	the address_space to search
+ * @start:	the starting page cache index
+ * @tag:	the tag index
+ * @nr_entries:	the maximum number of entries
+ * @entries:	where the resulting entries are placed
+ * @indices:	the cache indices corresponding to the entries in @entries
+ *
+ * Like find_get_entries, except we only return entries which are tagged with
+ * @tag.
+ */
+unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
+			int tag, unsigned int nr_entries,
+			struct page **entries, pgoff_t *indices)
+{
+	void **slot;
+	unsigned int ret = 0;
+	struct radix_tree_iter iter;
+
+	if (!nr_entries)
+		return 0;
+
+	rcu_read_lock();
+restart:
+	radix_tree_for_each_tagged(slot, &mapping->page_tree,
+				   &iter, start, tag) {
+		struct page *page;
+repeat:
+		page = radix_tree_deref_slot(slot);
+		if (unlikely(!page))
+			continue;
+		if (radix_tree_exception(page)) {
+			if (radix_tree_deref_retry(page)) {
+				/*
+				 * Transient condition which can only trigger
+				 * when entry at index 0 moves out of or back
+				 * to root: none yet gotten, safe to restart.
+				 */
+				goto restart;
+			}
+
+			/*
+			 * A shadow entry of a recently evicted page, a swap
+			 * entry from shmem/tmpfs or a DAX entry.  Return it
+			 * without attempting to raise page count.
+			 */
+			goto export;
+		}
+		if (!page_cache_get_speculative(page))
+			goto repeat;
+
+		/* Has the page moved? */
+		if (unlikely(page != *slot)) {
+			page_cache_release(page);
+			goto repeat;
+		}
+export:
+		indices[ret] = iter.index;
+		entries[ret] = page;
+		if (++ret == nr_entries)
+			break;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(find_get_entries_tag);
+
 /*
  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
  * a _large_ part of the i/o request. Imagine the worst scenario:
@@ -2684,11 +2763,11 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0)
 		ret = __generic_file_write_iter(iocb, from);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (ret > 0) {
 		ssize_t err;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8ad580273521..fd3a07b3e6f4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1560,7 +1560,8 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	struct mm_struct *mm = tlb->mm;
 	int ret = 0;
 
-	if (!pmd_trans_huge_lock(pmd, vma, &ptl))
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (!ptl)
 		goto out_unlocked;
 
 	orig_pmd = *pmd;
@@ -1627,7 +1628,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	pmd_t orig_pmd;
 	spinlock_t *ptl;
 
-	if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
+	ptl = __pmd_trans_huge_lock(pmd, vma);
+	if (!ptl)
 		return 0;
 	/*
 	 * For architectures like ppc64 we look at deposited pgtable
@@ -1690,7 +1692,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_sem prevents deadlock.
 	 */
-	if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
+	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
+	if (old_ptl) {
 		new_ptl = pmd_lockptr(mm, new_pmd);
 		if (new_ptl != old_ptl)
 			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1724,7 +1727,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	spinlock_t *ptl;
 	int ret = 0;
 
-	if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = __pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		pmd_t entry;
 		bool preserve_write = prot_numa && pmd_write(*pmd);
 		ret = 1;
@@ -1760,14 +1764,14 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  * Note that if it returns true, this routine returns without unlocking page
  * table lock. So callers must unlock it.
  */
-bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-		spinlock_t **ptl)
+spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 {
-	*ptl = pmd_lock(vma->vm_mm, pmd);
+	spinlock_t *ptl;
+	ptl = pmd_lock(vma->vm_mm, pmd);
 	if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
-		return true;
-	spin_unlock(*ptl);
-	return false;
+		return ptl;
+	spin_unlock(ptl);
+	return NULL;
 }
 
 #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@ -2068,7 +2072,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 	if (likely(writable)) {
 		if (likely(referenced)) {
 			result = SCAN_SUCCEED;
-			trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+			trace_mm_collapse_huge_page_isolate(page, none_or_zero,
 							    referenced, writable, result);
 			return 1;
 		}
@@ -2078,7 +2082,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 out:
 	release_pte_pages(pte, _pte);
-	trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
 					    referenced, writable, result);
 	return 0;
 }
@@ -2576,7 +2580,7 @@ out_unmap:
 		collapse_huge_page(mm, address, hpage, vma, node);
 	}
 out:
-	trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
 				     none_or_zero, result);
 	return ret;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ca052f2a4a0b..d06cae2de783 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4638,7 +4638,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
 		spin_unlock(ptl);
@@ -4826,7 +4827,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	union mc_target target;
 	struct page *page;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		if (mc.precharge < HPAGE_PMD_NR) {
 			spin_unlock(ptl);
 			return 0;
diff --git a/mm/mincore.c b/mm/mincore.c
index 2a565ed8bb49..563f32045490 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -117,7 +117,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	unsigned char *vec = walk->private;
 	int nr = (end - addr) >> PAGE_SHIFT;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		memset(vec, 1, nr);
 		spin_unlock(ptl);
 		goto out;
diff --git a/mm/mlock.c b/mm/mlock.c
index e1e2b1207bf2..96f001041928 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -175,7 +175,7 @@ static void __munlock_isolation_failed(struct page *page)
  */
 unsigned int munlock_vma_page(struct page *page)
 {
-	unsigned int nr_pages;
+	int nr_pages;
 	struct zone *zone = page_zone(page);
 
 	/* For try_to_munlock() and to serialize with page migration */
diff --git a/mm/percpu.c b/mm/percpu.c
index 8a943b97a053..998607adf6eb 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size)
 /**
  * pcpu_mem_free - free memory
  * @ptr: memory to free
- * @size: size of the area
  *
  * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
  */
-static void pcpu_mem_free(void *ptr, size_t size)
+static void pcpu_mem_free(void *ptr)
 {
-	if (size <= PAGE_SIZE)
-		kfree(ptr);
-	else
-		vfree(ptr);
+	kvfree(ptr);
 }
 
 /**
@@ -463,8 +459,8 @@ out_unlock:
 	 * pcpu_mem_free() might end up calling vfree() which uses
 	 * IRQ-unsafe lock and thus can't be called under pcpu_lock.
 	 */
-	pcpu_mem_free(old, old_size);
-	pcpu_mem_free(new, new_size);
+	pcpu_mem_free(old);
+	pcpu_mem_free(new);
 
 	return 0;
 }
@@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
 	chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
 						sizeof(chunk->map[0]));
 	if (!chunk->map) {
-		pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+		pcpu_mem_free(chunk);
 		return NULL;
 	}
 
@@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
 {
 	if (!chunk)
 		return;
-	pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
-	pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+	pcpu_mem_free(chunk->map);
+	pcpu_mem_free(chunk);
 }
 
 /**
diff --git a/mm/shmem.c b/mm/shmem.c
index fa2ceb2d2655..440e2a7e6c1c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -701,8 +701,7 @@ static void shmem_evict_inode(struct inode *inode)
 			list_del_init(&info->swaplist);
 			mutex_unlock(&shmem_swaplist_mutex);
 		}
-	} else
-		kfree(info->symlink);
+	}
 
 	simple_xattrs_free(&info->xattrs);
 	WARN_ON(inode->i_blocks);
@@ -1902,7 +1901,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 	if (whence != SEEK_DATA && whence != SEEK_HOLE)
 		return generic_file_llseek_size(file, offset, whence,
 					MAX_LFS_FILESIZE, i_size_read(inode));
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	/* We're holding i_mutex so we can access i_size directly */
 
 	if (offset < 0)
@@ -1926,7 +1925,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 
 	if (offset >= 0)
 		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return offset;
 }
 
@@ -2091,7 +2090,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
 	if (seals & ~(unsigned int)F_ALL_SEALS)
 		return -EINVAL;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (info->seals & F_SEAL_SEAL) {
 		error = -EPERM;
@@ -2114,7 +2113,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
 	error = 0;
 
 unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return error;
 }
 EXPORT_SYMBOL_GPL(shmem_add_seals);
@@ -2164,7 +2163,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
 		struct address_space *mapping = file->f_mapping;
@@ -2277,7 +2276,7 @@ undone:
 	inode->i_private = NULL;
 	spin_unlock(&inode->i_lock);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return error;
 }
 
@@ -2549,13 +2548,12 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	info = SHMEM_I(inode);
 	inode->i_size = len-1;
 	if (len <= SHORT_SYMLINK_LEN) {
-		info->symlink = kmemdup(symname, len, GFP_KERNEL);
-		if (!info->symlink) {
+		inode->i_link = kmemdup(symname, len, GFP_KERNEL);
+		if (!inode->i_link) {
 			iput(inode);
 			return -ENOMEM;
 		}
 		inode->i_op = &shmem_short_symlink_operations;
-		inode->i_link = info->symlink;
 	} else {
 		inode_nohighmem(inode);
 		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
@@ -3132,6 +3130,7 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 static void shmem_destroy_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kfree(inode->i_link);
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c43f654a7b64..d2c37365e2d6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1956,9 +1956,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 		set_blocksize(bdev, old_block_size);
 		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 	} else {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		inode->i_flags &= ~S_SWAPFILE;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	filp_close(swap_file, NULL);
 
@@ -2183,7 +2183,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
 		p->flags |= SWP_BLKDEV;
 	} else if (S_ISREG(inode->i_mode)) {
 		p->bdev = inode->i_sb->s_bdev;
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if (IS_SWAPFILE(inode))
 			return -EBUSY;
 	} else
@@ -2416,7 +2416,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	mapping = swap_file->f_mapping;
 	inode = mapping->host;
 
-	/* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
+	/* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
 	error = claim_swapfile(p, inode);
 	if (unlikely(error))
 		goto bad_swap;
@@ -2561,7 +2561,7 @@ bad_swap:
 	vfree(cluster_info);
 	if (swap_file) {
 		if (inode && S_ISREG(inode->i_mode)) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			inode = NULL;
 		}
 		filp_close(swap_file, NULL);
@@ -2574,7 +2574,7 @@ out:
 	if (name)
 		putname(name);
 	if (inode && S_ISREG(inode->i_mode))
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	return error;
 }
 
diff --git a/mm/truncate.c b/mm/truncate.c
index 76e35ad97102..e3ee0e27cd17 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/backing-dev.h>
+#include <linux/dax.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping,
 		return;
 
 	spin_lock_irq(&mapping->tree_lock);
-	/*
-	 * Regular page slots are stabilized by the page lock even
-	 * without the tree itself locked.  These unlocked entries
-	 * need verification under the tree lock.
-	 */
-	if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
-		goto unlock;
-	if (*slot != entry)
-		goto unlock;
-	radix_tree_replace_slot(slot, NULL);
-	mapping->nrshadows--;
-	if (!node)
-		goto unlock;
-	workingset_node_shadows_dec(node);
-	/*
-	 * Don't track node without shadow entries.
-	 *
-	 * Avoid acquiring the list_lru lock if already untracked.
-	 * The list_empty() test is safe as node->private_list is
-	 * protected by mapping->tree_lock.
-	 */
-	if (!workingset_node_shadows(node) &&
-	    !list_empty(&node->private_list))
-		list_lru_del(&workingset_shadow_nodes, &node->private_list);
-	__radix_tree_delete_node(&mapping->page_tree, node);
+
+	if (dax_mapping(mapping)) {
+		if (radix_tree_delete_item(&mapping->page_tree, index, entry))
+			mapping->nrexceptional--;
+	} else {
+		/*
+		 * Regular page slots are stabilized by the page lock even
+		 * without the tree itself locked.  These unlocked entries
+		 * need verification under the tree lock.
+		 */
+		if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
+					&slot))
+			goto unlock;
+		if (*slot != entry)
+			goto unlock;
+		radix_tree_replace_slot(slot, NULL);
+		mapping->nrexceptional--;
+		if (!node)
+			goto unlock;
+		workingset_node_shadows_dec(node);
+		/*
+		 * Don't track node without shadow entries.
+		 *
+		 * Avoid acquiring the list_lru lock if already untracked.
+		 * The list_empty() test is safe as node->private_list is
+		 * protected by mapping->tree_lock.
+		 */
+		if (!workingset_node_shadows(node) &&
+		    !list_empty(&node->private_list))
+			list_lru_del(&workingset_shadow_nodes,
+					&node->private_list);
+		__radix_tree_delete_node(&mapping->page_tree, node);
+	}
 unlock:
 	spin_unlock_irq(&mapping->tree_lock);
 }
@@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	int		i;
 
 	cleancache_invalidate_inode(mapping);
-	if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
 		return;
 
 	/* Offsets within partial pages */
@@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
  */
 void truncate_inode_pages_final(struct address_space *mapping)
 {
-	unsigned long nrshadows;
+	unsigned long nrexceptional;
 	unsigned long nrpages;
 
 	/*
@@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping)
 
 	/*
 	 * When reclaim installs eviction entries, it increases
-	 * nrshadows first, then decreases nrpages.  Make sure we see
+	 * nrexceptional first, then decreases nrpages.  Make sure we see
 	 * this in the right order or we might miss an entry.
 	 */
 	nrpages = mapping->nrpages;
 	smp_rmb();
-	nrshadows = mapping->nrshadows;
+	nrexceptional = mapping->nrexceptional;
 
-	if (nrpages || nrshadows) {
+	if (nrpages || nrexceptional) {
 		/*
 		 * As truncation uses a lockless tree lookup, cycle
 		 * the tree lock to make sure any ongoing tree
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd620b65db52..eb3dd37ccd7c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -46,6 +46,7 @@
 #include <linux/oom.h>
 #include <linux/prefetch.h>
 #include <linux/printk.h>
+#include <linux/dax.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 		 * inode reclaim needs to empty out the radix tree or
 		 * the nodes are lost.  Don't plant shadows behind its
 		 * back.
+		 *
+		 * We also don't store shadows for DAX mappings because the
+		 * only page cache pages found in these are zero pages
+		 * covering holes, and because we don't want to mix DAX
+		 * exceptional entries and shadow exceptional entries in the
+		 * same page_tree.
 		 */
 		if (reclaimed && page_is_file_cache(page) &&
-		    !mapping_exiting(mapping))
+		    !mapping_exiting(mapping) && !dax_mapping(mapping))
 			shadow = workingset_eviction(mapping, page);
 		__delete_from_page_cache(page, shadow, memcg);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 64bd0aa13f75..40b2c74ddf16 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1408,17 +1408,7 @@ static void vmstat_update(struct work_struct *w)
 		 * Defer the checking for differentials to the
 		 * shepherd thread on a different processor.
 		 */
-		int r;
-		/*
-		 * Shepherd work thread does not race since it never
-		 * changes the bit if its zero but the cpu
-		 * online / off line code may race if
-		 * worker threads are still allowed during
-		 * shutdown / startup.
-		 */
-		r = cpumask_test_and_set_cpu(smp_processor_id(),
-			cpu_stat_off);
-		VM_BUG_ON(r);
+		cpumask_set_cpu(smp_processor_id(), cpu_stat_off);
 	}
 }
 
diff --git a/mm/workingset.c b/mm/workingset.c
index aa017133744b..61ead9e5549d 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 			node->slots[i] = NULL;
 			BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
 			node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
-			BUG_ON(!mapping->nrshadows);
-			mapping->nrshadows--;
+			BUG_ON(!mapping->nrexceptional);
+			mapping->nrexceptional--;
 		}
 	}
 	BUG_ON(node->count);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index bced8c074c12..7bc2208b6cc4 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -108,9 +108,7 @@ struct p9_poll_wait {
  * @unsent_req_list: accounting for requests that haven't been sent
  * @req: current request being processed (if any)
  * @tmp_buf: temporary buffer to read in header
- * @rsize: amount to read for current frame
- * @rpos: read position in current frame
- * @rbuf: current read buffer
+ * @rc: temporary fcall for reading current frame
  * @wpos: write position for current frame
  * @wsize: amount of data to write for current frame
  * @wbuf: current write buffer
@@ -131,9 +129,7 @@ struct p9_conn {
 	struct list_head unsent_req_list;
 	struct p9_req_t *req;
 	char tmp_buf[7];
-	int rsize;
-	int rpos;
-	char *rbuf;
+	struct p9_fcall rc;
 	int wpos;
 	int wsize;
 	char *wbuf;
@@ -305,69 +301,77 @@ static void p9_read_work(struct work_struct *work)
 	if (m->err < 0)
 		return;
 
-	p9_debug(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos);
+	p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset);
 
-	if (!m->rbuf) {
-		m->rbuf = m->tmp_buf;
-		m->rpos = 0;
-		m->rsize = 7; /* start by reading header */
+	if (!m->rc.sdata) {
+		m->rc.sdata = m->tmp_buf;
+		m->rc.offset = 0;
+		m->rc.capacity = 7; /* start by reading header */
 	}
 
 	clear_bit(Rpending, &m->wsched);
-	p9_debug(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n",
-		 m, m->rpos, m->rsize, m->rsize-m->rpos);
-	err = p9_fd_read(m->client, m->rbuf + m->rpos,
-						m->rsize - m->rpos);
+	p9_debug(P9_DEBUG_TRANS, "read mux %p pos %zd size: %zd = %zd\n",
+		 m, m->rc.offset, m->rc.capacity,
+		 m->rc.capacity - m->rc.offset);
+	err = p9_fd_read(m->client, m->rc.sdata + m->rc.offset,
+			 m->rc.capacity - m->rc.offset);
 	p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
-	if (err == -EAGAIN) {
+	if (err == -EAGAIN)
 		goto end_clear;
-	}
 
 	if (err <= 0)
 		goto error;
 
-	m->rpos += err;
+	m->rc.offset += err;
 
-	if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */
-		u16 tag;
+	/* header read in */
+	if ((!m->req) && (m->rc.offset == m->rc.capacity)) {
 		p9_debug(P9_DEBUG_TRANS, "got new header\n");
 
-		n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */
-		if (n >= m->client->msize) {
+		err = p9_parse_header(&m->rc, NULL, NULL, NULL, 0);
+		if (err) {
+			p9_debug(P9_DEBUG_ERROR,
+				 "error parsing header: %d\n", err);
+			goto error;
+		}
+
+		if (m->rc.size >= m->client->msize) {
 			p9_debug(P9_DEBUG_ERROR,
-				 "requested packet size too big: %d\n", n);
+				 "requested packet size too big: %d\n",
+				 m->rc.size);
 			err = -EIO;
 			goto error;
 		}
 
-		tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */
 		p9_debug(P9_DEBUG_TRANS,
-			 "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag);
+			 "mux %p pkt: size: %d bytes tag: %d\n",
+			 m, m->rc.size, m->rc.tag);
 
-		m->req = p9_tag_lookup(m->client, tag);
+		m->req = p9_tag_lookup(m->client, m->rc.tag);
 		if (!m->req || (m->req->status != REQ_STATUS_SENT)) {
 			p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
-				 tag);
+				 m->rc.tag);
 			err = -EIO;
 			goto error;
 		}
 
 		if (m->req->rc == NULL) {
-			m->req->rc = kmalloc(sizeof(struct p9_fcall) +
-						m->client->msize, GFP_NOFS);
-			if (!m->req->rc) {
-				m->req = NULL;
-				err = -ENOMEM;
-				goto error;
-			}
+			p9_debug(P9_DEBUG_ERROR,
+				 "No recv fcall for tag %d (req %p), disconnecting!\n",
+				 m->rc.tag, m->req);
+			m->req = NULL;
+			err = -EIO;
+			goto error;
 		}
-		m->rbuf = (char *)m->req->rc + sizeof(struct p9_fcall);
-		memcpy(m->rbuf, m->tmp_buf, m->rsize);
-		m->rsize = n;
+		m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall);
+		memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity);
+		m->rc.capacity = m->rc.size;
 	}
 
-	/* not an else because some packets (like clunk) have no payload */
-	if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */
+	/* packet is read in
+	 * not an else because some packets (like clunk) have no payload
+	 */
+	if ((m->req) && (m->rc.offset == m->rc.capacity)) {
 		p9_debug(P9_DEBUG_TRANS, "got new packet\n");
 		spin_lock(&m->client->lock);
 		if (m->req->status != REQ_STATUS_ERROR)
@@ -375,9 +379,9 @@ static void p9_read_work(struct work_struct *work)
 		list_del(&m->req->req_list);
 		spin_unlock(&m->client->lock);
 		p9_client_cb(m->client, m->req, status);
-		m->rbuf = NULL;
-		m->rpos = 0;
-		m->rsize = 0;
+		m->rc.sdata = NULL;
+		m->rc.offset = 0;
+		m->rc.capacity = 0;
 		m->req = NULL;
 	}
 
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 199bc76202d2..4acb1d5417aa 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -658,7 +658,7 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
 	mutex_unlock(&virtio_9p_lock);
 
 	if (!found) {
-		pr_err("no channels available\n");
+		pr_err("no channels available for device %s\n", devname);
 		return ret;
 	}
 
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 10d87753ed87..9e43a315e662 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -152,7 +152,6 @@ static int process_one_ticket(struct ceph_auth_client *ac,
 	void *ticket_buf = NULL;
 	void *tp, *tpend;
 	void **ptp;
-	struct ceph_timespec new_validity;
 	struct ceph_crypto_key new_session_key;
 	struct ceph_buffer *new_ticket_blob;
 	unsigned long new_expires, new_renew_after;
@@ -193,8 +192,8 @@ static int process_one_ticket(struct ceph_auth_client *ac,
 	if (ret)
 		goto out;
 
-	ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
-	ceph_decode_timespec(&validity, &new_validity);
+	ceph_decode_timespec(&validity, dp);
+	dp += sizeof(struct ceph_timespec);
 	new_expires = get_seconds() + validity.tv_sec;
 	new_renew_after = new_expires - (validity.tv_sec / 4);
 	dout(" expires=%lu renew_after=%lu\n", new_expires,
@@ -233,10 +232,10 @@ static int process_one_ticket(struct ceph_auth_client *ac,
 		ceph_buffer_put(th->ticket_blob);
 	th->session_key = new_session_key;
 	th->ticket_blob = new_ticket_blob;
-	th->validity = new_validity;
 	th->secret_id = new_secret_id;
 	th->expires = new_expires;
 	th->renew_after = new_renew_after;
+	th->have_key = true;
 	dout(" got ticket service %d (%s) secret_id %lld len %d\n",
 	     type, ceph_entity_type_name(type), th->secret_id,
 	     (int)th->ticket_blob->vec.iov_len);
@@ -384,6 +383,24 @@ bad:
 	return -ERANGE;
 }
 
+static bool need_key(struct ceph_x_ticket_handler *th)
+{
+	if (!th->have_key)
+		return true;
+
+	return get_seconds() >= th->renew_after;
+}
+
+static bool have_key(struct ceph_x_ticket_handler *th)
+{
+	if (th->have_key) {
+		if (get_seconds() >= th->expires)
+			th->have_key = false;
+	}
+
+	return th->have_key;
+}
+
 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
 {
 	int want = ac->want_keys;
@@ -402,20 +419,18 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
 			continue;
 
 		th = get_ticket_handler(ac, service);
-
 		if (IS_ERR(th)) {
 			*pneed |= service;
 			continue;
 		}
 
-		if (get_seconds() >= th->renew_after)
+		if (need_key(th))
 			*pneed |= service;
-		if (get_seconds() >= th->expires)
+		if (!have_key(th))
 			xi->have_keys &= ~service;
 	}
 }
 
-
 static int ceph_x_build_request(struct ceph_auth_client *ac,
 				void *buf, void *end)
 {
@@ -667,14 +682,26 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
 	ac->private = NULL;
 }
 
-static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
-				   int peer_type)
+static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type)
 {
 	struct ceph_x_ticket_handler *th;
 
 	th = get_ticket_handler(ac, peer_type);
 	if (!IS_ERR(th))
-		memset(&th->validity, 0, sizeof(th->validity));
+		th->have_key = false;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+					 int peer_type)
+{
+	/*
+	 * We are to invalidate a service ticket in the hopes of
+	 * getting a new, hopefully more valid, one.  But, we won't get
+	 * it unless our AUTH ticket is good, so invalidate AUTH ticket
+	 * as well, just in case.
+	 */
+	invalidate_ticket(ac, peer_type);
+	invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH);
 }
 
 static int calcu_signature(struct ceph_x_authorizer *au,
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index e8b7c6917d47..40b1a3cf7397 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -16,7 +16,7 @@ struct ceph_x_ticket_handler {
 	unsigned int service;
 
 	struct ceph_crypto_key session_key;
-	struct ceph_timespec validity;
+	bool have_key;
 
 	u64 secret_id;
 	struct ceph_buffer *ticket_blob;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9981039ef4ff..9cfedf565f5b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -23,9 +23,6 @@
 #include <linux/ceph/pagelist.h>
 #include <linux/export.h>
 
-#define list_entry_next(pos, member)					\
-	list_entry(pos->member.next, typeof(*pos), member)
-
 /*
  * Ceph uses the messenger to exchange ceph_msg messages with other
  * hosts in the system.  The messenger provides ordered and reliable
@@ -672,6 +669,8 @@ static void reset_connection(struct ceph_connection *con)
 	}
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
+
+	con->out_skip = 0;
 }
 
 /*
@@ -771,6 +770,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
 
 static void con_out_kvec_reset(struct ceph_connection *con)
 {
+	BUG_ON(con->out_skip);
+
 	con->out_kvec_left = 0;
 	con->out_kvec_bytes = 0;
 	con->out_kvec_cur = &con->out_kvec[0];
@@ -779,9 +780,9 @@ static void con_out_kvec_reset(struct ceph_connection *con)
 static void con_out_kvec_add(struct ceph_connection *con,
 				size_t size, void *data)
 {
-	int index;
+	int index = con->out_kvec_left;
 
-	index = con->out_kvec_left;
+	BUG_ON(con->out_skip);
 	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
 
 	con->out_kvec[index].iov_len = size;
@@ -790,6 +791,27 @@ static void con_out_kvec_add(struct ceph_connection *con,
 	con->out_kvec_bytes += size;
 }
 
+/*
+ * Chop off a kvec from the end.  Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+	int off = con->out_kvec_cur - con->out_kvec;
+	int skip = 0;
+
+	if (con->out_kvec_bytes > 0) {
+		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
+		BUG_ON(con->out_kvec_bytes < skip);
+		BUG_ON(!con->out_kvec_left);
+		con->out_kvec_bytes -= skip;
+		con->out_kvec_left--;
+	}
+
+	return skip;
+}
+
 #ifdef CONFIG_BLOCK
 
 /*
@@ -1042,7 +1064,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
 	/* Move on to the next page */
 
 	BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
-	cursor->page = list_entry_next(cursor->page, lru);
+	cursor->page = list_next_entry(cursor->page, lru);
 	cursor->last_piece = cursor->resid <= PAGE_SIZE;
 
 	return true;
@@ -1166,7 +1188,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
 	if (!cursor->resid && cursor->total_resid) {
 		WARN_ON(!cursor->last_piece);
 		BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
-		cursor->data = list_entry_next(cursor->data, links);
+		cursor->data = list_next_entry(cursor->data, links);
 		__ceph_msg_data_cursor_init(cursor);
 		new_piece = true;
 	}
@@ -1197,7 +1219,6 @@ static void prepare_write_message_footer(struct ceph_connection *con)
 	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
 
 	dout("prepare_write_message_footer %p\n", con);
-	con->out_kvec_is_msg = true;
 	con->out_kvec[v].iov_base = &m->footer;
 	if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
 		if (con->ops->sign_message)
@@ -1225,7 +1246,6 @@ static void prepare_write_message(struct ceph_connection *con)
 	u32 crc;
 
 	con_out_kvec_reset(con);
-	con->out_kvec_is_msg = true;
 	con->out_msg_done = false;
 
 	/* Sneak an ack in there first?  If we can get it into the same
@@ -1265,18 +1285,19 @@ static void prepare_write_message(struct ceph_connection *con)
 
 	/* tag + hdr + front + middle */
 	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
-	con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
+	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
 	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
 
 	if (m->middle)
 		con_out_kvec_add(con, m->middle->vec.iov_len,
 			m->middle->vec.iov_base);
 
-	/* fill in crc (except data pages), footer */
+	/* fill in hdr crc and finalize hdr */
 	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
 	con->out_msg->hdr.crc = cpu_to_le32(crc);
-	con->out_msg->footer.flags = 0;
+	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
 
+	/* fill in front and middle crc, footer */
 	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
 	con->out_msg->footer.front_crc = cpu_to_le32(crc);
 	if (m->middle) {
@@ -1288,6 +1309,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	dout("%s front_crc %u middle_crc %u\n", __func__,
 	     le32_to_cpu(con->out_msg->footer.front_crc),
 	     le32_to_cpu(con->out_msg->footer.middle_crc));
+	con->out_msg->footer.flags = 0;
 
 	/* is there a data payload? */
 	con->out_msg->footer.data_crc = 0;
@@ -1492,7 +1514,6 @@ static int write_partial_kvec(struct ceph_connection *con)
 		}
 	}
 	con->out_kvec_left = 0;
-	con->out_kvec_is_msg = false;
 	ret = 1;
 out:
 	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
@@ -1584,6 +1605,7 @@ static int write_partial_skip(struct ceph_connection *con)
 {
 	int ret;
 
+	dout("%s %p %d left\n", __func__, con, con->out_skip);
 	while (con->out_skip > 0) {
 		size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
 
@@ -2506,13 +2528,13 @@ more:
 
 more_kvec:
 	/* kvec data queued? */
-	if (con->out_skip) {
-		ret = write_partial_skip(con);
+	if (con->out_kvec_left) {
+		ret = write_partial_kvec(con);
 		if (ret <= 0)
 			goto out;
 	}
-	if (con->out_kvec_left) {
-		ret = write_partial_kvec(con);
+	if (con->out_skip) {
+		ret = write_partial_skip(con);
 		if (ret <= 0)
 			goto out;
 	}
@@ -2805,13 +2827,17 @@ static bool con_backoff(struct ceph_connection *con)
 
 static void con_fault_finish(struct ceph_connection *con)
 {
+	dout("%s %p\n", __func__, con);
+
 	/*
 	 * in case we faulted due to authentication, invalidate our
 	 * current tickets so that we can get new ones.
 	 */
-	if (con->auth_retry && con->ops->invalidate_authorizer) {
-		dout("calling invalidate_authorizer()\n");
-		con->ops->invalidate_authorizer(con);
+	if (con->auth_retry) {
+		dout("auth_retry %d, invalidating\n", con->auth_retry);
+		if (con->ops->invalidate_authorizer)
+			con->ops->invalidate_authorizer(con);
+		con->auth_retry = 0;
 	}
 
 	if (con->ops->fault)
@@ -3050,16 +3076,31 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 		ceph_msg_put(msg);
 	}
 	if (con->out_msg == msg) {
-		dout("%s %p msg %p - was sending\n", __func__, con, msg);
-		con->out_msg = NULL;
-		if (con->out_kvec_is_msg) {
-			con->out_skip = con->out_kvec_bytes;
-			con->out_kvec_is_msg = false;
+		BUG_ON(con->out_skip);
+		/* footer */
+		if (con->out_msg_done) {
+			con->out_skip += con_out_kvec_skip(con);
+		} else {
+			BUG_ON(!msg->data_length);
+			if (con->peer_features & CEPH_FEATURE_MSG_AUTH)
+				con->out_skip += sizeof(msg->footer);
+			else
+				con->out_skip += sizeof(msg->old_footer);
 		}
+		/* data, middle, front */
+		if (msg->data_length)
+			con->out_skip += msg->cursor.total_resid;
+		if (msg->middle)
+			con->out_skip += con_out_kvec_skip(con);
+		con->out_skip += con_out_kvec_skip(con);
+
+		dout("%s %p msg %p - was sending, will write %d skip %d\n",
+		     __func__, con, msg, con->out_kvec_bytes, con->out_skip);
 		msg->hdr.seq = 0;
-
+		con->out_msg = NULL;
 		ceph_msg_put(msg);
 	}
+
 	mutex_unlock(&con->mutex);
 }
 
@@ -3361,9 +3402,7 @@ static void ceph_msg_free(struct ceph_msg *m)
 static void ceph_msg_release(struct kref *kref)
 {
 	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
-	LIST_HEAD(data);
-	struct list_head *links;
-	struct list_head *next;
+	struct ceph_msg_data *data, *next;
 
 	dout("%s %p\n", __func__, m);
 	WARN_ON(!list_empty(&m->list_head));
@@ -3376,12 +3415,8 @@ static void ceph_msg_release(struct kref *kref)
 		m->middle = NULL;
 	}
 
-	list_splice_init(&m->data, &data);
-	list_for_each_safe(links, next, &data) {
-		struct ceph_msg_data *data;
-
-		data = list_entry(links, struct ceph_msg_data, links);
-		list_del_init(links);
+	list_for_each_entry_safe(data, next, &m->data, links) {
+		list_del_init(&data->links);
 		ceph_msg_data_destroy(data);
 	}
 	m->data_length = 0;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index edda01626a45..de85dddc3dc0 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -364,10 +364,6 @@ static bool have_debugfs_info(struct ceph_mon_client *monc)
 	return monc->client->have_fsid && monc->auth->global_id > 0;
 }
 
-/*
- * The monitor responds with mount ack indicate mount success.  The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
 static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 				 struct ceph_msg *msg)
 {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 744e5936c10d..7aea0ccb6be6 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head)
 
 	if (!n->tn_bits)
 		kmem_cache_free(trie_leaf_kmem, n);
-	else if (n->tn_bits <= TNODE_KMALLOC_MAX)
-		kfree(n);
 	else
-		vfree(n);
+		kvfree(n);
 }
 
 #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
diff --git a/net/rds/ib.c b/net/rds/ib.c
index f222885ac0c7..9481d55ff6cb 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -122,44 +122,34 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
 static void rds_ib_add_one(struct ib_device *device)
 {
 	struct rds_ib_device *rds_ibdev;
-	struct ib_device_attr *dev_attr;
 
 	/* Only handle IB (no iWARP) devices */
 	if (device->node_type != RDMA_NODE_IB_CA)
 		return;
 
-	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
-	if (!dev_attr)
-		return;
-
-	if (ib_query_device(device, dev_attr)) {
-		rdsdebug("Query device failed for %s\n", device->name);
-		goto free_attr;
-	}
-
 	rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
 				 ibdev_to_node(device));
 	if (!rds_ibdev)
-		goto free_attr;
+		return;
 
 	spin_lock_init(&rds_ibdev->spinlock);
 	atomic_set(&rds_ibdev->refcount, 1);
 	INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
 
-	rds_ibdev->max_wrs = dev_attr->max_qp_wr;
-	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+	rds_ibdev->max_wrs = device->attrs.max_qp_wr;
+	rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
 
-	rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
-	rds_ibdev->max_1m_fmrs = dev_attr->max_mr ?
-		min_t(unsigned int, (dev_attr->max_mr / 2),
+	rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
+	rds_ibdev->max_1m_fmrs = device->attrs.max_mr ?
+		min_t(unsigned int, (device->attrs.max_mr / 2),
 		      rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
 
-	rds_ibdev->max_8k_fmrs = dev_attr->max_mr ?
-		min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE),
+	rds_ibdev->max_8k_fmrs = device->attrs.max_mr ?
+		min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
 		      rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
 
-	rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
-	rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+	rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
+	rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
 
 	rds_ibdev->dev = device;
 	rds_ibdev->pd = ib_alloc_pd(device);
@@ -183,7 +173,7 @@ static void rds_ib_add_one(struct ib_device *device)
 	}
 
 	rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
-		 dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
+		 device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
 		 rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
 		 rds_ibdev->max_8k_fmrs);
 
@@ -202,8 +192,6 @@ static void rds_ib_add_one(struct ib_device *device)
 
 put_dev:
 	rds_ib_dev_put(rds_ibdev);
-free_attr:
-	kfree(dev_attr);
 }
 
 /*
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 576f1825fc55..f4a9fff829e0 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -60,30 +60,20 @@ LIST_HEAD(iw_nodev_conns);
 static void rds_iw_add_one(struct ib_device *device)
 {
 	struct rds_iw_device *rds_iwdev;
-	struct ib_device_attr *dev_attr;
 
 	/* Only handle iwarp devices */
 	if (device->node_type != RDMA_NODE_RNIC)
 		return;
 
-	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
-	if (!dev_attr)
-		return;
-
-	if (ib_query_device(device, dev_attr)) {
-		rdsdebug("Query device failed for %s\n", device->name);
-		goto free_attr;
-	}
-
 	rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
 	if (!rds_iwdev)
-		goto free_attr;
+		return;
 
 	spin_lock_init(&rds_iwdev->spinlock);
 
-	rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
-	rds_iwdev->max_wrs = dev_attr->max_qp_wr;
-	rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
+	rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
+	rds_iwdev->max_wrs = device->attrs.max_qp_wr;
+	rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE);
 
 	rds_iwdev->dev = device;
 	rds_iwdev->pd = ib_alloc_pd(device);
@@ -111,8 +101,7 @@ static void rds_iw_add_one(struct ib_device *device)
 	list_add_tail(&rds_iwdev->list, &rds_iw_devices);
 
 	ib_set_client_data(device, &rds_iw_client, rds_iwdev);
-
-	goto free_attr;
+	return;
 
 err_mr:
 	if (rds_iwdev->mr)
@@ -121,8 +110,6 @@ err_pd:
 	ib_dealloc_pd(rds_iwdev->pd);
 free_dev:
 	kfree(rds_iwdev);
-free_attr:
-	kfree(dev_attr);
 }
 
 static void rds_iw_remove_one(struct ib_device *device, void *client_data)
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 5e4f815c2b34..2b32fd602669 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -771,7 +771,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
 	if (count == 0)
 		return 0;
 
-	mutex_lock(&inode->i_mutex); /* protect against multiple concurrent
+	inode_lock(inode); /* protect against multiple concurrent
 			      * readers on this file */
  again:
 	spin_lock(&queue_lock);
@@ -784,7 +784,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
 	}
 	if (rp->q.list.next == &cd->queue) {
 		spin_unlock(&queue_lock);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		WARN_ON_ONCE(rp->offset);
 		return 0;
 	}
@@ -838,7 +838,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
 	}
 	if (err == -EAGAIN)
 		goto again;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err ? err :  count;
 }
 
@@ -909,9 +909,9 @@ static ssize_t cache_write(struct file *filp, const char __user *buf,
 	if (!cd->cache_parse)
 		goto out;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = cache_downcall(mapping, buf, count, cd);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 out:
 	return ret;
 }
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 14f45bf0410c..31789ef3e614 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -172,7 +172,7 @@ rpc_close_pipes(struct inode *inode)
 	int need_release;
 	LIST_HEAD(free_list);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	spin_lock(&pipe->lock);
 	need_release = pipe->nreaders != 0 || pipe->nwriters != 0;
 	pipe->nreaders = 0;
@@ -188,7 +188,7 @@ rpc_close_pipes(struct inode *inode)
 	cancel_delayed_work_sync(&pipe->queue_timeout);
 	rpc_inode_setowner(inode, NULL);
 	RPC_I(inode)->pipe = NULL;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 }
 
 static struct inode *
@@ -221,7 +221,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
 	int first_open;
 	int res = -ENXIO;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	pipe = RPC_I(inode)->pipe;
 	if (pipe == NULL)
 		goto out;
@@ -237,7 +237,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
 		pipe->nwriters++;
 	res = 0;
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return res;
 }
 
@@ -248,7 +248,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
 	struct rpc_pipe_msg *msg;
 	int last_close;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	pipe = RPC_I(inode)->pipe;
 	if (pipe == NULL)
 		goto out;
@@ -278,7 +278,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
 	if (last_close && pipe->ops->release_pipe)
 		pipe->ops->release_pipe(inode);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return 0;
 }
 
@@ -290,7 +290,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 	struct rpc_pipe_msg *msg;
 	int res = 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	pipe = RPC_I(inode)->pipe;
 	if (pipe == NULL) {
 		res = -EPIPE;
@@ -322,7 +322,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 		pipe->ops->destroy_msg(msg);
 	}
 out_unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return res;
 }
 
@@ -332,11 +332,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of
 	struct inode *inode = file_inode(filp);
 	int res;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	res = -EPIPE;
 	if (RPC_I(inode)->pipe != NULL)
 		res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return res;
 }
 
@@ -349,12 +349,12 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
 
 	poll_wait(filp, &rpci->waitq, wait);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	if (rpci->pipe == NULL)
 		mask |= POLLERR | POLLHUP;
 	else if (filp->private_data || !list_empty(&rpci->pipe->pipe))
 		mask |= POLLIN | POLLRDNORM;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return mask;
 }
 
@@ -367,10 +367,10 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case FIONREAD:
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		pipe = RPC_I(inode)->pipe;
 		if (pipe == NULL) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			return -EPIPE;
 		}
 		spin_lock(&pipe->lock);
@@ -381,7 +381,7 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			len += msg->len - msg->copied;
 		}
 		spin_unlock(&pipe->lock);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		return put_user(len, (int __user *)arg);
 	default:
 		return -EINVAL;
@@ -617,9 +617,9 @@ int rpc_rmdir(struct dentry *dentry)
 
 	parent = dget_parent(dentry);
 	dir = d_inode(parent);
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(dir, I_MUTEX_PARENT);
 	error = __rpc_rmdir(dir, dentry);
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	dput(parent);
 	return error;
 }
@@ -701,9 +701,9 @@ static void rpc_depopulate(struct dentry *parent,
 {
 	struct inode *dir = d_inode(parent);
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(dir, I_MUTEX_CHILD);
 	__rpc_depopulate(parent, files, start, eof);
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 }
 
 static int rpc_populate(struct dentry *parent,
@@ -715,7 +715,7 @@ static int rpc_populate(struct dentry *parent,
 	struct dentry *dentry;
 	int i, err;
 
-	mutex_lock(&dir->i_mutex);
+	inode_lock(dir);
 	for (i = start; i < eof; i++) {
 		dentry = __rpc_lookup_create_exclusive(parent, files[i].name);
 		err = PTR_ERR(dentry);
@@ -739,11 +739,11 @@ static int rpc_populate(struct dentry *parent,
 		if (err != 0)
 			goto out_bad;
 	}
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	return 0;
 out_bad:
 	__rpc_depopulate(parent, files, start, eof);
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	printk(KERN_WARNING "%s: %s failed to populate directory %pd\n",
 			__FILE__, __func__, parent);
 	return err;
@@ -757,7 +757,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
 	struct inode *dir = d_inode(parent);
 	int error;
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(dir, I_MUTEX_PARENT);
 	dentry = __rpc_lookup_create_exclusive(parent, name);
 	if (IS_ERR(dentry))
 		goto out;
@@ -770,7 +770,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
 			goto err_rmdir;
 	}
 out:
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	return dentry;
 err_rmdir:
 	__rpc_rmdir(dir, dentry);
@@ -788,11 +788,11 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
 
 	parent = dget_parent(dentry);
 	dir = d_inode(parent);
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(dir, I_MUTEX_PARENT);
 	if (depopulate != NULL)
 		depopulate(dentry);
 	error = __rpc_rmdir(dir, dentry);
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	dput(parent);
 	return error;
 }
@@ -828,7 +828,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
 	if (pipe->ops->downcall == NULL)
 		umode &= ~S_IWUGO;
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(dir, I_MUTEX_PARENT);
 	dentry = __rpc_lookup_create_exclusive(parent, name);
 	if (IS_ERR(dentry))
 		goto out;
@@ -837,7 +837,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
 	if (err)
 		goto out_err;
 out:
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	return dentry;
 out_err:
 	dentry = ERR_PTR(err);
@@ -865,9 +865,9 @@ rpc_unlink(struct dentry *dentry)
 
 	parent = dget_parent(dentry);
 	dir = d_inode(parent);
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(dir, I_MUTEX_PARENT);
 	error = __rpc_rmpipe(dir, dentry);
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	dput(parent);
 	return error;
 }
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2e98f4a243e5..37edea6fa92d 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1425,3 +1425,4 @@ void xprt_put(struct rpc_xprt *xprt)
 	if (atomic_dec_and_test(&xprt->count))
 		xprt_destroy(xprt);
 }
+EXPORT_SYMBOL_GPL(xprt_put);
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 33f99d3004f2..dc9f3b513a05 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
 
 rpcrdma-y := transport.o rpc_rdma.o verbs.o \
 	fmr_ops.o frwr_ops.o physical_ops.o \
-	svc_rdma.o svc_rdma_transport.o \
+	svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
 	svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
 	module.o
 rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c6836844bd0e..e16567389e28 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -190,12 +190,11 @@ static int
 frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 	     struct rpcrdma_create_data_internal *cdata)
 {
-	struct ib_device_attr *devattr = &ia->ri_devattr;
 	int depth, delta;
 
 	ia->ri_max_frmr_depth =
 			min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-			      devattr->max_fast_reg_page_list_len);
+			      ia->ri_device->attrs.max_fast_reg_page_list_len);
 	dprintk("RPC:       %s: device's max FR page list len = %u\n",
 		__func__, ia->ri_max_frmr_depth);
 
@@ -222,8 +221,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 	}
 
 	ep->rep_attr.cap.max_send_wr *= depth;
-	if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
-		cdata->max_requests = devattr->max_qp_wr / depth;
+	if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) {
+		cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth;
 		if (!cdata->max_requests)
 			return -EINVAL;
 		ep->rep_attr.cap.max_send_wr = cdata->max_requests *
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 1b7051bdbdc8..c846ca9f1eba 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -55,6 +55,7 @@ unsigned int svcrdma_ord = RPCRDMA_ORD;
 static unsigned int min_ord = 1;
 static unsigned int max_ord = 4096;
 unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
 static unsigned int min_max_requests = 4;
 static unsigned int max_max_requests = 16384;
 unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
@@ -71,10 +72,6 @@ atomic_t rdma_stat_rq_prod;
 atomic_t rdma_stat_sq_poll;
 atomic_t rdma_stat_sq_prod;
 
-/* Temporary NFS request map and context caches */
-struct kmem_cache *svc_rdma_map_cachep;
-struct kmem_cache *svc_rdma_ctxt_cachep;
-
 struct workqueue_struct *svc_rdma_wq;
 
 /*
@@ -243,17 +240,16 @@ void svc_rdma_cleanup(void)
 	svc_unreg_xprt_class(&svc_rdma_bc_class);
 #endif
 	svc_unreg_xprt_class(&svc_rdma_class);
-	kmem_cache_destroy(svc_rdma_map_cachep);
-	kmem_cache_destroy(svc_rdma_ctxt_cachep);
 }
 
 int svc_rdma_init(void)
 {
 	dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
 	dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
-	dprintk("\tmax_requests     : %d\n", svcrdma_max_requests);
-	dprintk("\tsq_depth         : %d\n",
+	dprintk("\tmax_requests     : %u\n", svcrdma_max_requests);
+	dprintk("\tsq_depth         : %u\n",
 		svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
+	dprintk("\tmax_bc_requests  : %u\n", svcrdma_max_bc_requests);
 	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
 
 	svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
@@ -264,39 +260,10 @@ int svc_rdma_init(void)
 		svcrdma_table_header =
 			register_sysctl_table(svcrdma_root_table);
 
-	/* Create the temporary map cache */
-	svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
-						sizeof(struct svc_rdma_req_map),
-						0,
-						SLAB_HWCACHE_ALIGN,
-						NULL);
-	if (!svc_rdma_map_cachep) {
-		printk(KERN_INFO "Could not allocate map cache.\n");
-		goto err0;
-	}
-
-	/* Create the temporary context cache */
-	svc_rdma_ctxt_cachep =
-		kmem_cache_create("svc_rdma_ctxt_cache",
-				  sizeof(struct svc_rdma_op_ctxt),
-				  0,
-				  SLAB_HWCACHE_ALIGN,
-				  NULL);
-	if (!svc_rdma_ctxt_cachep) {
-		printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
-		goto err1;
-	}
-
 	/* Register RDMA with the SVC transport switch */
 	svc_reg_xprt_class(&svc_rdma_class);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	svc_reg_xprt_class(&svc_rdma_bc_class);
 #endif
 	return 0;
- err1:
-	kmem_cache_destroy(svc_rdma_map_cachep);
- err0:
-	unregister_sysctl_table(svcrdma_table_header);
-	destroy_workqueue(svc_rdma_wq);
-	return -ENOMEM;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
new file mode 100644
index 000000000000..65a7c232a345
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA (server-side).
+ */
+
+#include <linux/sunrpc/svc_rdma.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+#undef SVCRDMA_BACKCHANNEL_DEBUG
+
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+			     struct xdr_buf *rcvbuf)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct kvec *dst, *src = &rcvbuf->head[0];
+	struct rpc_rqst *req;
+	unsigned long cwnd;
+	u32 credits;
+	size_t len;
+	__be32 xid;
+	__be32 *p;
+	int ret;
+
+	p = (__be32 *)src->iov_base;
+	len = src->iov_len;
+	xid = rmsgp->rm_xid;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+	pr_info("%s: xid=%08x, length=%zu\n",
+		__func__, be32_to_cpu(xid), len);
+	pr_info("%s: RPC/RDMA: %*ph\n",
+		__func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+	pr_info("%s:      RPC: %*ph\n",
+		__func__, (int)len, p);
+#endif
+
+	ret = -EAGAIN;
+	if (src->iov_len < 24)
+		goto out_shortreply;
+
+	spin_lock_bh(&xprt->transport_lock);
+	req = xprt_lookup_rqst(xprt, xid);
+	if (!req)
+		goto out_notfound;
+
+	dst = &req->rq_private_buf.head[0];
+	memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
+	if (dst->iov_len < len)
+		goto out_unlock;
+	memcpy(dst->iov_base, p, len);
+
+	credits = be32_to_cpu(rmsgp->rm_credit);
+	if (credits == 0)
+		credits = 1;	/* don't deadlock */
+	else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
+		credits = r_xprt->rx_buf.rb_bc_max_requests;
+
+	cwnd = xprt->cwnd;
+	xprt->cwnd = credits << RPC_CWNDSHIFT;
+	if (xprt->cwnd > cwnd)
+		xprt_release_rqst_cong(req->rq_task);
+
+	ret = 0;
+	xprt_complete_rqst(req->rq_task, rcvbuf->len);
+	rcvbuf->len = 0;
+
+out_unlock:
+	spin_unlock_bh(&xprt->transport_lock);
+out:
+	return ret;
+
+out_shortreply:
+	dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n",
+		xprt, src->iov_len);
+	goto out;
+
+out_notfound:
+	dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n",
+		xprt, be32_to_cpu(xid));
+
+	goto out_unlock;
+}
+
+/* Send a backwards direction RPC call.
+ *
+ * Caller holds the connection's mutex and has already marshaled
+ * the RPC/RDMA request.
+ *
+ * This is similar to svc_rdma_reply, but takes an rpc_rqst
+ * instead, does not support chunks, and avoids blocking memory
+ * allocation.
+ *
+ * XXX: There is still an opportunity to block in svc_rdma_send()
+ * if there are no SQ entries to post the Send. This may occur if
+ * the adapter has a small maximum SQ depth.
+ */
+static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
+			      struct rpc_rqst *rqst)
+{
+	struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
+	struct svc_rdma_op_ctxt *ctxt;
+	struct svc_rdma_req_map *vec;
+	struct ib_send_wr send_wr;
+	int ret;
+
+	vec = svc_rdma_get_req_map(rdma);
+	ret = svc_rdma_map_xdr(rdma, sndbuf, vec);
+	if (ret)
+		goto out_err;
+
+	/* Post a recv buffer to handle the reply for this request. */
+	ret = svc_rdma_post_recv(rdma, GFP_NOIO);
+	if (ret) {
+		pr_err("svcrdma: Failed to post bc receive buffer, err=%d.\n",
+		       ret);
+		pr_err("svcrdma: closing transport %p.\n", rdma);
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		ret = -ENOTCONN;
+		goto out_err;
+	}
+
+	ctxt = svc_rdma_get_context(rdma);
+	ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
+	ctxt->count = 1;
+
+	ctxt->wr_op = IB_WR_SEND;
+	ctxt->direction = DMA_TO_DEVICE;
+	ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
+	ctxt->sge[0].length = sndbuf->len;
+	ctxt->sge[0].addr =
+	    ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
+			    sndbuf->len, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
+		ret = -EIO;
+		goto out_unmap;
+	}
+	atomic_inc(&rdma->sc_dma_used);
+
+	memset(&send_wr, 0, sizeof(send_wr));
+	send_wr.wr_id = (unsigned long)ctxt;
+	send_wr.sg_list = ctxt->sge;
+	send_wr.num_sge = 1;
+	send_wr.opcode = IB_WR_SEND;
+	send_wr.send_flags = IB_SEND_SIGNALED;
+
+	ret = svc_rdma_send(rdma, &send_wr);
+	if (ret) {
+		ret = -EIO;
+		goto out_unmap;
+	}
+
+out_err:
+	svc_rdma_put_req_map(rdma, vec);
+	dprintk("svcrdma: %s returns %d\n", __func__, ret);
+	return ret;
+
+out_unmap:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 1);
+	goto out_err;
+}
+
+/* Server-side transport endpoint wants a whole page for its send
+ * buffer. The client RPC code constructs the RPC header in this
+ * buffer before it invokes ->send_request.
+ *
+ * Returns NULL if there was a temporary allocation failure.
+ */
+static void *
+xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+	struct svcxprt_rdma *rdma;
+	struct page *page;
+
+	rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+
+	/* Prevent an infinite loop: try to make this case work */
+	if (size > PAGE_SIZE)
+		WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
+			  size);
+
+	page = alloc_page(RPCRDMA_DEF_GFP);
+	if (!page)
+		return NULL;
+
+	return page_address(page);
+}
+
+static void
+xprt_rdma_bc_free(void *buffer)
+{
+	/* No-op: ctxt and page have already been freed. */
+}
+
+static int
+rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
+{
+	struct rpc_xprt *xprt = rqst->rq_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
+	int rc;
+
+	/* Space in the send buffer for an RPC/RDMA header is reserved
+	 * via xprt->tsh_size.
+	 */
+	headerp->rm_xid = rqst->rq_xid;
+	headerp->rm_vers = rpcrdma_version;
+	headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+	headerp->rm_type = rdma_msg;
+	headerp->rm_body.rm_chunks[0] = xdr_zero;
+	headerp->rm_body.rm_chunks[1] = xdr_zero;
+	headerp->rm_body.rm_chunks[2] = xdr_zero;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+	pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
+#endif
+
+	rc = svc_rdma_bc_sendto(rdma, rqst);
+	if (rc)
+		goto drop_connection;
+	return rc;
+
+drop_connection:
+	dprintk("svcrdma: failed to send bc call\n");
+	xprt_disconnect_done(xprt);
+	return -ENOTCONN;
+}
+
+/* Send an RPC call on the passive end of a transport
+ * connection.
+ */
+static int
+xprt_rdma_bc_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+	struct svcxprt_rdma *rdma;
+	int ret;
+
+	dprintk("svcrdma: sending bc call with xid: %08x\n",
+		be32_to_cpu(rqst->rq_xid));
+
+	if (!mutex_trylock(&sxprt->xpt_mutex)) {
+		rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL);
+		if (!mutex_trylock(&sxprt->xpt_mutex))
+			return -EAGAIN;
+		rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task);
+	}
+
+	ret = -ENOTCONN;
+	rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+	if (!test_bit(XPT_DEAD, &sxprt->xpt_flags))
+		ret = rpcrdma_bc_send_request(rdma, rqst);
+
+	mutex_unlock(&sxprt->xpt_mutex);
+
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+static void
+xprt_rdma_bc_close(struct rpc_xprt *xprt)
+{
+	dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+}
+
+static void
+xprt_rdma_bc_put(struct rpc_xprt *xprt)
+{
+	dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+
+	xprt_free(xprt);
+	module_put(THIS_MODULE);
+}
+
+static struct rpc_xprt_ops xprt_rdma_bc_procs = {
+	.reserve_xprt		= xprt_reserve_xprt_cong,
+	.release_xprt		= xprt_release_xprt_cong,
+	.alloc_slot		= xprt_alloc_slot,
+	.release_request	= xprt_release_rqst_cong,
+	.buf_alloc		= xprt_rdma_bc_allocate,
+	.buf_free		= xprt_rdma_bc_free,
+	.send_request		= xprt_rdma_bc_send_request,
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
+	.close			= xprt_rdma_bc_close,
+	.destroy		= xprt_rdma_bc_put,
+	.print_stats		= xprt_rdma_print_stats
+};
+
+static const struct rpc_timeout xprt_rdma_bc_timeout = {
+	.to_initval = 60 * HZ,
+	.to_maxval = 60 * HZ,
+};
+
+/* It shouldn't matter if the number of backchannel session slots
+ * doesn't match the number of RPC/RDMA credits. That just means
+ * one or the other will have extra slots that aren't used.
+ */
+static struct rpc_xprt *
+xprt_setup_rdma_bc(struct xprt_create *args)
+{
+	struct rpc_xprt *xprt;
+	struct rpcrdma_xprt *new_xprt;
+
+	if (args->addrlen > sizeof(xprt->addr)) {
+		dprintk("RPC:       %s: address too large\n", __func__);
+		return ERR_PTR(-EBADF);
+	}
+
+	xprt = xprt_alloc(args->net, sizeof(*new_xprt),
+			  RPCRDMA_MAX_BC_REQUESTS,
+			  RPCRDMA_MAX_BC_REQUESTS);
+	if (!xprt) {
+		dprintk("RPC:       %s: couldn't allocate rpc_xprt\n",
+			__func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	xprt->timeout = &xprt_rdma_bc_timeout;
+	xprt_set_bound(xprt);
+	xprt_set_connected(xprt);
+	xprt->bind_timeout = RPCRDMA_BIND_TO;
+	xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+	xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+
+	xprt->prot = XPRT_TRANSPORT_BC_RDMA;
+	xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32);
+	xprt->ops = &xprt_rdma_bc_procs;
+
+	memcpy(&xprt->addr, args->dstaddr, args->addrlen);
+	xprt->addrlen = args->addrlen;
+	xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr);
+	xprt->resvport = 0;
+
+	xprt->max_payload = xprt_rdma_max_inline_read;
+
+	new_xprt = rpcx_to_rdmax(xprt);
+	new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs;
+
+	xprt_get(xprt);
+	args->bc_xprt->xpt_bc_xprt = xprt;
+	xprt->bc_xprt = args->bc_xprt;
+
+	if (!try_module_get(THIS_MODULE))
+		goto out_fail;
+
+	/* Final put for backchannel xprt is in __svc_rdma_free */
+	xprt_get(xprt);
+	return xprt;
+
+out_fail:
+	xprt_rdma_free_addresses(xprt);
+	args->bc_xprt->xpt_bc_xprt = NULL;
+	xprt_put(xprt);
+	xprt_free(xprt);
+	return ERR_PTR(-EINVAL);
+}
+
+struct xprt_class xprt_rdma_bc = {
+	.list			= LIST_HEAD_INIT(xprt_rdma_bc.list),
+	.name			= "rdma backchannel",
+	.owner			= THIS_MODULE,
+	.ident			= XPRT_TRANSPORT_BC_RDMA,
+	.setup			= xprt_setup_rdma_bc,
+};
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index ff4f01e527ec..c8b8a8b4181e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -144,6 +144,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 
 		head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
 		head->arg.page_len += len;
+
 		head->arg.len += len;
 		if (!pg_off)
 			head->count++;
@@ -160,8 +161,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 			goto err;
 		atomic_inc(&xprt->sc_dma_used);
 
-		/* The lkey here is either a local dma lkey or a dma_mr lkey */
-		ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
+		ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey;
 		ctxt->sge[pno].length = len;
 		ctxt->count++;
 
@@ -567,6 +567,38 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 	return ret;
 }
 
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool
+svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+{
+	__be32 *p = (__be32 *)rmsgp;
+
+	if (!xprt->xpt_bc_xprt)
+		return false;
+
+	if (rmsgp->rm_type != rdma_msg)
+		return false;
+	if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+		return false;
+	if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+		return false;
+	if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+		return false;
+
+	/* sanity */
+	if (p[7] != rmsgp->rm_xid)
+		return false;
+	/* call direction */
+	if (p[8] == cpu_to_be32(RPC_CALL))
+		return false;
+
+	return true;
+}
+
 /*
  * Set up the rqstp thread context to point to the RQ buffer. If
  * necessary, pull additional data from the client with an RDMA_READ
@@ -632,6 +664,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		goto close_out;
 	}
 
+	if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
+		ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+					       &rqstp->rq_arg);
+		svc_rdma_put_context(ctxt, 0);
+		if (ret)
+			goto repost;
+		return ret;
+	}
+
 	/* Read read-list data. */
 	ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
 	if (ret > 0) {
@@ -668,4 +709,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	set_bit(XPT_CLOSE, &xprt->xpt_flags);
 defer:
 	return 0;
+
+repost:
+	ret = svc_rdma_post_recv(rdma_xprt, GFP_KERNEL);
+	if (ret) {
+		pr_err("svcrdma: could not post a receive buffer, err=%d.\n",
+		       ret);
+		pr_err("svcrdma: closing transport %p.\n", rdma_xprt);
+		set_bit(XPT_CLOSE, &rdma_xprt->sc_xprt.xpt_flags);
+		ret = -ENOTCONN;
+	}
+	return ret;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 969a1ab75fc3..df57f3ce6cd2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -50,9 +50,9 @@
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
-static int map_xdr(struct svcxprt_rdma *xprt,
-		   struct xdr_buf *xdr,
-		   struct svc_rdma_req_map *vec)
+int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
+		     struct xdr_buf *xdr,
+		     struct svc_rdma_req_map *vec)
 {
 	int sge_no;
 	u32 sge_bytes;
@@ -62,7 +62,7 @@ static int map_xdr(struct svcxprt_rdma *xprt,
 
 	if (xdr->len !=
 	    (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
-		pr_err("svcrdma: map_xdr: XDR buffer length error\n");
+		pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
 		return -EIO;
 	}
 
@@ -97,9 +97,9 @@ static int map_xdr(struct svcxprt_rdma *xprt,
 		sge_no++;
 	}
 
-	dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
+	dprintk("svcrdma: %s: sge_no %d page_no %d "
 		"page_base %u page_len %u head_len %zu tail_len %zu\n",
-		sge_no, page_no, xdr->page_base, xdr->page_len,
+		__func__, sge_no, page_no, xdr->page_base, xdr->page_len,
 		xdr->head[0].iov_len, xdr->tail[0].iov_len);
 
 	vec->count = sge_no;
@@ -265,7 +265,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 					 sge[sge_no].addr))
 			goto err;
 		atomic_inc(&xprt->sc_dma_used);
-		sge[sge_no].lkey = xprt->sc_dma_lkey;
+		sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
 		ctxt->count++;
 		sge_off = 0;
 		sge_no++;
@@ -465,7 +465,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	int ret;
 
 	/* Post a recv buffer to handle another request. */
-	ret = svc_rdma_post_recv(rdma);
+	ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
 	if (ret) {
 		printk(KERN_INFO
 		       "svcrdma: could not post a receive buffer, err=%d."
@@ -480,7 +480,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	ctxt->count = 1;
 
 	/* Prepare the SGE for the RPCRDMA Header */
-	ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+	ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
 	ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
 	ctxt->sge[0].addr =
 	    ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
@@ -504,7 +504,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
 					 ctxt->sge[sge_no].addr))
 			goto err;
 		atomic_inc(&rdma->sc_dma_used);
-		ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+		ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
 		ctxt->sge[sge_no].length = sge_bytes;
 	}
 	if (byte_count != 0) {
@@ -591,14 +591,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	/* Build an req vec for the XDR */
 	ctxt = svc_rdma_get_context(rdma);
 	ctxt->direction = DMA_TO_DEVICE;
-	vec = svc_rdma_get_req_map();
-	ret = map_xdr(rdma, &rqstp->rq_res, vec);
+	vec = svc_rdma_get_req_map(rdma);
+	ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec);
 	if (ret)
 		goto err0;
 	inline_bytes = rqstp->rq_res.len;
 
 	/* Create the RDMA response header */
-	res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+	ret = -ENOMEM;
+	res_page = alloc_page(GFP_KERNEL);
+	if (!res_page)
+		goto err0;
 	rdma_resp = page_address(res_page);
 	reply_ary = svc_rdma_get_reply_array(rdma_argp);
 	if (reply_ary)
@@ -630,14 +633,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 
 	ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
 			 inline_bytes);
-	svc_rdma_put_req_map(vec);
+	svc_rdma_put_req_map(rdma, vec);
 	dprintk("svcrdma: send_reply returns %d\n", ret);
 	return ret;
 
  err1:
 	put_page(res_page);
  err0:
-	svc_rdma_put_req_map(vec);
+	svc_rdma_put_req_map(rdma, vec);
 	svc_rdma_put_context(ctxt, 0);
 	return ret;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index b348b4adef29..5763825d09bf 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -153,18 +153,76 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt)
 }
 #endif	/* CONFIG_SUNRPC_BACKCHANNEL */
 
-struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
+					   gfp_t flags)
 {
 	struct svc_rdma_op_ctxt *ctxt;
 
-	ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep,
-				GFP_KERNEL | __GFP_NOFAIL);
-	ctxt->xprt = xprt;
-	INIT_LIST_HEAD(&ctxt->dto_q);
+	ctxt = kmalloc(sizeof(*ctxt), flags);
+	if (ctxt) {
+		ctxt->xprt = xprt;
+		INIT_LIST_HEAD(&ctxt->free);
+		INIT_LIST_HEAD(&ctxt->dto_q);
+	}
+	return ctxt;
+}
+
+static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
+{
+	unsigned int i;
+
+	/* Each RPC/RDMA credit can consume a number of send
+	 * and receive WQEs. One ctxt is allocated for each.
+	 */
+	i = xprt->sc_sq_depth + xprt->sc_rq_depth;
+
+	while (i--) {
+		struct svc_rdma_op_ctxt *ctxt;
+
+		ctxt = alloc_ctxt(xprt, GFP_KERNEL);
+		if (!ctxt) {
+			dprintk("svcrdma: No memory for RDMA ctxt\n");
+			return false;
+		}
+		list_add(&ctxt->free, &xprt->sc_ctxts);
+	}
+	return true;
+}
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+{
+	struct svc_rdma_op_ctxt *ctxt = NULL;
+
+	spin_lock_bh(&xprt->sc_ctxt_lock);
+	xprt->sc_ctxt_used++;
+	if (list_empty(&xprt->sc_ctxts))
+		goto out_empty;
+
+	ctxt = list_first_entry(&xprt->sc_ctxts,
+				struct svc_rdma_op_ctxt, free);
+	list_del_init(&ctxt->free);
+	spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+out:
 	ctxt->count = 0;
 	ctxt->frmr = NULL;
-	atomic_inc(&xprt->sc_ctxt_used);
 	return ctxt;
+
+out_empty:
+	/* Either pre-allocation missed the mark, or send
+	 * queue accounting is broken.
+	 */
+	spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+	ctxt = alloc_ctxt(xprt, GFP_NOIO);
+	if (ctxt)
+		goto out;
+
+	spin_lock_bh(&xprt->sc_ctxt_lock);
+	xprt->sc_ctxt_used--;
+	spin_unlock_bh(&xprt->sc_ctxt_lock);
+	WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
+	return NULL;
 }
 
 void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
@@ -174,11 +232,11 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 	for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
 		/*
 		 * Unmap the DMA addr in the SGE if the lkey matches
-		 * the sc_dma_lkey, otherwise, ignore it since it is
+		 * the local_dma_lkey, otherwise, ignore it since it is
 		 * an FRMR lkey and will be unmapped later when the
 		 * last WR that uses it completes.
 		 */
-		if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
+		if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) {
 			atomic_dec(&xprt->sc_dma_used);
 			ib_dma_unmap_page(xprt->sc_cm_id->device,
 					    ctxt->sge[i].addr,
@@ -190,35 +248,108 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 
 void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
 {
-	struct svcxprt_rdma *xprt;
+	struct svcxprt_rdma *xprt = ctxt->xprt;
 	int i;
 
-	xprt = ctxt->xprt;
 	if (free_pages)
 		for (i = 0; i < ctxt->count; i++)
 			put_page(ctxt->pages[i]);
 
-	kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
-	atomic_dec(&xprt->sc_ctxt_used);
+	spin_lock_bh(&xprt->sc_ctxt_lock);
+	xprt->sc_ctxt_used--;
+	list_add(&ctxt->free, &xprt->sc_ctxts);
+	spin_unlock_bh(&xprt->sc_ctxt_lock);
 }
 
-/*
- * Temporary NFS req mappings are shared across all transport
- * instances. These are short lived and should be bounded by the number
- * of concurrent server threads * depth of the SQ.
- */
-struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
+{
+	while (!list_empty(&xprt->sc_ctxts)) {
+		struct svc_rdma_op_ctxt *ctxt;
+
+		ctxt = list_first_entry(&xprt->sc_ctxts,
+					struct svc_rdma_op_ctxt, free);
+		list_del(&ctxt->free);
+		kfree(ctxt);
+	}
+}
+
+static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
 {
 	struct svc_rdma_req_map *map;
-	map = kmem_cache_alloc(svc_rdma_map_cachep,
-			       GFP_KERNEL | __GFP_NOFAIL);
+
+	map = kmalloc(sizeof(*map), flags);
+	if (map)
+		INIT_LIST_HEAD(&map->free);
+	return map;
+}
+
+static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
+{
+	unsigned int i;
+
+	/* One for each receive buffer on this connection. */
+	i = xprt->sc_max_requests;
+
+	while (i--) {
+		struct svc_rdma_req_map *map;
+
+		map = alloc_req_map(GFP_KERNEL);
+		if (!map) {
+			dprintk("svcrdma: No memory for request map\n");
+			return false;
+		}
+		list_add(&map->free, &xprt->sc_maps);
+	}
+	return true;
+}
+
+struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
+{
+	struct svc_rdma_req_map *map = NULL;
+
+	spin_lock(&xprt->sc_map_lock);
+	if (list_empty(&xprt->sc_maps))
+		goto out_empty;
+
+	map = list_first_entry(&xprt->sc_maps,
+			       struct svc_rdma_req_map, free);
+	list_del_init(&map->free);
+	spin_unlock(&xprt->sc_map_lock);
+
+out:
 	map->count = 0;
 	return map;
+
+out_empty:
+	spin_unlock(&xprt->sc_map_lock);
+
+	/* Pre-allocation amount was incorrect */
+	map = alloc_req_map(GFP_NOIO);
+	if (map)
+		goto out;
+
+	WARN_ONCE(1, "svcrdma: empty request map list?\n");
+	return NULL;
+}
+
+void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
+			  struct svc_rdma_req_map *map)
+{
+	spin_lock(&xprt->sc_map_lock);
+	list_add(&map->free, &xprt->sc_maps);
+	spin_unlock(&xprt->sc_map_lock);
 }
 
-void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
+static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
 {
-	kmem_cache_free(svc_rdma_map_cachep, map);
+	while (!list_empty(&xprt->sc_maps)) {
+		struct svc_rdma_req_map *map;
+
+		map = list_first_entry(&xprt->sc_maps,
+				       struct svc_rdma_req_map, free);
+		list_del(&map->free);
+		kfree(map);
+	}
 }
 
 /* ib_cq event handler */
@@ -386,46 +517,44 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
 static void process_context(struct svcxprt_rdma *xprt,
 			    struct svc_rdma_op_ctxt *ctxt)
 {
+	struct svc_rdma_op_ctxt *read_hdr;
+	int free_pages = 0;
+
 	svc_rdma_unmap_dma(ctxt);
 
 	switch (ctxt->wr_op) {
 	case IB_WR_SEND:
-		if (ctxt->frmr)
-			pr_err("svcrdma: SEND: ctxt->frmr != NULL\n");
-		svc_rdma_put_context(ctxt, 1);
+		free_pages = 1;
 		break;
 
 	case IB_WR_RDMA_WRITE:
-		if (ctxt->frmr)
-			pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n");
-		svc_rdma_put_context(ctxt, 0);
 		break;
 
 	case IB_WR_RDMA_READ:
 	case IB_WR_RDMA_READ_WITH_INV:
 		svc_rdma_put_frmr(xprt, ctxt->frmr);
-		if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
-			struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
-			if (read_hdr) {
-				spin_lock_bh(&xprt->sc_rq_dto_lock);
-				set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
-				list_add_tail(&read_hdr->dto_q,
-					      &xprt->sc_read_complete_q);
-				spin_unlock_bh(&xprt->sc_rq_dto_lock);
-			} else {
-				pr_err("svcrdma: ctxt->read_hdr == NULL\n");
-			}
-			svc_xprt_enqueue(&xprt->sc_xprt);
-		}
+
+		if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags))
+			break;
+
+		read_hdr = ctxt->read_hdr;
 		svc_rdma_put_context(ctxt, 0);
-		break;
+
+		spin_lock_bh(&xprt->sc_rq_dto_lock);
+		set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+		list_add_tail(&read_hdr->dto_q,
+			      &xprt->sc_read_complete_q);
+		spin_unlock_bh(&xprt->sc_rq_dto_lock);
+		svc_xprt_enqueue(&xprt->sc_xprt);
+		return;
 
 	default:
-		printk(KERN_ERR "svcrdma: unexpected completion type, "
-		       "opcode=%d\n",
-		       ctxt->wr_op);
+		dprintk("svcrdma: unexpected completion opcode=%d\n",
+			ctxt->wr_op);
 		break;
 	}
+
+	svc_rdma_put_context(ctxt, free_pages);
 }
 
 /*
@@ -523,19 +652,15 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
+	INIT_LIST_HEAD(&cma_xprt->sc_maps);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
 
 	spin_lock_init(&cma_xprt->sc_lock);
 	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
 	spin_lock_init(&cma_xprt->sc_frmr_q_lock);
-
-	cma_xprt->sc_ord = svcrdma_ord;
-
-	cma_xprt->sc_max_req_size = svcrdma_max_req_size;
-	cma_xprt->sc_max_requests = svcrdma_max_requests;
-	cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
-	atomic_set(&cma_xprt->sc_sq_count, 0);
-	atomic_set(&cma_xprt->sc_ctxt_used, 0);
+	spin_lock_init(&cma_xprt->sc_ctxt_lock);
+	spin_lock_init(&cma_xprt->sc_map_lock);
 
 	if (listener)
 		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
@@ -543,7 +668,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	return cma_xprt;
 }
 
-int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
+int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
 {
 	struct ib_recv_wr recv_wr, *bad_recv_wr;
 	struct svc_rdma_op_ctxt *ctxt;
@@ -561,7 +686,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 			pr_err("svcrdma: Too many sges (%d)\n", sge_no);
 			goto err_put_ctxt;
 		}
-		page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+		page = alloc_page(flags);
+		if (!page)
+			goto err_put_ctxt;
 		ctxt->pages[sge_no] = page;
 		pa = ib_dma_map_page(xprt->sc_cm_id->device,
 				     page, 0, PAGE_SIZE,
@@ -571,7 +698,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 		atomic_inc(&xprt->sc_dma_used);
 		ctxt->sge[sge_no].addr = pa;
 		ctxt->sge[sge_no].length = PAGE_SIZE;
-		ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
+		ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
 		ctxt->count = sge_no + 1;
 		buflen += PAGE_SIZE;
 	}
@@ -886,11 +1013,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	struct rdma_conn_param conn_param;
 	struct ib_cq_init_attr cq_attr = {};
 	struct ib_qp_init_attr qp_attr;
-	struct ib_device_attr devattr;
-	int uninitialized_var(dma_mr_acc);
-	int need_dma_mr = 0;
-	int ret;
-	int i;
+	struct ib_device *dev;
+	unsigned int i;
+	int ret = 0;
 
 	listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
 	clear_bit(XPT_CONN, &xprt->xpt_flags);
@@ -910,37 +1035,42 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
 		newxprt, newxprt->sc_cm_id);
 
-	ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
-	if (ret) {
-		dprintk("svcrdma: could not query device attributes on "
-			"device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
-		goto errout;
-	}
+	dev = newxprt->sc_cm_id->device;
 
 	/* Qualify the transport resource defaults with the
 	 * capabilities of this particular device */
-	newxprt->sc_max_sge = min((size_t)devattr.max_sge,
+	newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
 				  (size_t)RPCSVC_MAXPAGES);
-	newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd,
+	newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd,
 				       RPCSVC_MAXPAGES);
-	newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
-				   (size_t)svcrdma_max_requests);
-	newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
+	newxprt->sc_max_req_size = svcrdma_max_req_size;
+	newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
+					 svcrdma_max_requests);
+	newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
+					    svcrdma_max_bc_requests);
+	newxprt->sc_rq_depth = newxprt->sc_max_requests +
+			       newxprt->sc_max_bc_requests;
+	newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+
+	if (!svc_rdma_prealloc_ctxts(newxprt))
+		goto errout;
+	if (!svc_rdma_prealloc_maps(newxprt))
+		goto errout;
 
 	/*
 	 * Limit ORD based on client limit, local device limit, and
 	 * configured svcrdma limit.
 	 */
-	newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
+	newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
 	newxprt->sc_ord = min_t(size_t,	svcrdma_ord, newxprt->sc_ord);
 
-	newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
+	newxprt->sc_pd = ib_alloc_pd(dev);
 	if (IS_ERR(newxprt->sc_pd)) {
 		dprintk("svcrdma: error creating PD for connect request\n");
 		goto errout;
 	}
 	cq_attr.cqe = newxprt->sc_sq_depth;
-	newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+	newxprt->sc_sq_cq = ib_create_cq(dev,
 					 sq_comp_handler,
 					 cq_event_handler,
 					 newxprt,
@@ -949,8 +1079,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		dprintk("svcrdma: error creating SQ CQ for connect request\n");
 		goto errout;
 	}
-	cq_attr.cqe = newxprt->sc_max_requests;
-	newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+	cq_attr.cqe = newxprt->sc_rq_depth;
+	newxprt->sc_rq_cq = ib_create_cq(dev,
 					 rq_comp_handler,
 					 cq_event_handler,
 					 newxprt,
@@ -964,7 +1094,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	qp_attr.event_handler = qp_event_handler;
 	qp_attr.qp_context = &newxprt->sc_xprt;
 	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
-	qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
+	qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
 	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
 	qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
 	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -978,7 +1108,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		"    cap.max_send_sge = %d\n"
 		"    cap.max_recv_sge = %d\n",
 		newxprt->sc_cm_id, newxprt->sc_pd,
-		newxprt->sc_cm_id->device, newxprt->sc_pd->device,
+		dev, newxprt->sc_pd->device,
 		qp_attr.cap.max_send_wr,
 		qp_attr.cap.max_recv_wr,
 		qp_attr.cap.max_send_sge,
@@ -1014,9 +1144,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	 *	of an RDMA_READ. IB does not.
 	 */
 	newxprt->sc_reader = rdma_read_chunk_lcl;
-	if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+	if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
 		newxprt->sc_frmr_pg_list_len =
-			devattr.max_fast_reg_page_list_len;
+			dev->attrs.max_fast_reg_page_list_len;
 		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
 		newxprt->sc_reader = rdma_read_chunk_frmr;
 	}
@@ -1024,44 +1154,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	/*
 	 * Determine if a DMA MR is required and if so, what privs are required
 	 */
-	if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device,
-				 newxprt->sc_cm_id->port_num) &&
-	    !rdma_ib_or_roce(newxprt->sc_cm_id->device,
-			     newxprt->sc_cm_id->port_num))
+	if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) &&
+	    !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num))
 		goto errout;
 
-	if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) ||
-	    !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
-		need_dma_mr = 1;
-		dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
-		if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
-					newxprt->sc_cm_id->port_num) &&
-		    !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG))
-			dma_mr_acc |= IB_ACCESS_REMOTE_WRITE;
-	}
-
-	if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
-				newxprt->sc_cm_id->port_num))
+	if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num))
 		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
 
-	/* Create the DMA MR if needed, otherwise, use the DMA LKEY */
-	if (need_dma_mr) {
-		/* Register all of physical memory */
-		newxprt->sc_phys_mr =
-			ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
-		if (IS_ERR(newxprt->sc_phys_mr)) {
-			dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
-				ret);
-			goto errout;
-		}
-		newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
-	} else
-		newxprt->sc_dma_lkey =
-			newxprt->sc_cm_id->device->local_dma_lkey;
-
 	/* Post receive buffers */
-	for (i = 0; i < newxprt->sc_max_requests; i++) {
-		ret = svc_rdma_post_recv(newxprt);
+	for (i = 0; i < newxprt->sc_rq_depth; i++) {
+		ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
 		if (ret) {
 			dprintk("svcrdma: failure posting receive buffers\n");
 			goto errout;
@@ -1160,12 +1262,14 @@ static void __svc_rdma_free(struct work_struct *work)
 {
 	struct svcxprt_rdma *rdma =
 		container_of(work, struct svcxprt_rdma, sc_work);
-	dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+	struct svc_xprt *xprt = &rdma->sc_xprt;
+
+	dprintk("svcrdma: %s(%p)\n", __func__, rdma);
 
 	/* We should only be called from kref_put */
-	if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0)
+	if (atomic_read(&xprt->xpt_ref.refcount) != 0)
 		pr_err("svcrdma: sc_xprt still in use? (%d)\n",
-		       atomic_read(&rdma->sc_xprt.xpt_ref.refcount));
+		       atomic_read(&xprt->xpt_ref.refcount));
 
 	/*
 	 * Destroy queued, but not processed read completions. Note
@@ -1193,15 +1297,22 @@ static void __svc_rdma_free(struct work_struct *work)
 	}
 
 	/* Warn if we leaked a resource or under-referenced */
-	if (atomic_read(&rdma->sc_ctxt_used) != 0)
+	if (rdma->sc_ctxt_used != 0)
 		pr_err("svcrdma: ctxt still in use? (%d)\n",
-		       atomic_read(&rdma->sc_ctxt_used));
+		       rdma->sc_ctxt_used);
 	if (atomic_read(&rdma->sc_dma_used) != 0)
 		pr_err("svcrdma: dma still in use? (%d)\n",
 		       atomic_read(&rdma->sc_dma_used));
 
-	/* De-allocate fastreg mr */
+	/* Final put of backchannel client transport */
+	if (xprt->xpt_bc_xprt) {
+		xprt_put(xprt->xpt_bc_xprt);
+		xprt->xpt_bc_xprt = NULL;
+	}
+
 	rdma_dealloc_frmr_q(rdma);
+	svc_rdma_destroy_ctxts(rdma);
+	svc_rdma_destroy_maps(rdma);
 
 	/* Destroy the QP if present (not a listener) */
 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
@@ -1213,9 +1324,6 @@ static void __svc_rdma_free(struct work_struct *work)
 	if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
 		ib_destroy_cq(rdma->sc_rq_cq);
 
-	if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr))
-		ib_dereg_mr(rdma->sc_phys_mr);
-
 	if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
 		ib_dealloc_pd(rdma->sc_pd);
 
@@ -1321,7 +1429,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	int length;
 	int ret;
 
-	p = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+	p = alloc_page(GFP_KERNEL);
+	if (!p)
+		return;
 	va = page_address(p);
 
 	/* XDR encode error */
@@ -1341,7 +1451,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 		return;
 	}
 	atomic_inc(&xprt->sc_dma_used);
-	ctxt->sge[0].lkey = xprt->sc_dma_lkey;
+	ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
 	ctxt->sge[0].length = length;
 
 	/* Prepare SEND WR */
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 740bddcf3488..b1b009f10ea3 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -63,7 +63,7 @@
  */
 
 static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
-static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_inline_write_padding;
 static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
@@ -143,12 +143,7 @@ static struct ctl_table sunrpc_table[] = {
 
 #endif
 
-#define RPCRDMA_BIND_TO		(60U * HZ)
-#define RPCRDMA_INIT_REEST_TO	(5U * HZ)
-#define RPCRDMA_MAX_REEST_TO	(30U * HZ)
-#define RPCRDMA_IDLE_DISC_TO	(5U * 60 * HZ)
-
-static struct rpc_xprt_ops xprt_rdma_procs;	/* forward reference */
+static struct rpc_xprt_ops xprt_rdma_procs;	/*forward reference */
 
 static void
 xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
@@ -174,7 +169,7 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
 	xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
 }
 
-static void
+void
 xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
 {
 	char buf[128];
@@ -203,7 +198,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
 	xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
 }
 
-static void
+void
 xprt_rdma_free_addresses(struct rpc_xprt *xprt)
 {
 	unsigned int i;
@@ -499,7 +494,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
 	if (req == NULL)
 		return NULL;
 
-	flags = GFP_NOIO | __GFP_NOWARN;
+	flags = RPCRDMA_DEF_GFP;
 	if (RPC_IS_SWAPPER(task))
 		flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
 
@@ -642,7 +637,7 @@ drop_connection:
 	return -ENOTCONN;	/* implies disconnect */
 }
 
-static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 	long idle_time = 0;
@@ -743,6 +738,11 @@ void xprt_rdma_cleanup(void)
 
 	rpcrdma_destroy_wq();
 	frwr_destroy_recovery_wq();
+
+	rc = xprt_unregister_transport(&xprt_rdma_bc);
+	if (rc)
+		dprintk("RPC:       %s: xprt_unregister(bc) returned %i\n",
+			__func__, rc);
 }
 
 int xprt_rdma_init(void)
@@ -766,6 +766,14 @@ int xprt_rdma_init(void)
 		return rc;
 	}
 
+	rc = xprt_register_transport(&xprt_rdma_bc);
+	if (rc) {
+		xprt_unregister_transport(&xprt_rdma);
+		rpcrdma_destroy_wq();
+		frwr_destroy_recovery_wq();
+		return rc;
+	}
+
 	dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
 
 	dprintk("Defaults:\n");
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 732c71ce5dca..878f1bfb1db9 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -462,7 +462,6 @@ int
 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 {
 	struct rpcrdma_ia *ia = &xprt->rx_ia;
-	struct ib_device_attr *devattr = &ia->ri_devattr;
 	int rc;
 
 	ia->ri_dma_mr = NULL;
@@ -482,16 +481,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 		goto out2;
 	}
 
-	rc = ib_query_device(ia->ri_device, devattr);
-	if (rc) {
-		dprintk("RPC:       %s: ib_query_device failed %d\n",
-			__func__, rc);
-		goto out3;
-	}
-
 	if (memreg == RPCRDMA_FRMR) {
-		if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
-		    (devattr->max_fast_reg_page_list_len == 0)) {
+		if (!(ia->ri_device->attrs.device_cap_flags &
+				IB_DEVICE_MEM_MGT_EXTENSIONS) ||
+		    (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
 			dprintk("RPC:       %s: FRMR registration "
 				"not supported by HCA\n", __func__);
 			memreg = RPCRDMA_MTHCAFMR;
@@ -566,24 +559,23 @@ int
 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 				struct rpcrdma_create_data_internal *cdata)
 {
-	struct ib_device_attr *devattr = &ia->ri_devattr;
 	struct ib_cq *sendcq, *recvcq;
 	struct ib_cq_init_attr cq_attr = {};
 	unsigned int max_qp_wr;
 	int rc, err;
 
-	if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+	if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
 		dprintk("RPC:       %s: insufficient sge's available\n",
 			__func__);
 		return -ENOMEM;
 	}
 
-	if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
+	if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
 		dprintk("RPC:       %s: insufficient wqe's available\n",
 			__func__);
 		return -ENOMEM;
 	}
-	max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS;
+	max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
 
 	/* check provider's send/recv wr limits */
 	if (cdata->max_requests > max_qp_wr)
@@ -668,11 +660,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 
 	/* Client offers RDMA Read but does not initiate */
 	ep->rep_remote_cma.initiator_depth = 0;
-	if (devattr->max_qp_rd_atom > 32)	/* arbitrary but <= 255 */
+	if (ia->ri_device->attrs.max_qp_rd_atom > 32)	/* arbitrary but <= 255 */
 		ep->rep_remote_cma.responder_resources = 32;
 	else
 		ep->rep_remote_cma.responder_resources =
-						devattr->max_qp_rd_atom;
+						ia->ri_device->attrs.max_qp_rd_atom;
 
 	ep->rep_remote_cma.retry_count = 7;
 	ep->rep_remote_cma.flow_control = 0;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 728101ddc44b..38fe11b09875 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -55,6 +55,11 @@
 #define RDMA_RESOLVE_TIMEOUT	(5000)	/* 5 seconds */
 #define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
 
+#define RPCRDMA_BIND_TO		(60U * HZ)
+#define RPCRDMA_INIT_REEST_TO	(5U * HZ)
+#define RPCRDMA_MAX_REEST_TO	(30U * HZ)
+#define RPCRDMA_IDLE_DISC_TO	(5U * 60 * HZ)
+
 /*
  * Interface Adapter -- one per transport instance
  */
@@ -68,7 +73,6 @@ struct rpcrdma_ia {
 	struct completion	ri_done;
 	int			ri_async_rc;
 	unsigned int		ri_max_frmr_depth;
-	struct ib_device_attr	ri_devattr;
 	struct ib_qp_attr	ri_qp_attr;
 	struct ib_qp_init_attr	ri_qp_init_attr;
 };
@@ -142,6 +146,8 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
 	return (struct rpcrdma_msg *)rb->rg_base;
 }
 
+#define RPCRDMA_DEF_GFP		(GFP_NOIO | __GFP_NOWARN)
+
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
  * and complete a reply, asychronously. It needs several pieces of
@@ -309,6 +315,8 @@ struct rpcrdma_buffer {
 	u32			rb_bc_srv_max_requests;
 	spinlock_t		rb_reqslock;	/* protect rb_allreqs */
 	struct list_head	rb_allreqs;
+
+	u32			rb_bc_max_requests;
 };
 #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
 
@@ -516,6 +524,10 @@ int rpcrdma_marshal_req(struct rpc_rqst *);
 
 /* RPC/RDMA module init - xprtrdma/transport.c
  */
+extern unsigned int xprt_rdma_max_inline_read;
+void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
+void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
 int xprt_rdma_init(void);
 void xprt_rdma_cleanup(void);
 
@@ -531,11 +543,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
 void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
 #endif	/* CONFIG_SUNRPC_BACKCHANNEL */
 
-/* Temporary NFS request map cache. Created in svc_rdma.c  */
-extern struct kmem_cache *svc_rdma_map_cachep;
-/* WR context cache. Created in svc_rdma.c  */
-extern struct kmem_cache *svc_rdma_ctxt_cachep;
-/* Workqueue created in svc_rdma.c */
-extern struct workqueue_struct *svc_rdma_wq;
+extern struct xprt_class xprt_rdma_bc;
 
 #endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */
diff --git a/security/inode.c b/security/inode.c
index 16622aef9bde..28414b0207ce 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -99,7 +99,7 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode,
 
 	dir = d_inode(parent);
 
-	mutex_lock(&dir->i_mutex);
+	inode_lock(dir);
 	dentry = lookup_one_len(name, parent, strlen(name));
 	if (IS_ERR(dentry))
 		goto out;
@@ -129,14 +129,14 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode,
 	}
 	d_instantiate(dentry, inode);
 	dget(dentry);
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	return dentry;
 
 out1:
 	dput(dentry);
 	dentry = ERR_PTR(error);
 out:
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	simple_release_fs(&mount, &mount_count);
 	return dentry;
 }
@@ -195,7 +195,7 @@ void securityfs_remove(struct dentry *dentry)
 	if (!parent || d_really_is_negative(parent))
 		return;
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 	if (simple_positive(dentry)) {
 		if (d_is_dir(dentry))
 			simple_rmdir(d_inode(parent), dentry);
@@ -203,7 +203,7 @@ void securityfs_remove(struct dentry *dentry)
 			simple_unlink(d_inode(parent), dentry);
 		dput(dentry);
 	}
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	simple_release_fs(&mount, &mount_count);
 }
 EXPORT_SYMBOL_GPL(securityfs_remove);
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index c21f09bf8b99..9d96551d0196 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -121,7 +121,7 @@ static void ima_check_last_writer(struct integrity_iint_cache *iint,
 	if (!(mode & FMODE_WRITE))
 		return;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	if (atomic_read(&inode->i_writecount) == 1) {
 		if ((iint->version != inode->i_version) ||
 		    (iint->flags & IMA_NEW_FILE)) {
@@ -130,7 +130,7 @@ static void ima_check_last_writer(struct integrity_iint_cache *iint,
 				ima_update_xattr(iint, file);
 		}
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 }
 
 /**
@@ -186,7 +186,7 @@ static int process_measurement(struct file *file, int mask, int function,
 	if (action & IMA_FILE_APPRAISE)
 		function = FILE_CHECK;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (action) {
 		iint = integrity_inode_get(inode);
@@ -250,7 +250,7 @@ out_free:
 	if (pathbuf)
 		__putname(pathbuf);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if ((rc && must_appraise) && (ima_appraise & IMA_APPRAISE_ENFORCE))
 		return -EACCES;
 	return 0;
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 732c1c77dccd..1b1fd27de632 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -380,9 +380,9 @@ static int sel_open_policy(struct inode *inode, struct file *filp)
 		goto err;
 
 	if (i_size_read(inode) != security_policydb_len()) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		i_size_write(inode, security_policydb_len());
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 	rc = security_read_policy(&plm->data, &plm->len);
diff --git a/sound/core/control.c b/sound/core/control.c
index 196a6fe100ca..a85d45595d02 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -1405,6 +1405,8 @@ static int snd_ctl_tlv_ioctl(struct snd_ctl_file *file,
 		return -EFAULT;
 	if (tlv.length < sizeof(unsigned int) * 2)
 		return -EINVAL;
+	if (!tlv.numid)
+		return -EINVAL;
 	down_read(&card->controls_rwsem);
 	kctl = snd_ctl_find_numid(card, tlv.numid);
 	if (kctl == NULL) {
diff --git a/sound/core/hrtimer.c b/sound/core/hrtimer.c
index f845ecf7e172..656d9a9032dc 100644
--- a/sound/core/hrtimer.c
+++ b/sound/core/hrtimer.c
@@ -90,7 +90,7 @@ static int snd_hrtimer_start(struct snd_timer *t)
 	struct snd_hrtimer *stime = t->private_data;
 
 	atomic_set(&stime->running, 0);
-	hrtimer_cancel(&stime->hrt);
+	hrtimer_try_to_cancel(&stime->hrt);
 	hrtimer_start(&stime->hrt, ns_to_ktime(t->sticks * resolution),
 		      HRTIMER_MODE_REL);
 	atomic_set(&stime->running, 1);
@@ -101,6 +101,7 @@ static int snd_hrtimer_stop(struct snd_timer *t)
 {
 	struct snd_hrtimer *stime = t->private_data;
 	atomic_set(&stime->running, 0);
+	hrtimer_try_to_cancel(&stime->hrt);
 	return 0;
 }
 
diff --git a/sound/core/pcm_compat.c b/sound/core/pcm_compat.c
index b48b434444ed..9630e9f72b7b 100644
--- a/sound/core/pcm_compat.c
+++ b/sound/core/pcm_compat.c
@@ -255,10 +255,15 @@ static int snd_pcm_ioctl_hw_params_compat(struct snd_pcm_substream *substream,
 	if (! (runtime = substream->runtime))
 		return -ENOTTY;
 
-	/* only fifo_size is different, so just copy all */
-	data = memdup_user(data32, sizeof(*data32));
-	if (IS_ERR(data))
-		return PTR_ERR(data);
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	/* only fifo_size (RO from userspace) is different, so just copy all */
+	if (copy_from_user(data, data32, sizeof(*data32))) {
+		err = -EFAULT;
+		goto error;
+	}
 
 	if (refine)
 		err = snd_pcm_hw_refine(substream, data);
diff --git a/sound/core/seq/seq_compat.c b/sound/core/seq/seq_compat.c
index 81f7c109dc46..65175902a68a 100644
--- a/sound/core/seq/seq_compat.c
+++ b/sound/core/seq/seq_compat.c
@@ -49,11 +49,12 @@ static int snd_seq_call_port_info_ioctl(struct snd_seq_client *client, unsigned
 	struct snd_seq_port_info *data;
 	mm_segment_t fs;
 
-	data = memdup_user(data32, sizeof(*data32));
-	if (IS_ERR(data))
-		return PTR_ERR(data);
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
 
-	if (get_user(data->flags, &data32->flags) ||
+	if (copy_from_user(data, data32, sizeof(*data32)) ||
+	    get_user(data->flags, &data32->flags) ||
 	    get_user(data->time_queue, &data32->time_queue))
 		goto error;
 	data->kernel = NULL;
diff --git a/sound/core/timer.c b/sound/core/timer.c
index cb25aded5349..af1f68f7e315 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -65,6 +65,7 @@ struct snd_timer_user {
 	int qtail;
 	int qused;
 	int queue_size;
+	bool disconnected;
 	struct snd_timer_read *queue;
 	struct snd_timer_tread *tqueue;
 	spinlock_t qlock;
@@ -290,6 +291,9 @@ int snd_timer_open(struct snd_timer_instance **ti,
 		mutex_unlock(&register_mutex);
 		return -ENOMEM;
 	}
+	/* take a card refcount for safe disconnection */
+	if (timer->card)
+		get_device(&timer->card->card_dev);
 	timeri->slave_class = tid->dev_sclass;
 	timeri->slave_id = slave_id;
 	if (list_empty(&timer->open_list_head) && timer->hw.open)
@@ -359,6 +363,9 @@ int snd_timer_close(struct snd_timer_instance *timeri)
 		}
 		spin_unlock(&timer->lock);
 		spin_unlock_irq(&slave_active_lock);
+		/* release a card refcount for safe disconnection */
+		if (timer->card)
+			put_device(&timer->card->card_dev);
 		mutex_unlock(&register_mutex);
 	}
  out:
@@ -474,6 +481,8 @@ int snd_timer_start(struct snd_timer_instance *timeri, unsigned int ticks)
 	timer = timeri->timer;
 	if (timer == NULL)
 		return -EINVAL;
+	if (timer->card && timer->card->shutdown)
+		return -ENODEV;
 	spin_lock_irqsave(&timer->lock, flags);
 	timeri->ticks = timeri->cticks = ticks;
 	timeri->pticks = 0;
@@ -505,6 +514,10 @@ static int _snd_timer_stop(struct snd_timer_instance *timeri, int event)
 	spin_lock_irqsave(&timer->lock, flags);
 	list_del_init(&timeri->ack_list);
 	list_del_init(&timeri->active_list);
+	if (timer->card && timer->card->shutdown) {
+		spin_unlock_irqrestore(&timer->lock, flags);
+		return 0;
+	}
 	if ((timeri->flags & SNDRV_TIMER_IFLG_RUNNING) &&
 	    !(--timer->running)) {
 		timer->hw.stop(timer);
@@ -565,6 +578,8 @@ int snd_timer_continue(struct snd_timer_instance *timeri)
 	timer = timeri->timer;
 	if (! timer)
 		return -EINVAL;
+	if (timer->card && timer->card->shutdown)
+		return -ENODEV;
 	spin_lock_irqsave(&timer->lock, flags);
 	if (!timeri->cticks)
 		timeri->cticks = 1;
@@ -628,6 +643,9 @@ static void snd_timer_tasklet(unsigned long arg)
 	unsigned long resolution, ticks;
 	unsigned long flags;
 
+	if (timer->card && timer->card->shutdown)
+		return;
+
 	spin_lock_irqsave(&timer->lock, flags);
 	/* now process all callbacks */
 	while (!list_empty(&timer->sack_list_head)) {
@@ -668,6 +686,9 @@ void snd_timer_interrupt(struct snd_timer * timer, unsigned long ticks_left)
 	if (timer == NULL)
 		return;
 
+	if (timer->card && timer->card->shutdown)
+		return;
+
 	spin_lock_irqsave(&timer->lock, flags);
 
 	/* remember the current resolution */
@@ -881,8 +902,15 @@ static int snd_timer_dev_register(struct snd_device *dev)
 static int snd_timer_dev_disconnect(struct snd_device *device)
 {
 	struct snd_timer *timer = device->device_data;
+	struct snd_timer_instance *ti;
+
 	mutex_lock(&register_mutex);
 	list_del_init(&timer->device_list);
+	/* wake up pending sleepers */
+	list_for_each_entry(ti, &timer->open_list_head, open_list) {
+		if (ti->disconnect)
+			ti->disconnect(ti);
+	}
 	mutex_unlock(&register_mutex);
 	return 0;
 }
@@ -893,6 +921,8 @@ void snd_timer_notify(struct snd_timer *timer, int event, struct timespec *tstam
 	unsigned long resolution = 0;
 	struct snd_timer_instance *ti, *ts;
 
+	if (timer->card && timer->card->shutdown)
+		return;
 	if (! (timer->hw.flags & SNDRV_TIMER_HW_SLAVE))
 		return;
 	if (snd_BUG_ON(event < SNDRV_TIMER_EVENT_MSTART ||
@@ -1051,6 +1081,8 @@ static void snd_timer_proc_read(struct snd_info_entry *entry,
 
 	mutex_lock(&register_mutex);
 	list_for_each_entry(timer, &snd_timer_list, device_list) {
+		if (timer->card && timer->card->shutdown)
+			continue;
 		switch (timer->tmr_class) {
 		case SNDRV_TIMER_CLASS_GLOBAL:
 			snd_iprintf(buffer, "G%i: ", timer->tmr_device);
@@ -1185,6 +1217,14 @@ static void snd_timer_user_ccallback(struct snd_timer_instance *timeri,
 	wake_up(&tu->qchange_sleep);
 }
 
+static void snd_timer_user_disconnect(struct snd_timer_instance *timeri)
+{
+	struct snd_timer_user *tu = timeri->callback_data;
+
+	tu->disconnected = true;
+	wake_up(&tu->qchange_sleep);
+}
+
 static void snd_timer_user_tinterrupt(struct snd_timer_instance *timeri,
 				      unsigned long resolution,
 				      unsigned long ticks)
@@ -1558,6 +1598,7 @@ static int snd_timer_user_tselect(struct file *file,
 			? snd_timer_user_tinterrupt : snd_timer_user_interrupt;
 		tu->timeri->ccallback = snd_timer_user_ccallback;
 		tu->timeri->callback_data = (void *)tu;
+		tu->timeri->disconnect = snd_timer_user_disconnect;
 	}
 
       __err:
@@ -1876,6 +1917,10 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer,
 
 			remove_wait_queue(&tu->qchange_sleep, &wait);
 
+			if (tu->disconnected) {
+				err = -ENODEV;
+				break;
+			}
 			if (signal_pending(current)) {
 				err = -ERESTARTSYS;
 				break;
@@ -1925,6 +1970,8 @@ static unsigned int snd_timer_user_poll(struct file *file, poll_table * wait)
 	mask = 0;
 	if (tu->qused)
 		mask |= POLLIN | POLLRDNORM;
+	if (tu->disconnected)
+		mask |= POLLERR;
 
 	return mask;
 }
diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c
index c50177fb469f..f6854dbd7d8d 100644
--- a/sound/hda/hdac_i915.c
+++ b/sound/hda/hdac_i915.c
@@ -306,7 +306,7 @@ out_master_del:
 out_err:
 	kfree(acomp);
 	bus->audio_component = NULL;
-	dev_err(dev, "failed to add i915 component master (%d)\n", ret);
+	dev_info(dev, "failed to add i915 component master (%d)\n", ret);
 
 	return ret;
 }
diff --git a/sound/pci/hda/hda_bind.c b/sound/pci/hda/hda_bind.c
index 70671ad65d24..6efadbfb3fe3 100644
--- a/sound/pci/hda/hda_bind.c
+++ b/sound/pci/hda/hda_bind.c
@@ -174,14 +174,40 @@ static inline bool codec_probed(struct hda_codec *codec)
 	return device_attach(hda_codec_dev(codec)) > 0 && codec->preset;
 }
 
-/* try to auto-load and bind the codec module */
-static void codec_bind_module(struct hda_codec *codec)
+/* try to auto-load codec module */
+static void request_codec_module(struct hda_codec *codec)
 {
 #ifdef MODULE
 	char modalias[32];
+	const char *mod = NULL;
+
+	switch (codec->probe_id) {
+	case HDA_CODEC_ID_GENERIC_HDMI:
+#if IS_MODULE(CONFIG_SND_HDA_CODEC_HDMI)
+		mod = "snd-hda-codec-hdmi";
+#endif
+		break;
+	case HDA_CODEC_ID_GENERIC:
+#if IS_MODULE(CONFIG_SND_HDA_GENERIC)
+		mod = "snd-hda-codec-generic";
+#endif
+		break;
+	default:
+		snd_hdac_codec_modalias(&codec->core, modalias, sizeof(modalias));
+		mod = modalias;
+		break;
+	}
+
+	if (mod)
+		request_module(mod);
+#endif /* MODULE */
+}
 
-	snd_hdac_codec_modalias(&codec->core, modalias, sizeof(modalias));
-	request_module(modalias);
+/* try to auto-load and bind the codec module */
+static void codec_bind_module(struct hda_codec *codec)
+{
+#ifdef MODULE
+	request_codec_module(codec);
 	if (codec_probed(codec))
 		return;
 #endif
@@ -218,17 +244,13 @@ static int codec_bind_generic(struct hda_codec *codec)
 
 	if (is_likely_hdmi_codec(codec)) {
 		codec->probe_id = HDA_CODEC_ID_GENERIC_HDMI;
-#if IS_MODULE(CONFIG_SND_HDA_CODEC_HDMI)
-		request_module("snd-hda-codec-hdmi");
-#endif
+		request_codec_module(codec);
 		if (codec_probed(codec))
 			return 0;
 	}
 
 	codec->probe_id = HDA_CODEC_ID_GENERIC;
-#if IS_MODULE(CONFIG_SND_HDA_GENERIC)
-	request_module("snd-hda-codec-generic");
-#endif
+	request_codec_module(codec);
 	if (codec_probed(codec))
 		return 0;
 	return -ENODEV;
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index c0bef11afa7e..256e6cda218f 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2078,9 +2078,11 @@ static int azx_probe_continue(struct azx *chip)
 			 * for other chips, still continue probing as other
 			 * codecs can be on the same link.
 			 */
-			if (CONTROLLER_IN_GPU(pci))
+			if (CONTROLLER_IN_GPU(pci)) {
+				dev_err(chip->card->dev,
+					"HSW/BDW HD-audio HDMI/DP requires binding with gfx driver\n");
 				goto out_free;
-			else
+			} else
 				goto skip_i915;
 		}
 
@@ -2149,9 +2151,17 @@ i915_power_fail:
 static void azx_remove(struct pci_dev *pci)
 {
 	struct snd_card *card = pci_get_drvdata(pci);
+	struct azx *chip;
+	struct hda_intel *hda;
+
+	if (card) {
+		/* flush the pending probing work */
+		chip = card->private_data;
+		hda = container_of(chip, struct hda_intel, chip);
+		flush_work(&hda->probe_work);
 
-	if (card)
 		snd_card_free(card);
+	}
 }
 
 static void azx_shutdown(struct pci_dev *pci)
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 8143c0e24a27..33753244f48f 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6566,6 +6566,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1028, 0x069f, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x103c, 0x1632, "HP RP5800", ALC662_FIXUP_HP_RP5800),
 	SND_PCI_QUIRK(0x1043, 0x11cd, "Asus N550", ALC662_FIXUP_BASS_1A),
+	SND_PCI_QUIRK(0x1043, 0x13df, "Asus N550JX", ALC662_FIXUP_BASS_1A),
 	SND_PCI_QUIRK(0x1043, 0x1477, "ASUS N56VZ", ALC662_FIXUP_BASS_MODE4_CHMAP),
 	SND_PCI_QUIRK(0x1043, 0x15a7, "ASUS UX51VZH", ALC662_FIXUP_BASS_16),
 	SND_PCI_QUIRK(0x1043, 0x1b73, "ASUS N55SF", ALC662_FIXUP_BASS_16),
diff --git a/sound/spi/at73c213.c b/sound/spi/at73c213.c
index 39522367897c..fac7e6eb9529 100644
--- a/sound/spi/at73c213.c
+++ b/sound/spi/at73c213.c
@@ -221,6 +221,8 @@ static int snd_at73c213_pcm_open(struct snd_pcm_substream *substream)
 	runtime->hw = snd_at73c213_playback_hw;
 	chip->substream = substream;
 
+	clk_enable(chip->ssc->clk);
+
 	return 0;
 }
 
@@ -228,6 +230,7 @@ static int snd_at73c213_pcm_close(struct snd_pcm_substream *substream)
 {
 	struct snd_at73c213 *chip = snd_pcm_substream_chip(substream);
 	chip->substream = NULL;
+	clk_disable(chip->ssc->clk);
 	return 0;
 }
 
@@ -897,6 +900,8 @@ static int snd_at73c213_dev_init(struct snd_card *card,
 	chip->card = card;
 	chip->irq = -1;
 
+	clk_enable(chip->ssc->clk);
+
 	retval = request_irq(irq, snd_at73c213_interrupt, 0, "at73c213", chip);
 	if (retval) {
 		dev_dbg(&chip->spi->dev, "unable to request irq %d\n", irq);
@@ -935,6 +940,8 @@ out_irq:
 	free_irq(chip->irq, chip);
 	chip->irq = -1;
 out:
+	clk_disable(chip->ssc->clk);
+
 	return retval;
 }
 
@@ -1012,7 +1019,9 @@ static int snd_at73c213_remove(struct spi_device *spi)
 	int retval;
 
 	/* Stop playback. */
+	clk_enable(chip->ssc->clk);
 	ssc_writel(chip->ssc->regs, CR, SSC_BIT(CR_TXDIS));
+	clk_disable(chip->ssc->clk);
 
 	/* Mute sound. */
 	retval = snd_at73c213_write_reg(chip, DAC_LMPG, 0x3f);
@@ -1080,6 +1089,7 @@ static int snd_at73c213_suspend(struct device *dev)
 	struct snd_at73c213 *chip = card->private_data;
 
 	ssc_writel(chip->ssc->regs, CR, SSC_BIT(CR_TXDIS));
+	clk_disable(chip->ssc->clk);
 	clk_disable(chip->board->dac_clk);
 
 	return 0;
@@ -1091,6 +1101,7 @@ static int snd_at73c213_resume(struct device *dev)
 	struct snd_at73c213 *chip = card->private_data;
 
 	clk_enable(chip->board->dac_clk);
+	clk_enable(chip->ssc->clk);
 	ssc_writel(chip->ssc->regs, CR, SSC_BIT(CR_TXEN));
 
 	return 0;
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index ea69ce35e902..c3bd294a63d1 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -3746,7 +3746,7 @@ static const struct flag flags[] = {
 	{ "NET_TX_SOFTIRQ", 2 },
 	{ "NET_RX_SOFTIRQ", 3 },
 	{ "BLOCK_SOFTIRQ", 4 },
-	{ "BLOCK_IOPOLL_SOFTIRQ", 5 },
+	{ "IRQ_POLL_SOFTIRQ", 5 },
 	{ "TASKLET_SOFTIRQ", 6 },
 	{ "SCHED_SOFTIRQ", 7 },
 	{ "HRTIMER_SOFTIRQ", 8 },
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 8ff7d620d942..33b52eaa39db 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -209,7 +209,7 @@ static const struct flag flags[] = {
 	{ "NET_TX_SOFTIRQ", 2 },
 	{ "NET_RX_SOFTIRQ", 3 },
 	{ "BLOCK_SOFTIRQ", 4 },
-	{ "BLOCK_IOPOLL_SOFTIRQ", 5 },
+	{ "IRQ_POLL_SOFTIRQ", 5 },
 	{ "TASKLET_SOFTIRQ", 6 },
 	{ "SCHED_SOFTIRQ", 7 },
 	{ "HRTIMER_SOFTIRQ", 8 },